switching to high quality piper tts and added label translations

This commit is contained in:
Matthias Hinrichs
2026-01-29 23:48:19 +01:00
commit d80c619df9
3934 changed files with 1451600 additions and 0 deletions
@@ -0,0 +1,117 @@
"""Piper configuration"""
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Mapping, Optional, Sequence
DEFAULT_NOISE_SCALE = 0.667
DEFAULT_LENGTH_SCALE = 1.0
DEFAULT_NOISE_W_SCALE = 0.8
class PhonemeType(str, Enum):
ESPEAK = "espeak"
TEXT = "text"
@dataclass
class PiperConfig:
"""Piper configuration"""
num_symbols: int
"""Number of phonemes."""
num_speakers: int
"""Number of speakers."""
sample_rate: int
"""Sample rate of output audio."""
espeak_voice: str
"""Name of espeak-ng voice or alphabet."""
phoneme_id_map: Mapping[str, Sequence[int]]
"""Phoneme -> [id,]."""
phoneme_type: PhonemeType
"""espeak or text."""
speaker_id_map: Mapping[str, int] = field(default_factory=dict)
"""Speaker -> id"""
piper_version: Optional[str] = None
# Inference settings
length_scale: float = DEFAULT_LENGTH_SCALE
noise_scale: float = DEFAULT_NOISE_SCALE
noise_w_scale: float = DEFAULT_NOISE_W_SCALE
@staticmethod
def from_dict(config: dict[str, Any]) -> "PiperConfig":
"""Load configuration from a dictionary."""
inference = config.get("inference", {})
return PiperConfig(
num_symbols=config["num_symbols"],
num_speakers=config["num_speakers"],
sample_rate=config["audio"]["sample_rate"],
noise_scale=inference.get("noise_scale", DEFAULT_NOISE_SCALE),
length_scale=inference.get("length_scale", DEFAULT_LENGTH_SCALE),
noise_w_scale=inference.get("noise_w", DEFAULT_NOISE_W_SCALE),
#
espeak_voice=config["espeak"]["voice"],
phoneme_id_map=config["phoneme_id_map"],
phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
speaker_id_map=config.get("speaker_id_map", {}),
#
piper_version=config.get("piper_version"),
)
def to_dict(self) -> dict[str, Any]:
"""Convert configuration to a dictionary."""
config_dict = {
"audio": {
"sample_rate": self.sample_rate,
},
"espeak": {
"voice": self.espeak_voice,
},
"phoneme_type": self.phoneme_type.value,
"num_symbols": self.num_symbols,
"num_speakers": self.num_speakers,
"inference": {
"noise_scale": self.noise_scale,
"length_scale": self.length_scale,
"noise_w": self.noise_w_scale,
},
"phoneme_id_map": self.phoneme_id_map,
"speaker_id_map": self.speaker_id_map,
}
if self.piper_version:
config_dict["piper_version"] = self.piper_version
return config_dict
@dataclass
class SynthesisConfig:
"""Configuration for Piper synthesis."""
speaker_id: Optional[int] = None
"""Index of speaker to use (multi-speaker voices only)."""
length_scale: Optional[float] = None
"""Phoneme length scale (< 1 is faster, > 1 is slower)."""
noise_scale: Optional[float] = None
"""Amount of generator noise to add."""
noise_w_scale: Optional[float] = None
"""Amount of phoneme width noise to add."""
normalize_audio: bool = True
"""Enable/disable scaling audio samples to fit full range."""
volume: float = 1.0
"""Multiplier for audio samples (< 1 is quieter, > 1 is louder)."""