switching to high quality piper tts and added label translations
This commit is contained in:
@@ -0,0 +1,117 @@
|
||||
"""Piper configuration"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any, Mapping, Optional, Sequence
|
||||
|
||||
DEFAULT_NOISE_SCALE = 0.667
|
||||
DEFAULT_LENGTH_SCALE = 1.0
|
||||
DEFAULT_NOISE_W_SCALE = 0.8
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
TEXT = "text"
|
||||
|
||||
|
||||
@dataclass
|
||||
class PiperConfig:
|
||||
"""Piper configuration"""
|
||||
|
||||
num_symbols: int
|
||||
"""Number of phonemes."""
|
||||
|
||||
num_speakers: int
|
||||
"""Number of speakers."""
|
||||
|
||||
sample_rate: int
|
||||
"""Sample rate of output audio."""
|
||||
|
||||
espeak_voice: str
|
||||
"""Name of espeak-ng voice or alphabet."""
|
||||
|
||||
phoneme_id_map: Mapping[str, Sequence[int]]
|
||||
"""Phoneme -> [id,]."""
|
||||
|
||||
phoneme_type: PhonemeType
|
||||
"""espeak or text."""
|
||||
|
||||
speaker_id_map: Mapping[str, int] = field(default_factory=dict)
|
||||
"""Speaker -> id"""
|
||||
|
||||
piper_version: Optional[str] = None
|
||||
|
||||
# Inference settings
|
||||
length_scale: float = DEFAULT_LENGTH_SCALE
|
||||
noise_scale: float = DEFAULT_NOISE_SCALE
|
||||
noise_w_scale: float = DEFAULT_NOISE_W_SCALE
|
||||
|
||||
@staticmethod
|
||||
def from_dict(config: dict[str, Any]) -> "PiperConfig":
|
||||
"""Load configuration from a dictionary."""
|
||||
inference = config.get("inference", {})
|
||||
|
||||
return PiperConfig(
|
||||
num_symbols=config["num_symbols"],
|
||||
num_speakers=config["num_speakers"],
|
||||
sample_rate=config["audio"]["sample_rate"],
|
||||
noise_scale=inference.get("noise_scale", DEFAULT_NOISE_SCALE),
|
||||
length_scale=inference.get("length_scale", DEFAULT_LENGTH_SCALE),
|
||||
noise_w_scale=inference.get("noise_w", DEFAULT_NOISE_W_SCALE),
|
||||
#
|
||||
espeak_voice=config["espeak"]["voice"],
|
||||
phoneme_id_map=config["phoneme_id_map"],
|
||||
phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
|
||||
speaker_id_map=config.get("speaker_id_map", {}),
|
||||
#
|
||||
piper_version=config.get("piper_version"),
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert configuration to a dictionary."""
|
||||
config_dict = {
|
||||
"audio": {
|
||||
"sample_rate": self.sample_rate,
|
||||
},
|
||||
"espeak": {
|
||||
"voice": self.espeak_voice,
|
||||
},
|
||||
"phoneme_type": self.phoneme_type.value,
|
||||
"num_symbols": self.num_symbols,
|
||||
"num_speakers": self.num_speakers,
|
||||
"inference": {
|
||||
"noise_scale": self.noise_scale,
|
||||
"length_scale": self.length_scale,
|
||||
"noise_w": self.noise_w_scale,
|
||||
},
|
||||
"phoneme_id_map": self.phoneme_id_map,
|
||||
"speaker_id_map": self.speaker_id_map,
|
||||
}
|
||||
|
||||
if self.piper_version:
|
||||
config_dict["piper_version"] = self.piper_version
|
||||
|
||||
return config_dict
|
||||
|
||||
|
||||
@dataclass
|
||||
class SynthesisConfig:
|
||||
"""Configuration for Piper synthesis."""
|
||||
|
||||
speaker_id: Optional[int] = None
|
||||
"""Index of speaker to use (multi-speaker voices only)."""
|
||||
|
||||
length_scale: Optional[float] = None
|
||||
"""Phoneme length scale (< 1 is faster, > 1 is slower)."""
|
||||
|
||||
noise_scale: Optional[float] = None
|
||||
"""Amount of generator noise to add."""
|
||||
|
||||
noise_w_scale: Optional[float] = None
|
||||
"""Amount of phoneme width noise to add."""
|
||||
|
||||
normalize_audio: bool = True
|
||||
"""Enable/disable scaling audio samples to fit full range."""
|
||||
|
||||
volume: float = 1.0
|
||||
"""Multiplier for audio samples (< 1 is quieter, > 1 is louder)."""
|
||||
Reference in New Issue
Block a user