switching to high quality piper tts and added label translations

2026-01-29 23:48:19 +01:00
commit d80c619df9
3934 changed files with 1451600 additions and 0 deletions
@@ -0,0 +1,12 @@
+"""Piper text-to-speech engine."""
+
+from .config import PhonemeType, PiperConfig, SynthesisConfig
+from .voice import AudioChunk, PiperVoice
+
+__all__ = [
+    "AudioChunk",
+    "PhonemeType",
+    "PiperConfig",
+    "PiperVoice",
+    "SynthesisConfig",
+]
@@ -0,0 +1,230 @@
+"""Piper main script."""
+
+import argparse
+import logging
+import shutil
+import sys
+import tempfile
+import time
+import wave
+from collections.abc import Iterable
+from pathlib import Path
+
+from . import PiperVoice, SynthesisConfig
+from .audio_playback import AudioPlayer
+
+_FILE = Path(__file__)
+_DIR = _FILE.parent
+_LOGGER = logging.getLogger(_FILE.stem)
+
+
+def main() -> None:
+    """Run piper text-to-speech engine."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+    parser.add_argument("-c", "--config", help="Path to model config file")
+    parser.add_argument(
+        "-i",
+        "--input-file",
+        "--input_file",
+        action="append",
+        help="Paths to input text files",
+    )
+    parser.add_argument(
+        "-f",
+        "--output-file",
+        "--output_file",
+        help="Path to output WAV file (default: stdout)",
+    )
+    parser.add_argument(
+        "-d",
+        "--output-dir",
+        "--output_dir",
+        help="Path to output directory (default: cwd)",
+    )
+    parser.add_argument(
+        "--output-raw",
+        "--output_raw",
+        action="store_true",
+        help="Stream raw audio to stdout",
+    )
+    #
+    parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
+    parser.add_argument(
+        "--length-scale", "--length_scale", type=float, help="Phoneme length"
+    )
+    parser.add_argument(
+        "--noise-scale", "--noise_scale", type=float, help="Generator noise"
+    )
+    parser.add_argument(
+        "--noise-w-scale",
+        "--noise_w_scale",
+        "--noise-w",
+        "--noise_w",
+        type=float,
+        help="Phoneme width noise",
+    )
+    #
+    parser.add_argument("--cuda", action="store_true", help="Use GPU")
+    #
+    parser.add_argument(
+        "--sentence-silence",
+        "--sentence_silence",
+        type=float,
+        default=0.0,
+        help="Seconds of silence after each sentence",
+    )
+    parser.add_argument(
+        "--volume", type=float, default=1.0, help="Volume multiplier (default: 1.0)"
+    )
+    parser.add_argument(
+        "--no-normalize", action="store_true", help="Don't normalize audio"
+    )
+    #
+    parser.add_argument(
+        "--data-dir",
+        "--data_dir",
+        action="append",
+        default=[str(Path.cwd())],
+        help="Data directory to check for voice models (default: current directory)",
+    )
+    #
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to console"
+    )
+    args, unknown_args = parser.parse_known_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+    _LOGGER.debug(args)
+
+    if args.input_file:
+        # Input text from file(s)
+        def lines() -> Iterable[str]:
+            for input_path in args.input_file:
+                _LOGGER.debug("Reading text from %s", input_path)
+                with open(input_path, "r", encoding="utf-8") as input_file:
+                    for line in input_file:
+                        line = line.strip()
+                        if line:
+                            yield line
+
+    else:
+        # Input text from args or stdin
+        texts: Iterable[str]
+        if unknown_args:
+            texts = [" ".join(unknown_args)]
+        else:
+            texts = sys.stdin
+
+        def lines() -> Iterable[str]:
+            for line in texts:
+                line = line.strip()
+                if line:
+                    yield line
+
+    model_path = Path(args.model)
+    if not model_path.exists():
+        # Look in data directories
+        voice_name = args.model
+        for data_dir in args.data_dir:
+            maybe_model_path = Path(data_dir) / f"{voice_name}.onnx"
+            _LOGGER.debug("Checking '%s'", maybe_model_path)
+            if maybe_model_path.exists():
+                model_path = maybe_model_path
+                break
+
+    if not model_path.exists():
+        raise ValueError(
+            f"Unable to find voice: {model_path} (use piper.download_voices)"
+        )
+
+    # Load voice
+    _LOGGER.debug("Loading voice: '%s'", model_path)
+    voice = PiperVoice.load(model_path, use_cuda=args.cuda)
+    syn_config = SynthesisConfig(
+        speaker_id=args.speaker,
+        length_scale=args.length_scale,
+        noise_scale=args.noise_scale,
+        noise_w_scale=args.noise_w_scale,
+        normalize_audio=(not args.no_normalize),
+        volume=args.volume,
+    )
+
+    wav_file: wave.Wave_write
+
+    # 16-bit samples for silence
+    silence_int16_bytes = bytes(
+        int(voice.config.sample_rate * args.sentence_silence * 2)
+    )
+
+    def lines_to_wav() -> None:
+        wav_params_set = False
+        for line in lines():
+            for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)):
+                if not wav_params_set:
+                    wav_file.setframerate(audio_chunk.sample_rate)
+                    wav_file.setsampwidth(audio_chunk.sample_width)
+                    wav_file.setnchannels(audio_chunk.sample_channels)
+                    wav_params_set = True
+
+                if i > 0:
+                    wav_file.writeframes(silence_int16_bytes)
+
+                wav_file.writeframes(audio_chunk.audio_int16_bytes)
+
+    if args.output_raw:
+        # Write raw audio to stdout as its produced
+        for line in lines():
+            audio_stream = voice.synthesize(line, syn_config)
+            for i, audio_chunk in enumerate(audio_stream):
+                if i > 0:
+                    sys.stdout.buffer.write(silence_int16_bytes)
+
+                sys.stdout.buffer.write(audio_chunk.audio_int16_bytes)
+                sys.stdout.buffer.flush()
+    elif args.output_dir:
+        # Write multiple WAV files to a directory, one per line
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        for line in lines():
+            wav_path = output_dir / f"{time.monotonic_ns()}.wav"
+            wav_file = wave.open(str(wav_path), "wb")
+            with wav_file:
+                lines_to_wav()
+
+            _LOGGER.info("Wrote %s", wav_path)
+    else:
+        if args.output_file == "-":
+            # Write WAV file to stdout
+            with tempfile.NamedTemporaryFile("wb+", suffix=".wav") as temp_wav_file:
+                wav_file = wave.open(temp_wav_file.name, "wb")
+                lines_to_wav()
+
+                temp_wav_file.seek(0)
+                shutil.copyfileobj(temp_wav_file, sys.stdout.buffer)
+        elif (not args.output_file) and AudioPlayer.is_available():
+            # Play audio using ffplay
+            with AudioPlayer(voice.config.sample_rate) as player:
+                for line in lines():
+                    for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)):
+                        if i > 0:
+                            player.play(silence_int16_bytes)
+
+                        player.play(audio_chunk.audio_int16_bytes)
+        else:
+            # Write to WAV file
+            if not args.output_file:
+                _LOGGER.warning(
+                    "Audio playback is not available (ffplay). Writing audio to output.wav."
+                )
+                args.output_file = "output.wav"
+
+            wav_file = wave.open(args.output_file, "wb")
+            with wav_file:
+                lines_to_wav()
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,58 @@
+"""Audio playback using ffplay."""
+
+import shutil
+import subprocess
+from typing import Optional
+
+
+class AudioPlayer:
+    """Plays raw audio using ffplay."""
+
+    def __init__(self, sample_rate: int) -> None:
+        """Initialzes audio player."""
+        self.sample_rate = sample_rate
+        self._proc: Optional[subprocess.Popen] = None
+
+    def __enter__(self):
+        """Starts ffplay subprocess and returns player."""
+        self._proc = subprocess.Popen(
+            [
+                "ffplay",
+                "-nodisp",
+                "-autoexit",
+                "-f",
+                "s16le",
+                "-ar",
+                str(self.sample_rate),
+                "-ac",
+                "1",
+                "-",
+            ],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Stops ffplay subprocess."""
+        if self._proc:
+            try:
+                if self._proc.stdin:
+                    self._proc.stdin.close()
+            except Exception:
+                pass
+            self._proc.wait(timeout=5)
+
+    def play(self, audio_bytes: bytes) -> None:
+        """Plays raw audio using ffplay."""
+        assert self._proc is not None
+        assert self._proc.stdin is not None
+
+        self._proc.stdin.write(audio_bytes)
+        self._proc.stdin.flush()
+
+    @staticmethod
+    def is_available() -> bool:
+        """Returns true if ffplay is available."""
+        return bool(shutil.which("ffplay"))
@@ -0,0 +1,117 @@
+"""Piper configuration"""
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Mapping, Optional, Sequence
+
+DEFAULT_NOISE_SCALE = 0.667
+DEFAULT_LENGTH_SCALE = 1.0
+DEFAULT_NOISE_W_SCALE = 0.8
+
+
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    TEXT = "text"
+
+
+@dataclass
+class PiperConfig:
+    """Piper configuration"""
+
+    num_symbols: int
+    """Number of phonemes."""
+
+    num_speakers: int
+    """Number of speakers."""
+
+    sample_rate: int
+    """Sample rate of output audio."""
+
+    espeak_voice: str
+    """Name of espeak-ng voice or alphabet."""
+
+    phoneme_id_map: Mapping[str, Sequence[int]]
+    """Phoneme -> [id,]."""
+
+    phoneme_type: PhonemeType
+    """espeak or text."""
+
+    speaker_id_map: Mapping[str, int] = field(default_factory=dict)
+    """Speaker -> id"""
+
+    piper_version: Optional[str] = None
+
+    # Inference settings
+    length_scale: float = DEFAULT_LENGTH_SCALE
+    noise_scale: float = DEFAULT_NOISE_SCALE
+    noise_w_scale: float = DEFAULT_NOISE_W_SCALE
+
+    @staticmethod
+    def from_dict(config: dict[str, Any]) -> "PiperConfig":
+        """Load configuration from a dictionary."""
+        inference = config.get("inference", {})
+
+        return PiperConfig(
+            num_symbols=config["num_symbols"],
+            num_speakers=config["num_speakers"],
+            sample_rate=config["audio"]["sample_rate"],
+            noise_scale=inference.get("noise_scale", DEFAULT_NOISE_SCALE),
+            length_scale=inference.get("length_scale", DEFAULT_LENGTH_SCALE),
+            noise_w_scale=inference.get("noise_w", DEFAULT_NOISE_W_SCALE),
+            #
+            espeak_voice=config["espeak"]["voice"],
+            phoneme_id_map=config["phoneme_id_map"],
+            phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
+            speaker_id_map=config.get("speaker_id_map", {}),
+            #
+            piper_version=config.get("piper_version"),
+        )
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert configuration to a dictionary."""
+        config_dict = {
+            "audio": {
+                "sample_rate": self.sample_rate,
+            },
+            "espeak": {
+                "voice": self.espeak_voice,
+            },
+            "phoneme_type": self.phoneme_type.value,
+            "num_symbols": self.num_symbols,
+            "num_speakers": self.num_speakers,
+            "inference": {
+                "noise_scale": self.noise_scale,
+                "length_scale": self.length_scale,
+                "noise_w": self.noise_w_scale,
+            },
+            "phoneme_id_map": self.phoneme_id_map,
+            "speaker_id_map": self.speaker_id_map,
+        }
+
+        if self.piper_version:
+            config_dict["piper_version"] = self.piper_version
+
+        return config_dict
+
+
+@dataclass
+class SynthesisConfig:
+    """Configuration for Piper synthesis."""
+
+    speaker_id: Optional[int] = None
+    """Index of speaker to use (multi-speaker voices only)."""
+
+    length_scale: Optional[float] = None
+    """Phoneme length scale (< 1 is faster, > 1 is slower)."""
+
+    noise_scale: Optional[float] = None
+    """Amount of generator noise to add."""
+
+    noise_w_scale: Optional[float] = None
+    """Amount of phoneme width noise to add."""
+
+    normalize_audio: bool = True
+    """Enable/disable scaling audio samples to fit full range."""
+
+    volume: float = 1.0
+    """Multiplier for audio samples (< 1 is quieter, > 1 is louder)."""
@@ -0,0 +1,5 @@
+"""Constants"""
+
+PAD = "_"  # padding (0)
+BOS = "^"  # beginning of sentence
+EOS = "$"  # end of sentence
@@ -0,0 +1,138 @@
+"""Command-line utility for downloading Piper voices."""
+
+import argparse
+import json
+import logging
+import re
+import shutil
+from pathlib import Path
+from urllib.request import urlopen
+
+URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/main/{lang_family}/{lang_code}/{voice_name}/{voice_quality}/{lang_code}-{voice_name}-{voice_quality}{extension}?download=true"
+VOICES_JSON = (
+    "https://huggingface.co/rhasspy/piper-voices/resolve/main/voices.json?download=true"
+)
+VOICE_PATTERN = re.compile(
+    r"^(?P<lang_family>[^-]+)_(?P<lang_region>[^-]+)-(?P<voice_name>[^-]+)-(?P<voice_quality>.+)$"
+)
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def main() -> None:
+    """Download Piper voices."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "voice", nargs="*", help="Name of voice like 'en_US-lessac-medium'"
+    )
+    parser.add_argument(
+        "--download-dir",
+        "--download_dir",
+        "--data-dir",
+        "--data_dir",
+        help="Directory to download voices into (default: current directory)",
+    )
+    parser.add_argument(
+        "--force-redownload",
+        "--force_redownload",
+        action="store_true",
+        help="Force redownloading of voice files even if they exist already",
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG logs to console"
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+
+    if not args.voice:
+        list_voices()
+        return
+
+    if args.download_dir:
+        download_dir = Path(args.download_dir)
+    else:
+        download_dir = Path.cwd()
+
+    download_dir.mkdir(parents=True, exist_ok=True)
+
+    for voice in args.voice:
+        download_voice(voice, download_dir, force_redownload=args.force_redownload)
+
+
+# -----------------------------------------------------------------------------
+
+
+def list_voices() -> None:
+    """List available voices and exit."""
+    _LOGGER.debug("Downloading voices.json file: '%s'", VOICES_JSON)
+    with urlopen(VOICES_JSON) as response:
+        voices_dict = json.load(response)
+
+    for voice in sorted(voices_dict.keys()):
+        print(voice)
+
+
+def download_voice(
+    voice: str, download_dir: Path, force_redownload: bool = False
+) -> None:
+    """Download a voice model and config file to a directory."""
+    voice = voice.strip()
+    voice_match = VOICE_PATTERN.match(voice)
+    if not voice_match:
+        raise ValueError(
+            f"Voice '{voice}' did not match pattern: <language>-<name>-<quality> like 'en_US-lessac-medium'",
+        )
+
+    lang_family = voice_match.group("lang_family")
+    lang_code = lang_family + "_" + voice_match.group("lang_region")
+    voice_name = voice_match.group("voice_name")
+    voice_quality = voice_match.group("voice_quality")
+
+    voice_code = f"{lang_code}-{voice_name}-{voice_quality}"
+    format_args = {
+        "lang_family": lang_family,
+        "lang_code": lang_code,
+        "voice_name": voice_name,
+        "voice_quality": voice_quality,
+    }
+
+    model_path = download_dir / f"{voice_code}.onnx"
+    if force_redownload or _needs_download(model_path):
+        model_url = URL_FORMAT.format(extension=".onnx", **format_args)
+        _LOGGER.debug("Downloading model from '%s' to '%s'", model_url, model_path)
+        with urlopen(model_url) as response:
+            with open(model_path, "wb") as model_file:
+                shutil.copyfileobj(response, model_file)
+
+        _LOGGER.debug("Downloaded: '%s'", model_path)
+
+    config_path = download_dir / f"{voice_code}.onnx.json"
+    if force_redownload or _needs_download(config_path):
+        config_url = URL_FORMAT.format(extension=".onnx.json", **format_args)
+        _LOGGER.debug("Downloading config from '%s' to '%s'", config_url, config_path)
+        with urlopen(config_url) as response:
+            with open(config_path, "wb") as config_file:
+                shutil.copyfileobj(response, config_file)
+
+        _LOGGER.debug("Downloaded: '%s'", config_path)
+
+    _LOGGER.info("Downloaded: %s", voice)
+
+
+def _needs_download(path: Path) -> bool:
+    """Return True if file needs to be downloaded."""
+    if not path.exists():
+        return True
+
+    if path.stat().st_size == 0:
+        # Empty
+        return True
+
+    return False
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,8 @@
+name Vietnamese (Northern)
+language vi
+
+words 1 2
+pitch 95 175
+
+
+tone 100 225 800 100 2000 50 5400 75 8000 200
@@ -0,0 +1,9 @@
+name Vietnamese (Central)
+language vi-vn-x-central
+phonemes vi-hue
+dictrules 1
+
+words 1
+pitch 82 118   //80 118
+ voicing 90  //18
+ flutter  20
@@ -0,0 +1,9 @@
+name Vietnamese (Southern)
+language vi-vn-x-south
+phonemes vi-sgn
+dictrules 2
+
+words 1
+pitch 82 118   //80 118
+ voicing 90  //18
+ flutter  20
@@ -0,0 +1,4 @@
+name Esperanto
+language eo
+
+apostrophe 2
@@ -0,0 +1,2 @@
+name Interlingua
+language ia
@@ -0,0 +1,5 @@
+name Ido
+language io
+phonemes eo
+status testing
+ 
@@ -0,0 +1,4 @@
+name Lojban
+language jbo
+
+speed 80   // speed adjustment, percentage
@@ -0,0 +1,8 @@
+name Lingua Franca Nova
+language lfn
+
+phonemes base2
+l_unpronouncable 0
+numbers 2 3
+
+stressLength  150 140 180 180 0 0 200 200
@@ -0,0 +1,5 @@
+name Klingon
+language piqd
+status testing
+stressRule 3
+
@@ -0,0 +1,7 @@
+name Pyash
+language py
+maintainer Logan Streondj <logan@liberit.ca>
+status testing
+
+speed 80   // speed adjustment, percentage
+stressRule 0
@@ -0,0 +1,6 @@
+name Lang Belta
+language qdb
+
+numbers 4 3
+
+replace 1 t ?
@@ -0,0 +1,4 @@
+name Quenya
+language qya
+stressRule 2
+// rule=penultimate, with qya_rules for light penultimate syllables to move primary stress to the preceding (antepenultimate) syllable
@@ -0,0 +1,4 @@
+name Sindarin
+language sjn
+stressRule 2
+// rule=penultimate, with sjn_rules for light penultimate syllables to move primary stress to the preceding (antepenultimate) syllable
@@ -0,0 +1,10 @@
+name xextan-test
+language xex
+
+phonemes pt-br
+phonemes pt
+
+pitch 80 130
+
+dictrules 1
+tunes s7 c7 q7 e7
@@ -0,0 +1,6 @@
+name Nahuatl (Classical)
+language nci
+
+intonation 3
+stressRule 2
+stressLength  190  190  200  200  0  0  220  240
@@ -0,0 +1,2 @@
+name Lithuanian
+language lt
@@ -0,0 +1,12 @@
+name Latgalian
+language ltg
+maintainer Valdis Vitolins <valdis.vitolins@odo.lv>
+status testing
+phonemes lv
+dictionary lv
+dictrules 2   // Setting for Latgalian pronunciation
+words 0 2
+pitch 64 118
+tone 60 150 204 100 400 255 700 10 3000 255
+stressAmp 12 10 8 8 0 0 15 16
+stressLength 160 140 200 140 0 0 240 160
@@ -0,0 +1,9 @@
+name Latvian
+language lv
+maintainer Valdis Vitolins <valdis.vitolins@odo.lv>
+status mature
+words 0 2
+pitch 67 123
+tone 60 150 204 100 400 255 700 10 3000 255
+stressAmp 11 8 11 9 0 0 14 12
+stressLength 160 120 200 130 0 0 230 180
@@ -0,0 +1,4 @@
+name Swahili
+language sw
+
+status testing
@@ -0,0 +1,4 @@
+name Setswana
+language tn
+
+status testing
@@ -0,0 +1,3 @@
+name Georgian
+language ka
+lowercaseSentence	// A period followed by a lowercase letter is considered a sentence (mkhedruli)
@@ -0,0 +1,4 @@
+name Welsh
+language cy
+
+intonation 4
@@ -0,0 +1,4 @@
+name Gaelic (Irish)
+language ga
+
+dictrules 1  // fix for eclipsis
@@ -0,0 +1,4 @@
+name Gaelic (Scottish)
+language gd
+
+status testing
@@ -0,0 +1,4 @@
+name Oromo
+language om
+
+status testing
@@ -0,0 +1,5 @@
+name Kannada
+language kn
+
+intonation 2
+//consonants 80
@@ -0,0 +1,5 @@
+name Malayalam
+language ml
+
+intonation 2
+//consonants 80
@@ -0,0 +1,5 @@
+name Tamil
+language ta
+
+intonation 2
+consonants 80
@@ -0,0 +1,7 @@
+name Telugu
+language te
+
+status testing
+
+intonation 2
+//consonants 80
@@ -0,0 +1,3 @@
+name Greenlandic
+language kl
+
@@ -0,0 +1,5 @@
+name Basque
+language eu
+
+status testing
+stressRule 15
@@ -0,0 +1,4 @@
+name Danish
+language da
+
+tunes s2 c2 q2 e2
@@ -0,0 +1,4 @@
+name Faroese
+language fo
+maintainer iSolveIT ApS (Andras Eliassen) <andras@isolveit.net>
+status testing
@@ -0,0 +1,2 @@
+name Icelandic
+language is
@@ -0,0 +1,7 @@
+name Norwegian Bokmål
+language nb
+language no
+phonemes no
+dictionary no
+
+intonation 4
--- a/Show More
+++ b/Show More