switching to high quality piper tts and added label translations

2026-01-29 23:48:19 +01:00
commit d80c619df9
3934 changed files with 1451600 additions and 0 deletions
@@ -0,0 +1,230 @@
+"""Piper main script."""
+
+import argparse
+import logging
+import shutil
+import sys
+import tempfile
+import time
+import wave
+from collections.abc import Iterable
+from pathlib import Path
+
+from . import PiperVoice, SynthesisConfig
+from .audio_playback import AudioPlayer
+
+_FILE = Path(__file__)
+_DIR = _FILE.parent
+_LOGGER = logging.getLogger(_FILE.stem)
+
+
+def main() -> None:
+    """Run piper text-to-speech engine."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+    parser.add_argument("-c", "--config", help="Path to model config file")
+    parser.add_argument(
+        "-i",
+        "--input-file",
+        "--input_file",
+        action="append",
+        help="Paths to input text files",
+    )
+    parser.add_argument(
+        "-f",
+        "--output-file",
+        "--output_file",
+        help="Path to output WAV file (default: stdout)",
+    )
+    parser.add_argument(
+        "-d",
+        "--output-dir",
+        "--output_dir",
+        help="Path to output directory (default: cwd)",
+    )
+    parser.add_argument(
+        "--output-raw",
+        "--output_raw",
+        action="store_true",
+        help="Stream raw audio to stdout",
+    )
+    #
+    parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
+    parser.add_argument(
+        "--length-scale", "--length_scale", type=float, help="Phoneme length"
+    )
+    parser.add_argument(
+        "--noise-scale", "--noise_scale", type=float, help="Generator noise"
+    )
+    parser.add_argument(
+        "--noise-w-scale",
+        "--noise_w_scale",
+        "--noise-w",
+        "--noise_w",
+        type=float,
+        help="Phoneme width noise",
+    )
+    #
+    parser.add_argument("--cuda", action="store_true", help="Use GPU")
+    #
+    parser.add_argument(
+        "--sentence-silence",
+        "--sentence_silence",
+        type=float,
+        default=0.0,
+        help="Seconds of silence after each sentence",
+    )
+    parser.add_argument(
+        "--volume", type=float, default=1.0, help="Volume multiplier (default: 1.0)"
+    )
+    parser.add_argument(
+        "--no-normalize", action="store_true", help="Don't normalize audio"
+    )
+    #
+    parser.add_argument(
+        "--data-dir",
+        "--data_dir",
+        action="append",
+        default=[str(Path.cwd())],
+        help="Data directory to check for voice models (default: current directory)",
+    )
+    #
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to console"
+    )
+    args, unknown_args = parser.parse_known_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+    _LOGGER.debug(args)
+
+    if args.input_file:
+        # Input text from file(s)
+        def lines() -> Iterable[str]:
+            for input_path in args.input_file:
+                _LOGGER.debug("Reading text from %s", input_path)
+                with open(input_path, "r", encoding="utf-8") as input_file:
+                    for line in input_file:
+                        line = line.strip()
+                        if line:
+                            yield line
+
+    else:
+        # Input text from args or stdin
+        texts: Iterable[str]
+        if unknown_args:
+            texts = [" ".join(unknown_args)]
+        else:
+            texts = sys.stdin
+
+        def lines() -> Iterable[str]:
+            for line in texts:
+                line = line.strip()
+                if line:
+                    yield line
+
+    model_path = Path(args.model)
+    if not model_path.exists():
+        # Look in data directories
+        voice_name = args.model
+        for data_dir in args.data_dir:
+            maybe_model_path = Path(data_dir) / f"{voice_name}.onnx"
+            _LOGGER.debug("Checking '%s'", maybe_model_path)
+            if maybe_model_path.exists():
+                model_path = maybe_model_path
+                break
+
+    if not model_path.exists():
+        raise ValueError(
+            f"Unable to find voice: {model_path} (use piper.download_voices)"
+        )
+
+    # Load voice
+    _LOGGER.debug("Loading voice: '%s'", model_path)
+    voice = PiperVoice.load(model_path, use_cuda=args.cuda)
+    syn_config = SynthesisConfig(
+        speaker_id=args.speaker,
+        length_scale=args.length_scale,
+        noise_scale=args.noise_scale,
+        noise_w_scale=args.noise_w_scale,
+        normalize_audio=(not args.no_normalize),
+        volume=args.volume,
+    )
+
+    wav_file: wave.Wave_write
+
+    # 16-bit samples for silence
+    silence_int16_bytes = bytes(
+        int(voice.config.sample_rate * args.sentence_silence * 2)
+    )
+
+    def lines_to_wav() -> None:
+        wav_params_set = False
+        for line in lines():
+            for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)):
+                if not wav_params_set:
+                    wav_file.setframerate(audio_chunk.sample_rate)
+                    wav_file.setsampwidth(audio_chunk.sample_width)
+                    wav_file.setnchannels(audio_chunk.sample_channels)
+                    wav_params_set = True
+
+                if i > 0:
+                    wav_file.writeframes(silence_int16_bytes)
+
+                wav_file.writeframes(audio_chunk.audio_int16_bytes)
+
+    if args.output_raw:
+        # Write raw audio to stdout as its produced
+        for line in lines():
+            audio_stream = voice.synthesize(line, syn_config)
+            for i, audio_chunk in enumerate(audio_stream):
+                if i > 0:
+                    sys.stdout.buffer.write(silence_int16_bytes)
+
+                sys.stdout.buffer.write(audio_chunk.audio_int16_bytes)
+                sys.stdout.buffer.flush()
+    elif args.output_dir:
+        # Write multiple WAV files to a directory, one per line
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        for line in lines():
+            wav_path = output_dir / f"{time.monotonic_ns()}.wav"
+            wav_file = wave.open(str(wav_path), "wb")
+            with wav_file:
+                lines_to_wav()
+
+            _LOGGER.info("Wrote %s", wav_path)
+    else:
+        if args.output_file == "-":
+            # Write WAV file to stdout
+            with tempfile.NamedTemporaryFile("wb+", suffix=".wav") as temp_wav_file:
+                wav_file = wave.open(temp_wav_file.name, "wb")
+                lines_to_wav()
+
+                temp_wav_file.seek(0)
+                shutil.copyfileobj(temp_wav_file, sys.stdout.buffer)
+        elif (not args.output_file) and AudioPlayer.is_available():
+            # Play audio using ffplay
+            with AudioPlayer(voice.config.sample_rate) as player:
+                for line in lines():
+                    for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)):
+                        if i > 0:
+                            player.play(silence_int16_bytes)
+
+                        player.play(audio_chunk.audio_int16_bytes)
+        else:
+            # Write to WAV file
+            if not args.output_file:
+                _LOGGER.warning(
+                    "Audio playback is not available (ffplay). Writing audio to output.wav."
+                )
+                args.output_file = "output.wav"
+
+            wav_file = wave.open(args.output_file, "wb")
+            with wav_file:
+                lines_to_wav()
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()