switching to high quality piper tts and added label translations

2026-01-29 23:48:19 +01:00
commit d80c619df9
3934 changed files with 1451600 additions and 0 deletions
@@ -0,0 +1,290 @@
+"""Flask web server with HTTP API for Piper."""
+
+import argparse
+import io
+import json
+import logging
+import wave
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from urllib.request import urlopen
+
+from flask import Flask, request
+
+from . import PiperVoice, SynthesisConfig
+from .download_voices import VOICES_JSON, download_voice
+
+_LOGGER = logging.getLogger()
+
+
+def main() -> None:
+    """Run HTTP server."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0", help="HTTP server host")
+    parser.add_argument("--port", type=int, default=5000, help="HTTP server port")
+    #
+    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+    #
+    parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
+    parser.add_argument(
+        "--length-scale", "--length_scale", type=float, help="Phoneme length"
+    )
+    parser.add_argument(
+        "--noise-scale", "--noise_scale", type=float, help="Generator noise"
+    )
+    parser.add_argument(
+        "--noise-w-scale",
+        "--noise_w_scale",
+        "--noise-w",
+        "--noise_w",
+        type=float,
+        help="Phoneme width noise",
+    )
+    #
+    parser.add_argument("--cuda", action="store_true", help="Use GPU")
+    #
+    parser.add_argument(
+        "--sentence-silence",
+        "--sentence_silence",
+        type=float,
+        default=0.0,
+        help="Seconds of silence after each sentence",
+    )
+    #
+    parser.add_argument(
+        "--data-dir",
+        "--data_dir",
+        action="append",
+        default=[str(Path.cwd())],
+        help="Data directory to check for downloaded models (default: current directory)",
+    )
+    parser.add_argument(
+        "--download-dir",
+        "--download_dir",
+        help="Path to download voices (default: first data dir)",
+    )
+    #
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to console"
+    )
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+    _LOGGER.debug(args)
+
+    if not args.download_dir:
+        # Download voices to first data directory if not specified
+        args.download_dir = args.data_dir[0]
+
+    download_dir = Path(args.download_dir)
+
+    # Download voice if file doesn't exist
+    model_path = Path(args.model)
+    if not model_path.exists():
+        # Look in data directories
+        voice_name = args.model
+        for data_dir in args.data_dir:
+            maybe_model_path = Path(data_dir) / f"{voice_name}.onnx"
+            _LOGGER.debug("Checking '%s'", maybe_model_path)
+            if maybe_model_path.exists():
+                model_path = maybe_model_path
+                break
+
+    if not model_path.exists():
+        raise ValueError(
+            f"Unable to find voice: {model_path} (use piper.download_voices)"
+        )
+
+    default_model_id = model_path.name.rstrip(".onnx")
+
+    # Load voice
+    default_voice = PiperVoice.load(model_path, use_cuda=args.cuda)
+    loaded_voices: Dict[str, PiperVoice] = {default_model_id: default_voice}
+
+    # Create web server
+    app = Flask(__name__)
+
+    @app.route("/voices", methods=["GET"])
+    def app_voices() -> Dict[str, Any]:
+        """List downloaded voices.
+
+        Outputs a JSON object with the format:
+        {
+          "<voice name>": { <voice config> },
+          ...
+        }
+
+        for each voice in your data directories.
+        """
+        voices_dict: Dict[str, Any] = {}
+        config_paths: List[Path] = [Path(f"{model_path}.json")]
+
+        for data_dir in args.data_dir:
+            for onnx_path in Path(data_dir).glob("*.onnx"):
+                config_path = Path(f"{onnx_path}.json")
+                if config_path.exists():
+                    config_paths.append(config_path)
+
+        for config_path in config_paths:
+            model_id = config_path.name.rstrip(".onnx.json")
+            if model_id in voices_dict:
+                continue
+
+            with open(config_path, "r", encoding="utf-8") as config_file:
+                voices_dict[model_id] = json.load(config_file)
+
+        return voices_dict
+
+    @app.route("/all-voices", methods=["GET"])
+    def app_all_voices() -> Dict[str, Any]:
+        """List all Piper voices.
+
+        Outputs voices.json from the piper-voices repo on HuggingFace.
+        See: https://huggingface.co/rhasspy/piper-voices
+        """
+        with urlopen(VOICES_JSON) as response:
+            return json.load(response)
+
+    @app.route("/download", methods=["POST"])
+    def app_download() -> str:
+        """Download a voice.
+
+        Downloads the .onnx and .onnx.json file from piper-voices repo on HuggingFace.
+        See: https://huggingface.co/rhasspy/piper-voices
+
+        Expects a JSON object with the format:
+        {
+          "voice": "<voice name>",   (required)
+          "force_redownload": false  (optional)
+        }
+
+        Returns the name of the voice.
+        Voice format must be <language>-<name>-<quality> like "en_US-lessac-medium".
+        """
+        data = json.loads(request.data)
+        model_id = data.get("voice")
+        if not model_id:
+            raise ValueError("voice is required")
+
+        force_redownload = data.get("force_redownload", False)
+        download_voice(model_id, download_dir, force_redownload=force_redownload)
+
+        return model_id
+
+    @app.route("/", methods=["POST"])
+    def app_synthesize() -> bytes:
+        """Synthesize audio from text.
+
+        Expects a JSON object with the format:
+        {
+          "text": "Text to speak.",      (required)
+          "voice": "<voice name>",       (optional)
+          "speaker": "<speaker name>",   (optional)
+          "speaker_id": "<speaker id>",  (optional, overrides speaker)
+          "length_scale": 1.0,           (optional)
+          "noise_scale": 0.667,          (optional)
+          "length_w_scale": 0.8          (optional)
+        }
+        """
+        data = json.loads(request.data)
+        text = data.get("text", "").strip()
+        if not text:
+            raise ValueError("No text provided")
+
+        _LOGGER.debug(data)
+
+        model_id = data.get("voice", default_model_id)
+        voice = loaded_voices.get(model_id)
+        if voice is None:
+            for data_dir in args.data_dir:
+                maybe_model_path = Path(data_dir) / f"{model_id}.onnx"
+                if maybe_model_path.exists():
+                    _LOGGER.debug("Loading voice %s", model_id)
+                    voice = PiperVoice.load(maybe_model_path, use_cuda=args.cuda)
+                    loaded_voices[model_id] = voice
+                    break
+
+        if voice is None:
+            _LOGGER.warning("Voice not found: %s. Using default voice.", model_id)
+            voice = default_voice
+
+        speaker_id: Optional[int] = data.get("speaker_id")
+        if (voice.config.num_speakers > 1) and (speaker_id is None):
+            speaker = data.get("speaker")
+            if speaker:
+                speaker_id = voice.config.speaker_id_map.get(speaker)
+
+            if speaker_id is None:
+                _LOGGER.warning(
+                    "Speaker not found: '%s' in %s",
+                    speaker,
+                    voice.config.speaker_id_map.keys(),
+                )
+                speaker_id = args.speaker or 0
+
+        if (speaker_id is not None) and (speaker_id > voice.config.num_speakers):
+            speaker_id = 0
+
+        syn_config = SynthesisConfig(
+            speaker_id=speaker_id,
+            length_scale=float(
+                data.get(
+                    "length_scale",
+                    (
+                        args.length_scale
+                        if args.length_scale is not None
+                        else voice.config.length_scale
+                    ),
+                )
+            ),
+            noise_scale=float(
+                data.get(
+                    "noise_scale",
+                    (
+                        args.noise_scale
+                        if args.noise_scale is not None
+                        else voice.config.noise_scale
+                    ),
+                )
+            ),
+            noise_w_scale=float(
+                data.get(
+                    "noise_w_scale",
+                    (
+                        args.noise_w_scale
+                        if args.noise_w_scale is not None
+                        else voice.config.noise_w_scale
+                    ),
+                )
+            ),
+        )
+
+        _LOGGER.debug("Synthesizing text: '%s' with config=%s", text, syn_config)
+        with io.BytesIO() as wav_io:
+            wav_file: wave.Wave_write = wave.open(wav_io, "wb")
+            with wav_file:
+                wav_params_set = False
+                for i, audio_chunk in enumerate(voice.synthesize(text, syn_config)):
+                    if not wav_params_set:
+                        wav_file.setframerate(audio_chunk.sample_rate)
+                        wav_file.setsampwidth(audio_chunk.sample_width)
+                        wav_file.setnchannels(audio_chunk.sample_channels)
+                        wav_params_set = True
+
+                    if i > 0:
+                        wav_file.writeframes(
+                            bytes(
+                                int(
+                                    voice.config.sample_rate * args.sentence_silence * 2
+                                )
+                            )
+                        )
+
+                    wav_file.writeframes(audio_chunk.audio_int16_bytes)
+
+            return wav_io.getvalue()
+
+    app.run(host=args.host, port=args.port)
+
+
+if __name__ == "__main__":
+    main()