switching to high quality piper tts and added label translations

This commit is contained in:
Matthias Hinrichs
2026-01-29 23:48:19 +01:00
commit d80c619df9
3934 changed files with 1451600 additions and 0 deletions
@@ -0,0 +1,12 @@
"""Piper text-to-speech engine."""
from .config import PhonemeType, PiperConfig, SynthesisConfig
from .voice import AudioChunk, PiperVoice
__all__ = [
"AudioChunk",
"PhonemeType",
"PiperConfig",
"PiperVoice",
"SynthesisConfig",
]
@@ -0,0 +1,230 @@
"""Piper main script."""
import argparse
import logging
import shutil
import sys
import tempfile
import time
import wave
from collections.abc import Iterable
from pathlib import Path
from . import PiperVoice, SynthesisConfig
from .audio_playback import AudioPlayer
_FILE = Path(__file__)
_DIR = _FILE.parent
_LOGGER = logging.getLogger(_FILE.stem)
def main() -> None:
"""Run piper text-to-speech engine."""
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
parser.add_argument(
"-i",
"--input-file",
"--input_file",
action="append",
help="Paths to input text files",
)
parser.add_argument(
"-f",
"--output-file",
"--output_file",
help="Path to output WAV file (default: stdout)",
)
parser.add_argument(
"-d",
"--output-dir",
"--output_dir",
help="Path to output directory (default: cwd)",
)
parser.add_argument(
"--output-raw",
"--output_raw",
action="store_true",
help="Stream raw audio to stdout",
)
#
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
parser.add_argument(
"--length-scale", "--length_scale", type=float, help="Phoneme length"
)
parser.add_argument(
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
)
parser.add_argument(
"--noise-w-scale",
"--noise_w_scale",
"--noise-w",
"--noise_w",
type=float,
help="Phoneme width noise",
)
#
parser.add_argument("--cuda", action="store_true", help="Use GPU")
#
parser.add_argument(
"--sentence-silence",
"--sentence_silence",
type=float,
default=0.0,
help="Seconds of silence after each sentence",
)
parser.add_argument(
"--volume", type=float, default=1.0, help="Volume multiplier (default: 1.0)"
)
parser.add_argument(
"--no-normalize", action="store_true", help="Don't normalize audio"
)
#
parser.add_argument(
"--data-dir",
"--data_dir",
action="append",
default=[str(Path.cwd())],
help="Data directory to check for voice models (default: current directory)",
)
#
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to console"
)
args, unknown_args = parser.parse_known_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
_LOGGER.debug(args)
if args.input_file:
# Input text from file(s)
def lines() -> Iterable[str]:
for input_path in args.input_file:
_LOGGER.debug("Reading text from %s", input_path)
with open(input_path, "r", encoding="utf-8") as input_file:
for line in input_file:
line = line.strip()
if line:
yield line
else:
# Input text from args or stdin
texts: Iterable[str]
if unknown_args:
texts = [" ".join(unknown_args)]
else:
texts = sys.stdin
def lines() -> Iterable[str]:
for line in texts:
line = line.strip()
if line:
yield line
model_path = Path(args.model)
if not model_path.exists():
# Look in data directories
voice_name = args.model
for data_dir in args.data_dir:
maybe_model_path = Path(data_dir) / f"{voice_name}.onnx"
_LOGGER.debug("Checking '%s'", maybe_model_path)
if maybe_model_path.exists():
model_path = maybe_model_path
break
if not model_path.exists():
raise ValueError(
f"Unable to find voice: {model_path} (use piper.download_voices)"
)
# Load voice
_LOGGER.debug("Loading voice: '%s'", model_path)
voice = PiperVoice.load(model_path, use_cuda=args.cuda)
syn_config = SynthesisConfig(
speaker_id=args.speaker,
length_scale=args.length_scale,
noise_scale=args.noise_scale,
noise_w_scale=args.noise_w_scale,
normalize_audio=(not args.no_normalize),
volume=args.volume,
)
wav_file: wave.Wave_write
# 16-bit samples for silence
silence_int16_bytes = bytes(
int(voice.config.sample_rate * args.sentence_silence * 2)
)
def lines_to_wav() -> None:
wav_params_set = False
for line in lines():
for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)):
if not wav_params_set:
wav_file.setframerate(audio_chunk.sample_rate)
wav_file.setsampwidth(audio_chunk.sample_width)
wav_file.setnchannels(audio_chunk.sample_channels)
wav_params_set = True
if i > 0:
wav_file.writeframes(silence_int16_bytes)
wav_file.writeframes(audio_chunk.audio_int16_bytes)
if args.output_raw:
# Write raw audio to stdout as its produced
for line in lines():
audio_stream = voice.synthesize(line, syn_config)
for i, audio_chunk in enumerate(audio_stream):
if i > 0:
sys.stdout.buffer.write(silence_int16_bytes)
sys.stdout.buffer.write(audio_chunk.audio_int16_bytes)
sys.stdout.buffer.flush()
elif args.output_dir:
# Write multiple WAV files to a directory, one per line
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
for line in lines():
wav_path = output_dir / f"{time.monotonic_ns()}.wav"
wav_file = wave.open(str(wav_path), "wb")
with wav_file:
lines_to_wav()
_LOGGER.info("Wrote %s", wav_path)
else:
if args.output_file == "-":
# Write WAV file to stdout
with tempfile.NamedTemporaryFile("wb+", suffix=".wav") as temp_wav_file:
wav_file = wave.open(temp_wav_file.name, "wb")
lines_to_wav()
temp_wav_file.seek(0)
shutil.copyfileobj(temp_wav_file, sys.stdout.buffer)
elif (not args.output_file) and AudioPlayer.is_available():
# Play audio using ffplay
with AudioPlayer(voice.config.sample_rate) as player:
for line in lines():
for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)):
if i > 0:
player.play(silence_int16_bytes)
player.play(audio_chunk.audio_int16_bytes)
else:
# Write to WAV file
if not args.output_file:
_LOGGER.warning(
"Audio playback is not available (ffplay). Writing audio to output.wav."
)
args.output_file = "output.wav"
wav_file = wave.open(args.output_file, "wb")
with wav_file:
lines_to_wav()
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()
@@ -0,0 +1,58 @@
"""Audio playback using ffplay."""
import shutil
import subprocess
from typing import Optional
class AudioPlayer:
"""Plays raw audio using ffplay."""
def __init__(self, sample_rate: int) -> None:
"""Initialzes audio player."""
self.sample_rate = sample_rate
self._proc: Optional[subprocess.Popen] = None
def __enter__(self):
"""Starts ffplay subprocess and returns player."""
self._proc = subprocess.Popen(
[
"ffplay",
"-nodisp",
"-autoexit",
"-f",
"s16le",
"-ar",
str(self.sample_rate),
"-ac",
"1",
"-",
],
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Stops ffplay subprocess."""
if self._proc:
try:
if self._proc.stdin:
self._proc.stdin.close()
except Exception:
pass
self._proc.wait(timeout=5)
def play(self, audio_bytes: bytes) -> None:
"""Plays raw audio using ffplay."""
assert self._proc is not None
assert self._proc.stdin is not None
self._proc.stdin.write(audio_bytes)
self._proc.stdin.flush()
@staticmethod
def is_available() -> bool:
"""Returns true if ffplay is available."""
return bool(shutil.which("ffplay"))
@@ -0,0 +1,117 @@
"""Piper configuration"""
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Mapping, Optional, Sequence
DEFAULT_NOISE_SCALE = 0.667
DEFAULT_LENGTH_SCALE = 1.0
DEFAULT_NOISE_W_SCALE = 0.8
class PhonemeType(str, Enum):
ESPEAK = "espeak"
TEXT = "text"
@dataclass
class PiperConfig:
"""Piper configuration"""
num_symbols: int
"""Number of phonemes."""
num_speakers: int
"""Number of speakers."""
sample_rate: int
"""Sample rate of output audio."""
espeak_voice: str
"""Name of espeak-ng voice or alphabet."""
phoneme_id_map: Mapping[str, Sequence[int]]
"""Phoneme -> [id,]."""
phoneme_type: PhonemeType
"""espeak or text."""
speaker_id_map: Mapping[str, int] = field(default_factory=dict)
"""Speaker -> id"""
piper_version: Optional[str] = None
# Inference settings
length_scale: float = DEFAULT_LENGTH_SCALE
noise_scale: float = DEFAULT_NOISE_SCALE
noise_w_scale: float = DEFAULT_NOISE_W_SCALE
@staticmethod
def from_dict(config: dict[str, Any]) -> "PiperConfig":
"""Load configuration from a dictionary."""
inference = config.get("inference", {})
return PiperConfig(
num_symbols=config["num_symbols"],
num_speakers=config["num_speakers"],
sample_rate=config["audio"]["sample_rate"],
noise_scale=inference.get("noise_scale", DEFAULT_NOISE_SCALE),
length_scale=inference.get("length_scale", DEFAULT_LENGTH_SCALE),
noise_w_scale=inference.get("noise_w", DEFAULT_NOISE_W_SCALE),
#
espeak_voice=config["espeak"]["voice"],
phoneme_id_map=config["phoneme_id_map"],
phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
speaker_id_map=config.get("speaker_id_map", {}),
#
piper_version=config.get("piper_version"),
)
def to_dict(self) -> dict[str, Any]:
"""Convert configuration to a dictionary."""
config_dict = {
"audio": {
"sample_rate": self.sample_rate,
},
"espeak": {
"voice": self.espeak_voice,
},
"phoneme_type": self.phoneme_type.value,
"num_symbols": self.num_symbols,
"num_speakers": self.num_speakers,
"inference": {
"noise_scale": self.noise_scale,
"length_scale": self.length_scale,
"noise_w": self.noise_w_scale,
},
"phoneme_id_map": self.phoneme_id_map,
"speaker_id_map": self.speaker_id_map,
}
if self.piper_version:
config_dict["piper_version"] = self.piper_version
return config_dict
@dataclass
class SynthesisConfig:
"""Configuration for Piper synthesis."""
speaker_id: Optional[int] = None
"""Index of speaker to use (multi-speaker voices only)."""
length_scale: Optional[float] = None
"""Phoneme length scale (< 1 is faster, > 1 is slower)."""
noise_scale: Optional[float] = None
"""Amount of generator noise to add."""
noise_w_scale: Optional[float] = None
"""Amount of phoneme width noise to add."""
normalize_audio: bool = True
"""Enable/disable scaling audio samples to fit full range."""
volume: float = 1.0
"""Multiplier for audio samples (< 1 is quieter, > 1 is louder)."""
@@ -0,0 +1,5 @@
"""Constants"""
PAD = "_" # padding (0)
BOS = "^" # beginning of sentence
EOS = "$" # end of sentence
@@ -0,0 +1,138 @@
"""Command-line utility for downloading Piper voices."""
import argparse
import json
import logging
import re
import shutil
from pathlib import Path
from urllib.request import urlopen
URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/main/{lang_family}/{lang_code}/{voice_name}/{voice_quality}/{lang_code}-{voice_name}-{voice_quality}{extension}?download=true"
VOICES_JSON = (
"https://huggingface.co/rhasspy/piper-voices/resolve/main/voices.json?download=true"
)
VOICE_PATTERN = re.compile(
r"^(?P<lang_family>[^-]+)_(?P<lang_region>[^-]+)-(?P<voice_name>[^-]+)-(?P<voice_quality>.+)$"
)
_LOGGER = logging.getLogger(__name__)
def main() -> None:
"""Download Piper voices."""
parser = argparse.ArgumentParser()
parser.add_argument(
"voice", nargs="*", help="Name of voice like 'en_US-lessac-medium'"
)
parser.add_argument(
"--download-dir",
"--download_dir",
"--data-dir",
"--data_dir",
help="Directory to download voices into (default: current directory)",
)
parser.add_argument(
"--force-redownload",
"--force_redownload",
action="store_true",
help="Force redownloading of voice files even if they exist already",
)
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG logs to console"
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
if not args.voice:
list_voices()
return
if args.download_dir:
download_dir = Path(args.download_dir)
else:
download_dir = Path.cwd()
download_dir.mkdir(parents=True, exist_ok=True)
for voice in args.voice:
download_voice(voice, download_dir, force_redownload=args.force_redownload)
# -----------------------------------------------------------------------------
def list_voices() -> None:
"""List available voices and exit."""
_LOGGER.debug("Downloading voices.json file: '%s'", VOICES_JSON)
with urlopen(VOICES_JSON) as response:
voices_dict = json.load(response)
for voice in sorted(voices_dict.keys()):
print(voice)
def download_voice(
voice: str, download_dir: Path, force_redownload: bool = False
) -> None:
"""Download a voice model and config file to a directory."""
voice = voice.strip()
voice_match = VOICE_PATTERN.match(voice)
if not voice_match:
raise ValueError(
f"Voice '{voice}' did not match pattern: <language>-<name>-<quality> like 'en_US-lessac-medium'",
)
lang_family = voice_match.group("lang_family")
lang_code = lang_family + "_" + voice_match.group("lang_region")
voice_name = voice_match.group("voice_name")
voice_quality = voice_match.group("voice_quality")
voice_code = f"{lang_code}-{voice_name}-{voice_quality}"
format_args = {
"lang_family": lang_family,
"lang_code": lang_code,
"voice_name": voice_name,
"voice_quality": voice_quality,
}
model_path = download_dir / f"{voice_code}.onnx"
if force_redownload or _needs_download(model_path):
model_url = URL_FORMAT.format(extension=".onnx", **format_args)
_LOGGER.debug("Downloading model from '%s' to '%s'", model_url, model_path)
with urlopen(model_url) as response:
with open(model_path, "wb") as model_file:
shutil.copyfileobj(response, model_file)
_LOGGER.debug("Downloaded: '%s'", model_path)
config_path = download_dir / f"{voice_code}.onnx.json"
if force_redownload or _needs_download(config_path):
config_url = URL_FORMAT.format(extension=".onnx.json", **format_args)
_LOGGER.debug("Downloading config from '%s' to '%s'", config_url, config_path)
with urlopen(config_url) as response:
with open(config_path, "wb") as config_file:
shutil.copyfileobj(response, config_file)
_LOGGER.debug("Downloaded: '%s'", config_path)
_LOGGER.info("Downloaded: %s", voice)
def _needs_download(path: Path) -> bool:
"""Return True if file needs to be downloaded."""
if not path.exists():
return True
if path.stat().st_size == 0:
# Empty
return True
return False
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()
@@ -0,0 +1,8 @@
name Vietnamese (Northern)
language vi
words 1 2
pitch 95 175
tone 100 225 800 100 2000 50 5400 75 8000 200
@@ -0,0 +1,9 @@
name Vietnamese (Central)
language vi-vn-x-central
phonemes vi-hue
dictrules 1
words 1
pitch 82 118 //80 118
voicing 90 //18
flutter 20
@@ -0,0 +1,9 @@
name Vietnamese (Southern)
language vi-vn-x-south
phonemes vi-sgn
dictrules 2
words 1
pitch 82 118 //80 118
voicing 90 //18
flutter 20
@@ -0,0 +1,4 @@
name Esperanto
language eo
apostrophe 2
@@ -0,0 +1,2 @@
name Interlingua
language ia
@@ -0,0 +1,5 @@
name Ido
language io
phonemes eo
status testing
@@ -0,0 +1,4 @@
name Lojban
language jbo
speed 80 // speed adjustment, percentage
@@ -0,0 +1,8 @@
name Lingua Franca Nova
language lfn
phonemes base2
l_unpronouncable 0
numbers 2 3
stressLength 150 140 180 180 0 0 200 200
@@ -0,0 +1,5 @@
name Klingon
language piqd
status testing
stressRule 3
@@ -0,0 +1,7 @@
name Pyash
language py
maintainer Logan Streondj <logan@liberit.ca>
status testing
speed 80 // speed adjustment, percentage
stressRule 0
@@ -0,0 +1,6 @@
name Lang Belta
language qdb
numbers 4 3
replace 1 t ?
@@ -0,0 +1,4 @@
name Quenya
language qya
stressRule 2
// rule=penultimate, with qya_rules for light penultimate syllables to move primary stress to the preceding (antepenultimate) syllable
@@ -0,0 +1,4 @@
name Sindarin
language sjn
stressRule 2
// rule=penultimate, with sjn_rules for light penultimate syllables to move primary stress to the preceding (antepenultimate) syllable
@@ -0,0 +1,10 @@
name xextan-test
language xex
phonemes pt-br
phonemes pt
pitch 80 130
dictrules 1
tunes s7 c7 q7 e7
@@ -0,0 +1,6 @@
name Nahuatl (Classical)
language nci
intonation 3
stressRule 2
stressLength 190 190 200 200 0 0 220 240
@@ -0,0 +1,2 @@
name Lithuanian
language lt
@@ -0,0 +1,12 @@
name Latgalian
language ltg
maintainer Valdis Vitolins <valdis.vitolins@odo.lv>
status testing
phonemes lv
dictionary lv
dictrules 2 // Setting for Latgalian pronunciation
words 0 2
pitch 64 118
tone 60 150 204 100 400 255 700 10 3000 255
stressAmp 12 10 8 8 0 0 15 16
stressLength 160 140 200 140 0 0 240 160
@@ -0,0 +1,9 @@
name Latvian
language lv
maintainer Valdis Vitolins <valdis.vitolins@odo.lv>
status mature
words 0 2
pitch 67 123
tone 60 150 204 100 400 255 700 10 3000 255
stressAmp 11 8 11 9 0 0 14 12
stressLength 160 120 200 130 0 0 230 180
@@ -0,0 +1,4 @@
name Swahili
language sw
status testing
@@ -0,0 +1,4 @@
name Setswana
language tn
status testing
@@ -0,0 +1,3 @@
name Georgian
language ka
lowercaseSentence // A period followed by a lowercase letter is considered a sentence (mkhedruli)
@@ -0,0 +1,4 @@
name Welsh
language cy
intonation 4
@@ -0,0 +1,4 @@
name Gaelic (Irish)
language ga
dictrules 1 // fix for eclipsis
@@ -0,0 +1,4 @@
name Gaelic (Scottish)
language gd
status testing
@@ -0,0 +1,4 @@
name Oromo
language om
status testing
@@ -0,0 +1,5 @@
name Kannada
language kn
intonation 2
//consonants 80
@@ -0,0 +1,5 @@
name Malayalam
language ml
intonation 2
//consonants 80
@@ -0,0 +1,5 @@
name Tamil
language ta
intonation 2
consonants 80
@@ -0,0 +1,7 @@
name Telugu
language te
status testing
intonation 2
//consonants 80
@@ -0,0 +1,3 @@
name Greenlandic
language kl
@@ -0,0 +1,5 @@
name Basque
language eu
status testing
stressRule 15
@@ -0,0 +1,4 @@
name Danish
language da
tunes s2 c2 q2 e2
@@ -0,0 +1,4 @@
name Faroese
language fo
maintainer iSolveIT ApS (Andras Eliassen) <andras@isolveit.net>
status testing
@@ -0,0 +1,2 @@
name Icelandic
language is
@@ -0,0 +1,7 @@
name Norwegian Bokmål
language nb
language no
phonemes no
dictionary no
intonation 4

Some files were not shown because too many files have changed in this diff Show More