switching to high quality piper tts and added label translations
This commit is contained in:
@@ -0,0 +1,230 @@
|
||||
"""Piper main script."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import wave
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from . import PiperVoice, SynthesisConfig
|
||||
from .audio_playback import AudioPlayer
|
||||
|
||||
_FILE = Path(__file__)
|
||||
_DIR = _FILE.parent
|
||||
_LOGGER = logging.getLogger(_FILE.stem)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Run piper text-to-speech engine."""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
|
||||
parser.add_argument("-c", "--config", help="Path to model config file")
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input-file",
|
||||
"--input_file",
|
||||
action="append",
|
||||
help="Paths to input text files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--output-file",
|
||||
"--output_file",
|
||||
help="Path to output WAV file (default: stdout)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--output-dir",
|
||||
"--output_dir",
|
||||
help="Path to output directory (default: cwd)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-raw",
|
||||
"--output_raw",
|
||||
action="store_true",
|
||||
help="Stream raw audio to stdout",
|
||||
)
|
||||
#
|
||||
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
|
||||
parser.add_argument(
|
||||
"--length-scale", "--length_scale", type=float, help="Phoneme length"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noise-w-scale",
|
||||
"--noise_w_scale",
|
||||
"--noise-w",
|
||||
"--noise_w",
|
||||
type=float,
|
||||
help="Phoneme width noise",
|
||||
)
|
||||
#
|
||||
parser.add_argument("--cuda", action="store_true", help="Use GPU")
|
||||
#
|
||||
parser.add_argument(
|
||||
"--sentence-silence",
|
||||
"--sentence_silence",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Seconds of silence after each sentence",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--volume", type=float, default=1.0, help="Volume multiplier (default: 1.0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-normalize", action="store_true", help="Don't normalize audio"
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
"--data_dir",
|
||||
action="append",
|
||||
default=[str(Path.cwd())],
|
||||
help="Data directory to check for voice models (default: current directory)",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to console"
|
||||
)
|
||||
args, unknown_args = parser.parse_known_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
||||
_LOGGER.debug(args)
|
||||
|
||||
if args.input_file:
|
||||
# Input text from file(s)
|
||||
def lines() -> Iterable[str]:
|
||||
for input_path in args.input_file:
|
||||
_LOGGER.debug("Reading text from %s", input_path)
|
||||
with open(input_path, "r", encoding="utf-8") as input_file:
|
||||
for line in input_file:
|
||||
line = line.strip()
|
||||
if line:
|
||||
yield line
|
||||
|
||||
else:
|
||||
# Input text from args or stdin
|
||||
texts: Iterable[str]
|
||||
if unknown_args:
|
||||
texts = [" ".join(unknown_args)]
|
||||
else:
|
||||
texts = sys.stdin
|
||||
|
||||
def lines() -> Iterable[str]:
|
||||
for line in texts:
|
||||
line = line.strip()
|
||||
if line:
|
||||
yield line
|
||||
|
||||
model_path = Path(args.model)
|
||||
if not model_path.exists():
|
||||
# Look in data directories
|
||||
voice_name = args.model
|
||||
for data_dir in args.data_dir:
|
||||
maybe_model_path = Path(data_dir) / f"{voice_name}.onnx"
|
||||
_LOGGER.debug("Checking '%s'", maybe_model_path)
|
||||
if maybe_model_path.exists():
|
||||
model_path = maybe_model_path
|
||||
break
|
||||
|
||||
if not model_path.exists():
|
||||
raise ValueError(
|
||||
f"Unable to find voice: {model_path} (use piper.download_voices)"
|
||||
)
|
||||
|
||||
# Load voice
|
||||
_LOGGER.debug("Loading voice: '%s'", model_path)
|
||||
voice = PiperVoice.load(model_path, use_cuda=args.cuda)
|
||||
syn_config = SynthesisConfig(
|
||||
speaker_id=args.speaker,
|
||||
length_scale=args.length_scale,
|
||||
noise_scale=args.noise_scale,
|
||||
noise_w_scale=args.noise_w_scale,
|
||||
normalize_audio=(not args.no_normalize),
|
||||
volume=args.volume,
|
||||
)
|
||||
|
||||
wav_file: wave.Wave_write
|
||||
|
||||
# 16-bit samples for silence
|
||||
silence_int16_bytes = bytes(
|
||||
int(voice.config.sample_rate * args.sentence_silence * 2)
|
||||
)
|
||||
|
||||
def lines_to_wav() -> None:
|
||||
wav_params_set = False
|
||||
for line in lines():
|
||||
for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)):
|
||||
if not wav_params_set:
|
||||
wav_file.setframerate(audio_chunk.sample_rate)
|
||||
wav_file.setsampwidth(audio_chunk.sample_width)
|
||||
wav_file.setnchannels(audio_chunk.sample_channels)
|
||||
wav_params_set = True
|
||||
|
||||
if i > 0:
|
||||
wav_file.writeframes(silence_int16_bytes)
|
||||
|
||||
wav_file.writeframes(audio_chunk.audio_int16_bytes)
|
||||
|
||||
if args.output_raw:
|
||||
# Write raw audio to stdout as its produced
|
||||
for line in lines():
|
||||
audio_stream = voice.synthesize(line, syn_config)
|
||||
for i, audio_chunk in enumerate(audio_stream):
|
||||
if i > 0:
|
||||
sys.stdout.buffer.write(silence_int16_bytes)
|
||||
|
||||
sys.stdout.buffer.write(audio_chunk.audio_int16_bytes)
|
||||
sys.stdout.buffer.flush()
|
||||
elif args.output_dir:
|
||||
# Write multiple WAV files to a directory, one per line
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for line in lines():
|
||||
wav_path = output_dir / f"{time.monotonic_ns()}.wav"
|
||||
wav_file = wave.open(str(wav_path), "wb")
|
||||
with wav_file:
|
||||
lines_to_wav()
|
||||
|
||||
_LOGGER.info("Wrote %s", wav_path)
|
||||
else:
|
||||
if args.output_file == "-":
|
||||
# Write WAV file to stdout
|
||||
with tempfile.NamedTemporaryFile("wb+", suffix=".wav") as temp_wav_file:
|
||||
wav_file = wave.open(temp_wav_file.name, "wb")
|
||||
lines_to_wav()
|
||||
|
||||
temp_wav_file.seek(0)
|
||||
shutil.copyfileobj(temp_wav_file, sys.stdout.buffer)
|
||||
elif (not args.output_file) and AudioPlayer.is_available():
|
||||
# Play audio using ffplay
|
||||
with AudioPlayer(voice.config.sample_rate) as player:
|
||||
for line in lines():
|
||||
for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)):
|
||||
if i > 0:
|
||||
player.play(silence_int16_bytes)
|
||||
|
||||
player.play(audio_chunk.audio_int16_bytes)
|
||||
else:
|
||||
# Write to WAV file
|
||||
if not args.output_file:
|
||||
_LOGGER.warning(
|
||||
"Audio playback is not available (ffplay). Writing audio to output.wav."
|
||||
)
|
||||
args.output_file = "output.wav"
|
||||
|
||||
wav_file = wave.open(args.output_file, "wb")
|
||||
with wav_file:
|
||||
lines_to_wav()
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user