"""Piper main script.""" import argparse import logging import shutil import sys import tempfile import time import wave from collections.abc import Iterable from pathlib import Path from . import PiperVoice, SynthesisConfig from .audio_playback import AudioPlayer _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: """Run piper text-to-speech engine.""" parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file") parser.add_argument("-c", "--config", help="Path to model config file") parser.add_argument( "-i", "--input-file", "--input_file", action="append", help="Paths to input text files", ) parser.add_argument( "-f", "--output-file", "--output_file", help="Path to output WAV file (default: stdout)", ) parser.add_argument( "-d", "--output-dir", "--output_dir", help="Path to output directory (default: cwd)", ) parser.add_argument( "--output-raw", "--output_raw", action="store_true", help="Stream raw audio to stdout", ) # parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)") parser.add_argument( "--length-scale", "--length_scale", type=float, help="Phoneme length" ) parser.add_argument( "--noise-scale", "--noise_scale", type=float, help="Generator noise" ) parser.add_argument( "--noise-w-scale", "--noise_w_scale", "--noise-w", "--noise_w", type=float, help="Phoneme width noise", ) # parser.add_argument("--cuda", action="store_true", help="Use GPU") # parser.add_argument( "--sentence-silence", "--sentence_silence", type=float, default=0.0, help="Seconds of silence after each sentence", ) parser.add_argument( "--volume", type=float, default=1.0, help="Volume multiplier (default: 1.0)" ) parser.add_argument( "--no-normalize", action="store_true", help="Don't normalize audio" ) # parser.add_argument( "--data-dir", "--data_dir", action="append", default=[str(Path.cwd())], help="Data directory to check for voice models (default: current directory)", ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args, unknown_args = parser.parse_known_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) _LOGGER.debug(args) if args.input_file: # Input text from file(s) def lines() -> Iterable[str]: for input_path in args.input_file: _LOGGER.debug("Reading text from %s", input_path) with open(input_path, "r", encoding="utf-8") as input_file: for line in input_file: line = line.strip() if line: yield line else: # Input text from args or stdin texts: Iterable[str] if unknown_args: texts = [" ".join(unknown_args)] else: texts = sys.stdin def lines() -> Iterable[str]: for line in texts: line = line.strip() if line: yield line model_path = Path(args.model) if not model_path.exists(): # Look in data directories voice_name = args.model for data_dir in args.data_dir: maybe_model_path = Path(data_dir) / f"{voice_name}.onnx" _LOGGER.debug("Checking '%s'", maybe_model_path) if maybe_model_path.exists(): model_path = maybe_model_path break if not model_path.exists(): raise ValueError( f"Unable to find voice: {model_path} (use piper.download_voices)" ) # Load voice _LOGGER.debug("Loading voice: '%s'", model_path) voice = PiperVoice.load(model_path, use_cuda=args.cuda) syn_config = SynthesisConfig( speaker_id=args.speaker, length_scale=args.length_scale, noise_scale=args.noise_scale, noise_w_scale=args.noise_w_scale, normalize_audio=(not args.no_normalize), volume=args.volume, ) wav_file: wave.Wave_write # 16-bit samples for silence silence_int16_bytes = bytes( int(voice.config.sample_rate * args.sentence_silence * 2) ) def lines_to_wav() -> None: wav_params_set = False for line in lines(): for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)): if not wav_params_set: wav_file.setframerate(audio_chunk.sample_rate) wav_file.setsampwidth(audio_chunk.sample_width) wav_file.setnchannels(audio_chunk.sample_channels) wav_params_set = True if i > 0: wav_file.writeframes(silence_int16_bytes) wav_file.writeframes(audio_chunk.audio_int16_bytes) if args.output_raw: # Write raw audio to stdout as its produced for line in lines(): audio_stream = voice.synthesize(line, syn_config) for i, audio_chunk in enumerate(audio_stream): if i > 0: sys.stdout.buffer.write(silence_int16_bytes) sys.stdout.buffer.write(audio_chunk.audio_int16_bytes) sys.stdout.buffer.flush() elif args.output_dir: # Write multiple WAV files to a directory, one per line output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) for line in lines(): wav_path = output_dir / f"{time.monotonic_ns()}.wav" wav_file = wave.open(str(wav_path), "wb") with wav_file: lines_to_wav() _LOGGER.info("Wrote %s", wav_path) else: if args.output_file == "-": # Write WAV file to stdout with tempfile.NamedTemporaryFile("wb+", suffix=".wav") as temp_wav_file: wav_file = wave.open(temp_wav_file.name, "wb") lines_to_wav() temp_wav_file.seek(0) shutil.copyfileobj(temp_wav_file, sys.stdout.buffer) elif (not args.output_file) and AudioPlayer.is_available(): # Play audio using ffplay with AudioPlayer(voice.config.sample_rate) as player: for line in lines(): for i, audio_chunk in enumerate(voice.synthesize(line, syn_config)): if i > 0: player.play(silence_int16_bytes) player.play(audio_chunk.audio_int16_bytes) else: # Write to WAV file if not args.output_file: _LOGGER.warning( "Audio playback is not available (ffplay). Writing audio to output.wav." ) args.output_file = "output.wav" wav_file = wave.open(args.output_file, "wb") with wav_file: lines_to_wav() # ----------------------------------------------------------------------------- if __name__ == "__main__": main()