import json import logging import wave from dataclasses import dataclass from pathlib import Path from typing import Iterable, List, Optional, Union import numpy as np import onnxruntime from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run from .config import PhonemeType, PiperConfig from .const import BOS, EOS, PAD from .util import audio_float_to_int16 _LOGGER = logging.getLogger(__name__) @dataclass class PiperVoice: session: onnxruntime.InferenceSession config: PiperConfig @staticmethod def load( model_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None, use_cuda: bool = False, ) -> "PiperVoice": """Load an ONNX model and config.""" if config_path is None: config_path = f"{model_path}.json" with open(config_path, "r", encoding="utf-8") as config_file: config_dict = json.load(config_file) return PiperVoice( config=PiperConfig.from_dict(config_dict), session=onnxruntime.InferenceSession( str(model_path), sess_options=onnxruntime.SessionOptions(), providers=["CPUExecutionProvider"] if not use_cuda else ["CUDAExecutionProvider"], ), ) def phonemize(self, text: str) -> List[List[str]]: """Text to phonemes grouped by sentence.""" if self.config.phoneme_type == PhonemeType.ESPEAK: if self.config.espeak_voice == "ar": # Arabic diacritization # https://github.com/mush42/libtashkeel/ text = tashkeel_run(text) return phonemize_espeak(text, self.config.espeak_voice) if self.config.phoneme_type == PhonemeType.TEXT: return phonemize_codepoints(text) raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}") def phonemes_to_ids(self, phonemes: List[str]) -> List[int]: """Phonemes to ids.""" id_map = self.config.phoneme_id_map ids: List[int] = list(id_map[BOS]) for phoneme in phonemes: if phoneme not in id_map: _LOGGER.warning("Missing phoneme from id map: %s", phoneme) continue ids.extend(id_map[phoneme]) ids.extend(id_map[PAD]) ids.extend(id_map[EOS]) return ids def synthesize( self, text: str, wav_file: wave.Wave_write, speaker_id: Optional[int] = None, length_scale: Optional[float] = None, noise_scale: Optional[float] = None, noise_w: Optional[float] = None, sentence_silence: float = 0.0, ): """Synthesize WAV audio from text.""" wav_file.setframerate(self.config.sample_rate) wav_file.setsampwidth(2) # 16-bit wav_file.setnchannels(1) # mono for audio_bytes in self.synthesize_stream_raw( text, speaker_id=speaker_id, length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w, sentence_silence=sentence_silence, ): wav_file.writeframes(audio_bytes) def synthesize_stream_raw( self, text: str, speaker_id: Optional[int] = None, length_scale: Optional[float] = None, noise_scale: Optional[float] = None, noise_w: Optional[float] = None, sentence_silence: float = 0.0, ) -> Iterable[bytes]: """Synthesize raw audio per sentence from text.""" sentence_phonemes = self.phonemize(text) # 16-bit mono num_silence_samples = int(sentence_silence * self.config.sample_rate) silence_bytes = bytes(num_silence_samples * 2) for phonemes in sentence_phonemes: phoneme_ids = self.phonemes_to_ids(phonemes) yield self.synthesize_ids_to_raw( phoneme_ids, speaker_id=speaker_id, length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w, ) + silence_bytes def synthesize_ids_to_raw( self, phoneme_ids: List[int], speaker_id: Optional[int] = None, length_scale: Optional[float] = None, noise_scale: Optional[float] = None, noise_w: Optional[float] = None, ) -> bytes: """Synthesize raw audio from phoneme ids.""" if length_scale is None: length_scale = self.config.length_scale if noise_scale is None: noise_scale = self.config.noise_scale if noise_w is None: noise_w = self.config.noise_w phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0) phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64) scales = np.array( [noise_scale, length_scale, noise_w], dtype=np.float32, ) if (self.config.num_speakers > 1) and (speaker_id is None): # Default speaker speaker_id = 0 sid = None if speaker_id is not None: sid = np.array([speaker_id], dtype=np.int64) # Synthesize through Onnx audio = self.session.run( None, { "input": phoneme_ids_array, "input_lengths": phoneme_ids_lengths, "scales": scales, "sid": sid, }, )[0].squeeze((0, 1)) audio = audio_float_to_int16(audio.squeeze()) return audio.tobytes()