import re import librosa import numpy as np from io import BytesIO from pathlib import Path from scipy.io import wavfile from typing import List, Literal, Optional from .encoder.inference import Encoder, preprocess_wav from .synthesizer.inference import Synthesizer from .vocoder.hifigan.inference import HifiGanVocoder from .vocoder.wavernn.inference import WaveRNNVocoder from .log import logger def process_text(text: str) -> List[str]: punctuation = "!,。、,?!," # punctuate and split/clean text processed_texts = [] text = re.sub(r"[{}]+".format(punctuation), "\n", text) for processed_text in text.split("\n"): if processed_text: processed_texts.append(processed_text.strip()) return processed_texts class MockingBird: def __init__(self): self.encoder: Optional[Encoder] = None self.gan_vocoder: Optional[HifiGanVocoder] = None self.rnn_vocoder: Optional[WaveRNNVocoder] = None self.synthesizer: Optional[Synthesizer] = None def load_model( self, encoder_path: Path, gan_vocoder_path: Optional[Path] = None, rnn_vocoder_path: Optional[Path] = None, ): """ 设置 Encoder模型 和 Vocoder模型 路径 Args: encoder_path (Path): Encoder模型路径 gan_vocoder_path (Path): HifiGan Vocoder模型路径,可选,需要用到 HifiGan 类型时必须填写 rnn_vocoder_path (Path): WaveRNN Vocoder模型路径,可选,需要用到 WaveRNN 类型时必须填写 """ self.encoder = Encoder(encoder_path) if gan_vocoder_path: self.gan_vocoder = HifiGanVocoder(gan_vocoder_path) if rnn_vocoder_path: self.rnn_vocoder = WaveRNNVocoder(rnn_vocoder_path) def set_synthesizer(self, synthesizer_path: Path): """ 设置Synthesizer模型路径 Args: synthesizer_path (Path): Synthesizer模型路径 """ self.synthesizer = Synthesizer(synthesizer_path) logger.info(f"using synthesizer model: {synthesizer_path}") def synthesize( self, text: str, input_wav: Path, vocoder_type: Literal["HifiGan", "WaveRNN"] = "HifiGan", style_idx: int = 0, min_stop_token: int = 5, steps: int = 1000, ) -> BytesIO: """ 生成语音 Args: text (str): 目标文字 input_wav (Path): 目标录音路径 vocoder_type (HifiGan / WaveRNN): Vocoder模型,默认使用HifiGan style_idx (int, optional): Style 范围 -1~9,默认为 0 min_stop_token (int, optional): Accuracy(精度) 范围3~9,默认为 5 steps (int, optional): MaxLength(最大句长) 范围200~2000,默认为 1000 """ if not self.encoder: raise Exception("Please set encoder path first") if not self.synthesizer: raise Exception("Please set synthesizer path first") # Load input wav wav, sample_rate = librosa.load(input_wav) encoder_wav = preprocess_wav(wav, sample_rate) embed, _, _ = self.encoder.embed_utterance(encoder_wav, return_partials=True) # Load input text texts = process_text(text) # synthesize and vocode embeds = [embed] * len(texts) specs = self.synthesizer.synthesize_spectrograms( texts, embeds, style_idx=style_idx, min_stop_token=min_stop_token, steps=steps, ) spec = np.concatenate(specs, axis=1) if vocoder_type == "WaveRNN": if not self.rnn_vocoder: raise Exception("Please set wavernn vocoder path first") wav, sample_rate = self.rnn_vocoder.infer_waveform(spec) else: if not self.gan_vocoder: raise Exception("Please set hifigan vocoder path first") wav, sample_rate = self.gan_vocoder.infer_waveform(spec) # Return cooked wav out = BytesIO() wavfile.write(out, sample_rate, wav.astype(np.float32)) return out