import sys from pathlib import Path import os import torch import openvino as ov import gradio as gr import langid import ipywidgets as widgets from IPython.display import Audio # from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass # import openvoice.se_extractor as se_extractor import nncf import subprocess # Clone the repo and set up the environment repo_dir = Path("OpenVoice") if not repo_dir.exists(): subprocess.run(["git", "clone", "https://github.com/myshell-ai/OpenVoice"]) orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py") english_path = Path("OpenVoice/openvoice/text/english.py") english_path.rename(orig_english_path) with orig_english_path.open("r") as f: data = f.read() data = data.replace("unidecode", "anyascii") with english_path.open("w") as out_f: out_f.write(data) sys.path.append(str(repo_dir)) # Install the required packages # %pip install -q "librosa>=0.8.1" "wavmark>=0.0.3" "faster-whisper>=0.9.0" "pydub>=0.25.1" "whisper-timestamped>=1.14.2" "tqdm" "inflect>=7.0.0" "eng_to_ipa>=0.0.2" "pypinyin>=0.50.0" \ # "cn2an>=0.5.22" "jieba>=0.42.1" "langid>=1.1.6" "gradio>=4.15" "ipywebrtc" "anyascii" "openvino>=2023.3" "torch>=2.1" "nncf>=2.11.0" from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass import openvoice.se_extractor as se_extractor packages = [ "librosa>=0.8.1", "wavmark>=0.0.3", "faster-whisper>=0.9.0", "pydub>=0.25.1", "whisper-timestamped>=1.14.2", "tqdm", "inflect>=7.0.0", "eng_to_ipa>=0.0.2", "pypinyin>=0.50.0", "ipywidgets" ] subprocess.run(["pip", "install"] + packages, check=True) core = ov.Core() CKPT_BASE_PATH = "checkpoints" en_suffix = f"{CKPT_BASE_PATH}/base_speakers/EN" zh_suffix = f"{CKPT_BASE_PATH}/base_speakers/ZH" converter_suffix = f"{CKPT_BASE_PATH}/converter" enable_chinese_lang = False def download_from_hf_hub(filename, local_dir="./"): from huggingface_hub import hf_hub_download os.makedirs(local_dir, exist_ok=True) hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir) download_from_hf_hub(f"{converter_suffix}/checkpoint.pth") download_from_hf_hub(f"{converter_suffix}/config.json") download_from_hf_hub(f"{en_suffix}/checkpoint.pth") download_from_hf_hub(f"{en_suffix}/config.json") download_from_hf_hub(f"{en_suffix}/en_default_se.pth") download_from_hf_hub(f"{en_suffix}/en_style_se.pth") if enable_chinese_lang: download_from_hf_hub(f"{zh_suffix}/checkpoint.pth") download_from_hf_hub(f"{zh_suffix}/config.json") download_from_hf_hub(f"{zh_suffix}/zh_default_se.pth") pt_device = "cpu" en_base_speaker_tts = BaseSpeakerTTS(f"{en_suffix}/config.json", device=pt_device) en_base_speaker_tts.load_ckpt(f"{en_suffix}/checkpoint.pth") tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device) tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth") if enable_chinese_lang: zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device) zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth") else: zh_base_speaker_tts = None class OVOpenVoiceBase(torch.nn.Module): def __init__(self, voice_model: OpenVoiceBaseClass): super().__init__() self.voice_model = voice_model for par in voice_model.model.parameters(): par.requires_grad = False class OVOpenVoiceTTS(OVOpenVoiceBase): def get_example_input(self): stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False) x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) speaker_id = torch.LongTensor([1]) noise_scale = torch.tensor(0.667) length_scale = torch.tensor(1.0) noise_scale_w = torch.tensor(0.6) return ( x_tst, x_tst_lengths, speaker_id, noise_scale, length_scale, noise_scale_w, ) def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w): return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w) class OVOpenVoiceConverter(OVOpenVoiceBase): def get_example_input(self): y = torch.randn([1, 513, 238], dtype=torch.float32) y_lengths = torch.LongTensor([y.size(-1)]) target_se = torch.randn(*(1, 256, 1)) source_se = torch.randn(*(1, 256, 1)) tau = torch.tensor(0.3) return (y, y_lengths, source_se, target_se, tau) def forward(self, y, y_lengths, sid_src, sid_tgt, tau): return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau) IRS_PATH = "openvino_irs/" EN_TTS_IR = f"{IRS_PATH}/openvoice_en_tts.xml" ZH_TTS_IR = f"{IRS_PATH}/openvoice_zh_tts.xml" VOICE_CONVERTER_IR = f"{IRS_PATH}/openvoice_tone_conversion.xml" paths = [EN_TTS_IR, VOICE_CONVERTER_IR] models = [ OVOpenVoiceTTS(en_base_speaker_tts), OVOpenVoiceConverter(tone_color_converter), ] if enable_chinese_lang: models.append(OVOpenVoiceTTS(zh_base_speaker_tts)) paths.append(ZH_TTS_IR) ov_models = [] for model, path in zip(models, paths): if not os.path.exists(path): ov_model = ov.convert_model(model, example_input=model.get_example_input()) ov_model = nncf.compress_weights(ov_model) ov.save_model(ov_model, path) else: ov_model = core.read_model(path) ov_models.append(ov_model) ov_en_tts, ov_voice_conversion = ov_models[:2] if enable_chinese_lang: ov_zh_tts = ov_models[-1] REFERENCE_VOICES_PATH = f"{repo_dir}/resources/" reference_speakers = [ *[path for path in os.listdir(REFERENCE_VOICES_PATH) if os.path.splitext(path)[-1] == ".mp3"], "record_manually", "load_manually", ] ref_speaker = widgets.Dropdown( options=reference_speakers, value=reference_speakers[0], description="reference voice from which tone color will be copied", disabled=False, ) ref_speaker OUTPUT_DIR = "outputs/" os.makedirs(OUTPUT_DIR, exist_ok=True) ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}" allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm" if ref_speaker.value == "record_manually": ref_speaker_path = f"{OUTPUT_DIR}/custom_example_sample.webm" from ipywebrtc import AudioRecorder, CameraStream camera = CameraStream(constraints={"audio": True, "video": False}) recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True) display(recorder) elif ref_speaker.value == "load_manually": upload_ref = widgets.FileUpload( accept=allowed_audio_types, multiple=False, description="Select audio with reference voice", ) display(upload_ref) def save_audio(voice_source: widgets.FileUpload, out_path: str): with open(out_path, "wb") as output_file: assert len(voice_source.value) > 0, "Please select audio file" output_file.write(voice_source.value[0]["content"]) en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth") en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth") zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True) def get_pathched_infer(ov_model: ov.Model, device: str) -> callable: compiled_model = core.compile_model(ov_model, device) def infer_impl(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w): ov_output = compiled_model((x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)) return (torch.tensor(ov_output[0]),) return infer_impl def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable: compiled_model = core.compile_model(ov_model, device) def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau): ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt, tau)) return (torch.tensor(ov_output[0]),) return voice_conversion_impl core = ov.Core() device = widgets.Dropdown( options=core.available_devices + ["AUTO"], value="AUTO", description="Device:", disabled=False, ) device en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value) tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value) if enable_chinese_lang: zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value) supported_languages = ["zh", "en"] def build_predict( output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se, supported_languages, ): def predict( input_text, reference_audio, speaker, noise_scale=0.667, length_scale=1.0, noise_scale_w=0.8, tone_color=False, ): if reference_audio: ref_audio_path = f"{output_dir}/input_audio.wav" save_audio(reference_audio, ref_audio_path) target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True) else: if speaker == "record_manually": raise ValueError("Manual recording is not implemented in this example.") elif speaker == "load_manually": raise ValueError("Loading a manual audio file is not implemented in this example.") else: ref_audio_path = f"{REFERENCE_VOICES_PATH}/{speaker}" target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True) lang = langid.classify(input_text)[0] if lang not in supported_languages: return f"Unsupported language: {lang}" tts_model = en_tts_model if lang == "en" else zh_tts_model stn_tst = tts_model.get_text(input_text, tts_model.hps, False) x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) speaker_id = torch.LongTensor([1]) noise_scale = torch.tensor(noise_scale) length_scale = torch.tensor(length_scale) noise_scale_w = torch.tensor(noise_scale_w) with torch.no_grad(): audio = tts_model.model.infer(x_tst, x_tst_lengths, speaker_id, noise_scale, length_scale, noise_scale_w)[0] if tone_color: source_se = en_source_style_se if lang == "en" else zh_source_se audio = tone_color_converter.model.voice_conversion(audio, x_tst_lengths, source_se, target_se, torch.tensor(0.3))[0] audio = audio.squeeze().cpu().numpy() output_path = f"{output_dir}/output_audio.wav" Audio(audio, rate=tts_model.hps.data.sampling_rate).save(output_path) return output_path return predict OUTPUT_DIR = "output_audio" os.makedirs(OUTPUT_DIR, exist_ok=True) predict_fn = build_predict( OUTPUT_DIR, tone_color_converter, en_base_speaker_tts, zh_base_speaker_tts, en_source_default_se, en_source_style_se, zh_source_se, supported_languages, ) def gradio_interface(): input_text = gr.Textbox(lines=2, placeholder="Enter text here...") reference_audio = gr.Audio(type="filepath", label="Reference Audio") speaker = gr.Dropdown(choices=reference_speakers, value="record_manually", label="Select Speaker") noise_scale = gr.Slider(minimum=0.1, maximum=1.0, value=0.667, label="Noise Scale") length_scale = gr.Slider(minimum=0.1, maximum=2.0, value=1.0, label="Length Scale") noise_scale_w = gr.Slider(minimum=0.1, maximum=1.0, value=0.8, label="Noise Scale W") tone_color = gr.Checkbox(value=False, label="Enable Tone Color Conversion") gr.Interface( fn=predict_fn, inputs=[input_text, reference_audio, speaker, noise_scale, length_scale, noise_scale_w, tone_color], outputs=gr.Audio(type="filepath", label="Generated Audio"), title="Speech Generation and Tone Conversion", description="Generate speech and convert tone using the OpenVoice model.", ).launch()