import time import streamlit as st from zerovox.tts.synthesize import ZeroVoxTTS from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN SAMPLE_RATE=24000 # FIXME DEFAULT_SPEAKER = 'en_speaker_00061.wav' if "text" not in st.session_state: st.session_state.text = "Welcome to the world of speech synthesis!" if "message" not in st.session_state: st.session_state.message = "READY." if "autoplay" not in st.session_state: st.session_state.autoplay = False def update_text_input(): global text if st.session_state['lang'] == "en": st.session_state.text = "Welcome to the world of speech synthesis!" elif st.session_state['lang'] == "de": st.session_state.text = "Willkommen in der Welt der Sprachsynthese!" #text = st.session_state.text def do_synth(): global lang, status, speakerref, playback, meldec synth = None if 'synth' in st.session_state: synth = st.session_state.synth if synth.meldec_model != st.session_state['meldec']: synth = None # trigger reload else: if synth.language != lang: status.update(label=f"loading the lexicon for {lang} ...", state="running") synth.language = lang if not synth: status.update(label="loading the model...", state="running") g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(ZeroVoxTTS.get_default_model(), g2p=g2p_model, lang=lang, meldec_model=st.session_state['meldec'], infer_device='cpu', num_threads=-1, do_compile=False, verbose=False) synth = st.session_state.synth modelcfg = st.session_state.modelcfg status.update(label="computing speaker embedding...", state="running") spkemb = synth.speaker_embed(ZeroVoxTTS.get_speakerref(speakerref, modelcfg['audio']['sampling_rate'])) status.update(label="synthesizing...", state="running") start_time = time.time() wav, phoneme, length = synth.tts(st.session_state.text, spkemb) elapsed_time = time.time() - start_time message = f"synth time: {elapsed_time:.2f} sec" wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate'] message += f", voice length: {wav_len:.2f} sec" real_time_factor = wav_len / elapsed_time message += f", rtf: {real_time_factor:.2f}" st.session_state.message = message #status.update(label=message, state="complete") st.session_state.wav = wav st.session_state.autoplay = True #playback.audio(wav, sample_rate=SAMPLE_RATE) st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n") tab1, tab2 = st.tabs(["Settings", "MEL Decoder"]) with tab1: lang = st.selectbox("Language", ["en", "de"], on_change=update_text_input, key='lang') speakers = [s for s in ZeroVoxTTS.available_speakerrefs()] speakerref = st.selectbox("Voice sample", speakers, index=speakers.index(DEFAULT_SPEAKER)) st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE) with tab2: meldec = st.selectbox("MEL decoder", ["meldec-libritts-multi-band-melgan-v2", "meldec-libritts-hifigan-v1"], #on_change=update_text_input, key='meldec') status = st.status(st.session_state.message, state="complete") #text = st.text_input("Text to synthesize", value=st.session_state.text, key='text', on_change=do_synth) text = st.text_input("Text to synthesize", key='text', on_change=do_synth) st.button("Synthesize!", type="primary", on_click=do_synth) if 'wav' in st.session_state: playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay) else: playback = st.empty()