zerovox-demo / app.py
Guenter Bartsch
autoplay, column layout
78b1e75
raw
history blame
4.26 kB
import time
import streamlit as st
from zerovox.tts.synthesize import ZeroVoxTTS
from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN
SAMPLE_RATE=24000 # FIXME
TTS_MODEL_NAME='tts_en_de_zerovox_alpha1'
if "text" not in st.session_state:
st.session_state.text = "Welcome to the world of speech synthesis!"
if "message" not in st.session_state:
st.session_state.message = "READY."
if "autoplay" not in st.session_state:
st.session_state.autoplay = False
def update_text_input():
global text
if st.session_state['lang'] == "en":
st.session_state.text = "Welcome to the world of speech synthesis!"
elif st.session_state['lang'] == "de":
st.session_state.text = "Willkommen in der Welt der Sprachsynthese!"
text = st.session_state.text
def do_synth():
global lang, status, speakerref, text, playback, meldec
synth = None
if 'synth' in st.session_state:
synth = st.session_state.synth
if synth.language != lang or synth.meldec_model != st.session_state['meldec']:
synth = None # trigger reload
if not synth:
status.update(label="loading the model...", state="running")
g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN
st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(TTS_MODEL_NAME,
g2p=g2p_model,
lang=lang,
meldec_model=st.session_state['meldec'],
infer_device='cpu',
num_threads=-1,
do_compile=False,
verbose=False)
synth = st.session_state.synth
modelcfg = st.session_state.modelcfg
status.update(label="computing speaker embedding...", state="running")
spkemb = synth.speaker_embed(ZeroVoxTTS.get_speakerref(speakerref, modelcfg['audio']['sampling_rate']))
status.update(label="synthesizing...", state="running")
start_time = time.time()
wav, phoneme, length = synth.tts(text, spkemb)
elapsed_time = time.time() - start_time
message = f"synth time: {elapsed_time:.2f} sec"
wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
message += f", voice length: {wav_len:.2f} sec"
real_time_factor = wav_len / elapsed_time
message += f", rtf: {real_time_factor:.2f}"
st.session_state.message = message
#status.update(label=message, state="complete")
st.session_state.wav = wav
st.session_state.autoplay = True
#playback.audio(wav, sample_rate=SAMPLE_RATE)
st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")
col1, col2 = st.columns(2)
with col1:
lang = st.selectbox("Language",
["en", "de"],
on_change=update_text_input,
key='lang')
text = st.text_input("Text to synthesize", value=st.session_state.text)
st.button("Synthesize!", type="primary", on_click=do_synth)
with col2:
meldec = st.selectbox("MEL decoder",
["meldec-libritts-multi-band-melgan-v2", "meldec-libritts-hifigan-v1"],
#on_change=update_text_input,
key='meldec')
speakerref = st.selectbox("Voice sample", ZeroVoxTTS.available_speakerrefs())
st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
status = st.status(st.session_state.message, state="complete")
if 'wav' in st.session_state:
playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay)
else:
playback = st.empty()