Spaces:
Running
Running
import tempfile | |
import time | |
import librosa | |
import streamlit as st | |
from zerovox.tts.synthesize import ZeroVoxTTS | |
from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN | |
SAMPLE_RATE=24000 # FIXME | |
DEFAULT_SPEAKER = 'en_speaker_00061.wav' | |
SAMPLE_SENTENCE_EN = "A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky." | |
#SAMPLE_SENTENCE_EN = "Welcome to the world of speech synthesis!" | |
SAMPLE_SENTENCE_DE = "Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als kreisbogenförmiges farbiges Lichtband in einem von der Sonne beschienenen Regenschauer erscheint." | |
if "lang" not in st.session_state: | |
st.session_state.lang = "en" | |
if "text" not in st.session_state: | |
st.session_state.text = SAMPLE_SENTENCE_EN | |
if "message" not in st.session_state: | |
st.session_state.message = "READY." | |
if "autoplay" not in st.session_state: | |
st.session_state.autoplay = False | |
if "speakerref" not in st.session_state: | |
st.session_state.speakerref = DEFAULT_SPEAKER | |
if "custom_voice" not in st.session_state: | |
st.session_state.custom_voice = False | |
if "voice_wav" not in st.session_state: | |
st.session_state.voice_wav = None | |
def update_text_input(): | |
global text | |
if st.session_state['lang'] == "en": | |
st.session_state.text = SAMPLE_SENTENCE_EN | |
elif st.session_state['lang'] == "de": | |
st.session_state.text = SAMPLE_SENTENCE_DE | |
def do_synth(): | |
global status, playback, meldec | |
synth = None | |
if 'synth' in st.session_state: | |
synth = st.session_state.synth | |
if synth.meldec_model != st.session_state['meldec']: | |
synth = None # trigger reload | |
else: | |
if synth.language != st.session_state.lang: | |
status.update(label=f"loading the lexicon for {st.session_state.lang} ...", state="running") | |
synth.language = st.session_state.lang | |
if not synth: | |
status.update(label="loading the model...", state="running") | |
g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN | |
st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(ZeroVoxTTS.get_default_model(), | |
g2p=g2p_model, | |
lang=lang, | |
meldec_model=st.session_state['meldec'], | |
infer_device='cpu', | |
num_threads=-1, | |
do_compile=False, | |
verbose=True) | |
synth = st.session_state.synth | |
modelcfg = st.session_state.modelcfg | |
status.update(label="computing speaker embedding...", state="running") | |
if not st.session_state.custom_voice or st.session_state.voice_wav is None: | |
speakerref = ZeroVoxTTS.get_speakerref(st.session_state.speakerref, modelcfg['audio']['sampling_rate']) | |
else: | |
speakerref = st.session_state.voice_wav | |
spkemb = synth.speaker_embed(speakerref) | |
status.update(label="synthesizing...", state="running") | |
start_time = time.time() | |
wav, phoneme, length = synth.tts(st.session_state.text, spkemb) | |
elapsed_time = time.time() - start_time | |
message = f"synth time: {elapsed_time:.2f} sec" | |
wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate'] | |
message += f", voice length: {wav_len:.2f} sec" | |
real_time_factor = wav_len / elapsed_time | |
message += f", rtf: {real_time_factor:.2f}" | |
st.session_state.message = message | |
st.session_state.wav = wav | |
st.session_state.autoplay = True | |
st.set_page_config(page_title="ZeroVOX TTS Demo", page_icon=':speech_balloon:', layout="centered", initial_sidebar_state="auto", menu_items=None) | |
st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n") | |
tab1, tab2 = st.tabs(["Voice", "MEL Decoder"]) | |
with tab1: | |
st.checkbox("Custom voice", key='custom_voice') | |
speakerref = st.empty() | |
if st.session_state.custom_voice: | |
# Create a file uploader that accepts only .wav files | |
uploaded_file = speakerref.file_uploader("Upload your voice sample", type=["wav"]) | |
# Process the uploaded file | |
if uploaded_file is not None: | |
with tempfile.NamedTemporaryFile() as f: | |
f.write(uploaded_file.read()) | |
wav, sr = librosa.load(f.name, sr=SAMPLE_RATE) | |
st.session_state.voice_wav=wav | |
st.audio(wav, sample_rate=SAMPLE_RATE) | |
else: | |
speakers = [s for s in ZeroVoxTTS.available_speakerrefs()] | |
speakerref.selectbox("Voice", speakers, key='speakerref') | |
st.audio(ZeroVoxTTS.get_speakerref(st.session_state.speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE) | |
with tab2: | |
meldec = st.selectbox("MEL decoder", | |
["meldec-libritts-hifigan-v1", "meldec-libritts-multi-band-melgan-v2"], | |
#on_change=update_text_input, | |
key='meldec') | |
status = st.status(st.session_state.message, state="complete") | |
col1, col2 = st.columns([0.8, 0.2]) | |
with col1: | |
text = st.text_input("Text to synthesize", key='text', on_change=do_synth) | |
with col2: | |
lang = st.selectbox("Language", | |
["en", "de"], | |
on_change=update_text_input, | |
key='lang') | |
st.button("Synthesize!", type="primary", on_click=do_synth) | |
if 'wav' in st.session_state: | |
playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay) | |
else: | |
playback = st.empty() | |