Spaces:
Running
Running
File size: 4,641 Bytes
468fe55 e7ab3a1 468fe55 41889ec 468fe55 78b1e75 468fe55 b8c03ff 468fe55 b8c03ff 468fe55 f099617 468fe55 f099617 468fe55 f099617 468fe55 b8c03ff 468fe55 78b1e75 468fe55 78b1e75 468fe55 b8c03ff 468fe55 b8c03ff 78b1e75 468fe55 41889ec 468fe55 41889ec 468fe55 41889ec b8c03ff 78b1e75 468fe55 b8c03ff 468fe55 b8c03ff 468fe55 b8c03ff 468fe55 78b1e75 468fe55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import time
import streamlit as st
from zerovox.tts.synthesize import ZeroVoxTTS
from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN
SAMPLE_RATE=24000 # FIXME
DEFAULT_SPEAKER = 'en_speaker_00061.wav'
if "text" not in st.session_state:
st.session_state.text = "Welcome to the world of speech synthesis!"
if "message" not in st.session_state:
st.session_state.message = "READY."
if "autoplay" not in st.session_state:
st.session_state.autoplay = False
def update_text_input():
global text
if st.session_state['lang'] == "en":
st.session_state.text = "Welcome to the world of speech synthesis!"
elif st.session_state['lang'] == "de":
st.session_state.text = "Willkommen in der Welt der Sprachsynthese!"
#text = st.session_state.text
def do_synth():
global lang, status, speakerref, playback, meldec
synth = None
if 'synth' in st.session_state:
synth = st.session_state.synth
if synth.meldec_model != st.session_state['meldec']:
synth = None # trigger reload
else:
if synth.language != lang:
status.update(label=f"loading the lexicon for {lang} ...", state="running")
synth.language = lang
if not synth:
status.update(label="loading the model...", state="running")
g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN
st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(ZeroVoxTTS.get_default_model(),
g2p=g2p_model,
lang=lang,
meldec_model=st.session_state['meldec'],
infer_device='cpu',
num_threads=-1,
do_compile=False,
verbose=False)
synth = st.session_state.synth
modelcfg = st.session_state.modelcfg
status.update(label="computing speaker embedding...", state="running")
spkemb = synth.speaker_embed(ZeroVoxTTS.get_speakerref(speakerref, modelcfg['audio']['sampling_rate']))
status.update(label="synthesizing...", state="running")
start_time = time.time()
wav, phoneme, length = synth.tts(st.session_state.text, spkemb)
elapsed_time = time.time() - start_time
message = f"synth time: {elapsed_time:.2f} sec"
wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
message += f", voice length: {wav_len:.2f} sec"
real_time_factor = wav_len / elapsed_time
message += f", rtf: {real_time_factor:.2f}"
st.session_state.message = message
#status.update(label=message, state="complete")
st.session_state.wav = wav
st.session_state.autoplay = True
#playback.audio(wav, sample_rate=SAMPLE_RATE)
st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")
tab1, tab2 = st.tabs(["Settings", "MEL Decoder"])
with tab1:
lang = st.selectbox("Language",
["en", "de"],
on_change=update_text_input,
key='lang')
speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]
speakerref = st.selectbox("Voice sample", speakers, index=speakers.index(DEFAULT_SPEAKER))
st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
with tab2:
meldec = st.selectbox("MEL decoder",
["meldec-libritts-multi-band-melgan-v2", "meldec-libritts-hifigan-v1"],
#on_change=update_text_input,
key='meldec')
status = st.status(st.session_state.message, state="complete")
#text = st.text_input("Text to synthesize", value=st.session_state.text, key='text', on_change=do_synth)
text = st.text_input("Text to synthesize", key='text', on_change=do_synth)
st.button("Synthesize!", type="primary", on_click=do_synth)
if 'wav' in st.session_state:
playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay)
else:
playback = st.empty()
|