Spaces:
Running
Running
File size: 6,197 Bytes
a2d19e9 468fe55 a2d19e9 468fe55 e7ab3a1 468fe55 41889ec 3b61dcb a2d19e9 468fe55 3b61dcb 468fe55 78b1e75 a2d19e9 468fe55 3b61dcb 468fe55 3b61dcb 468fe55 a2d19e9 468fe55 f099617 468fe55 f099617 a2d19e9 468fe55 f099617 468fe55 1a38443 468fe55 a2d19e9 468fe55 b8c03ff 468fe55 78b1e75 a2d19e9 78b1e75 468fe55 a2d19e9 468fe55 b8c03ff 468fe55 a2d19e9 468fe55 a2d19e9 468fe55 41889ec b8c03ff 78b1e75 f6feaaf 78b1e75 468fe55 b8c03ff 468fe55 a2d19e9 468fe55 b8c03ff 468fe55 78b1e75 468fe55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import tempfile
import time
import librosa
import streamlit as st
from zerovox.tts.synthesize import ZeroVoxTTS
from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN
SAMPLE_RATE=24000 # FIXME
DEFAULT_SPEAKER = 'en_speaker_00061.wav'
SAMPLE_SENTENCE_EN = "A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky."
#SAMPLE_SENTENCE_EN = "Welcome to the world of speech synthesis!"
SAMPLE_SENTENCE_DE = "Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als kreisbogenförmiges farbiges Lichtband in einem von der Sonne beschienenen Regenschauer erscheint."
if "lang" not in st.session_state:
st.session_state.lang = "en"
if "text" not in st.session_state:
st.session_state.text = SAMPLE_SENTENCE_EN
if "message" not in st.session_state:
st.session_state.message = "READY."
if "autoplay" not in st.session_state:
st.session_state.autoplay = False
if "speakerref" not in st.session_state:
st.session_state.speakerref = DEFAULT_SPEAKER
if "custom_voice" not in st.session_state:
st.session_state.custom_voice = False
if "voice_wav" not in st.session_state:
st.session_state.voice_wav = None
def update_text_input():
global text
if st.session_state['lang'] == "en":
st.session_state.text = SAMPLE_SENTENCE_EN
elif st.session_state['lang'] == "de":
st.session_state.text = SAMPLE_SENTENCE_DE
def do_synth():
global status, playback, meldec
synth = None
if 'synth' in st.session_state:
synth = st.session_state.synth
if synth.meldec_model != st.session_state['meldec']:
synth = None # trigger reload
else:
if synth.language != st.session_state.lang:
status.update(label=f"loading the lexicon for {st.session_state.lang} ...", state="running")
synth.language = st.session_state.lang
if not synth:
status.update(label="loading the model...", state="running")
g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN
st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(ZeroVoxTTS.get_default_model(),
g2p=g2p_model,
lang=lang,
meldec_model=st.session_state['meldec'],
infer_device='cpu',
num_threads=-1,
do_compile=False,
verbose=True)
synth = st.session_state.synth
modelcfg = st.session_state.modelcfg
status.update(label="computing speaker embedding...", state="running")
if not st.session_state.custom_voice or st.session_state.voice_wav is None:
speakerref = ZeroVoxTTS.get_speakerref(st.session_state.speakerref, modelcfg['audio']['sampling_rate'])
else:
speakerref = st.session_state.voice_wav
spkemb = synth.speaker_embed(speakerref)
status.update(label="synthesizing...", state="running")
start_time = time.time()
wav, phoneme, length = synth.tts(st.session_state.text, spkemb)
elapsed_time = time.time() - start_time
message = f"synth time: {elapsed_time:.2f} sec"
wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
message += f", voice length: {wav_len:.2f} sec"
real_time_factor = wav_len / elapsed_time
message += f", rtf: {real_time_factor:.2f}"
st.session_state.message = message
st.session_state.wav = wav
st.session_state.autoplay = True
st.set_page_config(page_title="ZeroVOX TTS Demo", page_icon=':speech_balloon:', layout="centered", initial_sidebar_state="auto", menu_items=None)
st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")
tab1, tab2 = st.tabs(["Voice", "MEL Decoder"])
with tab1:
st.checkbox("Custom voice", key='custom_voice')
speakerref = st.empty()
if st.session_state.custom_voice:
# Create a file uploader that accepts only .wav files
uploaded_file = speakerref.file_uploader("Upload your voice sample", type=["wav"])
# Process the uploaded file
if uploaded_file is not None:
with tempfile.NamedTemporaryFile() as f:
f.write(uploaded_file.read())
wav, sr = librosa.load(f.name, sr=SAMPLE_RATE)
st.session_state.voice_wav=wav
st.audio(wav, sample_rate=SAMPLE_RATE)
else:
speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]
speakerref.selectbox("Voice", speakers, key='speakerref')
st.audio(ZeroVoxTTS.get_speakerref(st.session_state.speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
with tab2:
meldec = st.selectbox("MEL decoder",
["meldec-libritts-hifigan-v1", "meldec-libritts-multi-band-melgan-v2"],
#on_change=update_text_input,
key='meldec')
status = st.status(st.session_state.message, state="complete")
col1, col2 = st.columns([0.8, 0.2])
with col1:
text = st.text_input("Text to synthesize", key='text', on_change=do_synth)
with col2:
lang = st.selectbox("Language",
["en", "de"],
on_change=update_text_input,
key='lang')
st.button("Synthesize!", type="primary", on_click=do_synth)
if 'wav' in st.session_state:
playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay)
else:
playback = st.empty()
|