Spaces:
Running
Running
Guenter Bartsch
commited on
Commit
·
468fe55
1
Parent(s):
e7ab3a1
first draft implementation
Browse files- .vscode/launch.json +16 -0
- app.py +106 -2
.vscode/launch.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
// Use IntelliSense to learn about possible attributes.
|
3 |
+
// Hover to view descriptions of existing attributes.
|
4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
5 |
+
"version": "0.2.0",
|
6 |
+
"configurations": [
|
7 |
+
{
|
8 |
+
"name": "Streamlit: Run and Debug",
|
9 |
+
"type": "debugpy",
|
10 |
+
"request": "launch",
|
11 |
+
"program": "../../venv/bin/streamlit",
|
12 |
+
"console": "integratedTerminal",
|
13 |
+
"args": ["run", "app.py"]
|
14 |
+
}
|
15 |
+
]
|
16 |
+
}
|
app.py
CHANGED
@@ -1,4 +1,108 @@
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
import streamlit as st
|
4 |
|
5 |
+
from zerovox.tts.synthesize import ZeroVoxTTS
|
6 |
+
from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN
|
7 |
+
|
8 |
+
SAMPLE_RATE=24000 # FIXME
|
9 |
+
TTS_MODEL_NAME='tts_en_de_zerovox_alpha1'
|
10 |
+
|
11 |
+
if "text" not in st.session_state:
|
12 |
+
st.session_state.text = "Welcome to the world of speech synthesis!"
|
13 |
+
|
14 |
+
if "message" not in st.session_state:
|
15 |
+
st.session_state.message = "READY."
|
16 |
+
|
17 |
+
def update_text_input():
|
18 |
+
global text
|
19 |
+
if st.session_state['lang'] == "en":
|
20 |
+
st.session_state.text = "Welcome to the world of speech synthesis!"
|
21 |
+
elif st.session_state['lang'] == "de":
|
22 |
+
st.session_state.text = "Willkommen in der Welt der Sprachsynthese!"
|
23 |
+
text = st.session_state.text
|
24 |
+
|
25 |
+
def do_synth():
|
26 |
+
|
27 |
+
global lang, status, speakerref, text, playback, meldec
|
28 |
+
|
29 |
+
|
30 |
+
synth = None
|
31 |
+
if 'synth' in st.session_state:
|
32 |
+
synth = st.session_state.synth
|
33 |
+
|
34 |
+
if synth.language != lang or synth.meldec_model != st.session_state['meldec']:
|
35 |
+
synth = None # trigger reload
|
36 |
+
|
37 |
+
if not synth:
|
38 |
+
|
39 |
+
status.update(label="loading the model...", state="running")
|
40 |
+
|
41 |
+
g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN
|
42 |
+
|
43 |
+
st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(TTS_MODEL_NAME,
|
44 |
+
g2p=g2p_model,
|
45 |
+
lang=lang,
|
46 |
+
meldec_model=st.session_state['meldec'],
|
47 |
+
infer_device='cpu',
|
48 |
+
num_threads=-1,
|
49 |
+
do_compile=False,
|
50 |
+
verbose=False)
|
51 |
+
|
52 |
+
synth = st.session_state.synth
|
53 |
+
modelcfg = st.session_state.modelcfg
|
54 |
+
|
55 |
+
status.update(label="computing speaker embedding...", state="running")
|
56 |
+
|
57 |
+
spkemb = synth.speaker_embed(ZeroVoxTTS.get_speakerref(speakerref, modelcfg['audio']['sampling_rate']))
|
58 |
+
|
59 |
+
status.update(label="synthesizing...", state="running")
|
60 |
+
|
61 |
+
start_time = time.time()
|
62 |
+
|
63 |
+
wav, phoneme, length = synth.tts(text, spkemb)
|
64 |
+
|
65 |
+
elapsed_time = time.time() - start_time
|
66 |
+
|
67 |
+
message = f"synth time: {elapsed_time:.2f} sec"
|
68 |
+
wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
|
69 |
+
message += f", voice length: {wav_len:.2f} sec"
|
70 |
+
real_time_factor = wav_len / elapsed_time
|
71 |
+
message += f", rtf: {real_time_factor:.2f}"
|
72 |
+
|
73 |
+
st.session_state.message = message
|
74 |
+
|
75 |
+
status.update(label=message, state="complete")
|
76 |
+
|
77 |
+
st.session_state.wav = wav
|
78 |
+
playback.audio(wav, sample_rate=SAMPLE_RATE)
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
lang = st.selectbox("Language",
|
83 |
+
["en", "de"],
|
84 |
+
on_change=update_text_input,
|
85 |
+
key='lang')
|
86 |
+
|
87 |
+
meldec = st.selectbox("MEL decoder",
|
88 |
+
["meldec-libritts-multi-band-melgan-v2", "meldec-libritts-hifigan-v1"],
|
89 |
+
#on_change=update_text_input,
|
90 |
+
key='meldec')
|
91 |
+
|
92 |
+
speakerref = st.selectbox("Voice sample", ZeroVoxTTS.available_speakerrefs())
|
93 |
+
|
94 |
+
st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
|
95 |
+
|
96 |
+
text = st.text_input("Text to synthesize", value=st.session_state.text)
|
97 |
+
|
98 |
+
st.button("Synthesize!", type="primary", on_click=do_synth)
|
99 |
+
|
100 |
+
status = st.status(st.session_state.message, state="complete")
|
101 |
+
|
102 |
+
if 'wav' in st.session_state:
|
103 |
+
|
104 |
+
playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE)
|
105 |
+
|
106 |
+
else:
|
107 |
+
|
108 |
+
playback = st.empty()
|