Guenter Bartsch commited on
Commit
468fe55
·
1 Parent(s): e7ab3a1

first draft implementation

Browse files
Files changed (2) hide show
  1. .vscode/launch.json +16 -0
  2. app.py +106 -2
.vscode/launch.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Streamlit: Run and Debug",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "program": "../../venv/bin/streamlit",
12
+ "console": "integratedTerminal",
13
+ "args": ["run", "app.py"]
14
+ }
15
+ ]
16
+ }
app.py CHANGED
@@ -1,4 +1,108 @@
 
 
1
  import streamlit as st
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
  import streamlit as st
4
 
5
+ from zerovox.tts.synthesize import ZeroVoxTTS
6
+ from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN
7
+
8
+ SAMPLE_RATE=24000 # FIXME
9
+ TTS_MODEL_NAME='tts_en_de_zerovox_alpha1'
10
+
11
+ if "text" not in st.session_state:
12
+ st.session_state.text = "Welcome to the world of speech synthesis!"
13
+
14
+ if "message" not in st.session_state:
15
+ st.session_state.message = "READY."
16
+
17
+ def update_text_input():
18
+ global text
19
+ if st.session_state['lang'] == "en":
20
+ st.session_state.text = "Welcome to the world of speech synthesis!"
21
+ elif st.session_state['lang'] == "de":
22
+ st.session_state.text = "Willkommen in der Welt der Sprachsynthese!"
23
+ text = st.session_state.text
24
+
25
+ def do_synth():
26
+
27
+ global lang, status, speakerref, text, playback, meldec
28
+
29
+
30
+ synth = None
31
+ if 'synth' in st.session_state:
32
+ synth = st.session_state.synth
33
+
34
+ if synth.language != lang or synth.meldec_model != st.session_state['meldec']:
35
+ synth = None # trigger reload
36
+
37
+ if not synth:
38
+
39
+ status.update(label="loading the model...", state="running")
40
+
41
+ g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN
42
+
43
+ st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(TTS_MODEL_NAME,
44
+ g2p=g2p_model,
45
+ lang=lang,
46
+ meldec_model=st.session_state['meldec'],
47
+ infer_device='cpu',
48
+ num_threads=-1,
49
+ do_compile=False,
50
+ verbose=False)
51
+
52
+ synth = st.session_state.synth
53
+ modelcfg = st.session_state.modelcfg
54
+
55
+ status.update(label="computing speaker embedding...", state="running")
56
+
57
+ spkemb = synth.speaker_embed(ZeroVoxTTS.get_speakerref(speakerref, modelcfg['audio']['sampling_rate']))
58
+
59
+ status.update(label="synthesizing...", state="running")
60
+
61
+ start_time = time.time()
62
+
63
+ wav, phoneme, length = synth.tts(text, spkemb)
64
+
65
+ elapsed_time = time.time() - start_time
66
+
67
+ message = f"synth time: {elapsed_time:.2f} sec"
68
+ wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
69
+ message += f", voice length: {wav_len:.2f} sec"
70
+ real_time_factor = wav_len / elapsed_time
71
+ message += f", rtf: {real_time_factor:.2f}"
72
+
73
+ st.session_state.message = message
74
+
75
+ status.update(label=message, state="complete")
76
+
77
+ st.session_state.wav = wav
78
+ playback.audio(wav, sample_rate=SAMPLE_RATE)
79
+
80
+
81
+
82
+ lang = st.selectbox("Language",
83
+ ["en", "de"],
84
+ on_change=update_text_input,
85
+ key='lang')
86
+
87
+ meldec = st.selectbox("MEL decoder",
88
+ ["meldec-libritts-multi-band-melgan-v2", "meldec-libritts-hifigan-v1"],
89
+ #on_change=update_text_input,
90
+ key='meldec')
91
+
92
+ speakerref = st.selectbox("Voice sample", ZeroVoxTTS.available_speakerrefs())
93
+
94
+ st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
95
+
96
+ text = st.text_input("Text to synthesize", value=st.session_state.text)
97
+
98
+ st.button("Synthesize!", type="primary", on_click=do_synth)
99
+
100
+ status = st.status(st.session_state.message, state="complete")
101
+
102
+ if 'wav' in st.session_state:
103
+
104
+ playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE)
105
+
106
+ else:
107
+
108
+ playback = st.empty()