Spaces:
Running
Running
Guenter Bartsch
commited on
Commit
·
a2d19e9
1
Parent(s):
b8c03ff
custom voice option added
Browse files- app.py +60 -16
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,4 +1,6 @@
|
|
|
|
1 |
import time
|
|
|
2 |
|
3 |
import streamlit as st
|
4 |
|
@@ -9,6 +11,9 @@ SAMPLE_RATE=24000 # FIXME
|
|
9 |
|
10 |
DEFAULT_SPEAKER = 'en_speaker_00061.wav'
|
11 |
|
|
|
|
|
|
|
12 |
if "text" not in st.session_state:
|
13 |
st.session_state.text = "Welcome to the world of speech synthesis!"
|
14 |
|
@@ -18,6 +23,15 @@ if "message" not in st.session_state:
|
|
18 |
if "autoplay" not in st.session_state:
|
19 |
st.session_state.autoplay = False
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def update_text_input():
|
22 |
global text
|
23 |
if st.session_state['lang'] == "en":
|
@@ -28,8 +42,7 @@ def update_text_input():
|
|
28 |
|
29 |
def do_synth():
|
30 |
|
31 |
-
global
|
32 |
-
|
33 |
|
34 |
synth = None
|
35 |
if 'synth' in st.session_state:
|
@@ -38,9 +51,9 @@ def do_synth():
|
|
38 |
if synth.meldec_model != st.session_state['meldec']:
|
39 |
synth = None # trigger reload
|
40 |
else:
|
41 |
-
if synth.language != lang:
|
42 |
-
status.update(label=f"loading the lexicon for {lang} ...", state="running")
|
43 |
-
synth.language = lang
|
44 |
|
45 |
if not synth:
|
46 |
|
@@ -62,7 +75,12 @@ def do_synth():
|
|
62 |
|
63 |
status.update(label="computing speaker embedding...", state="running")
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
status.update(label="synthesizing...", state="running")
|
68 |
|
@@ -86,21 +104,40 @@ def do_synth():
|
|
86 |
st.session_state.autoplay = True
|
87 |
#playback.audio(wav, sample_rate=SAMPLE_RATE)
|
88 |
|
|
|
|
|
89 |
st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")
|
90 |
|
91 |
-
tab1, tab2 = st.tabs(["
|
92 |
|
93 |
with tab1:
|
94 |
-
lang = st.selectbox("Language",
|
95 |
-
["en", "de"],
|
96 |
-
on_change=update_text_input,
|
97 |
-
key='lang')
|
98 |
|
99 |
-
|
|
|
|
|
100 |
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
|
104 |
|
105 |
with tab2:
|
106 |
meldec = st.selectbox("MEL decoder",
|
@@ -110,8 +147,15 @@ with tab2:
|
|
110 |
|
111 |
status = st.status(st.session_state.message, state="complete")
|
112 |
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
st.button("Synthesize!", type="primary", on_click=do_synth)
|
117 |
|
|
|
1 |
+
import tempfile
|
2 |
import time
|
3 |
+
import librosa
|
4 |
|
5 |
import streamlit as st
|
6 |
|
|
|
11 |
|
12 |
DEFAULT_SPEAKER = 'en_speaker_00061.wav'
|
13 |
|
14 |
+
if "lang" not in st.session_state:
|
15 |
+
st.session_state.lang = "en"
|
16 |
+
|
17 |
if "text" not in st.session_state:
|
18 |
st.session_state.text = "Welcome to the world of speech synthesis!"
|
19 |
|
|
|
23 |
if "autoplay" not in st.session_state:
|
24 |
st.session_state.autoplay = False
|
25 |
|
26 |
+
if "speakerref" not in st.session_state:
|
27 |
+
st.session_state.speakerref = DEFAULT_SPEAKER
|
28 |
+
|
29 |
+
if "custom_voice" not in st.session_state:
|
30 |
+
st.session_state.custom_voice = False
|
31 |
+
|
32 |
+
if "voice_wav" not in st.session_state:
|
33 |
+
st.session_state.voice_wav = None
|
34 |
+
|
35 |
def update_text_input():
|
36 |
global text
|
37 |
if st.session_state['lang'] == "en":
|
|
|
42 |
|
43 |
def do_synth():
|
44 |
|
45 |
+
global status, playback, meldec
|
|
|
46 |
|
47 |
synth = None
|
48 |
if 'synth' in st.session_state:
|
|
|
51 |
if synth.meldec_model != st.session_state['meldec']:
|
52 |
synth = None # trigger reload
|
53 |
else:
|
54 |
+
if synth.language != st.session_state.lang:
|
55 |
+
status.update(label=f"loading the lexicon for {st.session_state.lang} ...", state="running")
|
56 |
+
synth.language = st.session_state.lang
|
57 |
|
58 |
if not synth:
|
59 |
|
|
|
75 |
|
76 |
status.update(label="computing speaker embedding...", state="running")
|
77 |
|
78 |
+
if not st.session_state.custom_voice or st.session_state.voice_wav is None:
|
79 |
+
speakerref = ZeroVoxTTS.get_speakerref(st.session_state.speakerref, modelcfg['audio']['sampling_rate'])
|
80 |
+
else:
|
81 |
+
speakerref = st.session_state.voice_wav
|
82 |
+
|
83 |
+
spkemb = synth.speaker_embed(speakerref)
|
84 |
|
85 |
status.update(label="synthesizing...", state="running")
|
86 |
|
|
|
104 |
st.session_state.autoplay = True
|
105 |
#playback.audio(wav, sample_rate=SAMPLE_RATE)
|
106 |
|
107 |
+
st.set_page_config(page_title="ZeroVOX TTS Demo", page_icon=':speech_balloon:', layout="centered", initial_sidebar_state="auto", menu_items=None)
|
108 |
+
|
109 |
st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")
|
110 |
|
111 |
+
tab1, tab2 = st.tabs(["Voice", "MEL Decoder"])
|
112 |
|
113 |
with tab1:
|
|
|
|
|
|
|
|
|
114 |
|
115 |
+
st.checkbox("Custom voice", key='custom_voice')
|
116 |
+
|
117 |
+
speakerref = st.empty()
|
118 |
|
119 |
+
if st.session_state.custom_voice:
|
120 |
+
|
121 |
+
# Create a file uploader that accepts only .wav files
|
122 |
+
uploaded_file = speakerref.file_uploader("Upload your voice sample", type=["wav"])
|
123 |
+
|
124 |
+
# Process the uploaded file
|
125 |
+
if uploaded_file is not None:
|
126 |
+
with tempfile.NamedTemporaryFile() as f:
|
127 |
+
f.write(uploaded_file.read())
|
128 |
+
wav, sr = librosa.load(f.name, sr=SAMPLE_RATE)
|
129 |
+
|
130 |
+
st.session_state.voice_wav=wav
|
131 |
+
|
132 |
+
st.audio(wav, sample_rate=SAMPLE_RATE)
|
133 |
+
|
134 |
+
else:
|
135 |
+
|
136 |
+
speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]
|
137 |
+
speakerref.selectbox("Voice", speakers, key='speakerref')
|
138 |
+
|
139 |
+
st.audio(ZeroVoxTTS.get_speakerref(st.session_state.speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
|
140 |
|
|
|
141 |
|
142 |
with tab2:
|
143 |
meldec = st.selectbox("MEL decoder",
|
|
|
147 |
|
148 |
status = st.status(st.session_state.message, state="complete")
|
149 |
|
150 |
+
col1, col2 = st.columns([0.8, 0.2])
|
151 |
+
with col1:
|
152 |
+
text = st.text_input("Text to synthesize", key='text', on_change=do_synth)
|
153 |
+
|
154 |
+
with col2:
|
155 |
+
lang = st.selectbox("Language",
|
156 |
+
["en", "de"],
|
157 |
+
on_change=update_text_input,
|
158 |
+
key='lang')
|
159 |
|
160 |
st.button("Synthesize!", type="primary", on_click=do_synth)
|
161 |
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
zerovox>=0.0.15
|
|
|
|
1 |
zerovox>=0.0.15
|
2 |
+
librosa>=0.10.2
|