Guenter Bartsch commited on
Commit
a2d19e9
·
1 Parent(s): b8c03ff

custom voice option added

Browse files
Files changed (2) hide show
  1. app.py +60 -16
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,4 +1,6 @@
 
1
  import time
 
2
 
3
  import streamlit as st
4
 
@@ -9,6 +11,9 @@ SAMPLE_RATE=24000 # FIXME
9
 
10
  DEFAULT_SPEAKER = 'en_speaker_00061.wav'
11
 
 
 
 
12
  if "text" not in st.session_state:
13
  st.session_state.text = "Welcome to the world of speech synthesis!"
14
 
@@ -18,6 +23,15 @@ if "message" not in st.session_state:
18
  if "autoplay" not in st.session_state:
19
  st.session_state.autoplay = False
20
 
 
 
 
 
 
 
 
 
 
21
  def update_text_input():
22
  global text
23
  if st.session_state['lang'] == "en":
@@ -28,8 +42,7 @@ def update_text_input():
28
 
29
  def do_synth():
30
 
31
- global lang, status, speakerref, playback, meldec
32
-
33
 
34
  synth = None
35
  if 'synth' in st.session_state:
@@ -38,9 +51,9 @@ def do_synth():
38
  if synth.meldec_model != st.session_state['meldec']:
39
  synth = None # trigger reload
40
  else:
41
- if synth.language != lang:
42
- status.update(label=f"loading the lexicon for {lang} ...", state="running")
43
- synth.language = lang
44
 
45
  if not synth:
46
 
@@ -62,7 +75,12 @@ def do_synth():
62
 
63
  status.update(label="computing speaker embedding...", state="running")
64
 
65
- spkemb = synth.speaker_embed(ZeroVoxTTS.get_speakerref(speakerref, modelcfg['audio']['sampling_rate']))
 
 
 
 
 
66
 
67
  status.update(label="synthesizing...", state="running")
68
 
@@ -86,21 +104,40 @@ def do_synth():
86
  st.session_state.autoplay = True
87
  #playback.audio(wav, sample_rate=SAMPLE_RATE)
88
 
 
 
89
  st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")
90
 
91
- tab1, tab2 = st.tabs(["Settings", "MEL Decoder"])
92
 
93
  with tab1:
94
- lang = st.selectbox("Language",
95
- ["en", "de"],
96
- on_change=update_text_input,
97
- key='lang')
98
 
99
- speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]
 
 
100
 
101
- speakerref = st.selectbox("Voice sample", speakers, index=speakers.index(DEFAULT_SPEAKER))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
104
 
105
  with tab2:
106
  meldec = st.selectbox("MEL decoder",
@@ -110,8 +147,15 @@ with tab2:
110
 
111
  status = st.status(st.session_state.message, state="complete")
112
 
113
- #text = st.text_input("Text to synthesize", value=st.session_state.text, key='text', on_change=do_synth)
114
- text = st.text_input("Text to synthesize", key='text', on_change=do_synth)
 
 
 
 
 
 
 
115
 
116
  st.button("Synthesize!", type="primary", on_click=do_synth)
117
 
 
1
+ import tempfile
2
  import time
3
+ import librosa
4
 
5
  import streamlit as st
6
 
 
11
 
12
  DEFAULT_SPEAKER = 'en_speaker_00061.wav'
13
 
14
+ if "lang" not in st.session_state:
15
+ st.session_state.lang = "en"
16
+
17
  if "text" not in st.session_state:
18
  st.session_state.text = "Welcome to the world of speech synthesis!"
19
 
 
23
  if "autoplay" not in st.session_state:
24
  st.session_state.autoplay = False
25
 
26
+ if "speakerref" not in st.session_state:
27
+ st.session_state.speakerref = DEFAULT_SPEAKER
28
+
29
+ if "custom_voice" not in st.session_state:
30
+ st.session_state.custom_voice = False
31
+
32
+ if "voice_wav" not in st.session_state:
33
+ st.session_state.voice_wav = None
34
+
35
  def update_text_input():
36
  global text
37
  if st.session_state['lang'] == "en":
 
42
 
43
  def do_synth():
44
 
45
+ global status, playback, meldec
 
46
 
47
  synth = None
48
  if 'synth' in st.session_state:
 
51
  if synth.meldec_model != st.session_state['meldec']:
52
  synth = None # trigger reload
53
  else:
54
+ if synth.language != st.session_state.lang:
55
+ status.update(label=f"loading the lexicon for {st.session_state.lang} ...", state="running")
56
+ synth.language = st.session_state.lang
57
 
58
  if not synth:
59
 
 
75
 
76
  status.update(label="computing speaker embedding...", state="running")
77
 
78
+ if not st.session_state.custom_voice or st.session_state.voice_wav is None:
79
+ speakerref = ZeroVoxTTS.get_speakerref(st.session_state.speakerref, modelcfg['audio']['sampling_rate'])
80
+ else:
81
+ speakerref = st.session_state.voice_wav
82
+
83
+ spkemb = synth.speaker_embed(speakerref)
84
 
85
  status.update(label="synthesizing...", state="running")
86
 
 
104
  st.session_state.autoplay = True
105
  #playback.audio(wav, sample_rate=SAMPLE_RATE)
106
 
107
+ st.set_page_config(page_title="ZeroVOX TTS Demo", page_icon=':speech_balloon:', layout="centered", initial_sidebar_state="auto", menu_items=None)
108
+
109
  st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")
110
 
111
+ tab1, tab2 = st.tabs(["Voice", "MEL Decoder"])
112
 
113
  with tab1:
 
 
 
 
114
 
115
+ st.checkbox("Custom voice", key='custom_voice')
116
+
117
+ speakerref = st.empty()
118
 
119
+ if st.session_state.custom_voice:
120
+
121
+ # Create a file uploader that accepts only .wav files
122
+ uploaded_file = speakerref.file_uploader("Upload your voice sample", type=["wav"])
123
+
124
+ # Process the uploaded file
125
+ if uploaded_file is not None:
126
+ with tempfile.NamedTemporaryFile() as f:
127
+ f.write(uploaded_file.read())
128
+ wav, sr = librosa.load(f.name, sr=SAMPLE_RATE)
129
+
130
+ st.session_state.voice_wav=wav
131
+
132
+ st.audio(wav, sample_rate=SAMPLE_RATE)
133
+
134
+ else:
135
+
136
+ speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]
137
+ speakerref.selectbox("Voice", speakers, key='speakerref')
138
+
139
+ st.audio(ZeroVoxTTS.get_speakerref(st.session_state.speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)
140
 
 
141
 
142
  with tab2:
143
  meldec = st.selectbox("MEL decoder",
 
147
 
148
  status = st.status(st.session_state.message, state="complete")
149
 
150
+ col1, col2 = st.columns([0.8, 0.2])
151
+ with col1:
152
+ text = st.text_input("Text to synthesize", key='text', on_change=do_synth)
153
+
154
+ with col2:
155
+ lang = st.selectbox("Language",
156
+ ["en", "de"],
157
+ on_change=update_text_input,
158
+ key='lang')
159
 
160
  st.button("Synthesize!", type="primary", on_click=do_synth)
161
 
requirements.txt CHANGED
@@ -1 +1,2 @@
1
  zerovox>=0.0.15
 
 
1
  zerovox>=0.0.15
2
+ librosa>=0.10.2