Create demo.py
Browse files
demo.py
ADDED
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# client_app.py
|
2 |
+
import gradio as gr
|
3 |
+
import random
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
from gradio_client import Client, file
|
7 |
+
|
8 |
+
client = Client(os.environ['src'])
|
9 |
+
|
10 |
+
|
11 |
+
BASE_PATH = "Inference"
|
12 |
+
RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt")
|
13 |
+
EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt")
|
14 |
+
RU_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "prompt.txt")
|
15 |
+
EN_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "english_prompt.txt")
|
16 |
+
|
17 |
+
|
18 |
+
def load_texts(filepath):
|
19 |
+
if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '':
|
20 |
+
print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.")
|
21 |
+
return ["Example text file directory not found."]
|
22 |
+
try:
|
23 |
+
try:
|
24 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
25 |
+
return [line.strip() for line in f if line.strip()]
|
26 |
+
except UnicodeDecodeError:
|
27 |
+
print(f"Warning: UTF-8 decode failed for {filepath}. Trying 'cp1251' (common for Russian)...")
|
28 |
+
with open(filepath, 'r', encoding='cp1251') as f:
|
29 |
+
return [line.strip() for line in f if line.strip()]
|
30 |
+
except FileNotFoundError:
|
31 |
+
print(f"Warning: File not found - {filepath}")
|
32 |
+
if "english" in filepath and "random" in filepath:
|
33 |
+
return ["Example English text file not found."]
|
34 |
+
elif "random" in filepath:
|
35 |
+
return ["Пример русского текстового файла не найден."]
|
36 |
+
elif "english" in filepath and "prompt" in filepath:
|
37 |
+
return ["Speaker: Example English prompt file not found."]
|
38 |
+
elif "prompt" in filepath:
|
39 |
+
return ["Диктор: Пример русского файла подсказок не найден."]
|
40 |
+
else:
|
41 |
+
return ["Example text file not found."]
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Error loading {filepath}: {e}")
|
44 |
+
return ["Error loading example texts."]
|
45 |
+
|
46 |
+
ru_random_texts_list = load_texts(RU_RANDOM_TEXTS_PATH)
|
47 |
+
en_random_texts_list = load_texts(EN_RANDOM_TEXTS_PATH)
|
48 |
+
ru_prompt_texts_list = load_texts(RU_PROMPT_TEXTS_PATH)
|
49 |
+
en_prompt_texts_list = load_texts(EN_PROMPT_TEXTS_PATH)
|
50 |
+
|
51 |
+
def create_example_dict(text_list):
|
52 |
+
if not text_list or not isinstance(text_list[0], str):
|
53 |
+
return {"No examples found": ""}
|
54 |
+
return {f"{text[:30]}...": text for text in text_list}
|
55 |
+
|
56 |
+
ru_prompt_examples = create_example_dict(ru_prompt_texts_list)
|
57 |
+
en_prompt_examples = create_example_dict(en_prompt_texts_list)
|
58 |
+
|
59 |
+
|
60 |
+
VOICE_DIR = "./reference_sample_wavs"
|
61 |
+
try:
|
62 |
+
if os.path.exists(VOICE_DIR) and os.path.isdir(VOICE_DIR):
|
63 |
+
voicelist = sorted([v for v in os.listdir(VOICE_DIR) if os.path.isfile(os.path.join(VOICE_DIR, v)) and v.lower().endswith(('.wav', '.mp3', '.flac'))])
|
64 |
+
if not voicelist:
|
65 |
+
print(f"Warning: No compatible audio files found in {VOICE_DIR}. Dropdown will be empty.")
|
66 |
+
voicelist = ["default.wav"]
|
67 |
+
else:
|
68 |
+
print(f"Warning: Voice directory not found or is not a directory: {VOICE_DIR}. Using placeholder list.")
|
69 |
+
voicelist = ["anna_studio.wav", "boris_clear.wav", "female_neutral.wav", "male_deep.wav"]
|
70 |
+
except Exception as e:
|
71 |
+
print(f"Error listing voices in {VOICE_DIR}: {e}")
|
72 |
+
voicelist = ["error_loading_voices"]
|
73 |
+
|
74 |
+
|
75 |
+
def update_text_input_longform(preview_key, is_english):
|
76 |
+
examples_dict = en_prompt_examples if is_english else ru_prompt_examples
|
77 |
+
if preview_key in examples_dict:
|
78 |
+
return examples_dict[preview_key]
|
79 |
+
elif examples_dict:
|
80 |
+
return list(examples_dict.values())[0]
|
81 |
+
else:
|
82 |
+
return "Selected example not found or examples failed to load."
|
83 |
+
|
84 |
+
|
85 |
+
def generate_random_spk(is_english):
|
86 |
+
if is_english:
|
87 |
+
rand_id = random.randint(0, 3250)
|
88 |
+
print(f"Generated random English Speaker ID: {rand_id}")
|
89 |
+
return rand_id
|
90 |
+
else:
|
91 |
+
rand_id = random.randint(0, 196)
|
92 |
+
print(f"Generated random Russian Speaker ID: {rand_id}")
|
93 |
+
return rand_id
|
94 |
+
|
95 |
+
|
96 |
+
def Client_Synthesize_Audio(text, voice, voice2_path, spk_id, vcsteps, embscale, beta, ros, t, language_checkbox):
|
97 |
+
print("--- Client: Calling Synthesize_Audio ---")
|
98 |
+
print(f"Text: {text[:50]}...")
|
99 |
+
print(f"Default Voice: {voice}")
|
100 |
+
print(f"Uploaded Voice Path: {voice2_path}")
|
101 |
+
print(f"Speaker ID: {spk_id}")
|
102 |
+
print(f"Steps: {vcsteps}, Scale: {embscale}, Beta: {beta}, RoS: {ros}, T: {t}")
|
103 |
+
print(f"English Mode: {language_checkbox}")
|
104 |
+
|
105 |
+
voice2_arg = voice2_path
|
106 |
+
|
107 |
+
try:
|
108 |
+
if isinstance(client, DummyClient):
|
109 |
+
raise ConnectionError("Gradio client not connected.")
|
110 |
+
|
111 |
+
result = client.predict(
|
112 |
+
text,
|
113 |
+
voice,
|
114 |
+
voice2_arg,
|
115 |
+
spk_id,
|
116 |
+
vcsteps,
|
117 |
+
embscale,
|
118 |
+
beta,
|
119 |
+
ros,
|
120 |
+
t,
|
121 |
+
language_checkbox,
|
122 |
+
api_name="/Synthesize_Audio"
|
123 |
+
)
|
124 |
+
print("--- Client: Synthesize_Audio call successful ---")
|
125 |
+
return result
|
126 |
+
except Exception as e:
|
127 |
+
print(f"--- Client: Error calling Synthesize_Audio: {e} ---")
|
128 |
+
import numpy as np
|
129 |
+
return (44100, np.zeros(1))
|
130 |
+
|
131 |
+
def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, ros, language_checkbox):
|
132 |
+
print("--- Client: Calling PromptedSynth_Text ---")
|
133 |
+
print(f"Text: {text[:50]}...")
|
134 |
+
print(f"Beta: {beta}, T: {t}, Steps: {diffusion_steps}, Scale: {embedding_scale}, RoS: {ros}")
|
135 |
+
print(f"English Mode: {language_checkbox}")
|
136 |
+
|
137 |
+
try:
|
138 |
+
if isinstance(client, DummyClient):
|
139 |
+
raise ConnectionError("Gradio client not connected.")
|
140 |
+
|
141 |
+
result = client.predict(
|
142 |
+
text,
|
143 |
+
beta,
|
144 |
+
t,
|
145 |
+
diffusion_steps,
|
146 |
+
embedding_scale,
|
147 |
+
ros,
|
148 |
+
language_checkbox,
|
149 |
+
api_name="/PromptedSynth_Text"
|
150 |
+
)
|
151 |
+
print("--- Client: PromptedSynth_Text call successful ---")
|
152 |
+
return result
|
153 |
+
except Exception as e:
|
154 |
+
print(f"--- Client: Error calling PromptedSynth_Text: {e} ---")
|
155 |
+
import numpy as np
|
156 |
+
return (44100, np.zeros(1))
|
157 |
+
|
158 |
+
|
159 |
+
INTROTXT = """#
|
160 |
+
Demo for The Poor Man's TTS, this is run on a single RTX 3090.
|
161 |
+
Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/Project_Kanade_SpeechModel)
|
162 |
+
**Check the Tips and Model Details tabs below.** <br>
|
163 |
+
Enjoy!
|
164 |
+
"""
|
165 |
+
|
166 |
+
|
167 |
+
with gr.Blocks() as audio_inf:
|
168 |
+
gr.Markdown("### Synthesize speech using a reference audio clip (default, uploaded, or from speaker ID).")
|
169 |
+
with gr.Row():
|
170 |
+
with gr.Column(scale=1):
|
171 |
+
language_checkbox_audio = gr.Checkbox(label="English?", value=False,
|
172 |
+
info="Tick for English synthesis, leave unchecked for Russian.")
|
173 |
+
inp = gr.Textbox(label="Text",
|
174 |
+
info="Enter the text for voice-guided synthesis.",
|
175 |
+
value=ru_random_texts_list[0],
|
176 |
+
interactive=True,
|
177 |
+
scale=5)
|
178 |
+
|
179 |
+
voice = gr.Dropdown(choices=voicelist,
|
180 |
+
label="Default Reference Voice",
|
181 |
+
info="Select a pre-defined reference voice.",
|
182 |
+
value=voicelist[0] if voicelist else None,
|
183 |
+
interactive=True)
|
184 |
+
voice_2 = gr.Audio(label="Upload Your Audio Reference (Overrides Default Voice & Speaker ID)",
|
185 |
+
sources=["upload", "microphone"],
|
186 |
+
interactive=True,
|
187 |
+
type='filepath',
|
188 |
+
info="Upload a short (5-15s) clear audio clip.",
|
189 |
+
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
|
190 |
+
spk_id = gr.Number(label="Speaker ID (Alternative Reference)",
|
191 |
+
info="Input speaker ID (max 196 Ru / 3250 En) to use a random sample from that speaker on the server. 9999 disables.",
|
192 |
+
value=9999,
|
193 |
+
interactive=True)
|
194 |
+
|
195 |
+
random_spk_btn = gr.Button("Random")
|
196 |
+
|
197 |
+
|
198 |
+
with gr.Accordion("Advanced Parameters", open=False):
|
199 |
+
beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
|
200 |
+
label="Beta (Style Strength vs. Reference)",
|
201 |
+
info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
|
202 |
+
interactive=True)
|
203 |
+
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
|
204 |
+
label="Diffusion Steps",
|
205 |
+
info="More steps can improve quality but increase inference time.",
|
206 |
+
interactive=True)
|
207 |
+
embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
|
208 |
+
label="Embedding Scale (Intensity)",
|
209 |
+
info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
|
210 |
+
interactive=True)
|
211 |
+
rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
|
212 |
+
value=1,
|
213 |
+
step=0.1,
|
214 |
+
label="Rate of Speech",
|
215 |
+
info="Adjusts speech speed. 1.0 is normal.",
|
216 |
+
interactive=True)
|
217 |
+
|
218 |
+
t = gr.Slider(minimum=0.1, maximum=2, value=1.0, step=0.1,
|
219 |
+
label="T (Duration / Temperature)",
|
220 |
+
info="Controls duration scaling and randomness (T primarily affects English).",
|
221 |
+
interactive=True)
|
222 |
+
|
223 |
+
with gr.Column(scale=1):
|
224 |
+
btn = gr.Button("Synthesize (Voice Guided)", variant="primary")
|
225 |
+
audio = gr.Audio(interactive=False,
|
226 |
+
label="Synthesized Audio",
|
227 |
+
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
|
228 |
+
|
229 |
+
|
230 |
+
def update_audio_inf_defaults(is_english):
|
231 |
+
new_text_value = en_random_texts_list[0] if is_english else ru_random_texts_list[0]
|
232 |
+
new_spk_info = "Input speaker ID (max 3250 En) or use Randomize. 9999 disables." if is_english else "Input speaker ID (max 196 Ru) or use Randomize. 9999 disables."
|
233 |
+
new_spk_val = 9999
|
234 |
+
return gr.update(value=new_text_value), gr.update(info=new_spk_info, value=new_spk_val)
|
235 |
+
|
236 |
+
|
237 |
+
language_checkbox_audio.change(update_audio_inf_defaults,
|
238 |
+
inputs=[language_checkbox_audio],
|
239 |
+
outputs=[inp, spk_id])
|
240 |
+
|
241 |
+
random_spk_btn.click(fn=generate_random_spk, inputs=[language_checkbox_audio], outputs=spk_id)
|
242 |
+
|
243 |
+
btn.click(Client_Synthesize_Audio,
|
244 |
+
inputs=[inp, voice, voice_2, spk_id, multispeakersteps, embscale, beta, rate_of_speech, t, language_checkbox_audio],
|
245 |
+
outputs=[audio],
|
246 |
+
concurrency_limit=4)
|
247 |
+
|
248 |
+
|
249 |
+
with gr.Blocks() as longform:
|
250 |
+
gr.Markdown("### Synthesize speech using the text content itself to guide the style (semantic prompting).")
|
251 |
+
with gr.Row():
|
252 |
+
with gr.Column(scale=1):
|
253 |
+
language_checkbox_longform = gr.Checkbox(label="English?", value=False,
|
254 |
+
info="Tick for English synthesis, leave unchecked for Russian.")
|
255 |
+
inp_longform = gr.Textbox(label="Text",
|
256 |
+
info="Enter text; check the format from the examples.",
|
257 |
+
value=ru_prompt_texts_list[0],
|
258 |
+
lines=5,
|
259 |
+
interactive=True,
|
260 |
+
scale=5)
|
261 |
+
|
262 |
+
with gr.Row():
|
263 |
+
example_dropdown = gr.Dropdown(choices=list(ru_prompt_examples.keys()),
|
264 |
+
label="Example Prompts",
|
265 |
+
info="Select an example to load into the text box.",
|
266 |
+
value=list(ru_prompt_examples.keys())[0] if ru_prompt_examples else None,
|
267 |
+
interactive=True)
|
268 |
+
|
269 |
+
with gr.Accordion("Advanced Parameters", open=False):
|
270 |
+
beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
|
271 |
+
label="Beta (Style Strength vs. Semantic Prompt)",
|
272 |
+
info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
|
273 |
+
interactive=True)
|
274 |
+
diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=5, step=1,
|
275 |
+
label="Diffusion Steps",
|
276 |
+
info="More steps can improve quality but increase inference time.",
|
277 |
+
interactive=True)
|
278 |
+
embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
|
279 |
+
label="Embedding Scale (Intensity)",
|
280 |
+
info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
|
281 |
+
interactive=True)
|
282 |
+
rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
|
283 |
+
label="Rate of Speech",
|
284 |
+
info="Adjusts speech speed. 1.0 is normal.",
|
285 |
+
interactive=True)
|
286 |
+
t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
|
287 |
+
label="T (Style Consistency - Primarily English)",
|
288 |
+
info="Controls the influence of previous sentences' style on the current one.",
|
289 |
+
interactive=True)
|
290 |
+
|
291 |
+
|
292 |
+
with gr.Column(scale=1):
|
293 |
+
btn_longform = gr.Button("Synthesize (Text Guided)", variant="primary")
|
294 |
+
audio_longform = gr.Audio(interactive=False,
|
295 |
+
label="Synthesized Audio",
|
296 |
+
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
|
297 |
+
|
298 |
+
|
299 |
+
def update_longform_defaults(is_english):
|
300 |
+
examples_dict = en_prompt_examples if is_english else ru_prompt_examples
|
301 |
+
new_choices = list(examples_dict.keys())
|
302 |
+
new_value = new_choices[0] if new_choices else None
|
303 |
+
new_text_value = examples_dict.get(new_value, list(examples_dict.values())[0] if examples_dict else ("Speaker: Example text." if is_english else "Диктор: Пример текст��."))
|
304 |
+
|
305 |
+
return gr.update(choices=new_choices, value=new_value), gr.update(value=new_text_value)
|
306 |
+
|
307 |
+
language_checkbox_longform.change(update_longform_defaults,
|
308 |
+
inputs=[language_checkbox_longform],
|
309 |
+
outputs=[example_dropdown, inp_longform])
|
310 |
+
|
311 |
+
example_dropdown.change(fn=update_text_input_longform,
|
312 |
+
inputs=[example_dropdown, language_checkbox_longform],
|
313 |
+
outputs=[inp_longform])
|
314 |
+
|
315 |
+
btn_longform.click(Client_PromptedSynth_Text,
|
316 |
+
inputs=[inp_longform,
|
317 |
+
beta_longform,
|
318 |
+
t_longform,
|
319 |
+
diffusion_steps_longform,
|
320 |
+
embedding_scale_longform,
|
321 |
+
rate_of_speech_longform,
|
322 |
+
language_checkbox_longform],
|
323 |
+
outputs=[audio_longform],
|
324 |
+
concurrency_limit=4)
|
325 |
+
|
326 |
+
# --- User Guide / Info Tab (Reformatted User Text) ---
|
327 |
+
user_guide_text = f"""
|
328 |
+
## Quick Notes:
|
329 |
+
|
330 |
+
Everything in this demo & the repo (coming soon) is experimental. The main idea is just playing around with different things to see what works when you're limited to training on a pair of RTX 3090s.
|
331 |
+
|
332 |
+
The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details.
|
333 |
+
|
334 |
+
So far I focused on English and Russian, more can be covered.
|
335 |
+
|
336 |
+
---
|
337 |
+
|
338 |
+
### Voice-Guided Tab (Using Audio Reference)
|
339 |
+
|
340 |
+
* **Options:**
|
341 |
+
* **Default Voices:** Pick one from the dropdown (these are stored locally).
|
342 |
+
* **Upload Audio: ** While the data isn't nearly enough for zero-shotting, you can still test your own samples. make sure to decrease the beta if it didn't sound similar.
|
343 |
+
* **Speaker ID:** Use a number (RU: 0-196, EN: 0-3250) to grab a random clip of that speaker from the server's dataset. Hit 'Randomize' to explore. (Invalid IDs use a default voice on the server).
|
344 |
+
* **Some notes:**
|
345 |
+
* **Not all speakers are equal.** Randomized samples might give you a poor reference sometimes.
|
346 |
+
* **Play with Beta:** Values from 0.2 to 0.9 can work well. Higher Beta = LESS like the reference. It works great for some voices, breaks others. please play with different values. (0 = diffusion off).
|
347 |
+
|
348 |
+
---
|
349 |
+
|
350 |
+
### Text-Guided Tab (Using Text Meaning)
|
351 |
+
|
352 |
+
* **Intuition:** Figure out the voice style just from the text itself (using semantic encoders). No audio needed, which makes suitable for real-time use cases.
|
353 |
+
* **Speaker Prefix:** For Russian, you can use 'Speaker_ + number:'. as for the English, you can use any names. names were randomly assigned during the training of the Encoder.
|
354 |
+
|
355 |
+
---
|
356 |
+
|
357 |
+
### General Tips
|
358 |
+
|
359 |
+
* Punctuation matters for intonation; don't use unsupported symbols.
|
360 |
+
"""
|
361 |
+
|
362 |
+
with gr.Blocks() as info_tab:
|
363 |
+
gr.Markdown(user_guide_text)
|
364 |
+
|
365 |
+
# --- Model Details Tab (Reformatted User Text) ---
|
366 |
+
model_details_text = """
|
367 |
+
## Model Details (The Guts)
|
368 |
+
|
369 |
+
|
370 |
+
---
|
371 |
+
|
372 |
+
### Darya (Russian Model) - More Stable
|
373 |
+
|
374 |
+
* Generally more controlled than the English one. that's also why in terms of acoustic quality it should sound much better.
|
375 |
+
* **Setup:** Non-End-to-End (separate steps).
|
376 |
+
* **Components:**
|
377 |
+
* Style Encoder: Conformer-based.
|
378 |
+
* Duration Predictor: Conformer-based (with cross-attention).
|
379 |
+
* Semantic Encoder: `RuModernBERT-base` (for text-guidance).
|
380 |
+
* Diffusion Sampler: **None currently.**
|
381 |
+
* **Vocoder:** [RiFornet](https://github.com/Respaired/RiFornet_Vocoder)
|
382 |
+
* **Training:** ~200K steps on ~320 hours of Russian data (mix of conversation & narration, hundreds of speakers).
|
383 |
+
* **Size:** Lightweight (~< 200M params).
|
384 |
+
* **Specs:** 44.1kHz output, 128 mel bins.
|
385 |
+
|
386 |
+
---
|
387 |
+
|
388 |
+
### Kalliope (English Model) - Wild
|
389 |
+
|
390 |
+
* **Overall Vibe:** More expressive potential, but also less predictable. Showed signs of overfitting on the noisy data.
|
391 |
+
* **Setup:** Non-End-to-End.
|
392 |
+
* **Components:**
|
393 |
+
* Style Encoder: Conformer-based.
|
394 |
+
* Text Encoder: `ConvNextV2`.
|
395 |
+
* Duration Predictor: Conformer-based (with cross-attention).
|
396 |
+
* Acoustic Decoder: Conformer-based.
|
397 |
+
* Semantic Encoder: `DeBERTa V3 Base` (for text-guided).
|
398 |
+
* Diffusion Sampler: **Yes**
|
399 |
+
* **Vocoder:** [RiFornet](https://github.com/Respaired/RiFornet_Vocoder).
|
400 |
+
* **Training:** ~100K steps on ~300-400 hours of *very complex & noisy* English data (conversational, whisper, narration, wide emotion range).
|
401 |
+
* **Size:** Bigger (~1.2B params total, but not all active at once - training was surprisingly doable). Hidden dim 1024, Style vector 512.
|
402 |
+
* **Specs:** 44.1kHz output, 128 mel bins (but more than half the dataset were 22-24khz or even phone-call quality)
|
403 |
+
|
404 |
+
---
|
405 |
+
|
406 |
+
*More details might show up in a blog post later.*
|
407 |
+
"""
|
408 |
+
|
409 |
+
with gr.Blocks() as model_details_tab:
|
410 |
+
gr.Markdown(model_details_text)
|
411 |
+
|
412 |
+
|
413 |
+
theme = gr.themes.Base(
|
414 |
+
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
|
415 |
+
)
|
416 |
+
|
417 |
+
app = gr.TabbedInterface(
|
418 |
+
[longform, audio_inf, info_tab, model_details_tab],
|
419 |
+
['Text-guided Synthesis', 'Voice-guided Synthesis', 'Intuition & Tips', 'Model Details'],
|
420 |
+
title="The Poor Man's TTS (Experimental)",
|
421 |
+
theme="Respair/[email protected]"
|
422 |
+
)
|
423 |
+
|
424 |
+
|
425 |
+
if __name__ == "__main__":
|
426 |
+
print("Launching Client Gradio App...")
|
427 |
+
app.queue(api_open=False, max_size=15).launch(show_api=False, share=True)
|