Respair commited on
Commit
317cbd3
·
verified ·
1 Parent(s): a5088be

Create demo.py

Browse files
Files changed (1) hide show
  1. demo.py +427 -0
demo.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # client_app.py
2
+ import gradio as gr
3
+ import random
4
+ import os
5
+ import re
6
+ from gradio_client import Client, file
7
+
8
+ client = Client(os.environ['src'])
9
+
10
+
11
+ BASE_PATH = "Inference"
12
+ RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt")
13
+ EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt")
14
+ RU_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "prompt.txt")
15
+ EN_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "english_prompt.txt")
16
+
17
+
18
+ def load_texts(filepath):
19
+ if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '':
20
+ print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.")
21
+ return ["Example text file directory not found."]
22
+ try:
23
+ try:
24
+ with open(filepath, 'r', encoding='utf-8') as f:
25
+ return [line.strip() for line in f if line.strip()]
26
+ except UnicodeDecodeError:
27
+ print(f"Warning: UTF-8 decode failed for {filepath}. Trying 'cp1251' (common for Russian)...")
28
+ with open(filepath, 'r', encoding='cp1251') as f:
29
+ return [line.strip() for line in f if line.strip()]
30
+ except FileNotFoundError:
31
+ print(f"Warning: File not found - {filepath}")
32
+ if "english" in filepath and "random" in filepath:
33
+ return ["Example English text file not found."]
34
+ elif "random" in filepath:
35
+ return ["Пример русского текстового файла не найден."]
36
+ elif "english" in filepath and "prompt" in filepath:
37
+ return ["Speaker: Example English prompt file not found."]
38
+ elif "prompt" in filepath:
39
+ return ["Диктор: Пример русского файла подсказок не найден."]
40
+ else:
41
+ return ["Example text file not found."]
42
+ except Exception as e:
43
+ print(f"Error loading {filepath}: {e}")
44
+ return ["Error loading example texts."]
45
+
46
+ ru_random_texts_list = load_texts(RU_RANDOM_TEXTS_PATH)
47
+ en_random_texts_list = load_texts(EN_RANDOM_TEXTS_PATH)
48
+ ru_prompt_texts_list = load_texts(RU_PROMPT_TEXTS_PATH)
49
+ en_prompt_texts_list = load_texts(EN_PROMPT_TEXTS_PATH)
50
+
51
+ def create_example_dict(text_list):
52
+ if not text_list or not isinstance(text_list[0], str):
53
+ return {"No examples found": ""}
54
+ return {f"{text[:30]}...": text for text in text_list}
55
+
56
+ ru_prompt_examples = create_example_dict(ru_prompt_texts_list)
57
+ en_prompt_examples = create_example_dict(en_prompt_texts_list)
58
+
59
+
60
+ VOICE_DIR = "./reference_sample_wavs"
61
+ try:
62
+ if os.path.exists(VOICE_DIR) and os.path.isdir(VOICE_DIR):
63
+ voicelist = sorted([v for v in os.listdir(VOICE_DIR) if os.path.isfile(os.path.join(VOICE_DIR, v)) and v.lower().endswith(('.wav', '.mp3', '.flac'))])
64
+ if not voicelist:
65
+ print(f"Warning: No compatible audio files found in {VOICE_DIR}. Dropdown will be empty.")
66
+ voicelist = ["default.wav"]
67
+ else:
68
+ print(f"Warning: Voice directory not found or is not a directory: {VOICE_DIR}. Using placeholder list.")
69
+ voicelist = ["anna_studio.wav", "boris_clear.wav", "female_neutral.wav", "male_deep.wav"]
70
+ except Exception as e:
71
+ print(f"Error listing voices in {VOICE_DIR}: {e}")
72
+ voicelist = ["error_loading_voices"]
73
+
74
+
75
+ def update_text_input_longform(preview_key, is_english):
76
+ examples_dict = en_prompt_examples if is_english else ru_prompt_examples
77
+ if preview_key in examples_dict:
78
+ return examples_dict[preview_key]
79
+ elif examples_dict:
80
+ return list(examples_dict.values())[0]
81
+ else:
82
+ return "Selected example not found or examples failed to load."
83
+
84
+
85
+ def generate_random_spk(is_english):
86
+ if is_english:
87
+ rand_id = random.randint(0, 3250)
88
+ print(f"Generated random English Speaker ID: {rand_id}")
89
+ return rand_id
90
+ else:
91
+ rand_id = random.randint(0, 196)
92
+ print(f"Generated random Russian Speaker ID: {rand_id}")
93
+ return rand_id
94
+
95
+
96
+ def Client_Synthesize_Audio(text, voice, voice2_path, spk_id, vcsteps, embscale, beta, ros, t, language_checkbox):
97
+ print("--- Client: Calling Synthesize_Audio ---")
98
+ print(f"Text: {text[:50]}...")
99
+ print(f"Default Voice: {voice}")
100
+ print(f"Uploaded Voice Path: {voice2_path}")
101
+ print(f"Speaker ID: {spk_id}")
102
+ print(f"Steps: {vcsteps}, Scale: {embscale}, Beta: {beta}, RoS: {ros}, T: {t}")
103
+ print(f"English Mode: {language_checkbox}")
104
+
105
+ voice2_arg = voice2_path
106
+
107
+ try:
108
+ if isinstance(client, DummyClient):
109
+ raise ConnectionError("Gradio client not connected.")
110
+
111
+ result = client.predict(
112
+ text,
113
+ voice,
114
+ voice2_arg,
115
+ spk_id,
116
+ vcsteps,
117
+ embscale,
118
+ beta,
119
+ ros,
120
+ t,
121
+ language_checkbox,
122
+ api_name="/Synthesize_Audio"
123
+ )
124
+ print("--- Client: Synthesize_Audio call successful ---")
125
+ return result
126
+ except Exception as e:
127
+ print(f"--- Client: Error calling Synthesize_Audio: {e} ---")
128
+ import numpy as np
129
+ return (44100, np.zeros(1))
130
+
131
+ def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, ros, language_checkbox):
132
+ print("--- Client: Calling PromptedSynth_Text ---")
133
+ print(f"Text: {text[:50]}...")
134
+ print(f"Beta: {beta}, T: {t}, Steps: {diffusion_steps}, Scale: {embedding_scale}, RoS: {ros}")
135
+ print(f"English Mode: {language_checkbox}")
136
+
137
+ try:
138
+ if isinstance(client, DummyClient):
139
+ raise ConnectionError("Gradio client not connected.")
140
+
141
+ result = client.predict(
142
+ text,
143
+ beta,
144
+ t,
145
+ diffusion_steps,
146
+ embedding_scale,
147
+ ros,
148
+ language_checkbox,
149
+ api_name="/PromptedSynth_Text"
150
+ )
151
+ print("--- Client: PromptedSynth_Text call successful ---")
152
+ return result
153
+ except Exception as e:
154
+ print(f"--- Client: Error calling PromptedSynth_Text: {e} ---")
155
+ import numpy as np
156
+ return (44100, np.zeros(1))
157
+
158
+
159
+ INTROTXT = """#
160
+ Demo for The Poor Man's TTS, this is run on a single RTX 3090.
161
+ Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/Project_Kanade_SpeechModel)
162
+ **Check the Tips and Model Details tabs below.** <br>
163
+ Enjoy!
164
+ """
165
+
166
+
167
+ with gr.Blocks() as audio_inf:
168
+ gr.Markdown("### Synthesize speech using a reference audio clip (default, uploaded, or from speaker ID).")
169
+ with gr.Row():
170
+ with gr.Column(scale=1):
171
+ language_checkbox_audio = gr.Checkbox(label="English?", value=False,
172
+ info="Tick for English synthesis, leave unchecked for Russian.")
173
+ inp = gr.Textbox(label="Text",
174
+ info="Enter the text for voice-guided synthesis.",
175
+ value=ru_random_texts_list[0],
176
+ interactive=True,
177
+ scale=5)
178
+
179
+ voice = gr.Dropdown(choices=voicelist,
180
+ label="Default Reference Voice",
181
+ info="Select a pre-defined reference voice.",
182
+ value=voicelist[0] if voicelist else None,
183
+ interactive=True)
184
+ voice_2 = gr.Audio(label="Upload Your Audio Reference (Overrides Default Voice & Speaker ID)",
185
+ sources=["upload", "microphone"],
186
+ interactive=True,
187
+ type='filepath',
188
+ info="Upload a short (5-15s) clear audio clip.",
189
+ waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
190
+ spk_id = gr.Number(label="Speaker ID (Alternative Reference)",
191
+ info="Input speaker ID (max 196 Ru / 3250 En) to use a random sample from that speaker on the server. 9999 disables.",
192
+ value=9999,
193
+ interactive=True)
194
+
195
+ random_spk_btn = gr.Button("Random")
196
+
197
+
198
+ with gr.Accordion("Advanced Parameters", open=False):
199
+ beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
200
+ label="Beta (Style Strength vs. Reference)",
201
+ info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
202
+ interactive=True)
203
+ multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
204
+ label="Diffusion Steps",
205
+ info="More steps can improve quality but increase inference time.",
206
+ interactive=True)
207
+ embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
208
+ label="Embedding Scale (Intensity)",
209
+ info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
210
+ interactive=True)
211
+ rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
212
+ value=1,
213
+ step=0.1,
214
+ label="Rate of Speech",
215
+ info="Adjusts speech speed. 1.0 is normal.",
216
+ interactive=True)
217
+
218
+ t = gr.Slider(minimum=0.1, maximum=2, value=1.0, step=0.1,
219
+ label="T (Duration / Temperature)",
220
+ info="Controls duration scaling and randomness (T primarily affects English).",
221
+ interactive=True)
222
+
223
+ with gr.Column(scale=1):
224
+ btn = gr.Button("Synthesize (Voice Guided)", variant="primary")
225
+ audio = gr.Audio(interactive=False,
226
+ label="Synthesized Audio",
227
+ waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
228
+
229
+
230
+ def update_audio_inf_defaults(is_english):
231
+ new_text_value = en_random_texts_list[0] if is_english else ru_random_texts_list[0]
232
+ new_spk_info = "Input speaker ID (max 3250 En) or use Randomize. 9999 disables." if is_english else "Input speaker ID (max 196 Ru) or use Randomize. 9999 disables."
233
+ new_spk_val = 9999
234
+ return gr.update(value=new_text_value), gr.update(info=new_spk_info, value=new_spk_val)
235
+
236
+
237
+ language_checkbox_audio.change(update_audio_inf_defaults,
238
+ inputs=[language_checkbox_audio],
239
+ outputs=[inp, spk_id])
240
+
241
+ random_spk_btn.click(fn=generate_random_spk, inputs=[language_checkbox_audio], outputs=spk_id)
242
+
243
+ btn.click(Client_Synthesize_Audio,
244
+ inputs=[inp, voice, voice_2, spk_id, multispeakersteps, embscale, beta, rate_of_speech, t, language_checkbox_audio],
245
+ outputs=[audio],
246
+ concurrency_limit=4)
247
+
248
+
249
+ with gr.Blocks() as longform:
250
+ gr.Markdown("### Synthesize speech using the text content itself to guide the style (semantic prompting).")
251
+ with gr.Row():
252
+ with gr.Column(scale=1):
253
+ language_checkbox_longform = gr.Checkbox(label="English?", value=False,
254
+ info="Tick for English synthesis, leave unchecked for Russian.")
255
+ inp_longform = gr.Textbox(label="Text",
256
+ info="Enter text; check the format from the examples.",
257
+ value=ru_prompt_texts_list[0],
258
+ lines=5,
259
+ interactive=True,
260
+ scale=5)
261
+
262
+ with gr.Row():
263
+ example_dropdown = gr.Dropdown(choices=list(ru_prompt_examples.keys()),
264
+ label="Example Prompts",
265
+ info="Select an example to load into the text box.",
266
+ value=list(ru_prompt_examples.keys())[0] if ru_prompt_examples else None,
267
+ interactive=True)
268
+
269
+ with gr.Accordion("Advanced Parameters", open=False):
270
+ beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
271
+ label="Beta (Style Strength vs. Semantic Prompt)",
272
+ info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
273
+ interactive=True)
274
+ diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=5, step=1,
275
+ label="Diffusion Steps",
276
+ info="More steps can improve quality but increase inference time.",
277
+ interactive=True)
278
+ embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
279
+ label="Embedding Scale (Intensity)",
280
+ info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
281
+ interactive=True)
282
+ rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
283
+ label="Rate of Speech",
284
+ info="Adjusts speech speed. 1.0 is normal.",
285
+ interactive=True)
286
+ t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
287
+ label="T (Style Consistency - Primarily English)",
288
+ info="Controls the influence of previous sentences' style on the current one.",
289
+ interactive=True)
290
+
291
+
292
+ with gr.Column(scale=1):
293
+ btn_longform = gr.Button("Synthesize (Text Guided)", variant="primary")
294
+ audio_longform = gr.Audio(interactive=False,
295
+ label="Synthesized Audio",
296
+ waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
297
+
298
+
299
+ def update_longform_defaults(is_english):
300
+ examples_dict = en_prompt_examples if is_english else ru_prompt_examples
301
+ new_choices = list(examples_dict.keys())
302
+ new_value = new_choices[0] if new_choices else None
303
+ new_text_value = examples_dict.get(new_value, list(examples_dict.values())[0] if examples_dict else ("Speaker: Example text." if is_english else "Диктор: Пример текст��."))
304
+
305
+ return gr.update(choices=new_choices, value=new_value), gr.update(value=new_text_value)
306
+
307
+ language_checkbox_longform.change(update_longform_defaults,
308
+ inputs=[language_checkbox_longform],
309
+ outputs=[example_dropdown, inp_longform])
310
+
311
+ example_dropdown.change(fn=update_text_input_longform,
312
+ inputs=[example_dropdown, language_checkbox_longform],
313
+ outputs=[inp_longform])
314
+
315
+ btn_longform.click(Client_PromptedSynth_Text,
316
+ inputs=[inp_longform,
317
+ beta_longform,
318
+ t_longform,
319
+ diffusion_steps_longform,
320
+ embedding_scale_longform,
321
+ rate_of_speech_longform,
322
+ language_checkbox_longform],
323
+ outputs=[audio_longform],
324
+ concurrency_limit=4)
325
+
326
+ # --- User Guide / Info Tab (Reformatted User Text) ---
327
+ user_guide_text = f"""
328
+ ## Quick Notes:
329
+
330
+ Everything in this demo & the repo (coming soon) is experimental. The main idea is just playing around with different things to see what works when you're limited to training on a pair of RTX 3090s.
331
+
332
+ The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details.
333
+
334
+ So far I focused on English and Russian, more can be covered.
335
+
336
+ ---
337
+
338
+ ### Voice-Guided Tab (Using Audio Reference)
339
+
340
+ * **Options:**
341
+ * **Default Voices:** Pick one from the dropdown (these are stored locally).
342
+ * **Upload Audio: ** While the data isn't nearly enough for zero-shotting, you can still test your own samples. make sure to decrease the beta if it didn't sound similar.
343
+ * **Speaker ID:** Use a number (RU: 0-196, EN: 0-3250) to grab a random clip of that speaker from the server's dataset. Hit 'Randomize' to explore. (Invalid IDs use a default voice on the server).
344
+ * **Some notes:**
345
+ * **Not all speakers are equal.** Randomized samples might give you a poor reference sometimes.
346
+ * **Play with Beta:** Values from 0.2 to 0.9 can work well. Higher Beta = LESS like the reference. It works great for some voices, breaks others. please play with different values. (0 = diffusion off).
347
+
348
+ ---
349
+
350
+ ### Text-Guided Tab (Using Text Meaning)
351
+
352
+ * **Intuition:** Figure out the voice style just from the text itself (using semantic encoders). No audio needed, which makes suitable for real-time use cases.
353
+ * **Speaker Prefix:** For Russian, you can use 'Speaker_ + number:'. as for the English, you can use any names. names were randomly assigned during the training of the Encoder.
354
+
355
+ ---
356
+
357
+ ### General Tips
358
+
359
+ * Punctuation matters for intonation; don't use unsupported symbols.
360
+ """
361
+
362
+ with gr.Blocks() as info_tab:
363
+ gr.Markdown(user_guide_text)
364
+
365
+ # --- Model Details Tab (Reformatted User Text) ---
366
+ model_details_text = """
367
+ ## Model Details (The Guts)
368
+
369
+
370
+ ---
371
+
372
+ ### Darya (Russian Model) - More Stable
373
+
374
+ * Generally more controlled than the English one. that's also why in terms of acoustic quality it should sound much better.
375
+ * **Setup:** Non-End-to-End (separate steps).
376
+ * **Components:**
377
+ * Style Encoder: Conformer-based.
378
+ * Duration Predictor: Conformer-based (with cross-attention).
379
+ * Semantic Encoder: `RuModernBERT-base` (for text-guidance).
380
+ * Diffusion Sampler: **None currently.**
381
+ * **Vocoder:** [RiFornet](https://github.com/Respaired/RiFornet_Vocoder)
382
+ * **Training:** ~200K steps on ~320 hours of Russian data (mix of conversation & narration, hundreds of speakers).
383
+ * **Size:** Lightweight (~< 200M params).
384
+ * **Specs:** 44.1kHz output, 128 mel bins.
385
+
386
+ ---
387
+
388
+ ### Kalliope (English Model) - Wild
389
+
390
+ * **Overall Vibe:** More expressive potential, but also less predictable. Showed signs of overfitting on the noisy data.
391
+ * **Setup:** Non-End-to-End.
392
+ * **Components:**
393
+ * Style Encoder: Conformer-based.
394
+ * Text Encoder: `ConvNextV2`.
395
+ * Duration Predictor: Conformer-based (with cross-attention).
396
+ * Acoustic Decoder: Conformer-based.
397
+ * Semantic Encoder: `DeBERTa V3 Base` (for text-guided).
398
+ * Diffusion Sampler: **Yes**
399
+ * **Vocoder:** [RiFornet](https://github.com/Respaired/RiFornet_Vocoder).
400
+ * **Training:** ~100K steps on ~300-400 hours of *very complex & noisy* English data (conversational, whisper, narration, wide emotion range).
401
+ * **Size:** Bigger (~1.2B params total, but not all active at once - training was surprisingly doable). Hidden dim 1024, Style vector 512.
402
+ * **Specs:** 44.1kHz output, 128 mel bins (but more than half the dataset were 22-24khz or even phone-call quality)
403
+
404
+ ---
405
+
406
+ *More details might show up in a blog post later.*
407
+ """
408
+
409
+ with gr.Blocks() as model_details_tab:
410
+ gr.Markdown(model_details_text)
411
+
412
+
413
+ theme = gr.themes.Base(
414
+ font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
415
+ )
416
+
417
+ app = gr.TabbedInterface(
418
+ [longform, audio_inf, info_tab, model_details_tab],
419
+ ['Text-guided Synthesis', 'Voice-guided Synthesis', 'Intuition & Tips', 'Model Details'],
420
+ title="The Poor Man's TTS (Experimental)",
421
+ theme="Respair/[email protected]"
422
+ )
423
+
424
+
425
+ if __name__ == "__main__":
426
+ print("Launching Client Gradio App...")
427
+ app.queue(api_open=False, max_size=15).launch(show_api=False, share=True)