Spaces:

Respair
/

Darya_TTS

Paused

App Files Files Community

Respair commited on Apr 17

Commit

317cbd3

verified ·

1 Parent(s): a5088be

Create demo.py

Browse files

Files changed (1) hide show

demo.py +427 -0

demo.py ADDED Viewed

	@@ -0,0 +1,427 @@

+# client_app.py
+import gradio as gr
+import random
+import os
+import re
+from gradio_client import Client, file
+client = Client(os.environ['src'])
+BASE_PATH = "Inference"
+RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt")
+EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt")
+RU_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "prompt.txt")
+EN_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "english_prompt.txt")
+def load_texts(filepath):
+    if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '':
+         print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.")
+         return ["Example text file directory not found."]
+    try:
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                return [line.strip() for line in f if line.strip()]
+        except UnicodeDecodeError:
+            print(f"Warning: UTF-8 decode failed for {filepath}. Trying 'cp1251' (common for Russian)...")
+            with open(filepath, 'r', encoding='cp1251') as f:
+                 return [line.strip() for line in f if line.strip()]
+    except FileNotFoundError:
+        print(f"Warning: File not found - {filepath}")
+        if "english" in filepath and "random" in filepath:
+            return ["Example English text file not found."]
+        elif "random" in filepath:
+            return ["Пример русского текстового файла не найден."]
+        elif "english" in filepath and "prompt" in filepath:
+             return ["Speaker: Example English prompt file not found."]
+        elif "prompt" in filepath:
+             return ["Диктор: Пример русского файла подсказок не найден."]
+        else:
+             return ["Example text file not found."]
+    except Exception as e:
+        print(f"Error loading {filepath}: {e}")
+        return ["Error loading example texts."]
+ru_random_texts_list = load_texts(RU_RANDOM_TEXTS_PATH)
+en_random_texts_list = load_texts(EN_RANDOM_TEXTS_PATH)
+ru_prompt_texts_list = load_texts(RU_PROMPT_TEXTS_PATH)
+en_prompt_texts_list = load_texts(EN_PROMPT_TEXTS_PATH)
+def create_example_dict(text_list):
+    if not text_list or not isinstance(text_list[0], str):
+        return {"No examples found": ""}
+    return {f"{text[:30]}...": text for text in text_list}
+ru_prompt_examples = create_example_dict(ru_prompt_texts_list)
+en_prompt_examples = create_example_dict(en_prompt_texts_list)
+VOICE_DIR = "./reference_sample_wavs"
+try:
+    if os.path.exists(VOICE_DIR) and os.path.isdir(VOICE_DIR):
+        voicelist = sorted([v for v in os.listdir(VOICE_DIR) if os.path.isfile(os.path.join(VOICE_DIR, v)) and v.lower().endswith(('.wav', '.mp3', '.flac'))])
+        if not voicelist:
+           print(f"Warning: No compatible audio files found in {VOICE_DIR}. Dropdown will be empty.")
+           voicelist = ["default.wav"]
+    else:
+        print(f"Warning: Voice directory not found or is not a directory: {VOICE_DIR}. Using placeholder list.")
+        voicelist = ["anna_studio.wav", "boris_clear.wav", "female_neutral.wav", "male_deep.wav"]
+except Exception as e:
+    print(f"Error listing voices in {VOICE_DIR}: {e}")
+    voicelist = ["error_loading_voices"]
+def update_text_input_longform(preview_key, is_english):
+    examples_dict = en_prompt_examples if is_english else ru_prompt_examples
+    if preview_key in examples_dict:
+        return examples_dict[preview_key]
+    elif examples_dict:
+         return list(examples_dict.values())[0]
+    else:
+         return "Selected example not found or examples failed to load."
+def generate_random_spk(is_english):
+    if is_english:
+        rand_id = random.randint(0, 3250)
+        print(f"Generated random English Speaker ID: {rand_id}")
+        return rand_id
+    else:
+        rand_id = random.randint(0, 196)
+        print(f"Generated random Russian Speaker ID: {rand_id}")
+        return rand_id
+def Client_Synthesize_Audio(text, voice, voice2_path, spk_id, vcsteps, embscale, beta, ros, t, language_checkbox):
+    print("--- Client: Calling Synthesize_Audio ---")
+    print(f"Text: {text[:50]}...")
+    print(f"Default Voice: {voice}")
+    print(f"Uploaded Voice Path: {voice2_path}")
+    print(f"Speaker ID: {spk_id}")
+    print(f"Steps: {vcsteps}, Scale: {embscale}, Beta: {beta}, RoS: {ros}, T: {t}")
+    print(f"English Mode: {language_checkbox}")
+    voice2_arg = voice2_path
+    try:
+        if isinstance(client, DummyClient):
+            raise ConnectionError("Gradio client not connected.")
+        result = client.predict(
+            text,
+            voice,
+            voice2_arg,
+            spk_id,
+            vcsteps,
+            embscale,
+            beta,
+            ros,
+            t,
+            language_checkbox,
+            api_name="/Synthesize_Audio"
+        )
+        print("--- Client: Synthesize_Audio call successful ---")
+        return result
+    except Exception as e:
+        print(f"--- Client: Error calling Synthesize_Audio: {e} ---")
+        import numpy as np
+        return (44100, np.zeros(1))
+def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, ros, language_checkbox):
+    print("--- Client: Calling PromptedSynth_Text ---")
+    print(f"Text: {text[:50]}...")
+    print(f"Beta: {beta}, T: {t}, Steps: {diffusion_steps}, Scale: {embedding_scale}, RoS: {ros}")
+    print(f"English Mode: {language_checkbox}")
+    try:
+        if isinstance(client, DummyClient):
+            raise ConnectionError("Gradio client not connected.")
+        result = client.predict(
+            text,
+            beta,
+            t,
+            diffusion_steps,
+            embedding_scale,
+            ros,
+            language_checkbox,
+            api_name="/PromptedSynth_Text"
+        )
+        print("--- Client: PromptedSynth_Text call successful ---")
+        return result
+    except Exception as e:
+        print(f"--- Client: Error calling PromptedSynth_Text: {e} ---")
+        import numpy as np
+        return (44100, np.zeros(1))
+INTROTXT = """#
+Demo for The Poor Man's TTS, this is run on a single RTX 3090.
+Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/Project_Kanade_SpeechModel)
+**Check the Tips and Model Details tabs below.** <br>
+Enjoy!
+"""
+with gr.Blocks() as audio_inf:
+    gr.Markdown("### Synthesize speech using a reference audio clip (default, uploaded, or from speaker ID).")
+    with gr.Row():
+        with gr.Column(scale=1):
+            language_checkbox_audio = gr.Checkbox(label="English?", value=False,
+                                            info="Tick for English synthesis, leave unchecked for Russian.")
+            inp = gr.Textbox(label="Text",
+                             info="Enter the text for voice-guided synthesis.",
+                             value=ru_random_texts_list[0],
+                             interactive=True,
+                             scale=5)
+            voice = gr.Dropdown(choices=voicelist,
+                                label="Default Reference Voice",
+                                info="Select a pre-defined reference voice.",
+                                value=voicelist[0] if voicelist else None,
+                                interactive=True)
+            voice_2 = gr.Audio(label="Upload Your Audio Reference (Overrides Default Voice & Speaker ID)",
+                               sources=["upload", "microphone"],
+                               interactive=True,
+                               type='filepath',
+                               info="Upload a short (5-15s) clear audio clip.",
+                               waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
+            spk_id = gr.Number(label="Speaker ID (Alternative Reference)",
+                               info="Input speaker ID (max 196 Ru / 3250 En) to use a random sample from that speaker on the server. 9999 disables.",
+                               value=9999,
+                               interactive=True)
+            random_spk_btn = gr.Button("Random")
+            with gr.Accordion("Advanced Parameters", open=False):
+                beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
+                                 label="Beta (Style Strength vs. Reference)",
+                                 info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
+                                 interactive=True)
+                multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
+                                              label="Diffusion Steps",
+                                              info="More steps can improve quality but increase inference time.",
+                                              interactive=True)
+                embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
+                                     label="Embedding Scale (Intensity)",
+                                     info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
+                                     interactive=True)
+                rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
+                                           value=1,
+                                           step=0.1,
+                                           label="Rate of Speech",
+                                           info="Adjusts speech speed. 1.0 is normal.",
+                                           interactive=True)
+                t = gr.Slider(minimum=0.1, maximum=2, value=1.0, step=0.1,
+                              label="T (Duration / Temperature)",
+                              info="Controls duration scaling and randomness (T primarily affects English).",
+                              interactive=True)
+        with gr.Column(scale=1):
+            btn = gr.Button("Synthesize (Voice Guided)", variant="primary")
+            audio = gr.Audio(interactive=False,
+                             label="Synthesized Audio",
+                             waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
+    def update_audio_inf_defaults(is_english):
+        new_text_value = en_random_texts_list[0] if is_english else ru_random_texts_list[0]
+        new_spk_info = "Input speaker ID (max 3250 En) or use Randomize. 9999 disables." if is_english else "Input speaker ID (max 196 Ru) or use Randomize. 9999 disables."
+        new_spk_val = 9999
+        return gr.update(value=new_text_value), gr.update(info=new_spk_info, value=new_spk_val)
+    language_checkbox_audio.change(update_audio_inf_defaults,
+                                   inputs=[language_checkbox_audio],
+                                   outputs=[inp, spk_id])
+    random_spk_btn.click(fn=generate_random_spk, inputs=[language_checkbox_audio], outputs=spk_id)
+    btn.click(Client_Synthesize_Audio,
+              inputs=[inp, voice, voice_2, spk_id, multispeakersteps, embscale, beta, rate_of_speech, t, language_checkbox_audio],
+              outputs=[audio],
+              concurrency_limit=4)
+with gr.Blocks() as longform:
+    gr.Markdown("### Synthesize speech using the text content itself to guide the style (semantic prompting).")
+    with gr.Row():
+        with gr.Column(scale=1):
+            language_checkbox_longform = gr.Checkbox(label="English?", value=False,
+                                               info="Tick for English synthesis, leave unchecked for Russian.")
+            inp_longform = gr.Textbox(label="Text",
+                                      info="Enter text; check the format from the examples.",
+                                      value=ru_prompt_texts_list[0],
+                                      lines=5,
+                                      interactive=True,
+                                      scale=5)
+            with gr.Row():
+                example_dropdown = gr.Dropdown(choices=list(ru_prompt_examples.keys()),
+                                               label="Example Prompts",
+                                               info="Select an example to load into the text box.",
+                                               value=list(ru_prompt_examples.keys())[0] if ru_prompt_examples else None,
+                                               interactive=True)
+            with gr.Accordion("Advanced Parameters", open=False):
+                beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
+                                          label="Beta (Style Strength vs. Semantic Prompt)",
+                                          info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
+                                          interactive=True)
+                diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=5, step=1,
+                                                     label="Diffusion Steps",
+                                                      info="More steps can improve quality but increase inference time.",
+                                                     interactive=True)
+                embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
+                                              label="Embedding Scale (Intensity)",
+                                              info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
+                                              interactive=True)
+                rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
+                                                    label="Rate of Speech",
+                                                    info="Adjusts speech speed. 1.0 is normal.",
+                                                    interactive=True)
+                t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
+                                        label="T (Style Consistency - Primarily English)",
+                                        info="Controls the influence of previous sentences' style on the current one.",
+                                        interactive=True)
+        with gr.Column(scale=1):
+            btn_longform = gr.Button("Synthesize (Text Guided)", variant="primary")
+            audio_longform = gr.Audio(interactive=False,
+                                      label="Synthesized Audio",
+                                      waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
+    def update_longform_defaults(is_english):
+        examples_dict = en_prompt_examples if is_english else ru_prompt_examples
+        new_choices = list(examples_dict.keys())
+        new_value = new_choices[0] if new_choices else None
+        new_text_value = examples_dict.get(new_value, list(examples_dict.values())[0] if examples_dict else ("Speaker: Example text." if is_english else "Диктор: Пример текст��."))
+        return gr.update(choices=new_choices, value=new_value), gr.update(value=new_text_value)
+    language_checkbox_longform.change(update_longform_defaults,
+                                      inputs=[language_checkbox_longform],
+                                      outputs=[example_dropdown, inp_longform])
+    example_dropdown.change(fn=update_text_input_longform,
+                            inputs=[example_dropdown, language_checkbox_longform],
+                            outputs=[inp_longform])
+    btn_longform.click(Client_PromptedSynth_Text,
+                        inputs=[inp_longform,
+                                beta_longform,
+                                t_longform,
+                                diffusion_steps_longform,
+                                embedding_scale_longform,
+                                rate_of_speech_longform,
+                                language_checkbox_longform],
+                        outputs=[audio_longform],
+                        concurrency_limit=4)
+# --- User Guide / Info Tab (Reformatted User Text) ---
+user_guide_text = f"""
+## Quick Notes:
+Everything in this demo & the repo (coming soon) is experimental. The main idea is just playing around with different things to see what works when you're limited to training on a pair of RTX 3090s.
+The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details.
+So far I focused on English and Russian, more can be covered.
+---
+### Voice-Guided Tab (Using Audio Reference)
+*   **Options:**
+    *   **Default Voices:** Pick one from the dropdown (these are stored locally).
+    *   **Upload Audio: ** While the data isn't nearly enough for zero-shotting, you can still test your own samples. make sure to decrease the beta if it didn't sound similar.
+    *   **Speaker ID:** Use a number (RU: 0-196, EN: 0-3250) to grab a random clip of that speaker from the server's dataset. Hit 'Randomize' to explore. (Invalid IDs use a default voice on the server).
+*   **Some notes:**
+    *   **Not all speakers are equal.** Randomized samples might give you a poor reference sometimes.
+    *   **Play with Beta:** Values from 0.2 to 0.9 can work well. Higher Beta = LESS like the reference. It works great for some voices, breaks others. please play with different values. (0 = diffusion off).
+---
+### Text-Guided Tab (Using Text Meaning)
+*   **Intuition:** Figure out the voice style just from the text itself (using semantic encoders). No audio needed, which makes suitable for real-time use cases.
+*   **Speaker Prefix:** For Russian, you can use 'Speaker_ + number:'. as for the English, you can use any names. names were randomly assigned during the training of the Encoder.
+---
+### General Tips
+*   Punctuation matters for intonation; don't use unsupported symbols.
+"""
+with gr.Blocks() as info_tab:
+    gr.Markdown(user_guide_text)
+# --- Model Details Tab (Reformatted User Text) ---
+model_details_text = """
+## Model Details (The Guts)
+---
+### Darya (Russian Model) - More Stable
+*   Generally more controlled than the English one. that's also why in terms of acoustic quality it should sound much better.
+*   **Setup:** Non-End-to-End (separate steps).
+*   **Components:**
+    *   Style Encoder: Conformer-based.
+    *   Duration Predictor: Conformer-based (with cross-attention).
+    *   Semantic Encoder: `RuModernBERT-base` (for text-guidance).
+    *   Diffusion Sampler: **None currently.**
+*   **Vocoder:** [RiFornet](https://github.com/Respaired/RiFornet_Vocoder)
+*   **Training:** ~200K steps on ~320 hours of Russian data (mix of conversation & narration, hundreds of speakers).
+*   **Size:** Lightweight (~< 200M params).
+*   **Specs:** 44.1kHz output, 128 mel bins.
+---
+### Kalliope (English Model) - Wild
+*   **Overall Vibe:** More expressive potential, but also less predictable. Showed signs of overfitting on the noisy data.
+*   **Setup:** Non-End-to-End.
+*   **Components:**
+    *   Style Encoder: Conformer-based.
+    *   Text Encoder: `ConvNextV2`.
+    *   Duration Predictor: Conformer-based (with cross-attention).
+    *   Acoustic Decoder: Conformer-based.
+    *   Semantic Encoder: `DeBERTa V3 Base` (for text-guided).
+    *   Diffusion Sampler: **Yes**
+*   **Vocoder:** [RiFornet](https://github.com/Respaired/RiFornet_Vocoder).
+*   **Training:** ~100K steps on ~300-400 hours of *very complex & noisy* English data (conversational, whisper, narration, wide emotion range).
+*   **Size:** Bigger (~1.2B params total, but not all active at once - training was surprisingly doable). Hidden dim 1024, Style vector 512.
+*   **Specs:** 44.1kHz output, 128 mel bins (but more than half the dataset were 22-24khz or even phone-call quality)
+---
+*More details might show up in a blog post later.*
+"""
+with gr.Blocks() as model_details_tab:
+    gr.Markdown(model_details_text)
+theme = gr.themes.Base(
+    font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
+)
+app = gr.TabbedInterface(
+    [longform, audio_inf, info_tab, model_details_tab],
+    ['Text-guided Synthesis', 'Voice-guided Synthesis', 'Intuition & Tips', 'Model Details'],
+    title="The Poor Man's TTS (Experimental)",
+    theme="Respair/[email protected]"
+)
+if __name__ == "__main__":
+    print("Launching Client Gradio App...")
+    app.queue(api_open=False, max_size=15).launch(show_api=False, share=True)