import spaces import gradio as gr import random import os import re from gradio_client import Client, file client = Client(os.environ['src']) BASE_PATH = "Inference" RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt") EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt") RU_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "prompt.txt") EN_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "english_prompt.txt") @spaces.GPU def dummy(): return def load_texts(filepath): if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '': print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.") if "random" in filepath: return ["Default example text."] else: return ["Speaker: Default prompt text."] try: try: with open(filepath, 'r', encoding='utf-8') as f: return [line.strip() for line in f if line.strip()] except UnicodeDecodeError: print(f"Warning: UTF-8 decode failed for {filepath}. Trying 'cp1251' (common for Russian)...") with open(filepath, 'r', encoding='cp1251') as f: return [line.strip() for line in f if line.strip()] except FileNotFoundError: print(f"Warning: File not found - {filepath}") if "english" in filepath and "random" in filepath: return ["Example English text file not found."] elif "random" in filepath: return ["Пример русского текстового файла не найден."] elif "english" in filepath and "prompt" in filepath: return ["Speaker: Example English prompt file not found."] elif "prompt" in filepath: return ["Диктор: Пример русского файла подсказок не найден."] else: return ["Example text file not found."] except Exception as e: print(f"Error loading {filepath}: {e}") return ["Error loading example texts."] ru_random_texts_list = load_texts(RU_RANDOM_TEXTS_PATH) en_random_texts_list = load_texts(EN_RANDOM_TEXTS_PATH) ru_prompt_texts_list = load_texts(RU_PROMPT_TEXTS_PATH) en_prompt_texts_list = load_texts(EN_PROMPT_TEXTS_PATH) def create_example_dict(text_list): if not text_list or not isinstance(text_list[0], str): return {"No examples found": ""} return {f"{text[:30]}...": text for text in text_list} ru_prompt_examples = create_example_dict(ru_prompt_texts_list) en_prompt_examples = create_example_dict(en_prompt_texts_list) VOICE_DIR = "./reference_sample_wavs" try: if os.path.exists(VOICE_DIR) and os.path.isdir(VOICE_DIR): voicelist = sorted([v for v in os.listdir(VOICE_DIR) if os.path.isfile(os.path.join(VOICE_DIR, v)) and v.lower().endswith(('.wav', '.mp3', '.flac'))]) if not voicelist: print(f"Warning: No compatible audio files found in {VOICE_DIR}. Dropdown will be empty.") voicelist = ["default.wav"] else: print(f"Warning: Voice directory not found or is not a directory: {VOICE_DIR}. Using placeholder list.") voicelist = ["anna_studio.wav", "boris_clear.wav", "female_neutral.wav", "male_deep.wav"] except Exception as e: print(f"Error listing voices in {VOICE_DIR}: {e}") voicelist = ["error_loading_voices"] def update_text_input_longform(preview_key, is_english): examples_dict = en_prompt_examples if is_english else ru_prompt_examples if preview_key in examples_dict: return examples_dict[preview_key] elif examples_dict: return list(examples_dict.values())[0] else: return "Selected example not found or examples failed to load." def generate_random_spk(is_english): if is_english: rand_id = random.randint(0, 2006) print(f"Generated random English Speaker ID: {rand_id}") return rand_id else: rand_id = random.randint(0, 196) print(f"Generated random Russian Speaker ID: {rand_id}") return rand_id def Client_Synthesize_Audio(text, voice, voice2_path, spk_id, vcsteps, embscale, beta, ros, t, language_checkbox): print("--- Client: Calling Synthesize_Audio ---") print(f"Text: {text[:50]}...") print(f"Default Voice: {voice}") print(f"Uploaded Voice Path: {voice2_path}") print(f"Speaker ID: {spk_id}") print(f"Steps: {vcsteps}, Scale: {embscale}, Beta: {beta}, RoS: {ros}, T: {t}") print(f"English Mode: {language_checkbox}") if voice2_path is not None: voice2_path = {"path": voice2_path, "meta": {"_type": "gradio.FileData"}} voice2_arg = voice2_path try: result = client.predict( text, voice, voice2_arg, spk_id, vcsteps, embscale, beta, ros, t, language_checkbox, api_name="/Synthesize_Audio" ) print("--- Client: Synthesize_Audio call successful ---") return result except Exception as e: print(f"--- Client: Error calling Synthesize_Audio: {e} ---") import numpy as np return (44100, np.zeros(1)) def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, ros, language_checkbox): print("--- Client: Calling PromptedSynth_Text ---") print(f"Text: {text[:50]}...") print(f"Beta: {beta}, T: {t}, Steps: {diffusion_steps}, Scale: {embedding_scale}, RoS: {ros}") print(f"English Mode: {language_checkbox}") try: result = client.predict( text, beta, t, diffusion_steps, embedding_scale, ros, language_checkbox, api_name="/PromptedSynth_Text" ) print("--- Client: PromptedSynth_Text call successful ---") return result except Exception as e: print(f"--- Client: Error calling PromptedSynth_Text: {e} ---") import numpy as np return (44100, np.zeros(1)) # Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/xxx) later INTROTXT = """Update v0.01: Darya (RU) now supports style diffusion as well. """ with gr.Blocks() as audio_inf: with gr.Row(): with gr.Column(scale=1): language_checkbox_audio = gr.Checkbox(label="English?", value=False, info="Tick for English synthesis, leave unchecked for Russian.") inp = gr.Textbox(label="Text", info="Enter the text for voice-guided synthesis.", value=ru_random_texts_list[0], interactive=True, scale=5) voice = gr.Dropdown(choices=voicelist, label="Default Reference Voice (make sure it matches the language)", info="Select a pre-defined reference voice.", value=voicelist[7] if voicelist else None, interactive=True) voice_2 = gr.Audio(label="Upload Your Audio Reference (Overrides Default Voice & Speaker ID)", sources=["upload", "microphone"], interactive=True, type='filepath', waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'}) with gr.Accordion("Advanced Parameters", open=False): spk_id = gr.Number(label="Speaker ID (randomly picking a sample based on the ID - may result in subpar / broken audio)", info="Input speaker ID (max 196 Ru / 2006 En) to use a random sample from that speaker on the server. 9999 disables.", value=9999, interactive=True) random_spk_btn = gr.Button("Random") beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta (Diffusion Strength vs. Reference)", info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.", interactive=True) multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="More steps can improve quality but increase inference time.", interactive=True) embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1, label="Embedding Scale (Intensity)", info="Impacts expressiveness. High values (> 1.5) might cause artifacts.", interactive=True) rate_of_speech = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Rate of Speech", info="Adjusts speech speed. 1.0 is normal.", interactive=True) t = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.05, label="T (Duration / Temperature)", info="inflence of previous sentence on the current one", interactive=True) with gr.Column(scale=1): btn = gr.Button("Synthesize (Voice Guided)", variant="primary") audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'}) def update_audio_inf_defaults(is_english): new_text_value = en_random_texts_list[0] if is_english else ru_random_texts_list[0] new_spk_info = "Input speaker ID (max 2006 En) or use Randomize. 9999 disables." if is_english else "Input speaker ID (max 196 Ru) or use Randomize. 9999 disables." new_spk_val = 9999 return gr.update(value=new_text_value), gr.update(info=new_spk_info, value=new_spk_val) language_checkbox_audio.change(update_audio_inf_defaults, inputs=[language_checkbox_audio], outputs=[inp, spk_id]) random_spk_btn.click(fn=generate_random_spk, inputs=[language_checkbox_audio], outputs=spk_id) btn.click(Client_Synthesize_Audio, inputs=[inp, voice, voice_2, spk_id, multispeakersteps, embscale, beta, rate_of_speech, t, language_checkbox_audio], outputs=[audio], concurrency_limit=4) with gr.Blocks() as longform: with gr.Row(): with gr.Column(scale=1): language_checkbox_longform = gr.Checkbox(label="English?", value=False, info="Tick for English synthesis, leave unchecked for Russian.") inp_longform = gr.Textbox(label="Text", info="Enter text; check the format from the examples.", value=ru_prompt_texts_list[0], lines=5, interactive=True, scale=5) with gr.Row(): example_dropdown = gr.Dropdown(choices=list(ru_prompt_examples.keys()), label="Example Prompts", info="Select an example to load into the text box.", value=list(ru_prompt_examples.keys())[0] if ru_prompt_examples else None, interactive=True) with gr.Accordion("Advanced Parameters", open=False): beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Beta (Diffusion Strength vs. Semantic Encoder)", info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.", interactive=True) diffusion_steps_longform = gr.Slider(minimum=3, maximum=50, value=3, step=1, label="Diffusion Steps", info="More steps can improve diversity but increase inference time, it won't necessarily make it better.", interactive=True) embedding_scale_longform = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (Intensity)", info="Impacts expressiveness.", interactive=True) rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Rate of Speech", info="Adjusts speech speed. 1.0 is normal. it may not respond to tiny adjustments.", interactive=True) t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1, label="T (Style Consistency - Primarily English)", info="Controls the influence of previous sentences' style on the current one.", interactive=True) with gr.Column(scale=1): btn_longform = gr.Button("Synthesize (Text Guided)", variant="primary") audio_longform = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'}) def update_longform_defaults(is_english): examples_dict = en_prompt_examples if is_english else ru_prompt_examples new_choices = list(examples_dict.keys()) new_value = new_choices[0] if new_choices else None new_text_value = examples_dict.get(new_value, list(examples_dict.values())[0] if examples_dict else ("Speaker: Example text." if is_english else "Диктор: Пример текста.")) return gr.update(choices=new_choices, value=new_value), gr.update(value=new_text_value) language_checkbox_longform.change(update_longform_defaults, inputs=[language_checkbox_longform], outputs=[example_dropdown, inp_longform]) example_dropdown.change(fn=update_text_input_longform, inputs=[example_dropdown, language_checkbox_longform], outputs=[inp_longform]) btn_longform.click(Client_PromptedSynth_Text, inputs=[inp_longform, beta_longform, t_longform, diffusion_steps_longform, embedding_scale_longform, rate_of_speech_longform, language_checkbox_longform], outputs=[audio_longform], concurrency_limit=4) user_guide_html = f"""

Quick Notes:

This is run on a single RTX 3090.

These networks can only generate natural speech with correct intonations (i.e generating NSFW, non-speech sounds, stutters etc. doesn't work)

Make sure your inputs are not too short (more than a sentence long).

I will gradually update here and -> Github

Everything in this demo & the repo (coming soon) is experimental. The main idea is just playing around with different things to see what works when you're limited to training on a pair of RTX 3090s.

The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details.

So far I focused on English and Russian, more can be covered.


Voice-Guided Tab (Using Audio Reference)

Options:

Some notes:


Text-Guided Tab (Style is conditioned on the information and contents of the text)


General Tips

""" with gr.Blocks() as info_tab: gr.HTML(user_guide_html) # Use HTML component # --- Model Details Tab (Reformatted User Text) --- # Convert Markdown-like text to basic HTML for styling model_details_html = """

Model Details (The Guts)


Darya (Russian Model) - More Stable

Generally more controlled than the English one. That's also why in terms of acoustic quality it should sound much better.


Kalliope (English Model) - Wild

More expressive potential, but also less predictable. Showed signs of overfitting on the noisy data.


More details might show up in a blog post later.

""" with gr.Blocks() as model_details_tab: gr.HTML(model_details_html) # theme = gr.themes.Base( # font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'], # ) # app = gr.TabbedInterface( # [longform, audio_inf, info_tab, model_details_tab], # ['Text-guided Synthesis', 'Voice-guided Synthesis', 'Intuition & Tips', 'Model Details'], # title="The Poor Man's TTS (Experimental)", # theme="Respair/Shiki@2.1.4" # ) # if __name__ == "__main__": # print("Launching Client Gradio App...") # app.queue(api_open=False, max_size=15).launch(show_api=False, share=True) with gr.Blocks(title="The Poor Man's TTS (Experimental 🔧)", theme="Respair/Shiki@2.1.4") as demo: # gr.DuplicateButton("Duplicate Space") # gr.Markdown(INTROTXT) gr.TabbedInterface( [audio_inf, longform, info_tab, model_details_tab], ['Reference-guided Synthesis','Text-guided Synthesis', 'Intuition & Tips', 'Model Details'], title="The Poor Man's TTS (Experimental)", theme="Respair/Shiki@2.1.4" ) if __name__ == "__main__": demo.queue(api_open=False, max_size=15).launch(show_api=False, share=False)