import gradio as gr
import random
import os
import re
from gradio_client import Client, file
client = Client(os.environ['src'])
BASE_PATH = "Inference"
RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt")
EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt")
RU_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "prompt.txt")
EN_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "english_prompt.txt")
def load_texts(filepath):
if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '':
print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.")
if "random" in filepath: return ["Default example text."]
else: return ["Speaker: Default prompt text."]
try:
try:
with open(filepath, 'r', encoding='utf-8') as f:
return [line.strip() for line in f if line.strip()]
except UnicodeDecodeError:
print(f"Warning: UTF-8 decode failed for {filepath}. Trying 'cp1251' (common for Russian)...")
with open(filepath, 'r', encoding='cp1251') as f:
return [line.strip() for line in f if line.strip()]
except FileNotFoundError:
print(f"Warning: File not found - {filepath}")
if "english" in filepath and "random" in filepath:
return ["Example English text file not found."]
elif "random" in filepath:
return ["Пример русского текстового файла не найден."]
elif "english" in filepath and "prompt" in filepath:
return ["Speaker: Example English prompt file not found."]
elif "prompt" in filepath:
return ["Диктор: Пример русского файла подсказок не найден."]
else:
return ["Example text file not found."]
except Exception as e:
print(f"Error loading {filepath}: {e}")
return ["Error loading example texts."]
ru_random_texts_list = load_texts(RU_RANDOM_TEXTS_PATH)
en_random_texts_list = load_texts(EN_RANDOM_TEXTS_PATH)
ru_prompt_texts_list = load_texts(RU_PROMPT_TEXTS_PATH)
en_prompt_texts_list = load_texts(EN_PROMPT_TEXTS_PATH)
def create_example_dict(text_list):
if not text_list or not isinstance(text_list[0], str):
return {"No examples found": ""}
return {f"{text[:30]}...": text for text in text_list}
ru_prompt_examples = create_example_dict(ru_prompt_texts_list)
en_prompt_examples = create_example_dict(en_prompt_texts_list)
VOICE_DIR = "./reference_sample_wavs"
try:
if os.path.exists(VOICE_DIR) and os.path.isdir(VOICE_DIR):
voicelist = sorted([v for v in os.listdir(VOICE_DIR) if os.path.isfile(os.path.join(VOICE_DIR, v)) and v.lower().endswith(('.wav', '.mp3', '.flac'))])
if not voicelist:
print(f"Warning: No compatible audio files found in {VOICE_DIR}. Dropdown will be empty.")
voicelist = ["default.wav"]
else:
print(f"Warning: Voice directory not found or is not a directory: {VOICE_DIR}. Using placeholder list.")
voicelist = ["anna_studio.wav", "boris_clear.wav", "female_neutral.wav", "male_deep.wav"]
except Exception as e:
print(f"Error listing voices in {VOICE_DIR}: {e}")
voicelist = ["error_loading_voices"]
def update_text_input_longform(preview_key, is_english):
examples_dict = en_prompt_examples if is_english else ru_prompt_examples
if preview_key in examples_dict:
return examples_dict[preview_key]
elif examples_dict:
return list(examples_dict.values())[0]
else:
return "Selected example not found or examples failed to load."
def generate_random_spk(is_english):
if is_english:
rand_id = random.randint(0, 2006)
print(f"Generated random English Speaker ID: {rand_id}")
return rand_id
else:
rand_id = random.randint(0, 196)
print(f"Generated random Russian Speaker ID: {rand_id}")
return rand_id
def Client_Synthesize_Audio(text, voice, voice2_path, spk_id, vcsteps, embscale, beta, ros, t, language_checkbox):
print("--- Client: Calling Synthesize_Audio ---")
print(f"Text: {text[:50]}...")
print(f"Default Voice: {voice}")
print(f"Uploaded Voice Path: {voice2_path}")
print(f"Speaker ID: {spk_id}")
print(f"Steps: {vcsteps}, Scale: {embscale}, Beta: {beta}, RoS: {ros}, T: {t}")
print(f"English Mode: {language_checkbox}")
voice2_arg = voice2_path
try:
result = client.predict(
text,
voice,
voice2_arg,
spk_id,
vcsteps,
embscale,
beta,
ros,
t,
language_checkbox,
api_name="/Synthesize_Audio"
)
print("--- Client: Synthesize_Audio call successful ---")
return result
except Exception as e:
print(f"--- Client: Error calling Synthesize_Audio: {e} ---")
import numpy as np
return (44100, np.zeros(1))
def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, ros, language_checkbox):
print("--- Client: Calling PromptedSynth_Text ---")
print(f"Text: {text[:50]}...")
print(f"Beta: {beta}, T: {t}, Steps: {diffusion_steps}, Scale: {embedding_scale}, RoS: {ros}")
print(f"English Mode: {language_checkbox}")
try:
result = client.predict(
text,
beta,
t,
diffusion_steps,
embedding_scale,
ros,
language_checkbox,
api_name="/PromptedSynth_Text"
)
print("--- Client: PromptedSynth_Text call successful ---")
return result
except Exception as e:
print(f"--- Client: Error calling PromptedSynth_Text: {e} ---")
import numpy as np
return (44100, np.zeros(1))
# Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/xxx) later
INTROTXT = """#
Demo for The Poor Man's TTS, this is run on a single RTX 3090.
These networks can only generate natural speech with correct intonations (i.e generating NSFW, non-speech sounds, stutters etc. doesn't work.)
Repo -> [Github](https://github.com/Respaired/Project_Kalliope)
"""
with gr.Blocks() as audio_inf:
with gr.Row():
with gr.Column(scale=1):
language_checkbox_audio = gr.Checkbox(label="English?", value=False,
info="Tick for English synthesis, leave unchecked for Russian.")
inp = gr.Textbox(label="Text",
info="Enter the text for voice-guided synthesis.",
value=ru_random_texts_list[0],
interactive=True,
scale=5)
voice = gr.Dropdown(choices=voicelist,
label="Default Reference Voice (make sure it matches the language)",
info="Select a pre-defined reference voice.",
value=voicelist[0] if voicelist else None,
interactive=True)
voice_2 = gr.Audio(label="Upload Your Audio Reference (Overrides Default Voice & Speaker ID)",
sources=["upload", "microphone"],
interactive=True,
type='filepath',
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
spk_id = gr.Number(label="Speaker ID (randomly picking a sample based on the ID - may result in subpar / broken audio)",
info="Input speaker ID (max 196 Ru / 2006 En) to use a random sample from that speaker on the server. 9999 disables.",
value=9999,
interactive=True)
random_spk_btn = gr.Button("Random")
with gr.Accordion("Advanced Parameters", open=False):
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1,
label="Beta (Diffusion Strength vs. Reference) - Kalliope Only",
info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
interactive=True)
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
label="Diffusion Steps - Kalliope Only",
info="More steps can improve quality but increase inference time.",
interactive=True)
embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
label="Embedding Scale (Intensity) - Kalliope Only",
info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
interactive=True)
rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
value=1,
step=0.1,
label="Rate of Speech",
info="Adjusts speech speed. 1.0 is normal.",
interactive=True)
t = gr.Slider(minimum=0.1, maximum=2, value=1.0, step=0.1,
label="T (Duration / Temperature)",
info="Controls duration scaling and randomness (T primarily affects English).",
interactive=True)
with gr.Column(scale=1):
btn = gr.Button("Synthesize (Voice Guided)", variant="primary")
audio = gr.Audio(interactive=False,
label="Synthesized Audio",
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
def update_audio_inf_defaults(is_english):
new_text_value = en_random_texts_list[0] if is_english else ru_random_texts_list[0]
new_spk_info = "Input speaker ID (max 2006 En) or use Randomize. 9999 disables." if is_english else "Input speaker ID (max 196 Ru) or use Randomize. 9999 disables."
new_spk_val = 9999
return gr.update(value=new_text_value), gr.update(info=new_spk_info, value=new_spk_val)
language_checkbox_audio.change(update_audio_inf_defaults,
inputs=[language_checkbox_audio],
outputs=[inp, spk_id])
random_spk_btn.click(fn=generate_random_spk, inputs=[language_checkbox_audio], outputs=spk_id)
btn.click(Client_Synthesize_Audio,
inputs=[inp, voice, voice_2, spk_id, multispeakersteps, embscale, beta, rate_of_speech, t, language_checkbox_audio],
outputs=[audio],
concurrency_limit=4)
with gr.Blocks() as longform:
with gr.Row():
with gr.Column(scale=1):
language_checkbox_longform = gr.Checkbox(label="English?", value=False,
info="Tick for English synthesis, leave unchecked for Russian.")
inp_longform = gr.Textbox(label="Text",
info="Enter text; check the format from the examples.",
value=ru_prompt_texts_list[0],
lines=5,
interactive=True,
scale=5)
with gr.Row():
example_dropdown = gr.Dropdown(choices=list(ru_prompt_examples.keys()),
label="Example Prompts",
info="Select an example to load into the text box.",
value=list(ru_prompt_examples.keys())[0] if ru_prompt_examples else None,
interactive=True)
with gr.Accordion("Advanced Parameters", open=False):
beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
label="Beta (Diffusion Strength vs. Semantic Encoder) - Kalliope Only",
info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
interactive=True)
diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=5, step=1,
label="Diffusion Steps - Kalliope Only",
info="More steps can improve quality but increase inference time.",
interactive=True)
embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
label="Embedding Scale (Intensity) - Kalliope Only",
info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
interactive=True)
rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
label="Rate of Speech",
info="Adjusts speech speed. 1.0 is normal.",
interactive=True)
t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
label="T (Style Consistency - Primarily English)",
info="Controls the influence of previous sentences' style on the current one.",
interactive=True)
with gr.Column(scale=1):
btn_longform = gr.Button("Synthesize (Text Guided)", variant="primary")
audio_longform = gr.Audio(interactive=False,
label="Synthesized Audio",
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
def update_longform_defaults(is_english):
examples_dict = en_prompt_examples if is_english else ru_prompt_examples
new_choices = list(examples_dict.keys())
new_value = new_choices[0] if new_choices else None
new_text_value = examples_dict.get(new_value, list(examples_dict.values())[0] if examples_dict else ("Speaker: Example text." if is_english else "Диктор: Пример текста."))
return gr.update(choices=new_choices, value=new_value), gr.update(value=new_text_value)
language_checkbox_longform.change(update_longform_defaults,
inputs=[language_checkbox_longform],
outputs=[example_dropdown, inp_longform])
example_dropdown.change(fn=update_text_input_longform,
inputs=[example_dropdown, language_checkbox_longform],
outputs=[inp_longform])
btn_longform.click(Client_PromptedSynth_Text,
inputs=[inp_longform,
beta_longform,
t_longform,
diffusion_steps_longform,
embedding_scale_longform,
rate_of_speech_longform,
language_checkbox_longform],
outputs=[audio_longform],
concurrency_limit=4)
user_guide_html = f"""
This is run on a single RTX 3090.
These networks can only generate natural speech with correct intonations (i.e generating NSFW, non-speech sounds, stutters etc. doesn't work)
Make sure your inputs are not too short (more than a sentence long).
I will gradually update here and -> Github
Everything in this demo & the repo (coming soon) is experimental. The main idea is just playing around with different things to see what works when you're limited to training on a pair of RTX 3090s.
The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details.
So far I focused on English and Russian, more can be covered.
Generally more controlled than the English one. That's also why in terms of acoustic quality it should sound much better.
RuModernBERT-base
(for text-guidance).More expressive potential, but also less predictable. Showed signs of overfitting on the noisy data.
ConvNextV2
.DeBERTa V3 Base
(for text-guided).More details might show up in a blog post later.