|
import torch |
|
import torchaudio |
|
import gradio as gr |
|
from os import getenv |
|
|
|
from zonos.model import Zonos, DEFAULT_BACKBONE_CLS as ZonosBackbone |
|
from zonos.conditioning import make_cond_dict, supported_language_codes |
|
from zonos.utils import DEFAULT_DEVICE as device |
|
|
|
CURRENT_MODEL_TYPE = None |
|
CURRENT_MODEL = None |
|
|
|
SPEAKER_EMBEDDING = None |
|
SPEAKER_AUDIO_PATH = None |
|
|
|
|
|
def load_model_if_needed(model_choice: str): |
|
global CURRENT_MODEL_TYPE, CURRENT_MODEL |
|
if CURRENT_MODEL_TYPE != model_choice: |
|
if CURRENT_MODEL is not None: |
|
del CURRENT_MODEL |
|
torch.cuda.empty_cache() |
|
print(f"Loading {model_choice} model...") |
|
CURRENT_MODEL = Zonos.from_pretrained(model_choice, device=device) |
|
CURRENT_MODEL.requires_grad_(False).eval() |
|
CURRENT_MODEL_TYPE = model_choice |
|
print(f"{model_choice} model loaded successfully!") |
|
return CURRENT_MODEL |
|
|
|
|
|
def update_ui(model_choice): |
|
""" |
|
Dynamically show/hide UI elements based on the model's conditioners. |
|
We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model. |
|
""" |
|
model = load_model_if_needed(model_choice) |
|
cond_names = [c.name for c in model.prefix_conditioner.conditioners] |
|
print("Conditioners in this model:", cond_names) |
|
|
|
text_update = gr.update(visible=("espeak" in cond_names)) |
|
language_update = gr.update(visible=("espeak" in cond_names)) |
|
speaker_audio_update = gr.update(visible=("speaker" in cond_names)) |
|
prefix_audio_update = gr.update(visible=True) |
|
emotion1_update = gr.update(visible=("emotion" in cond_names)) |
|
emotion2_update = gr.update(visible=("emotion" in cond_names)) |
|
emotion3_update = gr.update(visible=("emotion" in cond_names)) |
|
emotion4_update = gr.update(visible=("emotion" in cond_names)) |
|
emotion5_update = gr.update(visible=("emotion" in cond_names)) |
|
emotion6_update = gr.update(visible=("emotion" in cond_names)) |
|
emotion7_update = gr.update(visible=("emotion" in cond_names)) |
|
emotion8_update = gr.update(visible=("emotion" in cond_names)) |
|
vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names)) |
|
fmax_slider_update = gr.update(visible=("fmax" in cond_names)) |
|
pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names)) |
|
speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names)) |
|
dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) |
|
speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names)) |
|
unconditional_keys_update = gr.update( |
|
choices=[name for name in cond_names if name not in ("espeak", "language_id")] |
|
) |
|
|
|
return ( |
|
text_update, |
|
language_update, |
|
speaker_audio_update, |
|
prefix_audio_update, |
|
emotion1_update, |
|
emotion2_update, |
|
emotion3_update, |
|
emotion4_update, |
|
emotion5_update, |
|
emotion6_update, |
|
emotion7_update, |
|
emotion8_update, |
|
vq_single_slider_update, |
|
fmax_slider_update, |
|
pitch_std_slider_update, |
|
speaking_rate_slider_update, |
|
dnsmos_slider_update, |
|
speaker_noised_checkbox_update, |
|
unconditional_keys_update, |
|
) |
|
|
|
|
|
def generate_audio( |
|
model_choice, |
|
text, |
|
language, |
|
speaker_audio, |
|
prefix_audio, |
|
e1, |
|
e2, |
|
e3, |
|
e4, |
|
e5, |
|
e6, |
|
e7, |
|
e8, |
|
vq_single, |
|
fmax, |
|
pitch_std, |
|
speaking_rate, |
|
dnsmos_ovrl, |
|
speaker_noised, |
|
cfg_scale, |
|
top_p, |
|
top_k, |
|
min_p, |
|
linear, |
|
confidence, |
|
quadratic, |
|
seed, |
|
randomize_seed, |
|
unconditional_keys, |
|
progress=gr.Progress(), |
|
): |
|
""" |
|
Generates audio based on the provided UI parameters. |
|
We do NOT use language_id or ctc_loss even if the model has them. |
|
""" |
|
selected_model = load_model_if_needed(model_choice) |
|
|
|
speaker_noised_bool = bool(speaker_noised) |
|
fmax = float(fmax) |
|
pitch_std = float(pitch_std) |
|
speaking_rate = float(speaking_rate) |
|
dnsmos_ovrl = float(dnsmos_ovrl) |
|
cfg_scale = float(cfg_scale) |
|
top_p = float(top_p) |
|
top_k = int(top_k) |
|
min_p = float(min_p) |
|
linear = float(linear) |
|
confidence = float(confidence) |
|
quadratic = float(quadratic) |
|
seed = int(seed) |
|
max_new_tokens = 86 * 30 |
|
|
|
|
|
global SPEAKER_AUDIO_PATH, SPEAKER_EMBEDDING |
|
|
|
if randomize_seed: |
|
seed = torch.randint(0, 2**32 - 1, (1,)).item() |
|
torch.manual_seed(seed) |
|
|
|
if speaker_audio is not None and "speaker" not in unconditional_keys: |
|
if speaker_audio != SPEAKER_AUDIO_PATH: |
|
print("Recomputed speaker embedding") |
|
wav, sr = torchaudio.load(speaker_audio) |
|
SPEAKER_EMBEDDING = selected_model.make_speaker_embedding(wav, sr) |
|
SPEAKER_EMBEDDING = SPEAKER_EMBEDDING.to(device, dtype=torch.bfloat16) |
|
SPEAKER_AUDIO_PATH = speaker_audio |
|
|
|
audio_prefix_codes = None |
|
if prefix_audio is not None: |
|
wav_prefix, sr_prefix = torchaudio.load(prefix_audio) |
|
wav_prefix = wav_prefix.mean(0, keepdim=True) |
|
wav_prefix = selected_model.autoencoder.preprocess(wav_prefix, sr_prefix) |
|
wav_prefix = wav_prefix.to(device, dtype=torch.float32) |
|
audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0)) |
|
|
|
emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device) |
|
|
|
vq_val = float(vq_single) |
|
vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0) |
|
|
|
cond_dict = make_cond_dict( |
|
text=text, |
|
language=language, |
|
speaker=SPEAKER_EMBEDDING, |
|
emotion=emotion_tensor, |
|
vqscore_8=vq_tensor, |
|
fmax=fmax, |
|
pitch_std=pitch_std, |
|
speaking_rate=speaking_rate, |
|
dnsmos_ovrl=dnsmos_ovrl, |
|
speaker_noised=speaker_noised_bool, |
|
device=device, |
|
unconditional_keys=unconditional_keys, |
|
) |
|
conditioning = selected_model.prepare_conditioning(cond_dict) |
|
|
|
estimated_generation_duration = 30 * len(text) / 400 |
|
estimated_total_steps = int(estimated_generation_duration * 86) |
|
|
|
def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool: |
|
progress((step, estimated_total_steps)) |
|
return True |
|
|
|
codes = selected_model.generate( |
|
prefix_conditioning=conditioning, |
|
audio_prefix_codes=audio_prefix_codes, |
|
max_new_tokens=max_new_tokens, |
|
cfg_scale=cfg_scale, |
|
batch_size=1, |
|
sampling_params=dict(top_p=top_p, top_k=top_k, min_p=min_p, linear=linear, conf=confidence, quad=quadratic), |
|
callback=update_progress, |
|
) |
|
|
|
wav_out = selected_model.autoencoder.decode(codes).cpu().detach() |
|
sr_out = selected_model.autoencoder.sampling_rate |
|
if wav_out.dim() == 2 and wav_out.size(0) > 1: |
|
wav_out = wav_out[0:1, :] |
|
return (sr_out, wav_out.squeeze().numpy()), seed |
|
|
|
|
|
def build_interface(): |
|
supported_models = [] |
|
if "transformer" in ZonosBackbone.supported_architectures: |
|
supported_models.append("Zyphra/Zonos-v0.1-transformer") |
|
|
|
if "hybrid" in ZonosBackbone.supported_architectures: |
|
supported_models.append("Zyphra/Zonos-v0.1-hybrid") |
|
else: |
|
print( |
|
"| The current ZonosBackbone does not support the hybrid architecture, meaning only the transformer model will be available in the model selector.\n" |
|
"| This probably means the mamba-ssm library has not been installed." |
|
) |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
model_choice = gr.Dropdown( |
|
choices=supported_models, |
|
value=supported_models[0], |
|
label="Zonos Model Type", |
|
info="Select the model variant to use.", |
|
) |
|
text = gr.Textbox( |
|
label="Text to Synthesize", |
|
value="Zonos uses eSpeak for text to phoneme conversion!", |
|
lines=4, |
|
max_length=500, |
|
) |
|
language = gr.Dropdown( |
|
choices=supported_language_codes, |
|
value="en-us", |
|
label="Language Code", |
|
info="Select a language code.", |
|
) |
|
prefix_audio = gr.Audio( |
|
value="assets/silence_100ms.wav", |
|
label="Optional Prefix Audio (continue from this audio)", |
|
type="filepath", |
|
) |
|
with gr.Column(): |
|
speaker_audio = gr.Audio( |
|
label="Optional Speaker Audio (for cloning)", |
|
type="filepath", |
|
) |
|
speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("## Conditioning Parameters") |
|
dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall") |
|
fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Fmax (Hz)") |
|
vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score") |
|
pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Std") |
|
speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate") |
|
|
|
with gr.Column(): |
|
gr.Markdown("## Generation Parameters") |
|
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale") |
|
seed_number = gr.Number(label="Seed", value=420, precision=0) |
|
randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True) |
|
|
|
with gr.Accordion("Sampling", open=False): |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### NovelAi's unified sampler") |
|
linear_slider = gr.Slider(-2.0, 2.0, 0.5, 0.01, label="Linear (set to 0 to disable unified sampling)", info="High values make the output less random.") |
|
|
|
confidence_slider = gr.Slider(-2.0, 2.0, 0.40, 0.01, label="Confidence", info="Low values make random outputs more random.") |
|
quadratic_slider = gr.Slider(-2.0, 2.0, 0.00, 0.01, label="Quadratic", info="High values make low probablities much lower.") |
|
with gr.Column(): |
|
gr.Markdown("### Legacy sampling") |
|
top_p_slider = gr.Slider(0.0, 1.0, 0, 0.01, label="Top P") |
|
min_k_slider = gr.Slider(0.0, 1024, 0, 1, label="Min K") |
|
min_p_slider = gr.Slider(0.0, 1.0, 0, 0.01, label="Min P") |
|
|
|
with gr.Accordion("Advanced Parameters", open=False): |
|
gr.Markdown( |
|
"### Unconditional Toggles\n" |
|
"Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n" |
|
'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".' |
|
) |
|
with gr.Row(): |
|
unconditional_keys = gr.CheckboxGroup( |
|
[ |
|
"speaker", |
|
"emotion", |
|
"vqscore_8", |
|
"fmax", |
|
"pitch_std", |
|
"speaking_rate", |
|
"dnsmos_ovrl", |
|
"speaker_noised", |
|
], |
|
value=["emotion"], |
|
label="Unconditional Keys", |
|
) |
|
|
|
gr.Markdown( |
|
"### Emotion Sliders\n" |
|
"Warning: The way these sliders work is not intuitive and may require some trial and error to get the desired effect.\n" |
|
"Certain configurations can cause the model to become unstable. Setting emotion to unconditional may help." |
|
) |
|
with gr.Row(): |
|
emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness") |
|
emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness") |
|
emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust") |
|
emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear") |
|
with gr.Row(): |
|
emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise") |
|
emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger") |
|
emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other") |
|
emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral") |
|
|
|
with gr.Column(): |
|
generate_button = gr.Button("Generate Audio") |
|
output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True) |
|
|
|
model_choice.change( |
|
fn=update_ui, |
|
inputs=[model_choice], |
|
outputs=[ |
|
text, |
|
language, |
|
speaker_audio, |
|
prefix_audio, |
|
emotion1, |
|
emotion2, |
|
emotion3, |
|
emotion4, |
|
emotion5, |
|
emotion6, |
|
emotion7, |
|
emotion8, |
|
vq_single_slider, |
|
fmax_slider, |
|
pitch_std_slider, |
|
speaking_rate_slider, |
|
dnsmos_slider, |
|
speaker_noised_checkbox, |
|
unconditional_keys, |
|
], |
|
) |
|
|
|
|
|
demo.load( |
|
fn=update_ui, |
|
inputs=[model_choice], |
|
outputs=[ |
|
text, |
|
language, |
|
speaker_audio, |
|
prefix_audio, |
|
emotion1, |
|
emotion2, |
|
emotion3, |
|
emotion4, |
|
emotion5, |
|
emotion6, |
|
emotion7, |
|
emotion8, |
|
vq_single_slider, |
|
fmax_slider, |
|
pitch_std_slider, |
|
speaking_rate_slider, |
|
dnsmos_slider, |
|
speaker_noised_checkbox, |
|
unconditional_keys, |
|
], |
|
) |
|
|
|
|
|
generate_button.click( |
|
fn=generate_audio, |
|
inputs=[ |
|
model_choice, |
|
text, |
|
language, |
|
speaker_audio, |
|
prefix_audio, |
|
emotion1, |
|
emotion2, |
|
emotion3, |
|
emotion4, |
|
emotion5, |
|
emotion6, |
|
emotion7, |
|
emotion8, |
|
vq_single_slider, |
|
fmax_slider, |
|
pitch_std_slider, |
|
speaking_rate_slider, |
|
dnsmos_slider, |
|
speaker_noised_checkbox, |
|
cfg_scale_slider, |
|
top_p_slider, |
|
min_k_slider, |
|
min_p_slider, |
|
linear_slider, |
|
confidence_slider, |
|
quadratic_slider, |
|
seed_number, |
|
randomize_seed_toggle, |
|
unconditional_keys, |
|
], |
|
outputs=[output_audio, seed_number], |
|
) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = build_interface() |
|
share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t") |
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=share) |
|
|