VoiceClone-TTS

Running on Zero

App Files Files Community

Steveeeeeeen HF Staff commited on Feb 11

Commit

15961ae

verified ·

1 Parent(s): d7ca016

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -34

app.py CHANGED Viewed

@@ -5,44 +5,42 @@ import gradio as gr
 from zonos.model import Zonos
 from zonos.conditioning import make_cond_dict, supported_language_codes
-# Global cache to hold the loaded model
-MODEL = None
 device = "cuda"
-def load_model():
     """
-    Loads the Zonos model once and caches it globally.
-    Adjust the model name if you want to switch from hybrid to transformer, etc.
     """
-    global MODEL
-    if MODEL is None:
-        model_name = "Zyphra/Zonos-v0.1-hybrid"
         print(f"Loading model: {model_name}")
-        MODEL = Zonos.from_pretrained(model_name, device="cuda")
-        MODEL = MODEL.requires_grad_(False).eval()
-        MODEL.bfloat16()  # optional if your GPU supports bfloat16
-        print("Model loaded successfully!")
-    return MODEL
-def tts(text, speaker_audio, selected_language):
     """
-    text: str
     speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
-    selected_language: str (e.g., "en-us", "es-es", etc.)
     Returns (sample_rate, waveform) for Gradio audio output.
     """
-    model = load_model()
-    # If no text, return None
     if not text:
         return None
-    # If no reference audio, return None
     if speaker_audio is None:
         return None
-    # Gradio provides audio in (sample_rate, numpy_array)
     sr, wav_np = speaker_audio
     # Convert to Torch tensor: shape (1, num_samples)
@@ -58,9 +56,9 @@ def tts(text, speaker_audio, selected_language):
     # Prepare conditioning dictionary
     cond_dict = make_cond_dict(
-        text=text,                   # The text prompt
-        speaker=spk_embedding,       # Speaker embedding
-        language=selected_language,  # Language from the Dropdown
         device=device,
     )
     conditioning = model.prepare_conditioning(cond_dict)
@@ -77,7 +75,7 @@ def tts(text, speaker_audio, selected_language):
 def build_demo():
     with gr.Blocks() as demo:
-        gr.Markdown("# Simple Zonos TTS Demo (Text + Reference Audio + Language)")
         with gr.Row():
             text_input = gr.Textbox(
@@ -89,23 +87,28 @@ def build_demo():
                 label="Reference Audio (Speaker Cloning)",
                 type="numpy"
             )
-        # Add a dropdown for language selection
         language_dropdown = gr.Dropdown(
-            label="Language",
-            choices=supported_language_codes,
             value="en-us",
-            interactive=True
         )
         generate_button = gr.Button("Generate")
-        # The output is an audio widget that Gradio will play
         audio_output = gr.Audio(label="Synthesized Output", type="numpy")
-        # Bind the generate button: pass text, reference audio, and selected language
         generate_button.click(
             fn=tts,
-            inputs=[text_input, ref_audio_input, language_dropdown],
             outputs=audio_output,
         )

 from zonos.model import Zonos
 from zonos.conditioning import make_cond_dict, supported_language_codes
+# We'll keep a global dictionary of loaded models to avoid reloading
+MODELS_CACHE = {}
 device = "cuda"
+def load_model(model_name: str):
     """
+    Loads or retrieves a cached Zonos model, sets it to eval and bfloat16.
     """
+    global MODELS_CACHE
+    if model_name not in MODELS_CACHE:
         print(f"Loading model: {model_name}")
+        model = Zonos.from_pretrained(model_name, device=device)
+        model = model.requires_grad_(False).eval()
+        model.bfloat16()  # optional if GPU supports bfloat16
+        MODELS_CACHE[model_name] = model
+        print(f"Model loaded successfully: {model_name}")
+    return MODELS_CACHE[model_name]
+def tts(text, speaker_audio, selected_language, model_choice):
     """
+    text: str (Text prompt to synthesize)
     speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
+    selected_language: str (language code)
+    model_choice: str (which Zonos model to use, e.g., "Zyphra/Zonos-v0.1-hybrid")
     Returns (sample_rate, waveform) for Gradio audio output.
     """
+    # Load the selected model
+    model = load_model(model_choice)
     if not text:
         return None
     if speaker_audio is None:
         return None
+    # Gradio gives audio in the format (sample_rate, numpy_array)
     sr, wav_np = speaker_audio
     # Convert to Torch tensor: shape (1, num_samples)
     # Prepare conditioning dictionary
     cond_dict = make_cond_dict(
+        text=text,
+        speaker=spk_embedding,
+        language=selected_language,
         device=device,
     )
     conditioning = model.prepare_conditioning(cond_dict)
 def build_demo():
     with gr.Blocks() as demo:
+        gr.Markdown("# Simple Zonos TTS Demo")
         with gr.Row():
             text_input = gr.Textbox(
                 label="Reference Audio (Speaker Cloning)",
                 type="numpy"
             )
+        # Model dropdown
+        model_dropdown = gr.Dropdown(
+            label="Model Choice",
+            choices=["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"],
+            value="Zyphra/Zonos-v0.1-hybrid",
+            interactive=True,
+        )
+        # Language dropdown (you can filter or use all from supported_language_codes)
         language_dropdown = gr.Dropdown(
+            label="Language Code",
+            choices=["en-us", "es-es", "fr-fr", "de-de", "it"],
             value="en-us",
+            interactive=True,
         )
         generate_button = gr.Button("Generate")
         audio_output = gr.Audio(label="Synthesized Output", type="numpy")
         generate_button.click(
             fn=tts,
+            inputs=[text_input, ref_audio_input, language_dropdown, model_dropdown],
             outputs=audio_output,
         )