Spaces:

MALIBA-AI
/

BambaraText2Speech

Running on Zero

App Files Files Community

sudoping01 commited on 12 days ago

Commit

254ef61

verified ·

1 Parent(s): 379fa64

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -198

app.py CHANGED Viewed

@@ -1,75 +1,59 @@
-import os
-os.environ["TORCHDYNAMO_DISABLE"] = "1"
-os.environ["TORCH_COMPILE_DISABLE"] = "1"
-os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-import torch
 import gradio as gr
 import numpy as np
 import spaces
-import logging
 from huggingface_hub import login
-import threading
-torch._dynamo.config.disable = True
-torch._dynamo.config.suppress_errors = True
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 hf_token = os.getenv("HF_TOKEN")
 if hf_token:
     login(token=hf_token)
-tts_model = None
-speakers_dict = None
-model_initialized = False
-model_initialized_lock = threading.Lock()
-@spaces.GPU()
-def initialize_model():
-    """Initialize the TTS model and speakers - called once with GPU context"""
-    global tts_model, speakers_dict, model_initialized, model_initialized_lock
-    with model_initialized_lock :
-        if not model_initialized:
-            logger.info("Initializing Bambara TTS model...")
-            try:
-                from maliba_ai.tts.inference import BambaraTTSInference
-                from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
-                tts_model = BambaraTTSInference()
-                speakers_dict = {
-                    "Adame": Adame,
-                    "Moussa": Moussa,
-                    "Bourama": Bourama,
-                    "Modibo": Modibo,
-                    "Seydou": Seydou
-                }
-                model_initialized = True
-                logger.info("Model initialized successfully!")
-            except Exception as e:
-                logger.error(f"Failed to initialize model: {e}")
-                raise e
-    return tts_model, speakers_dict
 def validate_inputs(text, temperature, top_k, top_p, max_tokens):
     if not text or not text.strip():
         return False, "Please enter some Bambara text."
-    if not (0.001 <= temperature <= 2.0):
-        return False, "Temperature must be between 0.001 and 2.0"
     if not (1 <= top_k <= 100):
         return False, "Top-K must be between 1 and 100"
@@ -77,20 +61,24 @@ def validate_inputs(text, temperature, top_k, top_p, max_tokens):
     if not (0.1 <= top_p <= 1.0):
         return False, "Top-P must be between 0.1 and 1.0"
     return True, ""
 @spaces.GPU()
 def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
-    global tts_model, speakers_dict, model_initialized, model_initialized_lock
     if not text.strip():
         return None, "Please enter some Bambara text."
     try:
-        with model_initialized_lock :
-            if not model_initialized :
-                tts, speakers = initialize_model()
-        speaker = speakers[speaker_name]
         if use_advanced:
             is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
@@ -106,6 +94,7 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
                 max_new_audio_tokens=int(max_tokens)
             )
         else:
             waveform = tts.generate_speech(
                 text=text.strip(),
                 speaker_id=speaker
@@ -118,12 +107,9 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
         return (sample_rate, waveform), f"✅ Audio generated successfully"
     except Exception as e:
-        logger.error(f"Speech generation failed: {e}")
         return None, f"❌ Error: {str(e)}"
-SPEAKER_NAMES = ["Adame", "Moussa", "Bourama", "Modibo", "Seydou"]
 examples = [
     ["Aw ni ce", "Adame"],
     ["Mali bɛna diya kɔsɛbɛ,  ka a da a kan baara bɛ ka kɛ.", "Moussa"],
@@ -137,157 +123,159 @@ examples = [
 ]
-def build_interface():
-    """Build the Gradio interface for Bambara TTS"""
-    with gr.Blocks(title="Bambara TTS - EXPERIMENTAL") as demo:
-        gr.Markdown("""
-        # 🎤 Bambara Text-to-Speech ⚠️ EXPERIMENTAL
-        **Powered by MALIBA-AI**
-        Convert Bambara text to speech. This model is currently experimental.
-        **Bambara** is spoken by millions of people in Mali and West Africa.
-        .
-        """)
-        with gr.Row():
-            with gr.Column(scale=2):
-                text_input = gr.Textbox(
-                    label="📝 Bambara Text",
-                    placeholder="Type your Bambara text here...",
-                    lines=3,
-                    max_lines=10,
-                    value="I ni ce"
-                )
-                speaker_dropdown = gr.Dropdown(
-                    choices=SPEAKER_NAMES,
-                    value="Adame",
-                    label="🗣️ Speaker Voice"
                 )
-                generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
-            with gr.Column(scale=1):
-                use_advanced = gr.Checkbox(
-                    label="⚙️ Use Advanced Settings",
-                    value=False,
-                    info="Enable to customize generation parameters"
                 )
-                with gr.Group(visible=False) as advanced_group:
-                    gr.Markdown("**Advanced Parameters:**")
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=2.0,
-                        value=0.8,
-                        step=0.1,
-                        label="Temperature",
-                        info="Higher = more varied"
-                    )
-                    top_k = gr.Slider(
-                        minimum=1,
-                        maximum=100,
-                        value=50,
-                        step=5,
-                        label="Top-K"
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.9,
-                        step=0.05,
-                        label="Top-P"
-                    )
-                    max_tokens = gr.Slider(
-                        minimum=256,
-                        maximum=4096,
-                        value=2048,
-                        step=256,
-                        label="Max Length"
-                    )
-        gr.Markdown("### 🔊 Generated Audio")
-        audio_output = gr.Audio(
-            label="Generated Speech",
-            type="numpy",
-            interactive=False
-        )
-        status_output = gr.Textbox(
-            label="Status",
-            interactive=False,
-            show_label=False,
-            container=False
-        )
-        with gr.Accordion("Try These Examples", open=True):
-            def load_example(text, speaker):
-                return text, speaker, False, 0.8, 50, 0.9, 2048
-            gr.Markdown("**Click any example below:**")
-            for i, (text, speaker) in enumerate(examples):
-                btn = gr.Button(f" {text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
-                btn.click(
-                    fn=lambda t=text, s=speaker: load_example(t, s),
-                    outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
                 )
-        with gr.Accordion(" About", open=False):
-            gr.Markdown("""
-            **⚠️ This is an experimental Bambara TTS model.**
-            - **Languages**: Bambara (bm)
-            - **Speakers**: 5 different voice options
-            - **Sample Rate**: 16kHz
-            """)
-        def toggle_advanced(use_adv):
-            return gr.Group(visible=use_adv)
-        use_advanced.change(
-            fn=toggle_advanced,
-            inputs=[use_advanced],
-            outputs=[advanced_group]
-        )
-        generate_btn.click(
-            fn=generate_speech,
-            inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
-            outputs=[audio_output, status_output],
-            show_progress=True
-        )
-        text_input.submit(
-            fn=generate_speech,
-            inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
-            outputs=[audio_output, status_output],
-            show_progress=True
-        )
-    return demo
-def main():
-    """Main function to launch the Gradio interface"""
-    logger.info("Starting Bambara TTS Gradio interface.")
-    interface = build_interface()
-    interface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
     )
-    logger.info("Gradio interface launched successfully.")
 if __name__ == "__main__":
-    main()

 import gradio as gr
 import numpy as np
+import os
 import spaces
+import sys
 from huggingface_hub import login
 hf_token = os.getenv("HF_TOKEN")
 if hf_token:
     login(token=hf_token)
+try:
+    from maliba_ai.tts.inference import BambaraTTSInference
+    from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
+    print("Loading Bambara TTS model...")
+    tts = BambaraTTSInference()
+    print("Model loaded successfully!")
+    MODEL_LOADED = True
+except Exception as e:
+    print(f"Error loading model: {e}")
+    MODEL_LOADED = False
+    tts = None
+if MODEL_LOADED:
+    SPEAKERS = {
+        "Adame": Adame,
+        "Moussa": Moussa,
+        "Bourama": Bourama,
+        "Modibo": Modibo,
+        "Seydou": Seydou
+    }
+else:
+    SPEAKERS = {
+        "Adame": "Adame",
+        "Moussa": "Moussa",
+        "Bourama": "Bourama",
+        "Modibo": "Modibo",
+        "Seydou": "Seydou"
+    }
 def validate_inputs(text, temperature, top_k, top_p, max_tokens):
+    """Validate user inputs"""
     if not text or not text.strip():
         return False, "Please enter some Bambara text."
+    if len(text.strip()) > 1000:
+        return False, "Text is too long. Please use shorter text (max 1000 characters)."
+    if not (0.1 <= temperature <= 2.0):
+        return False, "Temperature must be between 0.1 and 2.0"
     if not (1 <= top_k <= 100):
         return False, "Top-K must be between 1 and 100"
     if not (0.1 <= top_p <= 1.0):
         return False, "Top-P must be between 0.1 and 1.0"
+    if not (256 <= max_tokens <= 4096):
+        return False, "Max tokens must be between 256 and 4096"
     return True, ""
 @spaces.GPU()
 def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
+    """Generate speech from Bambara text"""
+    if not MODEL_LOADED:
+        return None, "❌ Model not loaded. Please check the logs for errors."
     if not text.strip():
         return None, "Please enter some Bambara text."
     try:
+        speaker = SPEAKERS[speaker_name]
         if use_advanced:
             is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
                 max_new_audio_tokens=int(max_tokens)
             )
         else:
             waveform = tts.generate_speech(
                 text=text.strip(),
                 speaker_id=speaker
         return (sample_rate, waveform), f"✅ Audio generated successfully"
     except Exception as e:
         return None, f"❌ Error: {str(e)}"
 examples = [
     ["Aw ni ce", "Adame"],
     ["Mali bɛna diya kɔsɛbɛ,  ka a da a kan baara bɛ ka kɛ.", "Moussa"],
 ]
+with gr.Blocks(title="Bambara TTS - EXPERIMENTAL", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎤 Bambara Text-to-Speech ⚠️ EXPERIMENTAL
+    Convert Bambara text to speech using AI. This model is currently experimental.
+    **Bambara** is spoken by millions of people in Mali and West Africa.
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Input section
+            text_input = gr.Textbox(
+                label="📝 Bambara Text",
+                placeholder="Type your Bambara text here...",
+                lines=3,
+                max_lines=6,
+                value="Aw ni ce"
+            )
+            # Speaker selection
+            speaker_dropdown = gr.Dropdown(
+                choices=list(SPEAKERS.keys()),
+                value="Adame",
+                label="🗣️ Speaker Voice"
+            )
+            # Generation button
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            # Advanced settings toggle
+            use_advanced = gr.Checkbox(
+                label="⚙️ Use Advanced Settings",
+                value=False,
+                info="Enable to customize generation parameters"
+            )
+            # Advanced settings (hidden by default)
+            with gr.Group(visible=False) as advanced_group:
+                gr.Markdown("**Advanced Parameters:**")
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=2.0,
+                    value=0.8,
+                    step=0.1,
+                    label="Temperature",
+                    info="Higher = more varied"
                 )
+                top_k = gr.Slider(
+                    minimum=1,
+                    maximum=100,
+                    value=50,
+                    step=5,
+                    label="Top-K"
+                )
+                top_p = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.05,
+                    label="Top-P"
                 )
+                max_tokens = gr.Slider(
+                    minimum=256,
+                    maximum=4096,
+                    value=2048,
+                    step=256,
+                    label="Max Length"
                 )
+    # Output section
+    gr.Markdown("### 🔊 Generated Audio")
+    audio_output = gr.Audio(
+        label="Generated Speech",
+        type="numpy",
+        interactive=False
+    )
+    status_output = gr.Textbox(
+        label="Status",
+        interactive=False,
+        show_label=False,
+        container=False
+    )
+    # Examples section
+    with gr.Accordion("📚 Try These Examples", open=True):
+        def load_example(text, speaker):
+            return text, speaker, False, 0.8, 50, 0.9, 2048
+        gr.Markdown("**Click any example below:**")
+        example_buttons = []
+        for i, (text, speaker) in enumerate(examples):
+            btn = gr.Button(f"🎯 {text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
+            btn.click(
+                fn=lambda t=text, s=speaker: load_example(t, s),
+                outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
+            )
+    # Information section
+    with gr.Accordion("ℹ️ About", open=False):
+        gr.Markdown("""
+        **⚠️ This is an experimental Bambara TTS model.**
+        **Common Bambara Phrases:**
+        - **Aw ni ce** - Hello (formal)
+        - **I ni ce** - Hello (informal)
+        - **I ka kene wa?** - How are you?
+        - **Aw ni tile** - Good afternoon
+        **Available Speakers:** Adame, Moussa, Bourama, Modibo, Seydou
+        **Tips:**
+        - Start with default settings
+        - Use shorter texts for better results
+        - Try different speakers for variety
+        """)
+    # Toggle advanced settings visibility
+    def toggle_advanced(use_adv):
+        return gr.Group(visible=use_adv)
+    use_advanced.change(
+        fn=toggle_advanced,
+        inputs=[use_advanced],
+        outputs=[advanced_group]
+    )
+    # Wire up the interface
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
+        outputs=[audio_output, status_output]
     )
+    # Auto-generate on Enter key in text input
+    text_input.submit(
+        fn=generate_speech,
+        inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
+        outputs=[audio_output, status_output]
+    )
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )