Spaces:

MALIBA-AI
/

BambaraText2Speech

Running on Zero

File size: 8,093 Bytes

import os
import warnings

# Set environment variables BEFORE any imports to prevent CUDA initialization
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Hide CUDA during startup
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # For debugging

# Suppress warnings
warnings.filterwarnings("ignore")

import gradio as gr
import numpy as np
import spaces
from huggingface_hub import login

# These imports should now work without CUDA errors
from maliba_ai.tts.inference import BambaraTTSInference
from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou

hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(token=hf_token)

# Initialize TTS model (this will use CPU during startup)
print("Loading Bambara TTS model...")
tts = BambaraTTSInference()
print("Model loaded successfully!")

SPEAKERS = {
    "Adame": Adame,
    "Moussa": Moussa, 
    "Bourama": Bourama,
    "Modibo": Modibo,
    "Seydou": Seydou
}

def validate_inputs(text, temperature, top_k, top_p, max_tokens):
    """Validate user inputs"""
    if not text or not text.strip():
        return False, "Please enter some Bambara text."
    
    if not (0.001 <= temperature <= 1):
        return False, "Temperature must be between positive"
    
    if not (1 <= top_k <= 100):
        return False, "Top-K must be between 1 and 100"
    
    if not (0.1 <= top_p <= 1.0):
        return False, "Top-P must be between 0.1 and 1.0"
    
    return True, ""

@spaces.GPU()
def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
    
    if not text.strip():
        return None, "Please enter some Bambara text."
    
    try:
        # Re-enable CUDA for GPU context
        import torch
        if torch.cuda.is_available():
            # Remove CUDA visibility restriction for GPU execution
            if "CUDA_VISIBLE_DEVICES" in os.environ:
                os.environ.pop("CUDA_VISIBLE_DEVICES", None)
        
        speaker = SPEAKERS[speaker_name]
        
        if use_advanced:
            is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
            if not is_valid:
                return None, f"❌ {error_msg}"
            
            waveform = tts.generate_speech(
                text=text.strip(),
                speaker_id=speaker,
                temperature=temperature,
                top_k=int(top_k),
                top_p=top_p,
                max_new_audio_tokens=int(max_tokens)
            )
        else:
            waveform = tts.generate_speech(
                text=text.strip(),
                speaker_id=speaker
            )
        
        if waveform.size == 0:
            return None, "Failed to generate audio. Please try again."
        
        sample_rate = 16000
        return (sample_rate, waveform), f"✅ Audio generated successfully"
        
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

examples = [
    ["Aw ni ce", "Adame"],
    ["I ni ce", "Moussa"],
    ["Aw ni tile", "Bourama"],
    ["I ka kene wa?", "Modibo"],
    ["Ala ka Mali suma", "Adame"],
    ["sigikafɔ kɔnɔ jamanaw ni ɲɔgɔn cɛ, olu ye a haminankow ye, wa o ko ninnu ka kan ka kɛ sariya ani tilennenya kɔnɔ", "Seydou"],
    ["Aw ni ce. Ne tɔgɔ ye Kaya Magan. Aw Sanbe Sanbe.", "Moussa"],
    ["An dɔlakelen bɛ masike bilenman don ka tɔw gɛn.", "Bourama"],
    ["Aw ni ce. Seidu bɛ aw fo wa aw ka yafa a ma, ka da a kan tuma dɔw la kow ka can.", "Modibo"],
]

# Create Gradio interface
with gr.Blocks(title="Bambara TTS - EXPERIMENTAL", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎤 Bambara Text-to-Speech ⚠️ EXPERIMENTAL
    
    Convert Bambara text to speech using AI. This model is currently experimental.
    
    **Bambara** is spoken by millions of people in Mali and West Africa.
    
    ⚡ **Note**: Model loads on CPU during startup, then uses GPU for generation.
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            # Input section
            text_input = gr.Textbox(
                label="📝 Bambara Text",
                placeholder="Type your Bambara text here...",
                lines=3,
                max_lines=6,
                value="Aw ni ce"
            )
            
            speaker_dropdown = gr.Dropdown(
                choices=list(SPEAKERS.keys()),
                value="Adame",
                label="🗣️ Speaker Voice"
            )
            
            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
            
        with gr.Column(scale=1):
            use_advanced = gr.Checkbox(
                label="⚙️ Use Advanced Settings", 
                value=False,
                info="Enable to customize generation parameters"
            )
            
            with gr.Group(visible=False) as advanced_group:
                gr.Markdown("**Advanced Parameters:**")
                
                temperature = gr.Slider(
                    minimum=0.1, 
                    maximum=2.0, 
                    value=0.8, 
                    step=0.1,
                    label="Temperature",
                    info="Higher = more varied"
                )
                
                top_k = gr.Slider(
                    minimum=1, 
                    maximum=100, 
                    value=50, 
                    step=5,
                    label="Top-K"
                )
                
                top_p = gr.Slider(
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.9, 
                    step=0.05,
                    label="Top-P"
                )
                
                max_tokens = gr.Slider(
                    minimum=256, 
                    maximum=4096, 
                    value=2048, 
                    step=256,
                    label="Max Length"
                )
    
    gr.Markdown("### 🔊 Generated Audio")
    
    audio_output = gr.Audio(
        label="Generated Speech",
        type="numpy",
        interactive=False
    )
        
    status_output = gr.Textbox(
        label="Status",
        interactive=False,
        show_label=False,
        container=False
    )
    
    with gr.Accordion("📚 Try These Examples", open=True):
        def load_example(text, speaker):
            return text, speaker, False, 0.8, 50, 0.9, 2048
        
        gr.Markdown("**Click any example below:**")
        example_buttons = []
        
        for i, (text, speaker) in enumerate(examples):
            btn = gr.Button(f"🎯 {text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
            btn.click(
                fn=lambda t=text, s=speaker: load_example(t, s),
                outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
            )
    
    # Information section
    with gr.Accordion("ℹ️ About", open=False):
        gr.Markdown("""
        **⚠️ This is an experimental Bambara TTS model.**
        
        The model loads on CPU during startup to avoid CUDA initialization errors,
        then switches to GPU during speech generation for optimal performance.
        """)
    
    def toggle_advanced(use_adv):
        return gr.Group(visible=use_adv)
    
    use_advanced.change(
        fn=toggle_advanced,
        inputs=[use_advanced],
        outputs=[advanced_group]
    )
    
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
        outputs=[audio_output, status_output]
    )
    
    text_input.submit(
        fn=generate_speech,
        inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
        outputs=[audio_output, status_output]
    )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )