import gradio as gr import numpy as np import os import spaces from huggingface_hub import login from maliba_ai.tts.inference import BambaraTTSInference from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou hf_token = os.getenv("HF_TOKEN") if hf_token: login(token=hf_token) print("Loading Bambara TTS model...") tts = BambaraTTSInference() print("Model loaded successfully!") SPEAKERS = { "Adame": Adame, "Moussa": Moussa, "Bourama": Bourama, "Modibo": Modibo, "Seydou": Seydou } def validate_inputs(text, temperature, top_k, top_p, max_tokens): """Validate user inputs""" if not text or not text.strip(): return False, "Please enter some Bambara text." if not (0.001 <= temperature <= 1): return False, "Temperature must be between positive" if not (1 <= top_k <= 100): return False, "Top-K must be between 1 and 100" if not (0.1 <= top_p <= 1.0): return False, "Top-P must be between 0.1 and 1.0" return True, "" @spaces.GPU() def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens): if not text.strip(): return None, "Please enter some Bambara text." try: speaker = SPEAKERS[speaker_name] if use_advanced: is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens) if not is_valid: return None, f"❌ {error_msg}" waveform = tts.generate_speech( text=text.strip(), speaker_id=speaker, temperature=temperature, top_k=int(top_k), top_p=top_p, max_new_audio_tokens=int(max_tokens) ) else: waveform = tts.generate_speech( text=text.strip(), speaker_id=speaker ) if waveform.size == 0: return None, "Failed to generate audio. Please try again." sample_rate = 16000 return (sample_rate, waveform), f"✅ Audio generated successfully" except Exception as e: return None, f"❌ Error: {str(e)}" examples = [ ["Aw ni ce", "Adame"], ["I ni ce", "Moussa"], ["Aw ni tile", "Bourama"], ["I ka kene wa?", "Modibo"], ["Ala ka Mali suma", "Adame"], ["sigikafɔ kɔnɔ jamanaw ni ɲɔgɔn cɛ, olu ye a haminankow ye, wa o ko ninnu ka kan ka kɛ sariya ani tilennenya kɔnɔ", "Seydou"], ["Aw ni ce. Ne tɔgɔ ye Kaya Magan. Aw Sanbe Sanbe.", "Moussa"], ["An dɔlakelen bɛ masike bilenman don ka tɔw gɛn.", "Bourama"], ["Aw ni ce. Seidu bɛ aw fo wa aw ka yafa a ma, ka da a kan tuma dɔw la kow ka can.", "Modibo"], ] # Create Gradio interface with gr.Blocks(title="Bambara TTS - EXPERIMENTAL", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎤 Bambara Text-to-Speech ⚠️ EXPERIMENTAL Convert Bambara text to speech using AI. This model is currently experimental. **Bambara** is spoken by millions of people in Mali and West Africa. """) with gr.Row(): with gr.Column(scale=2): # Input section text_input = gr.Textbox( label="📝 Bambara Text", placeholder="Type your Bambara text here...", lines=3, max_lines=6, value="Aw ni ce" ) speaker_dropdown = gr.Dropdown( choices=list(SPEAKERS.keys()), value="Adame", label="🗣️ Speaker Voice" ) generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): use_advanced = gr.Checkbox( label="⚙️ Use Advanced Settings", value=False, info="Enable to customize generation parameters" ) with gr.Group(visible=False) as advanced_group: gr.Markdown("**Advanced Parameters:**") temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature", info="Higher = more varied" ) top_k = gr.Slider( minimum=1, maximum=100, value=50, step=5, label="Top-K" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P" ) max_tokens = gr.Slider( minimum=256, maximum=4096, value=2048, step=256, label="Max Length" ) gr.Markdown("### 🔊 Generated Audio") audio_output = gr.Audio( label="Generated Speech", type="numpy", interactive=False ) status_output = gr.Textbox( label="Status", interactive=False, show_label=False, container=False ) with gr.Accordion("📚 Try These Examples", open=True): def load_example(text, speaker): return text, speaker, False, 0.8, 50, 0.9, 2048 gr.Markdown("**Click any example below:**") example_buttons = [] for i, (text, speaker) in enumerate(examples): btn = gr.Button(f"🎯 {text[:30]}{'...' if len(text) > 30 else ''}", size="sm") btn.click( fn=lambda t=text, s=speaker: load_example(t, s), outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens] ) # Information section with gr.Accordion("ℹ️ About", open=False): gr.Markdown(""" **⚠️ This is an experimental Bambara TTS model.** """) def toggle_advanced(use_adv): return gr.Group(visible=use_adv) use_advanced.change( fn=toggle_advanced, inputs=[use_advanced], outputs=[advanced_group] ) generate_btn.click( fn=generate_speech, inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens], outputs=[audio_output, status_output] ) text_input.submit( fn=generate_speech, inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens], outputs=[audio_output, status_output] ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )