import os import spaces import torch import torchaudio import gradio as gr import logging from whosper import WhosperTranscriber logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) if torch.cuda.is_available(): device = "cuda" logger.info("Using CUDA for inference.") elif torch.backends.mps.is_available(): device = "mps" logger.info("Using MPS for inference.") else: device = "cpu" logger.info("Using CPU for inference.") model_id = "sudoping01/maliba-asr-v1" transcriber = WhosperTranscriber(model_id=model_id) logger.info(f"Transcriber initialized with model: {model_id}") def resample_audio(audio_path, target_sample_rate=16000): """ Converts the audio file to the target sampling rate (16000 Hz). Args: audio_path (str): Path to the audio file. target_sample_rate (int): The desired sample rate. Returns: A tensor containing the resampled audio data and the target sample rate. """ try: waveform, original_sample_rate = torchaudio.load(audio_path) if original_sample_rate != target_sample_rate: resampler = torchaudio.transforms.Resample( orig_freq=original_sample_rate, new_freq=target_sample_rate ) waveform = resampler(waveform) return waveform, target_sample_rate except Exception as e: logger.error(f"Error resampling audio: {e}") raise e @spaces.GPU() def transcribe_audio(audio_file): """ Transcribes the provided audio file into Bambara text using Whosper. Args: audio_file: The path to the audio file to transcribe. Returns: A string representing the transcribed Bambara text. """ if audio_file is None: return "Please provide an audio file for transcription." try: logger.info(f"Transcribing audio file: {audio_file}") result = transcriber.transcribe_audio(audio_file) logger.info("Transcription successful.") return result.get("text", "") except Exception as e: logger.error(f"Transcription failed: {e}") return f"Error during transcription: {str(e)}" def get_example_files(directory="./examples"): """ Returns a list of audio files from the examples directory. Args: directory (str): The directory to search for audio files. Returns: list: A list of paths to the audio files. """ if not os.path.exists(directory): logger.warning(f"Examples directory {directory} not found.") return [] audio_extensions = ['.wav', '.mp3', '.m4a', '.flac', '.ogg'] audio_files = [] try: files = os.listdir(directory) for file in files: if any(file.lower().endswith(ext) for ext in audio_extensions): full_path = os.path.abspath(os.path.join(directory, file)) audio_files.append(full_path) logger.info(f"Found {len(audio_files)} example audio files.") return audio_files[:5] except Exception as e: logger.error(f"Error reading examples directory: {e}") return [] def build_interface(): """ Builds the Gradio interface for Bambara speech recognition. """ example_files = get_example_files() with gr.Blocks(title="Bambara Speech Recognition") as demo: gr.Markdown( """ # 🎤 Bambara Automatic Speech Recognition **Powered by MALIBA-AI** Convert Bambara speech to text using our state-of-the-art ASR model. You can either: - đŸŽ™ī¸ **Record** your voice directly - 📁 **Upload** an audio file - đŸŽĩ **Try** our example audio files ## Supported Audio Formats WAV, MP3, M4A, FLAC, OGG """ ) with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="🎤 Record or Upload Audio", type="filepath", sources=["microphone", "upload"] ) transcribe_btn = gr.Button( "🔄 Transcribe Audio", variant="primary", size="lg" ) clear_btn = gr.Button("đŸ—‘ī¸ Clear", variant="secondary") with gr.Column(): output_text = gr.Textbox( label="📝 Transcribed Text (Bambara)", lines=8, placeholder="Your transcribed Bambara text will appear here...", interactive=False ) if example_files: gr.Markdown("## đŸŽĩ Try These Examples") gr.Examples( examples=[[f] for f in example_files], inputs=[audio_input], outputs=output_text, fn=transcribe_audio, cache_examples=False, label="Example Audio Files" ) gr.Markdown( """ --- ## â„šī¸ About This Model - **Model:** [sudoping01/maliba-asr-v1](https://huggingface.co/sudoping01/maliba-asr-v1) - **Developer:** MALIBA-AI - **Language:** Bambara (bm) - **Task:** Automatic Speech Recognition (ASR) - **Sample Rate:** 16kHz (automatically resampled) ## 🚀 How to Use 1. **Record Audio:** Click the microphone button and speak in Bambara 2. **Upload File:** Click the upload button to select an audio file 3. **Transcribe:** Click the "Transcribe Audio" button 4. **View Results:** See your transcribed text in Bambara ## 📊 Performance Notes - Best results with clear speech and minimal background noise - Supports various audio formats and durations - Optimized for Bambara language patterns and phonetics """ ) transcribe_btn.click( fn=transcribe_audio, inputs=[audio_input], outputs=output_text, show_progress=True ) clear_btn.click( fn=lambda: (None, ""), outputs=[audio_input, output_text] ) audio_input.change( fn=transcribe_audio, inputs=[audio_input], outputs=output_text, show_progress=True ) return demo def main(): """ Main function to launch the Gradio interface. """ logger.info("Starting Bambara ASR Gradio interface.") interface = build_interface() interface.launch( share=False, server_name="0.0.0.0", server_port=7860 ) logger.info("Gradio interface launched successfully.") if __name__ == "__main__": main()