sudoping01's picture
Update app.py
8522dd2 verified
import os
import spaces
import torch
import torchaudio
import gradio as gr
import logging
from whosper import WhosperTranscriber
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
if torch.cuda.is_available():
device = "cuda"
logger.info("Using CUDA for inference.")
elif torch.backends.mps.is_available():
device = "mps"
logger.info("Using MPS for inference.")
else:
device = "cpu"
logger.info("Using CPU for inference.")
model_id = "sudoping01/maliba-asr-v1"
transcriber = WhosperTranscriber(model_id=model_id)
logger.info(f"Transcriber initialized with model: {model_id}")
def resample_audio(audio_path, target_sample_rate=16000):
"""
Converts the audio file to the target sampling rate (16000 Hz).
Args:
audio_path (str): Path to the audio file.
target_sample_rate (int): The desired sample rate.
Returns:
A tensor containing the resampled audio data and the target sample rate.
"""
try:
waveform, original_sample_rate = torchaudio.load(audio_path)
if original_sample_rate != target_sample_rate:
resampler = torchaudio.transforms.Resample(
orig_freq=original_sample_rate,
new_freq=target_sample_rate
)
waveform = resampler(waveform)
return waveform, target_sample_rate
except Exception as e:
logger.error(f"Error resampling audio: {e}")
raise e
@spaces.GPU()
def transcribe_audio(audio_file):
"""
Transcribes the provided audio file into Bambara text using Whosper.
Args:
audio_file: The path to the audio file to transcribe.
Returns:
A string representing the transcribed Bambara text.
"""
if audio_file is None:
return "Please provide an audio file for transcription."
try:
logger.info(f"Transcribing audio file: {audio_file}")
result = transcriber.transcribe_audio(audio_file)
logger.info("Transcription successful.")
return result.get("text", "")
except Exception as e:
logger.error(f"Transcription failed: {e}")
return f"Error during transcription: {str(e)}"
def get_example_files(directory="./examples"):
"""
Returns a list of audio files from the examples directory.
Args:
directory (str): The directory to search for audio files.
Returns:
list: A list of paths to the audio files.
"""
if not os.path.exists(directory):
logger.warning(f"Examples directory {directory} not found.")
return []
audio_extensions = ['.wav', '.mp3', '.m4a', '.flac', '.ogg']
audio_files = []
try:
files = os.listdir(directory)
for file in files:
if any(file.lower().endswith(ext) for ext in audio_extensions):
full_path = os.path.abspath(os.path.join(directory, file))
audio_files.append(full_path)
logger.info(f"Found {len(audio_files)} example audio files.")
return audio_files[:5]
except Exception as e:
logger.error(f"Error reading examples directory: {e}")
return []
def build_interface():
"""
Builds the Gradio interface for Bambara speech recognition.
"""
example_files = get_example_files()
with gr.Blocks(title="Bambara Speech Recognition") as demo:
gr.Markdown(
"""
# 🎀 Bambara Automatic Speech Recognition
**Powered by MALIBA-AI**
Convert Bambara speech to text using our state-of-the-art ASR model. You can either:
- πŸŽ™οΈ **Record** your voice directly
- πŸ“ **Upload** an audio file
- 🎡 **Try** our example audio files
## Supported Audio Formats
WAV, MP3, M4A, FLAC, OGG
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="🎀 Record or Upload Audio",
type="filepath",
sources=["microphone", "upload"]
)
transcribe_btn = gr.Button(
"πŸ”„ Transcribe Audio",
variant="primary",
size="lg"
)
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
with gr.Column():
output_text = gr.Textbox(
label="πŸ“ Transcribed Text (Bambara)",
lines=8,
placeholder="Your transcribed Bambara text will appear here...",
interactive=False
)
if example_files:
gr.Markdown("## 🎡 Try These Examples")
gr.Examples(
examples=[[f] for f in example_files],
inputs=[audio_input],
outputs=output_text,
fn=transcribe_audio,
cache_examples=False,
label="Example Audio Files"
)
gr.Markdown(
"""
---
## ℹ️ About This Model
- **Model:** [sudoping01/maliba-asr-v1](https://huggingface.co/sudoping01/maliba-asr-v1)
- **Developer:** MALIBA-AI
- **Language:** Bambara (bm)
- **Task:** Automatic Speech Recognition (ASR)
- **Sample Rate:** 16kHz (automatically resampled)
## πŸš€ How to Use
1. **Record Audio:** Click the microphone button and speak in Bambara
2. **Upload File:** Click the upload button to select an audio file
3. **Transcribe:** Click the "Transcribe Audio" button
4. **View Results:** See your transcribed text in Bambara
## πŸ“Š Performance Notes
- Best results with clear speech and minimal background noise
- Supports various audio formats and durations
- Optimized for Bambara language patterns and phonetics
"""
)
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input],
outputs=output_text,
show_progress=True
)
clear_btn.click(
fn=lambda: (None, ""),
outputs=[audio_input, output_text]
)
audio_input.change(
fn=transcribe_audio,
inputs=[audio_input],
outputs=output_text,
show_progress=True
)
return demo
def main():
"""
Main function to launch the Gradio interface.
"""
logger.info("Starting Bambara ASR Gradio interface.")
interface = build_interface()
interface.launch(
share=False,
server_name="0.0.0.0",
server_port=7860
)
logger.info("Gradio interface launched successfully.")
if __name__ == "__main__":
main()