oyemade's picture
Update app.py
f0c2087 verified
from transformers import pipeline
import gradio as gr
import numpy as np
import librosa
# Initialize the speech recognition pipeline
pipe = pipeline("automatic-speech-recognition", model="oyemade/w2v-bert-2.0-yoruba-CV17.0")
def transcribe(audio):
if audio is None:
return "No audio detected. Please try again."
try:
# Check if the input is a file path (for uploaded files) or numpy array (for microphone input)
if isinstance(audio, str):
# Load the audio file using librosa
audio, sr = librosa.load(audio, sr=16000) # Resample to 16kHz
elif isinstance(audio, tuple): # Gradio audio components return a tuple (sr, audio)
sr, audio = audio
if sr != 16000:
audio = librosa.resample(audio, sr, 16000)
else:
return "Invalid audio format. Please try again."
# Check if the audio is valid (not silent)
if np.max(np.abs(audio)) < 0.01:
return "Audio is too quiet. Please speak louder or choose a different file and try again."
text = pipe(audio)["text"]
return text
except Exception as e:
return f"An error occurred: {str(e)}"
# Create the Gradio interface
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input"),
],
outputs="text",
title="Neoform AI: Yoruba Speech Recognition",
description="Realtime demo for Yoruba speech recognition using a fine-tuned Wav2Vec-Bert model. "
"You can either use your microphone or upload an MP3 file. "
"https://neoformai.com",
)
# Launch the interface
iface.launch()