|
from transformers import pipeline |
|
import gradio as gr |
|
import numpy as np |
|
import librosa |
|
|
|
|
|
pipe = pipeline("automatic-speech-recognition", model="oyemade/w2v-bert-2.0-yoruba-CV17.0") |
|
|
|
def transcribe(audio): |
|
if audio is None: |
|
return "No audio detected. Please try again." |
|
|
|
try: |
|
|
|
if isinstance(audio, str): |
|
|
|
audio, sr = librosa.load(audio, sr=16000) |
|
elif isinstance(audio, tuple): |
|
sr, audio = audio |
|
if sr != 16000: |
|
audio = librosa.resample(audio, sr, 16000) |
|
else: |
|
return "Invalid audio format. Please try again." |
|
|
|
|
|
if np.max(np.abs(audio)) < 0.01: |
|
return "Audio is too quiet. Please speak louder or choose a different file and try again." |
|
|
|
text = pipe(audio)["text"] |
|
return text |
|
except Exception as e: |
|
return f"An error occurred: {str(e)}" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=[ |
|
gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input"), |
|
], |
|
outputs="text", |
|
title="Neoform AI: Yoruba Speech Recognition", |
|
description="Realtime demo for Yoruba speech recognition using a fine-tuned Wav2Vec-Bert model. " |
|
"You can either use your microphone or upload an MP3 file. " |
|
"https://neoformai.com", |
|
) |
|
|
|
|
|
iface.launch() |