Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import librosa | |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq | |
# Load Wav2Vec2 Model | |
MODEL_NAME = "eleferrand/xlsr53_Amis" | |
processor =AutoProcessor.from_pretrained(MODEL_NAME) | |
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME) | |
def transcribe(audio_file): | |
""" | |
Transcribes speech from an uploaded audio file or live microphone input. | |
""" | |
try: | |
# Load and convert audio to 16kHz | |
audio, rate = librosa.load(audio_file, sr=16000) | |
# Convert audio to tensor format for Wav2Vec | |
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values | |
# Run the model for transcription | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
# Convert predicted tokens into text | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids)[0] | |
return transcription | |
except Exception as e: | |
return "Error processing file" | |
# UI Build | |
interface = gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speak or Upload Audio"), | |
outputs="text", | |
title="Wav2Vec2 Speech-to-Text Transcription", | |
description="Speak into your microphone or upload an audio file to get an automatic transcription.", | |
live=True # Real-time microphone processing | |
) | |
interface.launch(share=True) | |