Spaces:
Sleeping
Sleeping
File size: 1,480 Bytes
b626f3f 3ed5652 ebbbdc2 b626f3f e60a557 3ed5652 11e5741 b626f3f 8e0d359 b626f3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import gradio as gr
import torch
import librosa
from transformers import Wav2Vec2Processor, AutoModelForCTC
# Load Wav2Vec2 Model
MODEL_NAME = "eleferrand/xlsr53_Amis"
processor =Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = AutoModelForCTC.from_pretrained(MODEL_NAME)
def transcribe(audio_file):
"""
Transcribes speech from an uploaded audio file or live microphone input.
"""
try:
# Load and convert audio to 16kHz
audio, rate = librosa.load(audio_file, sr=16000)
# Convert audio to tensor format for Wav2Vec
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values
# Run the model for transcription
with torch.no_grad():
logits = model(input_values).logits
# Convert predicted tokens into text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription.replace("[UNK]", "")
except Exception as e:
return "Error processing file"
# UI Build
interface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speak or Upload Audio"),
outputs="text",
title="Wav2Vec2 Speech-to-Text Transcription",
description="Speak into your microphone or upload an audio file to get an automatic transcription.",
live=True # Real-time microphone processing
)
interface.launch(share=True)
|