Norphel's picture
Update app.py
83dc08b verified
raw
history blame
1.45 kB
import numpy as np
import torch
import gradio as gr
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Load ASR model & processor
asr_model_id = "Norphel/wav2vec2-large-mms-1b-dzo-colab"
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_id, target_lang="dzo").to("cuda")
asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_id)
asr_processor.tokenizer.set_target_lang("dzo")
# Function to process audio & generate text
def generate_text(audio):
if audio is None:
return "No audio recorded."
sr, data = audio # Gradio provides (sample_rate, waveform)
# Resample to 16kHz if needed
if sr != 16000:
data = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(torch.tensor(data))
# Convert to model input format
input_dict = asr_processor(data.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
# Run model inference
with torch.no_grad():
logits = asr_model(input_dict.input_values.to("cuda")).logits
pred_ids = torch.argmax(logits, dim=-1)[0]
# Decode prediction
return asr_processor.decode(pred_ids)
# Gradio interface
demo = gr.Interface(
fn=generate_text,
inputs=gr.Audio(type="numpy"), # Automatically enables recording
outputs="text",
title="Dzongkha Speech-to-Text",
description="Record your voice and get transcriptions in Dzongkha."
)
if __name__ == "__main__":
demo.launch()