Spaces:
Runtime error
Runtime error
import numpy as np | |
import torch | |
import gradio as gr | |
import torchaudio | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
# Load ASR model & processor | |
asr_model_id = "Norphel/wav2vec2-large-mms-1b-dzo-colab" | |
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_id, target_lang="dzo").to("cuda") | |
asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_id) | |
asr_processor.tokenizer.set_target_lang("dzo") | |
# Function to process audio & generate text | |
def generate_text(audio): | |
if audio is None: | |
return "No audio recorded." | |
sr, data = audio # Gradio provides (sample_rate, waveform) | |
# Resample to 16kHz if needed | |
if sr != 16000: | |
data = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(torch.tensor(data)) | |
# Convert to model input format | |
input_dict = asr_processor(data.numpy(), sampling_rate=16000, return_tensors="pt", padding=True) | |
# Run model inference | |
with torch.no_grad(): | |
logits = asr_model(input_dict.input_values.to("cuda")).logits | |
pred_ids = torch.argmax(logits, dim=-1)[0] | |
# Decode prediction | |
return asr_processor.decode(pred_ids) | |
# Gradio interface | |
demo = gr.Interface( | |
fn=generate_text, | |
inputs=gr.Audio(type="numpy"), # Automatically enables recording | |
outputs="text", | |
title="Dzongkha Speech-to-Text", | |
description="Record your voice and get transcriptions in Dzongkha." | |
) | |
if __name__ == "__main__": | |
demo.launch() | |