from transformers import AutoModelForCTC, Wav2Vec2Processor import torch import gradio as gr # Load model and processor model_name = "nada15/wav2vec2-large-xls-r-300m-dm32" processor = Wav2Vec2Processor.from_pretrained(model_name) model = AutoModelForCTC.from_pretrained(model_name, ignore_mismatched_sizes=True) def transcribe(audio): inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True) logits = model(inputs.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return transcription[0] # Gradio Interface interface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone"), # Corrected outputs="text", live=True ) interface.launch()