Spaces:
Runtime error
Runtime error
import torch | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
import soundfile as sf | |
import gradio as gr | |
# Load the pre-trained processor and model | |
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn") | |
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn") | |
def speech_to_text(audio): | |
# Load audio file | |
speech, sample_rate = sf.read(audio) | |
# Preprocess the audio file | |
inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt", padding=True) | |
# Perform inference | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
# Decode the predicted ids to text | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids) | |
return transcription[0] | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=speech_to_text, | |
inputs=gr.Audio(type="filepath"), | |
outputs=gr.Textbox(), | |
title="Chinese Speech Recognition", | |
description="Upload an audio file and get the transcribed text using the wav2vec2-large-xlsr-53-chinese-zh-cn model." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |