import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import soundfile as sf import gradio as gr # Load the pre-trained processor and model processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn") model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn") def speech_to_text(audio): # Load audio file speech, sample_rate = sf.read(audio) # Preprocess the audio file inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt", padding=True) # Perform inference with torch.no_grad(): logits = model(**inputs).logits # Decode the predicted ids to text predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return transcription[0] # Create the Gradio interface iface = gr.Interface( fn=speech_to_text, inputs=gr.Audio(type="filepath"), outputs=gr.Textbox(), title="Chinese Speech Recognition", description="Upload an audio file and get the transcribed text using the wav2vec2-large-xlsr-53-chinese-zh-cn model." ) if __name__ == "__main__": iface.launch()