aikitty commited on
Commit
dc7d091
·
verified ·
1 Parent(s): 2ffbef3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -4
app.py CHANGED
@@ -1,6 +1,37 @@
1
- from huggingsound import SpeechRecognitionModel
 
 
 
2
 
3
- model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn")
4
- audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
 
5
 
6
- transcriptions = model.transcribe(audio_paths)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
3
+ import soundfile as sf
4
+ import gradio as gr
5
 
6
+ # Load the pre-trained processor and model
7
+ processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn")
8
+ model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn")
9
 
10
+ def speech_to_text(audio):
11
+ # Load audio file
12
+ speech, sample_rate = sf.read(audio)
13
+
14
+ # Preprocess the audio file
15
+ inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt", padding=True)
16
+
17
+ # Perform inference
18
+ with torch.no_grad():
19
+ logits = model(**inputs).logits
20
+
21
+ # Decode the predicted ids to text
22
+ predicted_ids = torch.argmax(logits, dim=-1)
23
+ transcription = processor.batch_decode(predicted_ids)
24
+
25
+ return transcription[0]
26
+
27
+ # Create the Gradio interface
28
+ iface = gr.Interface(
29
+ fn=speech_to_text,
30
+ inputs=gr.inputs.Audio(source="upload", type="filepath"),
31
+ outputs="text",
32
+ title="Chinese Speech Recognition",
33
+ description="Upload an audio file and get the transcribed text using the wav2vec2-large-xlsr-53-chinese-zh-cn model."
34
+ )
35
+
36
+ if __name__ == "__main__":
37
+ iface.launch()