Spaces:
Running
Running
Update asr.py
Browse files
asr.py
CHANGED
@@ -1,24 +1,38 @@
|
|
1 |
import torchaudio
|
2 |
import torch
|
3 |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
|
|
4 |
|
5 |
# Load processor and model
|
6 |
processor = AutoProcessor.from_pretrained("ixxan/whisper-small-ug-cv-15")
|
7 |
model = AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-ug-cv-15")
|
8 |
|
9 |
|
10 |
-
def transcribe(audio_data
|
11 |
"""
|
12 |
Transcribes audio to text using the Whisper model for Uyghur.
|
13 |
Args:
|
14 |
-
- audio_data
|
15 |
Returns:
|
16 |
- str: The transcription of the audio.
|
17 |
"""
|
18 |
-
audio_path = audio_data[0] # Extract the file path from the tuple
|
19 |
|
20 |
# Load audio file
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# Resample if needed
|
24 |
if sampling_rate != processor.feature_extractor.sampling_rate:
|
|
|
1 |
import torchaudio
|
2 |
import torch
|
3 |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
4 |
+
import numpy as np
|
5 |
|
6 |
# Load processor and model
|
7 |
processor = AutoProcessor.from_pretrained("ixxan/whisper-small-ug-cv-15")
|
8 |
model = AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-ug-cv-15")
|
9 |
|
10 |
|
11 |
+
def transcribe(audio_data) -> str:
|
12 |
"""
|
13 |
Transcribes audio to text using the Whisper model for Uyghur.
|
14 |
Args:
|
15 |
+
- audio_data: Gradio audio input
|
16 |
Returns:
|
17 |
- str: The transcription of the audio.
|
18 |
"""
|
|
|
19 |
|
20 |
# Load audio file
|
21 |
+
if not audio_data:
|
22 |
+
return "<<ERROR: Empty Audio Input>>"
|
23 |
+
|
24 |
+
if isinstance(audio_data, tuple):
|
25 |
+
# microphone
|
26 |
+
sampling_rate, audio_input = audio_data
|
27 |
+
audio_input = (audio_input / 32768.0).astype(np.float32)
|
28 |
+
|
29 |
+
if isinstance(audio_data, str):
|
30 |
+
# file upload
|
31 |
+
audio_input, sampling_rate = torchaudio.load(audio_data)
|
32 |
+
|
33 |
+
else:
|
34 |
+
return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
|
35 |
+
|
36 |
|
37 |
# Resample if needed
|
38 |
if sampling_rate != processor.feature_extractor.sampling_rate:
|