Spaces:

ixxan
/

uyghur-speech-models

Running

App Files Files Community

Irpan commited on Dec 21, 2024

Commit

3a18b3b

1 Parent(s): 1959ce1

asr

Browse files

Files changed (2) hide show

app.py +8 -2
asr.py +49 -11

app.py CHANGED Viewed

@@ -1,11 +1,17 @@
 import gradio as gr
-from asr import transcribe
 # from tts import synthesize
 mms_transcribe = gr.Interface(
-    fn=transcribe,
     inputs=[
         gr.Audio()
     ],
     outputs="text",

 import gradio as gr
+import asr
 # from tts import synthesize
 mms_transcribe = gr.Interface(
+    fn=asr.transcribe,
     inputs=[
+        gr.Dropdown(
+            choices=[m["id"] for m in asr.models_info],
+            label="Select Model for ASR",
+            value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
+            interactive=True
+        ),
         gr.Audio()
     ],
     outputs="text",

asr.py CHANGED Viewed

@@ -1,15 +1,45 @@
 import torchaudio
 import torch
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import numpy as np
 # Load processor and model
-processor = AutoProcessor.from_pretrained("ixxan/whisper-small-common-voice-ug")
-model = AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-common-voice-ug")
-target_sr = processor.feature_extractor.sampling_rate
-def transcribe(audio_data) -> str:
     """
     Transcribes audio to text using the Whisper model for Uyghur.
     Args:
@@ -35,13 +65,18 @@ def transcribe(audio_data) -> str:
         return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
     # Resample if needed
     if sampling_rate != target_sr:
         resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
         audio_input = resampler(audio_input)
     # Preprocess the audio input
-    inputs = processor(audio_input.squeeze(), sampling_rate=target_sr, return_tensors="pt")
     # Move model to GPU if available
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -50,9 +85,12 @@ def transcribe(audio_data) -> str:
     # Generate transcription
     with torch.no_grad():
-        generated_ids = model.generate(inputs["input_features"], max_length=225)
-    # Decode the output to get the transcription text
-    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return transcription

 import torchaudio
 import torch
+from transformers import (
+    WhisperProcessor,
+    AutoProcessor,
+    AutoModelForSpeechSeq2Seq,
+    AutoModelForCTC,
+    Wav2Vec2Processor,
+    Wav2Vec2ForCTC
+)
 import numpy as np
 # Load processor and model
+models_info = {
+    "openai/whisper-small-uzbek": {
+        "processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
+        "model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
+        "ctc_model": False
+    },
+    "ixxan/whisper-small-thugy20": {
+        "processor": AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20"),
+        "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-thugy20"),
+        "ctc_model": False
+    },
+    "ixxan/whisper-small-uyghur-common-voice": {
+        "processor": AutoProcessor.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
+        "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
+        "ctc_model": False
+    },
+    "facebook/mms-1b-all": {
+        "processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
+        "model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
+        "ctc_model": True
+    },
+    # "ixxan/wav2vec2-large-mms-1b-uyghur-latin": {
+    #     "processor": Wav2Vec2Processor.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
+    #     "model": Wav2Vec2ForCTC.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin"),
+    #     "ctc_model": True
+    # },
+}
+def transcribe(audio_data, model_id) -> str:
     """
     Transcribes audio to text using the Whisper model for Uyghur.
     Args:
         return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
+    model = models_info[model_id]["model"]
+    processor = models_info[model_id]["processor"]
+    target_sr = processor.feature_extractor.sampling_rate
+    ctc_model = models_info[model_id]["ctc_model"]
     # Resample if needed
     if sampling_rate != target_sr:
         resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
         audio_input = resampler(audio_input)
     # Preprocess the audio input
+    inputs = processor(audio_input.squeeze(), sampling_rate=target_sr, return_tensors="pt", padding=True)
     # Move model to GPU if available
     device = "cuda" if torch.cuda.is_available() else "cpu"
     # Generate transcription
     with torch.no_grad():
+        if ctc_model:
+            logits = model(**inputs).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+            transcription = processor.batch_decode(predicted_ids)[0]
+        else:
+            generated_ids = model.generate(inputs["input_features"], max_length=225)
+            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return transcription