Spaces:

ixxan
/

uyghur-speech-models

Running

Irpan commited on Dec 24, 2024

Commit

6502e85

1 Parent(s): 2b3019f

asr

Files changed (2) hide show

app.py CHANGED Viewed

@@ -6,10 +6,14 @@ import util
 mms_transcribe = gr.Interface(
     fn=asr.transcribe,
     inputs=[
-        gr.Audio(),
         gr.Dropdown(
             choices=[model for model in asr.models_info],
-            label="Select a Model for ASR",
             value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
             interactive=True
         ),
@@ -19,7 +23,7 @@ mms_transcribe = gr.Interface(
         gr.Textbox(label="Uyghur Latin Transcription"),
     ],
     examples=util.asr_examples,
-    title="Speech-to-text",
     description=(
         "Transcribe Uyghur speech audio from a microphone or input file."
     ),
@@ -32,7 +36,7 @@ mms_synthesize = gr.Interface(
         gr.Text(label="Input text"),
         gr.Dropdown(
             choices=[model for model in tts.models_info],
-            label="Select a Model for TTS",
             value="Meta-MMS",
             interactive=True
         )
@@ -41,7 +45,7 @@ mms_synthesize = gr.Interface(
         gr.Audio(label="Generated Audio"),
     ],
     examples=util.tts_examples,
-    title="Text-to-speech",
     description=(
         "Generate audio from input Uyghur text."
         ),
@@ -50,7 +54,7 @@ mms_synthesize = gr.Interface(
 tabbed_interface = gr.TabbedInterface(
     [mms_transcribe, mms_synthesize],
-    ["Speech-to-text", "Text-to-speech"],
 )
 with gr.Blocks() as demo:

 mms_transcribe = gr.Interface(
     fn=asr.transcribe,
     inputs=[
+        gr.Audio(
+            label="Record or Upload Uyghur Audio",
+            sources=["microphone", "upload"],
+            type="filepath",
+        ),
         gr.Dropdown(
             choices=[model for model in asr.models_info],
+            label="Select a Model",
             value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
             interactive=True
         ),
         gr.Textbox(label="Uyghur Latin Transcription"),
     ],
     examples=util.asr_examples,
+    title="Speech-To-Text",
     description=(
         "Transcribe Uyghur speech audio from a microphone or input file."
     ),
         gr.Text(label="Input text"),
         gr.Dropdown(
             choices=[model for model in tts.models_info],
+            label="Select a Model",
             value="Meta-MMS",
             interactive=True
         )
         gr.Audio(label="Generated Audio"),
     ],
     examples=util.tts_examples,
+    title="Text-To-Speech",
     description=(
         "Generate audio from input Uyghur text."
         ),
 tabbed_interface = gr.TabbedInterface(
     [mms_transcribe, mms_synthesize],
+    ["Speech-To-Text", "Text-To-Speech"],
 )
 with gr.Blocks() as demo:

asr.py CHANGED Viewed

@@ -58,36 +58,31 @@ models_info = {
 #     return transcriptions
 def transcribe(audio_data, model_id) -> str:
-    # Load audio file
-    if not audio_data:
-        return "<<ERROR: Empty Audio Input>>"
     if isinstance(audio_data, tuple):
         # microphone
         sampling_rate, audio_input = audio_data
         audio_input = (audio_input / 32768.0).astype(np.float32)
     elif isinstance(audio_data, str):
         # file upload
         audio_input, sampling_rate = torchaudio.load(audio_data)
-    else:
         return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
     model = models_info[model_id]["model"]
     processor = models_info[model_id]["processor"]
-    target_sr = 16000           #processor.feature_extractor.sampling_rate
     ctc_model = models_info[model_id]["ctc_model"]
     # Resample if needed
     if sampling_rate != target_sr:
         resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
         audio_input = resampler(audio_input)
     # Preprocess the audio input
-    inputs = processor(audio_input.squeeze(), sampling_rate=target_sr, return_tensors="pt")
     # Move model to GPU if available
     device = "cuda" if torch.cuda.is_available() else "cpu"

 #     return transcriptions
 def transcribe(audio_data, model_id) -> str:
+    # Load user audio
     if isinstance(audio_data, tuple):
         # microphone
         sampling_rate, audio_input = audio_data
         audio_input = (audio_input / 32768.0).astype(np.float32)
     elif isinstance(audio_data, str):
         # file upload
         audio_input, sampling_rate = torchaudio.load(audio_data)
+    else:
         return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
     model = models_info[model_id]["model"]
     processor = models_info[model_id]["processor"]
+    target_sr = processor.feature_extractor.sampling_rate
     ctc_model = models_info[model_id]["ctc_model"]
+    print(target_sr)
     # Resample if needed
     if sampling_rate != target_sr:
         resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
         audio_input = resampler(audio_input)
+        sampling_rate = target_sr
     # Preprocess the audio input
+    inputs = processor(audio_input.squeeze(), sampling_rate=sampling_rate, return_tensors="pt")
     # Move model to GPU if available
     device = "cuda" if torch.cuda.is_available() else "cpu"