Spaces:

cdactvm
/

punjabi-asr-quantized

Runtime error

App Files Files Community

cdactvm commited on Feb 13

Commit

82814b2

verified ·

1 Parent(s): 50710a8

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -44

app.py CHANGED Viewed

@@ -9,9 +9,43 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load processor & model
 model_name = "cdactvm/w2v-bert-punjabi"  # Change if using a Punjabi ASR model
 processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
-model = Wav2Vec2BertForCTC.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)
-def transcribe(audio_path):
     # Load audio file
     waveform, sample_rate = torchaudio.load(audio_path)
@@ -29,53 +63,37 @@ def transcribe(audio_path):
     # Get logits & transcribe
     with torch.no_grad():
-        logits = model(**inputs).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.batch_decode(predicted_ids)[0]
     return transcription
-# Gradio Interface
-app = gr.Interface(
-    fn=transcribe,
-    inputs=gr.Audio(sources="upload", type="filepath"),
-    outputs="text",
-    title="Punjabi Speech-to-Text",
-    description="Upload an audio file and get the transcription in Punjabi."
-)
 if __name__ == "__main__":
     app.launch()
-# import gradio as gr
-# import torch
-# from transformers import pipeline
-# # Set device
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# # Load ASR pipeline
-# asr_pipeline = pipeline(
-#     "automatic-speech-recognition",
-#     model="cdactvm/w2v-bert-punjabi",  # Replace with a Punjabi ASR model if available
-#     torch_dtype=torch.bfloat16,
-#     device=0 if torch.cuda.is_available() else -1  # GPU (0) or CPU (-1)
-# )
-# def transcribe(audio_path):
-#     # Run inference
-#     result = asr_pipeline(audio_path)
-#     return result["text"]
-# # Gradio Interface
-# app = gr.Interface(
-#     fn=transcribe,
-#     inputs=gr.Audio(sources="upload", type="filepath"),
-#     outputs="text",
-#     title="Punjabi Speech-to-Text",
-#     description="Upload an audio file and get the transcription in Punjabi."
-# )
-# if __name__ == "__main__":
-#     app.launch()

 # Load processor & model
 model_name = "cdactvm/w2v-bert-punjabi"  # Change if using a Punjabi ASR model
 processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
+# Loading the original model.
+original_model=Wav2Vec2BertForCTC.from_pretrained(model_name)
+# Explicitly allow Wav2Vec2BertForCTC during unpickling3+
+torch.serialization.add_safe_globals([Wav2Vec2BertForCTC])
+# Load the full quantized model
+quantized_model = torch.load("model_name", weights_only=False)
+quantized_model.eval()
+#####################################################
+# recognize speech using original model
+def transcribe_original_model(audio_path):
+    # Load audio file
+    waveform, sample_rate = torchaudio.load(audio_path)
+    # Convert stereo to mono (if needed)
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    # Resample to 16kHz
+    if sample_rate != 16000:
+        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
+    # Process audio
+    inputs = processor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt")
+    inputs = {key: val.to(device, dtype=torch.bfloat16) for key, val in inputs.items()}
+    # Get logits & transcribe
+    with torch.no_grad():
+        logits = original_model(**inputs).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids)[0]
+    return transcription
+# recognize speech using quantized model.
+def transcribe_quantized_model(audio_path):
     # Load audio file
     waveform, sample_rate = torchaudio.load(audio_path)
     # Get logits & transcribe
     with torch.no_grad():
+        logits = quantized_model(**inputs).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.batch_decode(predicted_ids)[0]
     return transcription
+def select_lng(lng, mic=None, file=None):
+    if mic is not None:
+        audio = mic
+    elif file is not None:
+        audio = file
+    else:
+        return "You must either provide a mic recording or a file"
+    if lng == "original_model":
+        return transcribe_original_model(audio)
+    elif lng == "quantized_model":
+        return transcribe_quantized_model(audio)
+# Gradio Interface
+demo=gr.Interface(
+    fn=select_lng,
+    inputs=[
+        gr.Dropdown(["original_model","quantized_model"],label="Select Model"),
+        gr.Audio(sources=["microphone","upload"], type="filepath"),
+    ],
+    outputs=["textbox"],
+    title="Automatic Speech Recognition",
+    description = "Upload an audio file and get the transcription in Punjabi.",
+    )
 if __name__ == "__main__":
     app.launch()