Spaces:

rohitp1
/

wav2vec2-demo

Runtime error

App Files Files Community

rohitp1 commited on Jun 25, 2023

Commit

abc4a4a

1 Parent(s): ffa230f

Create app.py

Browse files

Files changed (1) hide show

app.py +170 -0

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# import gradio as gr
+# gr.Interface.load("models/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset").launch()
+import gradio as gr
+import os
+import transformers
+from transformers import pipeline, Wav2Vec2ForCTC,Wav2Vec2Processor
+import time
+import torch
+# def greet_from_secret(ignored_param):
+#     name = os.environ.get('TOKEN')
+#     return
+auth_token = os.environ.get('TOKEN')
+M1 = "rohitp1/dgx1_w2v2_base_teacher_student_distillation_mozilla_epochs_100_batch_16_concatenate_datasets"
+M2 = "rohitp1/finetune_teacher_babble_noise_mozilla_200_epochs"
+M3 = "rohitp1/finetune_teacher_clean_mozilla_200_epochs"
+model1 = Wav2Vec2ForCTC.from_pretrained(M1, use_auth_token=auth_token)
+processor1 = Wav2Vec2Processor.from_pretrained(M1, use_auth_token=auth_token)
+model2 = Wav2Vec2ForCTC.from_pretrained(M2, use_auth_token=auth_token)
+processor2 = Wav2Vec2Processor.from_pretrained(M2, use_auth_token=auth_token)
+model3 = Wav2Vec2ForCTC.from_pretrained(M3, use_auth_token=auth_token)
+processor3 = Wav2Vec2Processor.from_pretrained(M3, use_auth_token=auth_token)
+# make quantized model
+quantized_model1 = torch.quantization.quantize_dynamic(
+    model3, {torch.nn.Linear}, dtype=torch.qint8
+)
+p1 = pipeline('automatic-speech-recognition', model=model1, processor=processor1)
+p2 = pipeline('automatic-speech-recognition', model=model2, processor=processor2)
+p3 = pipeline('automatic-speech-recognition', model=model3, processor=processor3)
+p1_quant = pipeline('automatic-speech-recognition', model=quantized_model1, processor=processor1)
+def transcribe(mic_input, upl_input, model_type):
+    if mic_input:
+        audio = mic_input
+    else:
+        audio = upl_input
+    time.sleep(3)
+    st_time = time.time()
+    if model_type == 'NoisyFinetuned':
+        text = p2(audio)["text"]
+    elif model_type == 'CleanFinetuned':
+        text = p3(audio)["text"]
+    elif model_type == 'DistilledQuantised':
+        text = p1_quant(audio)['text']
+    else:
+        text = p1(audio)["text"]
+    end_time = time.time()
+    # state = text + " "
+    time_taken = round((end_time - st_time) / 60 , 4)
+    return text, time_taken
+# gr.Interface(
+#     fn=transcribe,
+#     inputs=[
+#         gr.inputs.Audio(source="microphone", type="filepath"),
+#         'state'
+#     ],
+#     outputs=[
+#         "textbox",
+#         "state"
+#     ],
+#     live=False).launch()
+# demo = gr.load(
+#     "huggingface/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset",
+#     title="Speech-to-text",
+#     inputs="mic",
+#     description="Let me try to guess what you're saying!",
+#     api_key="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"
+# )
+# demo.launch()
+def clear_inputs_and_outputs():
+    return [None, None, "CleanFinetuned", None, None]
+# Main function
+if __name__ == "__main__":
+    demo = gr.Blocks()
+    with demo:
+        gr.Markdown(
+            """
+            <center><h1> Noise Robust English Automatic Speech Recognition LibriSpeech Dataset</h1></center> \
+            This space is a demo of an English ASR model using Huggingface.<br> \
+    In this space, you can record your voice or upload a wav file and the model will predict the text spoken in the audio<br><br>
+            """
+        )
+        with gr.Row():
+            ## Input
+            with gr.Column():
+                mic_input = gr.Audio(source="microphone", type="filepath", label="Record your own voice")
+                upl_input = gr.Audio(
+                    source="upload", type="filepath", label="Upload a wav file"
+                )
+                with gr.Row():
+                    model_type = gr.inputs.Dropdown(["RobustDistillation", "NoisyFinetuned", "CleanFinetuned", "DistilledAndQuantised"], label='Model Type')
+                with gr.Row():
+                    clr_btn = gr.Button(value="Clear", variant="secondary")
+                    prd_btn = gr.Button(value="Predict")
+            # Outputs
+            with gr.Column():
+                lbl_output = gr.Label(label="Transcription")
+                with gr.Row():
+                    time_output = gr.Label(label="Time Taken (in sec)")
+                    # with gr.Group():
+                    #     gr.Markdown("<center>Prediction per time slot</center>")
+                    #     plt_output = gr.Plot(
+                    #         label="Prediction per time slot", show_label=False
+                    #     )
+        with gr.Row():
+            gr.Examples(
+            [
+                # os.path.join(os.path.dirname(__file__), "audio/sample1.wav"),
+                # os.path.join(os.path.dirname(__file__), "audio/sample2.wav"),
+                os.path.join(os.path.dirname(__file__), "audio/sample3.wav"),
+            ],
+            upl_input,
+            [lbl_output, time_output],
+            transcribe
+        )
+        # Credits
+        with gr.Row():
+            gr.Markdown(
+                """
+                <h4>Credits</h4>
+                Author:  Rohit Prasad <br>
+                Check out the model <a href="https://huggingface.co/rohitp1/subh_whisper_small_distil_att_loss_mozilla_epochs_50_batch_8">here</a>
+                """
+            )
+        clr_btn.click(
+            fn=clear_inputs_and_outputs,
+            inputs=[],
+            outputs=[mic_input, upl_input, model_type, lbl_output, time_output],
+        )
+        prd_btn.click(
+            fn=transcribe,
+            inputs=[mic_input, upl_input, model_type],
+            outputs=[lbl_output, time_output],
+        )
+    demo.launch(debug=True)