Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import gradio as gr
|
2 |
+
|
3 |
+
# gr.Interface.load("models/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset").launch()
|
4 |
+
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import os
|
8 |
+
import transformers
|
9 |
+
from transformers import pipeline, Wav2Vec2ForCTC,Wav2Vec2Processor
|
10 |
+
import time
|
11 |
+
import torch
|
12 |
+
|
13 |
+
# def greet_from_secret(ignored_param):
|
14 |
+
# name = os.environ.get('TOKEN')
|
15 |
+
# return
|
16 |
+
|
17 |
+
|
18 |
+
auth_token = os.environ.get('TOKEN')
|
19 |
+
|
20 |
+
|
21 |
+
M1 = "rohitp1/dgx1_w2v2_base_teacher_student_distillation_mozilla_epochs_100_batch_16_concatenate_datasets"
|
22 |
+
M2 = "rohitp1/finetune_teacher_babble_noise_mozilla_200_epochs"
|
23 |
+
M3 = "rohitp1/finetune_teacher_clean_mozilla_200_epochs"
|
24 |
+
|
25 |
+
model1 = Wav2Vec2ForCTC.from_pretrained(M1, use_auth_token=auth_token)
|
26 |
+
processor1 = Wav2Vec2Processor.from_pretrained(M1, use_auth_token=auth_token)
|
27 |
+
|
28 |
+
|
29 |
+
model2 = Wav2Vec2ForCTC.from_pretrained(M2, use_auth_token=auth_token)
|
30 |
+
processor2 = Wav2Vec2Processor.from_pretrained(M2, use_auth_token=auth_token)
|
31 |
+
|
32 |
+
|
33 |
+
model3 = Wav2Vec2ForCTC.from_pretrained(M3, use_auth_token=auth_token)
|
34 |
+
processor3 = Wav2Vec2Processor.from_pretrained(M3, use_auth_token=auth_token)
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
# make quantized model
|
39 |
+
quantized_model1 = torch.quantization.quantize_dynamic(
|
40 |
+
model3, {torch.nn.Linear}, dtype=torch.qint8
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
p1 = pipeline('automatic-speech-recognition', model=model1, processor=processor1)
|
45 |
+
p2 = pipeline('automatic-speech-recognition', model=model2, processor=processor2)
|
46 |
+
p3 = pipeline('automatic-speech-recognition', model=model3, processor=processor3)
|
47 |
+
p1_quant = pipeline('automatic-speech-recognition', model=quantized_model1, processor=processor1)
|
48 |
+
|
49 |
+
def transcribe(mic_input, upl_input, model_type):
|
50 |
+
if mic_input:
|
51 |
+
audio = mic_input
|
52 |
+
else:
|
53 |
+
audio = upl_input
|
54 |
+
time.sleep(3)
|
55 |
+
st_time = time.time()
|
56 |
+
if model_type == 'NoisyFinetuned':
|
57 |
+
text = p2(audio)["text"]
|
58 |
+
elif model_type == 'CleanFinetuned':
|
59 |
+
text = p3(audio)["text"]
|
60 |
+
elif model_type == 'DistilledQuantised':
|
61 |
+
text = p1_quant(audio)['text']
|
62 |
+
else:
|
63 |
+
text = p1(audio)["text"]
|
64 |
+
end_time = time.time()
|
65 |
+
# state = text + " "
|
66 |
+
time_taken = round((end_time - st_time) / 60 , 4)
|
67 |
+
return text, time_taken
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
# gr.Interface(
|
72 |
+
# fn=transcribe,
|
73 |
+
# inputs=[
|
74 |
+
# gr.inputs.Audio(source="microphone", type="filepath"),
|
75 |
+
# 'state'
|
76 |
+
# ],
|
77 |
+
# outputs=[
|
78 |
+
# "textbox",
|
79 |
+
# "state"
|
80 |
+
# ],
|
81 |
+
# live=False).launch()
|
82 |
+
|
83 |
+
|
84 |
+
# demo = gr.load(
|
85 |
+
# "huggingface/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset",
|
86 |
+
# title="Speech-to-text",
|
87 |
+
# inputs="mic",
|
88 |
+
# description="Let me try to guess what you're saying!",
|
89 |
+
# api_key="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"
|
90 |
+
# )
|
91 |
+
|
92 |
+
# demo.launch()
|
93 |
+
|
94 |
+
def clear_inputs_and_outputs():
|
95 |
+
return [None, None, "CleanFinetuned", None, None]
|
96 |
+
|
97 |
+
# Main function
|
98 |
+
if __name__ == "__main__":
|
99 |
+
demo = gr.Blocks()
|
100 |
+
|
101 |
+
with demo:
|
102 |
+
gr.Markdown(
|
103 |
+
"""
|
104 |
+
<center><h1> Noise Robust English Automatic Speech Recognition LibriSpeech Dataset</h1></center> \
|
105 |
+
This space is a demo of an English ASR model using Huggingface.<br> \
|
106 |
+
In this space, you can record your voice or upload a wav file and the model will predict the text spoken in the audio<br><br>
|
107 |
+
"""
|
108 |
+
)
|
109 |
+
with gr.Row():
|
110 |
+
## Input
|
111 |
+
with gr.Column():
|
112 |
+
mic_input = gr.Audio(source="microphone", type="filepath", label="Record your own voice")
|
113 |
+
upl_input = gr.Audio(
|
114 |
+
source="upload", type="filepath", label="Upload a wav file"
|
115 |
+
)
|
116 |
+
|
117 |
+
with gr.Row():
|
118 |
+
model_type = gr.inputs.Dropdown(["RobustDistillation", "NoisyFinetuned", "CleanFinetuned", "DistilledAndQuantised"], label='Model Type')
|
119 |
+
|
120 |
+
with gr.Row():
|
121 |
+
clr_btn = gr.Button(value="Clear", variant="secondary")
|
122 |
+
prd_btn = gr.Button(value="Predict")
|
123 |
+
|
124 |
+
|
125 |
+
# Outputs
|
126 |
+
with gr.Column():
|
127 |
+
lbl_output = gr.Label(label="Transcription")
|
128 |
+
|
129 |
+
with gr.Row():
|
130 |
+
time_output = gr.Label(label="Time Taken (in sec)")
|
131 |
+
# with gr.Group():
|
132 |
+
# gr.Markdown("<center>Prediction per time slot</center>")
|
133 |
+
# plt_output = gr.Plot(
|
134 |
+
# label="Prediction per time slot", show_label=False
|
135 |
+
# )
|
136 |
+
|
137 |
+
|
138 |
+
with gr.Row():
|
139 |
+
gr.Examples(
|
140 |
+
[
|
141 |
+
# os.path.join(os.path.dirname(__file__), "audio/sample1.wav"),
|
142 |
+
# os.path.join(os.path.dirname(__file__), "audio/sample2.wav"),
|
143 |
+
os.path.join(os.path.dirname(__file__), "audio/sample3.wav"),
|
144 |
+
],
|
145 |
+
upl_input,
|
146 |
+
[lbl_output, time_output],
|
147 |
+
transcribe
|
148 |
+
)
|
149 |
+
# Credits
|
150 |
+
with gr.Row():
|
151 |
+
gr.Markdown(
|
152 |
+
"""
|
153 |
+
<h4>Credits</h4>
|
154 |
+
Author: Rohit Prasad <br>
|
155 |
+
Check out the model <a href="https://huggingface.co/rohitp1/subh_whisper_small_distil_att_loss_mozilla_epochs_50_batch_8">here</a>
|
156 |
+
"""
|
157 |
+
)
|
158 |
+
|
159 |
+
clr_btn.click(
|
160 |
+
fn=clear_inputs_and_outputs,
|
161 |
+
inputs=[],
|
162 |
+
outputs=[mic_input, upl_input, model_type, lbl_output, time_output],
|
163 |
+
)
|
164 |
+
prd_btn.click(
|
165 |
+
fn=transcribe,
|
166 |
+
inputs=[mic_input, upl_input, model_type],
|
167 |
+
outputs=[lbl_output, time_output],
|
168 |
+
)
|
169 |
+
|
170 |
+
demo.launch(debug=True)
|