rohitp1 commited on
Commit
abc4a4a
·
1 Parent(s): ffa230f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import gradio as gr
2
+
3
+ # gr.Interface.load("models/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset").launch()
4
+
5
+
6
+ import gradio as gr
7
+ import os
8
+ import transformers
9
+ from transformers import pipeline, Wav2Vec2ForCTC,Wav2Vec2Processor
10
+ import time
11
+ import torch
12
+
13
+ # def greet_from_secret(ignored_param):
14
+ # name = os.environ.get('TOKEN')
15
+ # return
16
+
17
+
18
+ auth_token = os.environ.get('TOKEN')
19
+
20
+
21
+ M1 = "rohitp1/dgx1_w2v2_base_teacher_student_distillation_mozilla_epochs_100_batch_16_concatenate_datasets"
22
+ M2 = "rohitp1/finetune_teacher_babble_noise_mozilla_200_epochs"
23
+ M3 = "rohitp1/finetune_teacher_clean_mozilla_200_epochs"
24
+
25
+ model1 = Wav2Vec2ForCTC.from_pretrained(M1, use_auth_token=auth_token)
26
+ processor1 = Wav2Vec2Processor.from_pretrained(M1, use_auth_token=auth_token)
27
+
28
+
29
+ model2 = Wav2Vec2ForCTC.from_pretrained(M2, use_auth_token=auth_token)
30
+ processor2 = Wav2Vec2Processor.from_pretrained(M2, use_auth_token=auth_token)
31
+
32
+
33
+ model3 = Wav2Vec2ForCTC.from_pretrained(M3, use_auth_token=auth_token)
34
+ processor3 = Wav2Vec2Processor.from_pretrained(M3, use_auth_token=auth_token)
35
+
36
+
37
+
38
+ # make quantized model
39
+ quantized_model1 = torch.quantization.quantize_dynamic(
40
+ model3, {torch.nn.Linear}, dtype=torch.qint8
41
+ )
42
+
43
+
44
+ p1 = pipeline('automatic-speech-recognition', model=model1, processor=processor1)
45
+ p2 = pipeline('automatic-speech-recognition', model=model2, processor=processor2)
46
+ p3 = pipeline('automatic-speech-recognition', model=model3, processor=processor3)
47
+ p1_quant = pipeline('automatic-speech-recognition', model=quantized_model1, processor=processor1)
48
+
49
+ def transcribe(mic_input, upl_input, model_type):
50
+ if mic_input:
51
+ audio = mic_input
52
+ else:
53
+ audio = upl_input
54
+ time.sleep(3)
55
+ st_time = time.time()
56
+ if model_type == 'NoisyFinetuned':
57
+ text = p2(audio)["text"]
58
+ elif model_type == 'CleanFinetuned':
59
+ text = p3(audio)["text"]
60
+ elif model_type == 'DistilledQuantised':
61
+ text = p1_quant(audio)['text']
62
+ else:
63
+ text = p1(audio)["text"]
64
+ end_time = time.time()
65
+ # state = text + " "
66
+ time_taken = round((end_time - st_time) / 60 , 4)
67
+ return text, time_taken
68
+
69
+
70
+
71
+ # gr.Interface(
72
+ # fn=transcribe,
73
+ # inputs=[
74
+ # gr.inputs.Audio(source="microphone", type="filepath"),
75
+ # 'state'
76
+ # ],
77
+ # outputs=[
78
+ # "textbox",
79
+ # "state"
80
+ # ],
81
+ # live=False).launch()
82
+
83
+
84
+ # demo = gr.load(
85
+ # "huggingface/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset",
86
+ # title="Speech-to-text",
87
+ # inputs="mic",
88
+ # description="Let me try to guess what you're saying!",
89
+ # api_key="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"
90
+ # )
91
+
92
+ # demo.launch()
93
+
94
+ def clear_inputs_and_outputs():
95
+ return [None, None, "CleanFinetuned", None, None]
96
+
97
+ # Main function
98
+ if __name__ == "__main__":
99
+ demo = gr.Blocks()
100
+
101
+ with demo:
102
+ gr.Markdown(
103
+ """
104
+ <center><h1> Noise Robust English Automatic Speech Recognition LibriSpeech Dataset</h1></center> \
105
+ This space is a demo of an English ASR model using Huggingface.<br> \
106
+ In this space, you can record your voice or upload a wav file and the model will predict the text spoken in the audio<br><br>
107
+ """
108
+ )
109
+ with gr.Row():
110
+ ## Input
111
+ with gr.Column():
112
+ mic_input = gr.Audio(source="microphone", type="filepath", label="Record your own voice")
113
+ upl_input = gr.Audio(
114
+ source="upload", type="filepath", label="Upload a wav file"
115
+ )
116
+
117
+ with gr.Row():
118
+ model_type = gr.inputs.Dropdown(["RobustDistillation", "NoisyFinetuned", "CleanFinetuned", "DistilledAndQuantised"], label='Model Type')
119
+
120
+ with gr.Row():
121
+ clr_btn = gr.Button(value="Clear", variant="secondary")
122
+ prd_btn = gr.Button(value="Predict")
123
+
124
+
125
+ # Outputs
126
+ with gr.Column():
127
+ lbl_output = gr.Label(label="Transcription")
128
+
129
+ with gr.Row():
130
+ time_output = gr.Label(label="Time Taken (in sec)")
131
+ # with gr.Group():
132
+ # gr.Markdown("<center>Prediction per time slot</center>")
133
+ # plt_output = gr.Plot(
134
+ # label="Prediction per time slot", show_label=False
135
+ # )
136
+
137
+
138
+ with gr.Row():
139
+ gr.Examples(
140
+ [
141
+ # os.path.join(os.path.dirname(__file__), "audio/sample1.wav"),
142
+ # os.path.join(os.path.dirname(__file__), "audio/sample2.wav"),
143
+ os.path.join(os.path.dirname(__file__), "audio/sample3.wav"),
144
+ ],
145
+ upl_input,
146
+ [lbl_output, time_output],
147
+ transcribe
148
+ )
149
+ # Credits
150
+ with gr.Row():
151
+ gr.Markdown(
152
+ """
153
+ <h4>Credits</h4>
154
+ Author: Rohit Prasad <br>
155
+ Check out the model <a href="https://huggingface.co/rohitp1/subh_whisper_small_distil_att_loss_mozilla_epochs_50_batch_8">here</a>
156
+ """
157
+ )
158
+
159
+ clr_btn.click(
160
+ fn=clear_inputs_and_outputs,
161
+ inputs=[],
162
+ outputs=[mic_input, upl_input, model_type, lbl_output, time_output],
163
+ )
164
+ prd_btn.click(
165
+ fn=transcribe,
166
+ inputs=[mic_input, upl_input, model_type],
167
+ outputs=[lbl_output, time_output],
168
+ )
169
+
170
+ demo.launch(debug=True)