Harshad Bhandwaldar commited on
Commit
a606014
·
1 Parent(s): 0a18830

model added

Browse files
Files changed (1) hide show
  1. app.py +23 -23
app.py CHANGED
@@ -10,21 +10,21 @@ model = nemo_asr.models.EncDecCTCModel.from_pretrained(
10
  model_name="stt_en_quartznet15x5"
11
  )
12
 
13
- # def speech_youtube(x):
14
- # data = pytube.YouTube(x)
15
- # audio = data.streams.get_audio_only()
16
- # text = model.transcribe(audio.download())
17
- # return text['text']
18
 
19
  def speech_file(x):
20
- print(x)
21
  text = model.transcribe([f"{x}"])
22
- print(text)
23
  return text
24
 
25
- # def speech_record(x):
26
- # text = model.transcribe(x)
27
- # return text['text']
28
 
29
  css = """
30
  .gradio-container {
@@ -112,23 +112,23 @@ css = """
112
  with gr.Blocks(css = css) as demo:
113
  gr.Markdown(
114
  """
115
- # Speech to Text Transcriptions!
116
- This demo uses the OpenAI whisper model which is trained on a large dataset of diverse audio that can perform multilingual speech recognition. The computation time is dependent on the length of the audio.
117
  """)
118
- # with gr.Tab("YouTube"):
119
- # audio_input = gr.Textbox(label="YouTube Link", placeholder="paste the youtube link here")
120
- # text_output = gr.Textbox(label="Transcription", show_label=False)
121
- # youtube_button = gr.Button("Transcribe")
122
  with gr.Tab("Audio File"):
123
  with gr.Row().style(equal_height=True):
124
  audio_input2 = gr.Audio(label="Audio File", type="filepath")
125
  text_output2 = gr.Textbox(label="Transcription", show_label=False)
126
  file_button = gr.Button("Transcribe")
127
- # with gr.Tab("Record"):
128
- # with gr.Row().style(equal_height=True):
129
- # audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
130
- # text_output3 = gr.Textbox(label="Transcription", show_label=False)
131
- # rec_button = gr.Button("Transcribe")
132
  gr.HTML('''
133
  <div class="footer">
134
  <p></a>
@@ -136,8 +136,8 @@ with gr.Blocks(css = css) as demo:
136
  </div>
137
  ''')
138
 
139
- # youtube_button.click(speech_youtube, inputs=audio_input, outputs=text_output)
140
  file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
141
- # rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)
142
 
143
  demo.launch()
 
10
  model_name="stt_en_quartznet15x5"
11
  )
12
 
13
+ def speech_youtube(x):
14
+ data = pytube.YouTube([f"{x}"])
15
+ audio = data.streams.get_audio_only()
16
+ text = model.transcribe(audio.download())
17
+ return text
18
 
19
  def speech_file(x):
20
+ # print(x)
21
  text = model.transcribe([f"{x}"])
22
+ # print(text)
23
  return text
24
 
25
+ def speech_record(x):
26
+ text = model.transcribe([f"{x}"])
27
+ return text
28
 
29
  css = """
30
  .gradio-container {
 
112
  with gr.Blocks(css = css) as demo:
113
  gr.Markdown(
114
  """
115
+ # Speech to Text - NVIDIA Qaurtznet15x5 (English)
116
+ QuartzNet is a Jasper-like network that uses separable convolutions and larger filter sizes. It has comparable accuracy to Jasper while having much fewer parameters. This particular model has 15 blocks each repeated 5 times.
117
  """)
118
+ with gr.Tab("YouTube"):
119
+ audio_input = gr.Textbox(label="YouTube Link", placeholder="paste the youtube link here")
120
+ text_output = gr.Textbox(label="Transcription", show_label=False)
121
+ youtube_button = gr.Button("Transcribe")
122
  with gr.Tab("Audio File"):
123
  with gr.Row().style(equal_height=True):
124
  audio_input2 = gr.Audio(label="Audio File", type="filepath")
125
  text_output2 = gr.Textbox(label="Transcription", show_label=False)
126
  file_button = gr.Button("Transcribe")
127
+ with gr.Tab("Record"):
128
+ with gr.Row().style(equal_height=True):
129
+ audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
130
+ text_output3 = gr.Textbox(label="Transcription", show_label=False)
131
+ rec_button = gr.Button("Transcribe")
132
  gr.HTML('''
133
  <div class="footer">
134
  <p></a>
 
136
  </div>
137
  ''')
138
 
139
+ youtube_button.click(speech_youtube, inputs=audio_input, outputs=text_output)
140
  file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
141
+ rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)
142
 
143
  demo.launch()