Spaces:

hashb
/

stt-quartznet15x5-en-nvidia

Runtime error

App Files Files Community

Harshad Bhandwaldar commited on Jan 9, 2023

Commit

a606014

1 Parent(s): 0a18830

model added

Browse files

Files changed (1) hide show

app.py +23 -23

app.py CHANGED Viewed

@@ -10,21 +10,21 @@ model = nemo_asr.models.EncDecCTCModel.from_pretrained(
     model_name="stt_en_quartznet15x5"
 )
-# def speech_youtube(x):
-#     data = pytube.YouTube(x)
-#     audio = data.streams.get_audio_only()
-#     text = model.transcribe(audio.download())
-#     return text['text']
 def speech_file(x):
-    print(x)
     text = model.transcribe([f"{x}"])
-    print(text)
     return text
-# def speech_record(x):
-#     text = model.transcribe(x)
-#     return text['text']
 css = """
         .gradio-container {
@@ -112,23 +112,23 @@ css = """
 with gr.Blocks(css = css) as demo:
     gr.Markdown(
     """
-    # Speech to Text Transcriptions!
-    This demo uses the OpenAI whisper model which is trained on a large dataset of diverse audio that can perform multilingual speech recognition. The computation time is dependent on the length of the audio.
     """)
-    # with gr.Tab("YouTube"):
-    #     audio_input = gr.Textbox(label="YouTube Link", placeholder="paste the youtube link here")
-    #     text_output = gr.Textbox(label="Transcription", show_label=False)
-    #     youtube_button = gr.Button("Transcribe")
     with gr.Tab("Audio File"):
         with gr.Row().style(equal_height=True):
             audio_input2 = gr.Audio(label="Audio File", type="filepath")
             text_output2 = gr.Textbox(label="Transcription", show_label=False)
         file_button = gr.Button("Transcribe")
-    # with gr.Tab("Record"):
-    #     with gr.Row().style(equal_height=True):
-    #         audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
-    #         text_output3 = gr.Textbox(label="Transcription", show_label=False)
-    #     rec_button = gr.Button("Transcribe")
     gr.HTML('''
         <div class="footer">
                     <p></a>
@@ -136,8 +136,8 @@ with gr.Blocks(css = css) as demo:
         </div>
         ''')
-    # youtube_button.click(speech_youtube, inputs=audio_input, outputs=text_output)
     file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
-    # rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)
 demo.launch()

     model_name="stt_en_quartznet15x5"
 )
+def speech_youtube(x):
+    data = pytube.YouTube([f"{x}"])
+    audio = data.streams.get_audio_only()
+    text = model.transcribe(audio.download())
+    return text
 def speech_file(x):
+    # print(x)
     text = model.transcribe([f"{x}"])
+    # print(text)
     return text
+def speech_record(x):
+    text = model.transcribe([f"{x}"])
+    return text
 css = """
         .gradio-container {
 with gr.Blocks(css = css) as demo:
     gr.Markdown(
     """
+    # Speech to Text - NVIDIA Qaurtznet15x5 (English)
+    QuartzNet is a Jasper-like network that uses separable convolutions and larger filter sizes. It has comparable accuracy to Jasper while having much fewer parameters. This particular model has 15 blocks each repeated 5 times.
     """)
+    with gr.Tab("YouTube"):
+        audio_input = gr.Textbox(label="YouTube Link", placeholder="paste the youtube link here")
+        text_output = gr.Textbox(label="Transcription", show_label=False)
+        youtube_button = gr.Button("Transcribe")
     with gr.Tab("Audio File"):
         with gr.Row().style(equal_height=True):
             audio_input2 = gr.Audio(label="Audio File", type="filepath")
             text_output2 = gr.Textbox(label="Transcription", show_label=False)
         file_button = gr.Button("Transcribe")
+    with gr.Tab("Record"):
+        with gr.Row().style(equal_height=True):
+            audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
+            text_output3 = gr.Textbox(label="Transcription", show_label=False)
+        rec_button = gr.Button("Transcribe")
     gr.HTML('''
         <div class="footer">
                     <p></a>
         </div>
         ''')
+    youtube_button.click(speech_youtube, inputs=audio_input, outputs=text_output)
     file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
+    rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)
 demo.launch()