Spaces:

pyf98
/

OWSM_v3_demo

Runtime error

App Files Files Community

pyf98 commited on Nov 22, 2023

Commit

638c588

1 Parent(s): e41dd19

update app

Browse files

Files changed (1) hide show

app.py +9 -8

app.py CHANGED Viewed

@@ -20,8 +20,6 @@ OWSM v3 has 889M parameters and is trained on 180k hours of paired speech data.
 For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
-We also have a [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) where you can use a free GPU.
 ```
 @article{peng2023owsm,
   title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
@@ -31,6 +29,9 @@ We also have a [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd
 }
 ```
 Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
 '''
@@ -114,7 +115,7 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
     # ASR or ST
     if long_form: # speech will be padded in decode_long()
         try:
-            speech2text.maxlenratio = 0.0
             utts = speech2text.decode_long(
                 speech,
                 segment_sec=_dur,
@@ -124,7 +125,7 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
                 start_time="<0.00>",
                 end_time_threshold="<29.50>",
             )
             text = []
             for t1, t2, res in utts:
                 text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
@@ -132,9 +133,9 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
             return code2lang[lang_code], text
         except:
-            print("An exception occurred in long-form decoding. Falling back to short-form decoding (only first 30s)")
-    speech2text.maxlenratio = -min(450, int((len(speech) / rate) * 15))  # assuming 15 tokens per second
     speech = librosa.util.fix_length(speech, size=(_sr * _dur))
     text = speech2text(speech, text_prev)[0][3]
@@ -144,11 +145,11 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
 demo = gr.Interface(
     predict,
     inputs=[
-        gr.Audio(type="filepath", label="Speech Input", max_length=150, sources=["microphone", "upload"]),
         gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
         gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
         gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
-        gr.Checkbox(label="Long Form (Experimental)", info="Whether to perform long-form decoding (experimental feature)."),
         gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
     ],
     outputs=[

 For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
 ```
 @article{peng2023owsm,
   title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
 }
 ```
+As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
+Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
 Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
 '''
     # ASR or ST
     if long_form: # speech will be padded in decode_long()
         try:
+            speech2text.maxlenratio = -300
             utts = speech2text.decode_long(
                 speech,
                 segment_sec=_dur,
                 start_time="<0.00>",
                 end_time_threshold="<29.50>",
             )
             text = []
             for t1, t2, res in utts:
                 text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
             return code2lang[lang_code], text
         except:
+            print("An exception occurred in long-form decoding. Fall back to standard decoding (only first 30s)")
+    speech2text.maxlenratio = -min(300, int((len(speech) / rate) * 10))  # assuming 10 tokens per second
     speech = librosa.util.fix_length(speech, size=(_sr * _dur))
     text = speech2text(speech, text_prev)[0][3]
 demo = gr.Interface(
     predict,
     inputs=[
+        gr.Audio(type="filepath", label="Input Speech (<120s)", max_length=120, sources=["microphone", "upload"], show_download_button=True, show_share_button=True,),
         gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
         gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
         gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
+        gr.Checkbox(label="Long Form (Experimental)", info="Perform long-form decoding for audios that are longer than 30s. If an exception happens, it will fall back to standard decoding on the initial 30s."),
         gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
     ],
     outputs=[