pyf98 commited on
Commit
638c588
1 Parent(s): e41dd19

update app

Browse files
Files changed (1) hide show
  1. app.py +9 -8
app.py CHANGED
@@ -20,8 +20,6 @@ OWSM v3 has 889M parameters and is trained on 180k hours of paired speech data.
20
 
21
  For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
22
 
23
- We also have a [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) where you can use a free GPU.
24
-
25
  ```
26
  @article{peng2023owsm,
27
  title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
@@ -31,6 +29,9 @@ We also have a [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd
31
  }
32
  ```
33
 
 
 
 
34
  Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
35
  '''
36
 
@@ -114,7 +115,7 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
114
  # ASR or ST
115
  if long_form: # speech will be padded in decode_long()
116
  try:
117
- speech2text.maxlenratio = 0.0
118
  utts = speech2text.decode_long(
119
  speech,
120
  segment_sec=_dur,
@@ -124,7 +125,7 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
124
  start_time="<0.00>",
125
  end_time_threshold="<29.50>",
126
  )
127
-
128
  text = []
129
  for t1, t2, res in utts:
130
  text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
@@ -132,9 +133,9 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
132
 
133
  return code2lang[lang_code], text
134
  except:
135
- print("An exception occurred in long-form decoding. Falling back to short-form decoding (only first 30s)")
136
 
137
- speech2text.maxlenratio = -min(450, int((len(speech) / rate) * 15)) # assuming 15 tokens per second
138
  speech = librosa.util.fix_length(speech, size=(_sr * _dur))
139
  text = speech2text(speech, text_prev)[0][3]
140
 
@@ -144,11 +145,11 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
144
  demo = gr.Interface(
145
  predict,
146
  inputs=[
147
- gr.Audio(type="filepath", label="Speech Input", max_length=150, sources=["microphone", "upload"]),
148
  gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
149
  gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
150
  gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
151
- gr.Checkbox(label="Long Form (Experimental)", info="Whether to perform long-form decoding (experimental feature)."),
152
  gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
153
  ],
154
  outputs=[
 
20
 
21
  For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
22
 
 
 
23
  ```
24
  @article{peng2023owsm,
25
  title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
 
29
  }
30
  ```
31
 
32
+ As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
33
+ Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
34
+
35
  Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
36
  '''
37
 
 
115
  # ASR or ST
116
  if long_form: # speech will be padded in decode_long()
117
  try:
118
+ speech2text.maxlenratio = -300
119
  utts = speech2text.decode_long(
120
  speech,
121
  segment_sec=_dur,
 
125
  start_time="<0.00>",
126
  end_time_threshold="<29.50>",
127
  )
128
+
129
  text = []
130
  for t1, t2, res in utts:
131
  text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
 
133
 
134
  return code2lang[lang_code], text
135
  except:
136
+ print("An exception occurred in long-form decoding. Fall back to standard decoding (only first 30s)")
137
 
138
+ speech2text.maxlenratio = -min(300, int((len(speech) / rate) * 10)) # assuming 10 tokens per second
139
  speech = librosa.util.fix_length(speech, size=(_sr * _dur))
140
  text = speech2text(speech, text_prev)[0][3]
141
 
 
145
  demo = gr.Interface(
146
  predict,
147
  inputs=[
148
+ gr.Audio(type="filepath", label="Input Speech (<120s)", max_length=120, sources=["microphone", "upload"], show_download_button=True, show_share_button=True,),
149
  gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
150
  gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
151
  gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
152
+ gr.Checkbox(label="Long Form (Experimental)", info="Perform long-form decoding for audios that are longer than 30s. If an exception happens, it will fall back to standard decoding on the initial 30s."),
153
  gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
154
  ],
155
  outputs=[