Spaces:
Sleeping
Sleeping
update app
Browse files
app.py
CHANGED
@@ -20,8 +20,6 @@ OWSM v3 has 889M parameters and is trained on 180k hours of paired speech data.
|
|
20 |
|
21 |
For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
|
22 |
|
23 |
-
We also have a [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) where you can use a free GPU.
|
24 |
-
|
25 |
```
|
26 |
@article{peng2023owsm,
|
27 |
title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
|
@@ -31,6 +29,9 @@ We also have a [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd
|
|
31 |
}
|
32 |
```
|
33 |
|
|
|
|
|
|
|
34 |
Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
|
35 |
'''
|
36 |
|
@@ -114,7 +115,7 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
114 |
# ASR or ST
|
115 |
if long_form: # speech will be padded in decode_long()
|
116 |
try:
|
117 |
-
speech2text.maxlenratio =
|
118 |
utts = speech2text.decode_long(
|
119 |
speech,
|
120 |
segment_sec=_dur,
|
@@ -124,7 +125,7 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
124 |
start_time="<0.00>",
|
125 |
end_time_threshold="<29.50>",
|
126 |
)
|
127 |
-
|
128 |
text = []
|
129 |
for t1, t2, res in utts:
|
130 |
text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
|
@@ -132,9 +133,9 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
132 |
|
133 |
return code2lang[lang_code], text
|
134 |
except:
|
135 |
-
print("An exception occurred in long-form decoding.
|
136 |
|
137 |
-
speech2text.maxlenratio = -min(
|
138 |
speech = librosa.util.fix_length(speech, size=(_sr * _dur))
|
139 |
text = speech2text(speech, text_prev)[0][3]
|
140 |
|
@@ -144,11 +145,11 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
144 |
demo = gr.Interface(
|
145 |
predict,
|
146 |
inputs=[
|
147 |
-
gr.Audio(type="filepath", label="Speech
|
148 |
gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
|
149 |
gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
|
150 |
gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
|
151 |
-
gr.Checkbox(label="Long Form (Experimental)", info="
|
152 |
gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
|
153 |
],
|
154 |
outputs=[
|
|
|
20 |
|
21 |
For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
|
22 |
|
|
|
|
|
23 |
```
|
24 |
@article{peng2023owsm,
|
25 |
title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
|
|
|
29 |
}
|
30 |
```
|
31 |
|
32 |
+
As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
|
33 |
+
Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
|
34 |
+
|
35 |
Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
|
36 |
'''
|
37 |
|
|
|
115 |
# ASR or ST
|
116 |
if long_form: # speech will be padded in decode_long()
|
117 |
try:
|
118 |
+
speech2text.maxlenratio = -300
|
119 |
utts = speech2text.decode_long(
|
120 |
speech,
|
121 |
segment_sec=_dur,
|
|
|
125 |
start_time="<0.00>",
|
126 |
end_time_threshold="<29.50>",
|
127 |
)
|
128 |
+
|
129 |
text = []
|
130 |
for t1, t2, res in utts:
|
131 |
text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
|
|
|
133 |
|
134 |
return code2lang[lang_code], text
|
135 |
except:
|
136 |
+
print("An exception occurred in long-form decoding. Fall back to standard decoding (only first 30s)")
|
137 |
|
138 |
+
speech2text.maxlenratio = -min(300, int((len(speech) / rate) * 10)) # assuming 10 tokens per second
|
139 |
speech = librosa.util.fix_length(speech, size=(_sr * _dur))
|
140 |
text = speech2text(speech, text_prev)[0][3]
|
141 |
|
|
|
145 |
demo = gr.Interface(
|
146 |
predict,
|
147 |
inputs=[
|
148 |
+
gr.Audio(type="filepath", label="Input Speech (<120s)", max_length=120, sources=["microphone", "upload"], show_download_button=True, show_share_button=True,),
|
149 |
gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
|
150 |
gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
|
151 |
gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
|
152 |
+
gr.Checkbox(label="Long Form (Experimental)", info="Perform long-form decoding for audios that are longer than 30s. If an exception happens, it will fall back to standard decoding on the initial 30s."),
|
153 |
gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
|
154 |
],
|
155 |
outputs=[
|