Spaces:
Sleeping
Sleeping
update app
Browse files
app.py
CHANGED
@@ -6,24 +6,53 @@ from espnet2.bin.s2t_inference import Speech2Text
|
|
6 |
from espnet2.bin.s2t_inference_language import Speech2Text as Speech2Lang
|
7 |
|
8 |
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
speech2text = Speech2Text.from_pretrained(
|
13 |
-
|
14 |
device=device,
|
15 |
category_sym="<eng>",
|
16 |
beam_size=5,
|
17 |
-
quantize_s2t_model=not torch.cuda.is_available(),
|
18 |
-
quantize_dtype="float16",
|
19 |
)
|
20 |
|
21 |
speech2lang = Speech2Lang.from_pretrained(
|
22 |
-
|
23 |
device=device,
|
24 |
nbest=1,
|
25 |
-
quantize_s2t_model=not torch.cuda.is_available(),
|
26 |
-
quantize_dtype="float16",
|
27 |
)
|
28 |
|
29 |
|
@@ -110,37 +139,10 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
|
|
110 |
return code2lang[lang_code], text
|
111 |
|
112 |
|
113 |
-
_DESCRIPTION=r'''
|
114 |
-
OWSM is an Open Whisper-style Speech Model from [CMU WAVLab](https://www.wavlab.org/).
|
115 |
-
It reproduces Whisper-style training using publicly available data and an open-source toolkit [ESPnet](https://github.com/espnet/espnet).
|
116 |
-
|
117 |
-
OWSM v3 is trained on 180k hours of paired speech data. It supports various speech-to-text tasks:
|
118 |
-
- Speech recognition for 151 languages
|
119 |
-
- Any-to-any language speech translation
|
120 |
-
- Timestamp prediction
|
121 |
-
- Long-form transcription
|
122 |
-
- Language identification
|
123 |
-
|
124 |
-
For more details about OWSM, please check out our paper [here](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
|
125 |
-
|
126 |
-
```
|
127 |
-
@article{peng2023owsm,
|
128 |
-
title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
|
129 |
-
author={Yifan Peng and Jinchuan Tian and Brian Yan and Dan Berrebbi and Xuankai Chang and Xinjian Li and Jiatong Shi and Siddhant Arora and William Chen and Roshan Sharma and Wangyou Zhang and Yui Sudo and Muhammad Shakeel and Jee-weon Jung and Soumi Maiti and Shinji Watanabe},
|
130 |
-
journal={arXiv preprint arXiv:2309.13876},
|
131 |
-
year={2023}
|
132 |
-
}
|
133 |
-
```
|
134 |
-
|
135 |
-
Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
|
136 |
-
|
137 |
-
|
138 |
-
'''
|
139 |
-
|
140 |
demo = gr.Interface(
|
141 |
predict,
|
142 |
inputs=[
|
143 |
-
gr.Audio(type="filepath", label="Speech Input"),
|
144 |
gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
|
145 |
gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
|
146 |
gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
|
@@ -151,13 +153,15 @@ demo = gr.Interface(
|
|
151 |
gr.Text(label="Predicted Language", info="Language identification is performed if language is unknown."),
|
152 |
gr.Text(label="Predicted Text", info="Best hypothesis."),
|
153 |
],
|
154 |
-
title=
|
155 |
-
description=
|
|
|
156 |
)
|
157 |
|
158 |
|
159 |
if __name__ == "__main__":
|
160 |
demo.launch(
|
161 |
show_api=False,
|
162 |
-
|
|
|
163 |
)
|
|
|
6 |
from espnet2.bin.s2t_inference_language import Speech2Text as Speech2Lang
|
7 |
|
8 |
|
9 |
+
TITLE="OWSM v3: An Open Whisper-style Speech Model from CMU WAVLab"
|
10 |
+
DESCRIPTION='''
|
11 |
+
OWSM is an Open Whisper-style Speech Model from [CMU WAVLab](https://www.wavlab.org/).
|
12 |
+
It reproduces Whisper-style training using publicly available data and an open-source toolkit [ESPnet](https://github.com/espnet/espnet).
|
13 |
+
|
14 |
+
OWSM v3 is trained on 180k hours of paired speech data. It supports various speech-to-text tasks:
|
15 |
+
- Speech recognition for 151 languages
|
16 |
+
- Any-to-any language speech translation
|
17 |
+
- Timestamp prediction
|
18 |
+
- Long-form transcription
|
19 |
+
- Language identification
|
20 |
+
|
21 |
+
For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
|
22 |
+
|
23 |
+
```
|
24 |
+
@article{peng2023owsm,
|
25 |
+
title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
|
26 |
+
author={Yifan Peng and Jinchuan Tian and Brian Yan and Dan Berrebbi and Xuankai Chang and Xinjian Li and Jiatong Shi and Siddhant Arora and William Chen and Roshan Sharma and Wangyou Zhang and Yui Sudo and Muhammad Shakeel and Jee-weon Jung and Soumi Maiti and Shinji Watanabe},
|
27 |
+
journal={arXiv preprint arXiv:2309.13876},
|
28 |
+
year={2023}
|
29 |
+
}
|
30 |
+
```
|
31 |
+
|
32 |
+
Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
|
33 |
+
'''
|
34 |
+
|
35 |
+
if not torch.cuda.is_available():
|
36 |
+
raise RuntimeError("Please use GPU for better speed")
|
37 |
+
|
38 |
+
model_path = "owsm_v3/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/valid.acc.ave_5best.till50epoch.pth"
|
39 |
+
device = "cuda" # if torch.cuda.is_available() else "cpu"
|
40 |
|
41 |
speech2text = Speech2Text.from_pretrained(
|
42 |
+
s2t_model_file=model_path,
|
43 |
device=device,
|
44 |
category_sym="<eng>",
|
45 |
beam_size=5,
|
46 |
+
# quantize_s2t_model=not torch.cuda.is_available(),
|
47 |
+
# quantize_dtype="float16",
|
48 |
)
|
49 |
|
50 |
speech2lang = Speech2Lang.from_pretrained(
|
51 |
+
s2t_model_file=model_path,
|
52 |
device=device,
|
53 |
nbest=1,
|
54 |
+
# quantize_s2t_model=not torch.cuda.is_available(),
|
55 |
+
# quantize_dtype="float16",
|
56 |
)
|
57 |
|
58 |
|
|
|
139 |
return code2lang[lang_code], text
|
140 |
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
demo = gr.Interface(
|
143 |
predict,
|
144 |
inputs=[
|
145 |
+
gr.Audio(type="filepath", label="Speech Input", max_length=300, sources=["microphone", "upload"]),
|
146 |
gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
|
147 |
gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
|
148 |
gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
|
|
|
153 |
gr.Text(label="Predicted Language", info="Language identification is performed if language is unknown."),
|
154 |
gr.Text(label="Predicted Text", info="Best hypothesis."),
|
155 |
],
|
156 |
+
title=TITLE,
|
157 |
+
description=DESCRIPTION,
|
158 |
+
allow_flagging="never",
|
159 |
)
|
160 |
|
161 |
|
162 |
if __name__ == "__main__":
|
163 |
demo.launch(
|
164 |
show_api=False,
|
165 |
+
share=True,
|
166 |
+
# debug=True,
|
167 |
)
|