pyf98 commited on
Commit
5d9c695
1 Parent(s): 8851f17

update app

Browse files
Files changed (1) hide show
  1. app.py +43 -39
app.py CHANGED
@@ -6,24 +6,53 @@ from espnet2.bin.s2t_inference import Speech2Text
6
  from espnet2.bin.s2t_inference_language import Speech2Text as Speech2Lang
7
 
8
 
9
- model_name_or_path = "espnet/owsm_v3"
10
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  speech2text = Speech2Text.from_pretrained(
13
- model_name_or_path,
14
  device=device,
15
  category_sym="<eng>",
16
  beam_size=5,
17
- quantize_s2t_model=not torch.cuda.is_available(),
18
- quantize_dtype="float16",
19
  )
20
 
21
  speech2lang = Speech2Lang.from_pretrained(
22
- model_name_or_path,
23
  device=device,
24
  nbest=1,
25
- quantize_s2t_model=not torch.cuda.is_available(),
26
- quantize_dtype="float16",
27
  )
28
 
29
 
@@ -110,37 +139,10 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
110
  return code2lang[lang_code], text
111
 
112
 
113
- _DESCRIPTION=r'''
114
- OWSM is an Open Whisper-style Speech Model from [CMU WAVLab](https://www.wavlab.org/).
115
- It reproduces Whisper-style training using publicly available data and an open-source toolkit [ESPnet](https://github.com/espnet/espnet).
116
-
117
- OWSM v3 is trained on 180k hours of paired speech data. It supports various speech-to-text tasks:
118
- - Speech recognition for 151 languages
119
- - Any-to-any language speech translation
120
- - Timestamp prediction
121
- - Long-form transcription
122
- - Language identification
123
-
124
- For more details about OWSM, please check out our paper [here](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
125
-
126
- ```
127
- @article{peng2023owsm,
128
- title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
129
- author={Yifan Peng and Jinchuan Tian and Brian Yan and Dan Berrebbi and Xuankai Chang and Xinjian Li and Jiatong Shi and Siddhant Arora and William Chen and Roshan Sharma and Wangyou Zhang and Yui Sudo and Muhammad Shakeel and Jee-weon Jung and Soumi Maiti and Shinji Watanabe},
130
- journal={arXiv preprint arXiv:2309.13876},
131
- year={2023}
132
- }
133
- ```
134
-
135
- Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
136
-
137
-
138
- '''
139
-
140
  demo = gr.Interface(
141
  predict,
142
  inputs=[
143
- gr.Audio(type="filepath", label="Speech Input"),
144
  gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
145
  gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
146
  gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
@@ -151,13 +153,15 @@ demo = gr.Interface(
151
  gr.Text(label="Predicted Language", info="Language identification is performed if language is unknown."),
152
  gr.Text(label="Predicted Text", info="Best hypothesis."),
153
  ],
154
- title="Demo of OWSM v3: An Open Whisper-style Speech Model from CMU WAVLab",
155
- description=_DESCRIPTION,
 
156
  )
157
 
158
 
159
  if __name__ == "__main__":
160
  demo.launch(
161
  show_api=False,
162
- # debug=True
 
163
  )
 
6
  from espnet2.bin.s2t_inference_language import Speech2Text as Speech2Lang
7
 
8
 
9
+ TITLE="OWSM v3: An Open Whisper-style Speech Model from CMU WAVLab"
10
+ DESCRIPTION='''
11
+ OWSM is an Open Whisper-style Speech Model from [CMU WAVLab](https://www.wavlab.org/).
12
+ It reproduces Whisper-style training using publicly available data and an open-source toolkit [ESPnet](https://github.com/espnet/espnet).
13
+
14
+ OWSM v3 is trained on 180k hours of paired speech data. It supports various speech-to-text tasks:
15
+ - Speech recognition for 151 languages
16
+ - Any-to-any language speech translation
17
+ - Timestamp prediction
18
+ - Long-form transcription
19
+ - Language identification
20
+
21
+ For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
22
+
23
+ ```
24
+ @article{peng2023owsm,
25
+ title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
26
+ author={Yifan Peng and Jinchuan Tian and Brian Yan and Dan Berrebbi and Xuankai Chang and Xinjian Li and Jiatong Shi and Siddhant Arora and William Chen and Roshan Sharma and Wangyou Zhang and Yui Sudo and Muhammad Shakeel and Jee-weon Jung and Soumi Maiti and Shinji Watanabe},
27
+ journal={arXiv preprint arXiv:2309.13876},
28
+ year={2023}
29
+ }
30
+ ```
31
+
32
+ Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
33
+ '''
34
+
35
+ if not torch.cuda.is_available():
36
+ raise RuntimeError("Please use GPU for better speed")
37
+
38
+ model_path = "owsm_v3/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/valid.acc.ave_5best.till50epoch.pth"
39
+ device = "cuda" # if torch.cuda.is_available() else "cpu"
40
 
41
  speech2text = Speech2Text.from_pretrained(
42
+ s2t_model_file=model_path,
43
  device=device,
44
  category_sym="<eng>",
45
  beam_size=5,
46
+ # quantize_s2t_model=not torch.cuda.is_available(),
47
+ # quantize_dtype="float16",
48
  )
49
 
50
  speech2lang = Speech2Lang.from_pretrained(
51
+ s2t_model_file=model_path,
52
  device=device,
53
  nbest=1,
54
+ # quantize_s2t_model=not torch.cuda.is_available(),
55
+ # quantize_dtype="float16",
56
  )
57
 
58
 
 
139
  return code2lang[lang_code], text
140
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  demo = gr.Interface(
143
  predict,
144
  inputs=[
145
+ gr.Audio(type="filepath", label="Speech Input", max_length=300, sources=["microphone", "upload"]),
146
  gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
147
  gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
148
  gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
 
153
  gr.Text(label="Predicted Language", info="Language identification is performed if language is unknown."),
154
  gr.Text(label="Predicted Text", info="Best hypothesis."),
155
  ],
156
+ title=TITLE,
157
+ description=DESCRIPTION,
158
+ allow_flagging="never",
159
  )
160
 
161
 
162
  if __name__ == "__main__":
163
  demo.launch(
164
  show_api=False,
165
+ share=True,
166
+ # debug=True,
167
  )