pyf98 commited on
Commit
ec44f6b
1 Parent(s): 582bb63

upgrade to new interface

Browse files
Files changed (42) hide show
  1. app.py +41 -47
  2. owsm_v3.1_ebf/README.md +0 -80
  3. owsm_v3.1_ebf/data/token_list/bpe_unigram50000/bpe.model +0 -3
  4. owsm_v3.1_ebf/data/token_list/bpe_unigram50000/tokens.txt +0 -0
  5. owsm_v3.1_ebf/exp/s2t_stats_raw_bpe50000/train/feats_stats.npz +0 -3
  6. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/config.yaml +0 -0
  7. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/acc.png +0 -0
  8. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/backward_time.png +0 -0
  9. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/cer.png +0 -0
  10. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/cer_ctc.png +0 -0
  11. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/clip.png +0 -0
  12. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/forward_time.png +0 -0
  13. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/gpu_max_cached_mem_GB.png +0 -0
  14. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/grad_norm.png +0 -0
  15. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/iter_time.png +0 -0
  16. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/loss.png +0 -0
  17. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/loss_att.png +0 -0
  18. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/loss_ctc.png +0 -0
  19. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/loss_scale.png +0 -0
  20. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/optim0_lr0.png +0 -0
  21. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/optim_step_time.png +0 -0
  22. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/train_time.png +0 -0
  23. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/wer.png +0 -0
  24. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.1.log +0 -0
  25. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.10.log +0 -0
  26. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.13.log +0 -0
  27. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.2.log +0 -0
  28. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.3.log +0 -0
  29. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.4.log +0 -0
  30. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.5.log +0 -0
  31. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.6.log +0 -0
  32. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.7.log +0 -0
  33. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.8.log +0 -0
  34. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.9.log +0 -0
  35. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.log +0 -0
  36. owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth +0 -3
  37. owsm_v3.1_ebf/meta.yaml +0 -8
  38. owsm_v3/data/token_list/bpe_unigram50000/bpe.model +0 -3
  39. owsm_v3/exp/s2t_stats_raw_bpe50000/train/feats_stats.npz +0 -3
  40. owsm_v3/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/config.yaml +0 -0
  41. owsm_v3/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/valid.acc.ave_5best.till50epoch.pth +0 -3
  42. requirements.txt +1 -1
app.py CHANGED
@@ -2,31 +2,32 @@ import torch
2
  import gradio as gr
3
  import librosa
4
 
 
5
  from espnet2.bin.s2t_inference import Speech2Text
6
- from espnet2.bin.s2t_inference_language import Speech2Text as Speech2Lang
7
 
8
 
9
- TITLE="OWSM: An Open Whisper-style Speech Model from CMU WAVLab"
10
 
11
  DESCRIPTION='''
12
- OWSM is an Open Whisper-style Speech Model from [CMU WAVLab](https://www.wavlab.org/).
13
- It reproduces Whisper-style training using publicly available data and an open-source toolkit [ESPnet](https://github.com/espnet/espnet).
14
- For more details, please check out our [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
15
 
16
- OWSM v3.1 is an improved version of OWSM v3. It significantly outperforms OWSM v3 in almost all evaluation benchmarks.
17
- We do not include any new training data. Instead, we utilize a state-of-the-art speech encoder, [E-Branchformer](https://arxiv.org/abs/2210.00077).
 
18
 
19
- OWSM v3.1 has 1.02B parameters and is trained on 180k hours of paired speech data. It supports various speech-to-text tasks:
20
  - Speech recognition for 151 languages
21
  - Any-to-any language speech translation
22
- - Timestamp prediction
23
  - Long-form transcription
24
  - Language identification
25
 
26
  As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
27
  Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
28
 
29
- Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain language directions.
30
 
31
  Please consider citing the following related papers if you find our work helpful.
32
 
@@ -66,30 +67,29 @@ Please consider citing the following related papers if you find our work helpful
66
  '''
67
 
68
  if not torch.cuda.is_available():
69
- raise RuntimeError("Please use GPU for better speed")
70
 
71
- # model_path = "owsm_v3/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/valid.acc.ave_5best.till50epoch.pth"
72
- model_path = "owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth"
73
- device = "cuda" # if torch.cuda.is_available() else "cpu"
74
 
75
- speech2text = Speech2Text.from_pretrained(
76
- s2t_model_file=model_path,
77
  device=device,
78
- category_sym="<eng>",
79
- beam_size=5,
80
- # quantize_s2t_model=not torch.cuda.is_available(),
81
- # quantize_dtype="float16",
82
  )
83
 
84
- speech2lang = Speech2Lang.from_pretrained(
85
- s2t_model_file=model_path,
86
  device=device,
87
- nbest=1,
88
- # quantize_s2t_model=not torch.cuda.is_available(),
89
- # quantize_dtype="float16",
 
 
 
 
90
  )
91
 
92
-
93
  iso_codes = ['abk', 'afr', 'amh', 'ara', 'asm', 'ast', 'aze', 'bak', 'bas', 'bel', 'ben', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chv', 'ckb', 'cmn', 'cnh', 'cym', 'dan', 'deu', 'dgd', 'div', 'ell', 'eng', 'epo', 'est', 'eus', 'fas', 'fil', 'fin', 'fra', 'frr', 'ful', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ibo', 'ina', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kam', 'kan', 'kat', 'kaz', 'kea', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lga', 'lin', 'lit', 'ltz', 'lug', 'luo', 'mal', 'mar', 'mas', 'mdf', 'mhr', 'mkd', 'mlt', 'mon', 'mri', 'mrj', 'mya', 'myv', 'nan', 'nep', 'nld', 'nno', 'nob', 'npi', 'nso', 'nya', 'oci', 'ori', 'orm', 'ory', 'pan', 'pol', 'por', 'pus', 'quy', 'roh', 'ron', 'rus', 'sah', 'sat', 'sin', 'skr', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'sun', 'swa', 'swe', 'swh', 'tam', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tig', 'tir', 'tok', 'tpi', 'tsn', 'tuk', 'tur', 'twi', 'uig', 'ukr', 'umb', 'urd', 'uzb', 'vie', 'vot', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul']
94
  lang_names = ['Abkhazian', 'Afrikaans', 'Amharic', 'Arabic', 'Assamese', 'Asturian', 'Azerbaijani', 'Bashkir', 'Basa (Cameroon)', 'Belarusian', 'Bengali', 'Bosnian', 'Breton', 'Bulgarian', 'Catalan', 'Cebuano', 'Czech', 'Chuvash', 'Central Kurdish', 'Mandarin Chinese', 'Hakha Chin', 'Welsh', 'Danish', 'German', 'Dagaari Dioula', 'Dhivehi', 'Modern Greek (1453-)', 'English', 'Esperanto', 'Estonian', 'Basque', 'Persian', 'Filipino', 'Finnish', 'French', 'Northern Frisian', 'Fulah', 'Irish', 'Galician', 'Guarani', 'Gujarati', 'Haitian', 'Hausa', 'Hebrew', 'Hindi', 'Croatian', 'Upper Sorbian', 'Hungarian', 'Armenian', 'Igbo', 'Interlingua (International Auxiliary Language Association)', 'Indonesian', 'Icelandic', 'Italian', 'Javanese', 'Japanese', 'Kabyle', 'Kamba (Kenya)', 'Kannada', 'Georgian', 'Kazakh', 'Kabuverdianu', 'Khmer', 'Kinyarwanda', 'Kirghiz', 'Northern Kurdish', 'Korean', 'Lao', 'Latvian', 'Lungga', 'Lingala', 'Lithuanian', 'Luxembourgish', 'Ganda', 'Luo (Kenya and Tanzania)', 'Malayalam', 'Marathi', 'Masai', 'Moksha', 'Eastern Mari', 'Macedonian', 'Maltese', 'Mongolian', 'Maori', 'Western Mari', 'Burmese', 'Erzya', 'Min Nan Chinese', 'Nepali (macrolanguage)', 'Dutch', 'Norwegian Nynorsk', 'Norwegian Bokmål', 'Nepali (individual language)', 'Pedi', 'Nyanja', 'Occitan (post 1500)', 'Oriya (macrolanguage)', 'Oromo', 'Odia', 'Panjabi', 'Polish', 'Portuguese', 'Pushto', 'Ayacucho Quechua', 'Romansh', 'Romanian', 'Russian', 'Yakut', 'Santali', 'Sinhala', 'Saraiki', 'Slovak', 'Slovenian', 'Shona', 'Sindhi', 'Somali', 'Southern Sotho', 'Spanish', 'Sardinian', 'Serbian', 'Sundanese', 'Swahili (macrolanguage)', 'Swedish', 'Swahili (individual language)', 'Tamil', 'Tatar', 'Telugu', 'Tajik', 'Tagalog', 'Thai', 'Tigre', 'Tigrinya', 'Toki Pona', 'Tok Pisin', 'Tswana', 'Turkmen', 'Turkish', 'Twi', 'Uighur', 'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', 'Vietnamese', 'Votic', 'Wolof', 'Xhosa', 'Yoruba', 'Yue Chinese', 'Chinese', 'Zulu']
95
 
@@ -127,34 +127,29 @@ def format_timestamp(
127
 
128
 
129
  def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, text_prev: str,):
130
- speech2text.task_id = speech2text.converter.token2id[f'<{task2code[task]}>']
131
- speech2text.beam_search.beam_size = int(beam_size)
132
 
133
  # Our model is trained on 30s and 16kHz
134
- _sr = 16000
135
- _dur = 30
136
- speech, rate = librosa.load(audio_path, sr=_sr) # speech has shape (len,); resample to 16k Hz
137
 
138
- # Detect language using the first 30s of speech
139
  lang_code = lang2code[src_lang]
140
  if lang_code == 'none':
141
- lang_code = speech2lang(
142
- librosa.util.fix_length(speech, size=(_sr * _dur))
143
- )[0][0].strip()[1:-1]
144
- speech2text.category_id = speech2text.converter.token2id[f'<{lang_code}>']
145
 
146
  # ASR or ST
147
- if long_form: # speech will be padded in decode_long()
148
  try:
149
- speech2text.maxlenratio = -300
150
- utts = speech2text.decode_long(
151
  speech,
152
- segment_sec=_dur,
153
- fs=_sr,
154
  condition_on_prev_text=False,
155
  init_text=text_prev,
156
- start_time="<0.00>",
157
- end_time_threshold="<29.50>",
 
158
  )
159
 
160
  text = []
@@ -166,9 +161,8 @@ def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, te
166
  except:
167
  print("An exception occurred in long-form decoding. Fall back to standard decoding (only first 30s)")
168
 
169
- speech2text.maxlenratio = -min(300, int((len(speech) / rate) * 10)) # assuming 10 tokens per second
170
- speech = librosa.util.fix_length(speech, size=(_sr * _dur))
171
- text = speech2text(speech, text_prev)[0][3]
172
 
173
  return code2lang[lang_code], text
174
 
@@ -180,7 +174,7 @@ demo = gr.Interface(
180
  gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
181
  gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
182
  gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
183
- gr.Checkbox(label="Long Form (Experimental)", info="Perform long-form decoding for audios that are longer than 30s. If an exception happens, it will fall back to standard decoding on the initial 30s."),
184
  gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
185
  ],
186
  outputs=[
 
2
  import gradio as gr
3
  import librosa
4
 
5
+ from espnet2.bin.s2t_inference_language import Speech2Language
6
  from espnet2.bin.s2t_inference import Speech2Text
 
7
 
8
 
9
+ TITLE="OWSM: Open Whisper-style Speech Model from CMU WAVLab"
10
 
11
  DESCRIPTION='''
12
+ OWSM (pronounced as "awesome") is a series of Open Whisper-style Speech Models from [CMU WAVLab](https://www.wavlab.org/).
13
+ We reproduce Whisper-style training using publicly available data and an open-source toolkit [ESPnet](https://github.com/espnet/espnet).
14
+ For more details, please check our [website](https://www.wavlab.org/activities/2024/owsm/) or [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
15
 
16
+ The latest demo uses OWSM v3.1, an improved version of OWSM v3.
17
+ OWSM v3.1 outperforms OWSM v3 in almost all evaluation benchmarks while being faster during inference.
18
+ Note that we do not use extra training data. Instead, we utilize a state-of-the-art speech encoder, [E-Branchformer](https://arxiv.org/abs/2210.00077), to enhance the speech modeling capability.
19
 
20
+ OWSM v3.1 has 1.02B parameters and is trained on 180k hours of labelled data. It supports various speech-to-text tasks:
21
  - Speech recognition for 151 languages
22
  - Any-to-any language speech translation
23
+ - Utterance-level timestamp prediction
24
  - Long-form transcription
25
  - Language identification
26
 
27
  As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
28
  Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
29
 
30
+ Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain languages.
31
 
32
  Please consider citing the following related papers if you find our work helpful.
33
 
 
67
  '''
68
 
69
  if not torch.cuda.is_available():
70
+ raise RuntimeError("Please use GPU for better inference speed.")
71
 
72
+ model_tag = "espnet/owsm_v3.1_ebf"
73
+ device = "cuda"
 
74
 
75
+ s2l = Speech2Language.from_pretrained(
76
+ model_tag=model_tag,
77
  device=device,
78
+ nbest=1,
 
 
 
79
  )
80
 
81
+ s2t = Speech2Text.from_pretrained(
82
+ model_tag=model_tag,
83
  device=device,
84
+ beam_size=5,
85
+ ctc_weight=0.0,
86
+ maxlenratio=0.0,
87
+ # below are default values which can be overwritten in __call__
88
+ lang_sym="<eng>",
89
+ task_sym="<asr>",
90
+ predict_time=False,
91
  )
92
 
 
93
  iso_codes = ['abk', 'afr', 'amh', 'ara', 'asm', 'ast', 'aze', 'bak', 'bas', 'bel', 'ben', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chv', 'ckb', 'cmn', 'cnh', 'cym', 'dan', 'deu', 'dgd', 'div', 'ell', 'eng', 'epo', 'est', 'eus', 'fas', 'fil', 'fin', 'fra', 'frr', 'ful', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ibo', 'ina', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kam', 'kan', 'kat', 'kaz', 'kea', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lga', 'lin', 'lit', 'ltz', 'lug', 'luo', 'mal', 'mar', 'mas', 'mdf', 'mhr', 'mkd', 'mlt', 'mon', 'mri', 'mrj', 'mya', 'myv', 'nan', 'nep', 'nld', 'nno', 'nob', 'npi', 'nso', 'nya', 'oci', 'ori', 'orm', 'ory', 'pan', 'pol', 'por', 'pus', 'quy', 'roh', 'ron', 'rus', 'sah', 'sat', 'sin', 'skr', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'sun', 'swa', 'swe', 'swh', 'tam', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tig', 'tir', 'tok', 'tpi', 'tsn', 'tuk', 'tur', 'twi', 'uig', 'ukr', 'umb', 'urd', 'uzb', 'vie', 'vot', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul']
94
  lang_names = ['Abkhazian', 'Afrikaans', 'Amharic', 'Arabic', 'Assamese', 'Asturian', 'Azerbaijani', 'Bashkir', 'Basa (Cameroon)', 'Belarusian', 'Bengali', 'Bosnian', 'Breton', 'Bulgarian', 'Catalan', 'Cebuano', 'Czech', 'Chuvash', 'Central Kurdish', 'Mandarin Chinese', 'Hakha Chin', 'Welsh', 'Danish', 'German', 'Dagaari Dioula', 'Dhivehi', 'Modern Greek (1453-)', 'English', 'Esperanto', 'Estonian', 'Basque', 'Persian', 'Filipino', 'Finnish', 'French', 'Northern Frisian', 'Fulah', 'Irish', 'Galician', 'Guarani', 'Gujarati', 'Haitian', 'Hausa', 'Hebrew', 'Hindi', 'Croatian', 'Upper Sorbian', 'Hungarian', 'Armenian', 'Igbo', 'Interlingua (International Auxiliary Language Association)', 'Indonesian', 'Icelandic', 'Italian', 'Javanese', 'Japanese', 'Kabyle', 'Kamba (Kenya)', 'Kannada', 'Georgian', 'Kazakh', 'Kabuverdianu', 'Khmer', 'Kinyarwanda', 'Kirghiz', 'Northern Kurdish', 'Korean', 'Lao', 'Latvian', 'Lungga', 'Lingala', 'Lithuanian', 'Luxembourgish', 'Ganda', 'Luo (Kenya and Tanzania)', 'Malayalam', 'Marathi', 'Masai', 'Moksha', 'Eastern Mari', 'Macedonian', 'Maltese', 'Mongolian', 'Maori', 'Western Mari', 'Burmese', 'Erzya', 'Min Nan Chinese', 'Nepali (macrolanguage)', 'Dutch', 'Norwegian Nynorsk', 'Norwegian Bokmål', 'Nepali (individual language)', 'Pedi', 'Nyanja', 'Occitan (post 1500)', 'Oriya (macrolanguage)', 'Oromo', 'Odia', 'Panjabi', 'Polish', 'Portuguese', 'Pushto', 'Ayacucho Quechua', 'Romansh', 'Romanian', 'Russian', 'Yakut', 'Santali', 'Sinhala', 'Saraiki', 'Slovak', 'Slovenian', 'Shona', 'Sindhi', 'Somali', 'Southern Sotho', 'Spanish', 'Sardinian', 'Serbian', 'Sundanese', 'Swahili (macrolanguage)', 'Swedish', 'Swahili (individual language)', 'Tamil', 'Tatar', 'Telugu', 'Tajik', 'Tagalog', 'Thai', 'Tigre', 'Tigrinya', 'Toki Pona', 'Tok Pisin', 'Tswana', 'Turkmen', 'Turkish', 'Twi', 'Uighur', 'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', 'Vietnamese', 'Votic', 'Wolof', 'Xhosa', 'Yoruba', 'Yue Chinese', 'Chinese', 'Zulu']
95
 
 
127
 
128
 
129
  def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, text_prev: str,):
130
+ task_sym = f'<{task2code[task]}>'
131
+ s2t.beam_search.beam_size = int(beam_size)
132
 
133
  # Our model is trained on 30s and 16kHz
134
+ speech, rate = librosa.load(audio_path, sr=16000) # speech has shape (len,); resample to 16k Hz
 
 
135
 
 
136
  lang_code = lang2code[src_lang]
137
  if lang_code == 'none':
138
+ # Detect language using the first 30s of speech
139
+ lang_code = s2l(speech)[0][0].strip()[1:-1]
140
+ lang_sym = f'<{lang_code}>'
 
141
 
142
  # ASR or ST
143
+ if long_form:
144
  try:
145
+ s2t.maxlenratio = -300
146
+ utts = s2t.decode_long(
147
  speech,
 
 
148
  condition_on_prev_text=False,
149
  init_text=text_prev,
150
+ end_time_threshold="<29.00>",
151
+ lang_sym=lang_sym,
152
+ task_sym=task_sym,
153
  )
154
 
155
  text = []
 
161
  except:
162
  print("An exception occurred in long-form decoding. Fall back to standard decoding (only first 30s)")
163
 
164
+ s2t.maxlenratio = -min(300, int((len(speech) / rate) * 10)) # assuming 10 tokens per second
165
+ text = s2t(speech, text_prev, lang_sym=lang_sym, task_sym=task_sym)[0][-2]
 
166
 
167
  return code2lang[lang_code], text
168
 
 
174
  gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
175
  gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
176
  gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
177
+ gr.Checkbox(label="Long Form (Experimental)", info="Perform long-form decoding. If an exception happens, it will fall back to standard decoding on the initial 30s."),
178
  gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
179
  ],
180
  outputs=[
owsm_v3.1_ebf/README.md DELETED
@@ -1,80 +0,0 @@
1
- ---
2
- tags:
3
- - espnet
4
- - audio
5
- - automatic-speech-recognition
6
- - speech-translation
7
- language: multilingual
8
- datasets:
9
- - owsm_v3.1
10
- license: cc-by-4.0
11
- ---
12
-
13
- ## OWSM: Open Whisper-style Speech Model
14
-
15
- [OWSM](https://arxiv.org/abs/2309.13876) is an Open Whisper-style Speech Model from [CMU WAVLab](https://www.wavlab.org/). It reproduces Whisper-style training using publicly available data and an open-source toolkit [ESPnet](https://github.com/espnet/espnet).
16
-
17
- Our demo is available [here](https://huggingface.co/spaces/pyf98/OWSM_v3_demo).
18
-
19
- **OWSM v3.1 is an improved version of OWSM v3. It significantly outperforms OWSM v3 in almost all evaluation benchmarks.**
20
- We do not include any new training data. Instead, we utilize a state-of-the-art speech encoder, [E-Branchformer](https://arxiv.org/abs/2210.00077).
21
-
22
- OWSM v3.1 has 1.02B parameters in total and is trained on 180k hours of public speech data.
23
- Specifically, it supports the following speech-to-text tasks:
24
- - Speech recognition
25
- - Any-to-any-language speech translation
26
- - Utterance-level alignment
27
- - Long-form transcription
28
- - Language identification
29
-
30
-
31
- ### Citing OWSM, Branchformers and ESPnet
32
-
33
- ```BibTex
34
- @article{peng2023owsm,
35
- title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
36
- author={Yifan Peng and Jinchuan Tian and Brian Yan and Dan Berrebbi and Xuankai Chang and Xinjian Li and Jiatong Shi and Siddhant Arora and William Chen and Roshan Sharma and Wangyou Zhang and Yui Sudo and Muhammad Shakeel and Jee-weon Jung and Soumi Maiti and Shinji Watanabe},
37
- journal={arXiv preprint arXiv:2309.13876},
38
- year={2023}
39
- }
40
- @inproceedings{peng23b_interspeech,
41
- author={Yifan Peng and Kwangyoun Kim and Felix Wu and Brian Yan and Siddhant Arora and William Chen and Jiyang Tang and Suwon Shon and Prashant Sridhar and Shinji Watanabe},
42
- title={{A Comparative Study on E-Branchformer vs Conformer in Speech Recognition, Translation, and Understanding Tasks}},
43
- year=2023,
44
- booktitle={Proc. INTERSPEECH 2023},
45
- pages={2208--2212},
46
- doi={10.21437/Interspeech.2023-1194}
47
- }
48
- @inproceedings{kim2023branchformer,
49
- title={E-branchformer: Branchformer with enhanced merging for speech recognition},
50
- author={Kim, Kwangyoun and Wu, Felix and Peng, Yifan and Pan, Jing and Sridhar, Prashant and Han, Kyu J and Watanabe, Shinji},
51
- booktitle={2022 IEEE Spoken Language Technology Workshop (SLT)},
52
- pages={84--91},
53
- year={2023},
54
- organization={IEEE}
55
- }
56
- @InProceedings{pmlr-v162-peng22a,
57
- title = {Branchformer: Parallel {MLP}-Attention Architectures to Capture Local and Global Context for Speech Recognition and Understanding},
58
- author = {Peng, Yifan and Dalmia, Siddharth and Lane, Ian and Watanabe, Shinji},
59
- booktitle = {Proceedings of the 39th International Conference on Machine Learning},
60
- pages = {17627--17643},
61
- year = {2022},
62
- editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
63
- volume = {162},
64
- series = {Proceedings of Machine Learning Research},
65
- month = {17--23 Jul},
66
- publisher = {PMLR},
67
- pdf = {https://proceedings.mlr.press/v162/peng22a/peng22a.pdf},
68
- url = {https://proceedings.mlr.press/v162/peng22a.html},
69
- abstract = {Conformer has proven to be effective in many speech processing tasks. It combines the benefits of extracting local dependencies using convolutions and global dependencies using self-attention. Inspired by this, we propose a more flexible, interpretable and customizable encoder alternative, Branchformer, with parallel branches for modeling various ranged dependencies in end-to-end speech processing. In each encoder layer, one branch employs self-attention or its variant to capture long-range dependencies, while the other branch utilizes an MLP module with convolutional gating (cgMLP) to extract local relationships. We conduct experiments on several speech recognition and spoken language understanding benchmarks. Results show that our model outperforms both Transformer and cgMLP. It also matches with or outperforms state-of-the-art results achieved by Conformer. Furthermore, we show various strategies to reduce computation thanks to the two-branch architecture, including the ability to have variable inference complexity in a single trained model. The weights learned for merging branches indicate how local and global dependencies are utilized in different layers, which benefits model designing.}
70
- }
71
- @inproceedings{watanabe2018espnet,
72
- author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
73
- title={{ESPnet}: End-to-End Speech Processing Toolkit},
74
- year={2018},
75
- booktitle={Proceedings of Interspeech},
76
- pages={2207--2211},
77
- doi={10.21437/Interspeech.2018-1456},
78
- url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
79
- }
80
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
owsm_v3.1_ebf/data/token_list/bpe_unigram50000/bpe.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d6327da127e870bcb8c737dceb3bd47ccbce63da74ddb094f64afe313d68c8c
3
- size 1041297
 
 
 
 
owsm_v3.1_ebf/data/token_list/bpe_unigram50000/tokens.txt DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_stats_raw_bpe50000/train/feats_stats.npz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ef4b5e465110edf32eec024cf2427eedd677f5733bb87d6b2131e6984a6e13f
3
- size 1402
 
 
 
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/config.yaml DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/acc.png DELETED
Binary file (32.8 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/backward_time.png DELETED
Binary file (46.4 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/cer.png DELETED
Binary file (31.3 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/cer_ctc.png DELETED
Binary file (24.1 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/clip.png DELETED
Binary file (14.7 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/forward_time.png DELETED
Binary file (39.9 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/gpu_max_cached_mem_GB.png DELETED
Binary file (44.9 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/grad_norm.png DELETED
Binary file (28.3 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/iter_time.png DELETED
Binary file (43.7 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/loss.png DELETED
Binary file (31.4 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/loss_att.png DELETED
Binary file (33.2 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/loss_ctc.png DELETED
Binary file (32.7 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/loss_scale.png DELETED
Binary file (32.4 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/optim0_lr0.png DELETED
Binary file (33.8 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/optim_step_time.png DELETED
Binary file (31.4 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/train_time.png DELETED
Binary file (42 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/images/wer.png DELETED
Binary file (43.3 kB)
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.1.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.10.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.13.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.2.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.3.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.4.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.5.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.6.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.7.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.8.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.9.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.log DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3.1_ebf/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfb6d34e9f03af6113ada55463d3abe26f133ff2c64e56c65419f9a469313ad3
3
- size 4068122375
 
 
 
 
owsm_v3.1_ebf/meta.yaml DELETED
@@ -1,8 +0,0 @@
1
- espnet: '202308'
2
- files:
3
- s2t_model_file: exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth
4
- python: 3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]
5
- timestamp: 1703273348.000399
6
- torch: 1.13.1
7
- yaml_files:
8
- s2t_train_config: exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/config.yaml
 
 
 
 
 
 
 
 
 
owsm_v3/data/token_list/bpe_unigram50000/bpe.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:623b8767f80bd60036d8c207b96b25306a8181aa5a702cdac2bf5e90348da174
3
- size 1042418
 
 
 
 
owsm_v3/exp/s2t_stats_raw_bpe50000/train/feats_stats.npz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:904a9739b6cdd17afdb4b677627a21d3d1f8ffc99148d8cce07b65395b7e543d
3
- size 1402
 
 
 
 
owsm_v3/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/config.yaml DELETED
The diff for this file is too large to render. See raw diff
 
owsm_v3/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/valid.acc.ave_5best.till50epoch.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cae90cc63ba655d6e265f60543162cf5a7f8f92205efafeb89547f19403c977
3
- size 3554533303
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  torch==2.1.0
2
  torchaudio
3
- espnet @ git+https://github.com/espnet/espnet@d3254133c595ea8271072ee49a1b4ceb3ed4fd7a
 
1
  torch==2.1.0
2
  torchaudio
3
+ espnet @ git+https://github.com/espnet/espnet@7bcb169291f5d4a9b1fd00f8bfe554de84e50024