Spaces:
Sleeping
Sleeping
File size: 10,672 Bytes
96e69cb bf9aad7 ec44f6b bf9aad7 ec44f6b d49740e 5d9c695 ec44f6b d49740e a853413 ec44f6b a853413 5d9c695 ec44f6b 5d9c695 582bb63 ec44f6b 582bb63 5d9c695 a853413 582bb63 5d9c695 582bb63 5d9c695 582bb63 638c588 5d9c695 ec44f6b 5d9c695 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 ec44f6b bf9aad7 638c588 bf9aad7 638c588 bf9aad7 ec44f6b bf9aad7 638c588 bf9aad7 e41dd19 ec44f6b bf9aad7 ea91f53 bf9aad7 5d9c695 bf9aad7 5d9c695 bf9aad7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import torch
import gradio as gr
import librosa
from espnet2.bin.s2t_inference_language import Speech2Language
from espnet2.bin.s2t_inference import Speech2Text
TITLE="OWSM: Open Whisper-style Speech Model from CMU WAVLab"
DESCRIPTION='''
OWSM (pronounced as "awesome") is a series of Open Whisper-style Speech Models from [CMU WAVLab](https://www.wavlab.org/).
We reproduce Whisper-style training using publicly available data and an open-source toolkit [ESPnet](https://github.com/espnet/espnet).
For more details, please check our [website](https://www.wavlab.org/activities/2024/owsm/) or [paper](https://arxiv.org/abs/2309.13876) (Peng et al., ASRU 2023).
The latest demo uses OWSM v3.1 based on [E-Branchformer](https://arxiv.org/abs/2210.00077).
OWSM v3.1 has 1.02B parameters and is trained on 180k hours of labelled data. It supports various speech-to-text tasks:
- Speech recognition in 151 languages
- Any-to-any language speech translation
- Utterance-level timestamp prediction
- Long-form transcription
- Language identification
As a demo, the input speech should not exceed 2 minutes. We also limit the maximum number of tokens to be generated.
Please try our [Colab demo](https://colab.research.google.com/drive/1zKI3ZY_OtZd6YmVeED6Cxy1QwT1mqv9O?usp=sharing) if you want to explore more features.
Disclaimer: OWSM has not been thoroughly evaluated in all tasks. Due to limited training data, it may not perform well for certain languages.
Please consider citing the following related papers if you find our work helpful.
<details><summary>citations</summary>
<p>
```
@inproceedings{peng2024owsm31,
title={OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on E-Branchformer},
author={Yifan Peng and Jinchuan Tian and William Chen and Siddhant Arora and Brian Yan and Yui Sudo and Muhammad Shakeel and Kwanghee Choi and Jiatong Shi and Xuankai Chang and Jee-weon Jung and Shinji Watanabe},
booktitle={Proc. INTERSPEECH},
year={2024}
}
@inproceedings{peng2023owsm,
title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
author={Yifan Peng and Jinchuan Tian and Brian Yan and Dan Berrebbi and Xuankai Chang and Xinjian Li and Jiatong Shi and Siddhant Arora and William Chen and Roshan Sharma and Wangyou Zhang and Yui Sudo and Muhammad Shakeel and Jee-weon Jung and Soumi Maiti and Shinji Watanabe},
booktitle={Proc. ASRU},
year={2023}
}
```
</p>
</details>
'''
if not torch.cuda.is_available():
raise RuntimeError("Please use GPU for better inference speed.")
model_tag = "espnet/owsm_v3.1_ebf"
device = "cuda"
s2l = Speech2Language.from_pretrained(
model_tag=model_tag,
device=device,
nbest=1,
)
s2t = Speech2Text.from_pretrained(
model_tag=model_tag,
device=device,
beam_size=5,
ctc_weight=0.0,
maxlenratio=0.0,
# below are default values which can be overwritten in __call__
lang_sym="<eng>",
task_sym="<asr>",
predict_time=False,
)
iso_codes = ['abk', 'afr', 'amh', 'ara', 'asm', 'ast', 'aze', 'bak', 'bas', 'bel', 'ben', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chv', 'ckb', 'cmn', 'cnh', 'cym', 'dan', 'deu', 'dgd', 'div', 'ell', 'eng', 'epo', 'est', 'eus', 'fas', 'fil', 'fin', 'fra', 'frr', 'ful', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ibo', 'ina', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kam', 'kan', 'kat', 'kaz', 'kea', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lga', 'lin', 'lit', 'ltz', 'lug', 'luo', 'mal', 'mar', 'mas', 'mdf', 'mhr', 'mkd', 'mlt', 'mon', 'mri', 'mrj', 'mya', 'myv', 'nan', 'nep', 'nld', 'nno', 'nob', 'npi', 'nso', 'nya', 'oci', 'ori', 'orm', 'ory', 'pan', 'pol', 'por', 'pus', 'quy', 'roh', 'ron', 'rus', 'sah', 'sat', 'sin', 'skr', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'sun', 'swa', 'swe', 'swh', 'tam', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tig', 'tir', 'tok', 'tpi', 'tsn', 'tuk', 'tur', 'twi', 'uig', 'ukr', 'umb', 'urd', 'uzb', 'vie', 'vot', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul']
lang_names = ['Abkhazian', 'Afrikaans', 'Amharic', 'Arabic', 'Assamese', 'Asturian', 'Azerbaijani', 'Bashkir', 'Basa (Cameroon)', 'Belarusian', 'Bengali', 'Bosnian', 'Breton', 'Bulgarian', 'Catalan', 'Cebuano', 'Czech', 'Chuvash', 'Central Kurdish', 'Mandarin Chinese', 'Hakha Chin', 'Welsh', 'Danish', 'German', 'Dagaari Dioula', 'Dhivehi', 'Modern Greek (1453-)', 'English', 'Esperanto', 'Estonian', 'Basque', 'Persian', 'Filipino', 'Finnish', 'French', 'Northern Frisian', 'Fulah', 'Irish', 'Galician', 'Guarani', 'Gujarati', 'Haitian', 'Hausa', 'Hebrew', 'Hindi', 'Croatian', 'Upper Sorbian', 'Hungarian', 'Armenian', 'Igbo', 'Interlingua (International Auxiliary Language Association)', 'Indonesian', 'Icelandic', 'Italian', 'Javanese', 'Japanese', 'Kabyle', 'Kamba (Kenya)', 'Kannada', 'Georgian', 'Kazakh', 'Kabuverdianu', 'Khmer', 'Kinyarwanda', 'Kirghiz', 'Northern Kurdish', 'Korean', 'Lao', 'Latvian', 'Lungga', 'Lingala', 'Lithuanian', 'Luxembourgish', 'Ganda', 'Luo (Kenya and Tanzania)', 'Malayalam', 'Marathi', 'Masai', 'Moksha', 'Eastern Mari', 'Macedonian', 'Maltese', 'Mongolian', 'Maori', 'Western Mari', 'Burmese', 'Erzya', 'Min Nan Chinese', 'Nepali (macrolanguage)', 'Dutch', 'Norwegian Nynorsk', 'Norwegian Bokmål', 'Nepali (individual language)', 'Pedi', 'Nyanja', 'Occitan (post 1500)', 'Oriya (macrolanguage)', 'Oromo', 'Odia', 'Panjabi', 'Polish', 'Portuguese', 'Pushto', 'Ayacucho Quechua', 'Romansh', 'Romanian', 'Russian', 'Yakut', 'Santali', 'Sinhala', 'Saraiki', 'Slovak', 'Slovenian', 'Shona', 'Sindhi', 'Somali', 'Southern Sotho', 'Spanish', 'Sardinian', 'Serbian', 'Sundanese', 'Swahili (macrolanguage)', 'Swedish', 'Swahili (individual language)', 'Tamil', 'Tatar', 'Telugu', 'Tajik', 'Tagalog', 'Thai', 'Tigre', 'Tigrinya', 'Toki Pona', 'Tok Pisin', 'Tswana', 'Turkmen', 'Turkish', 'Twi', 'Uighur', 'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', 'Vietnamese', 'Votic', 'Wolof', 'Xhosa', 'Yoruba', 'Yue Chinese', 'Chinese', 'Zulu']
task_codes = ['asr', 'st_ara', 'st_cat', 'st_ces', 'st_cym', 'st_deu', 'st_eng', 'st_est', 'st_fas', 'st_fra', 'st_ind', 'st_ita', 'st_jpn', 'st_lav', 'st_mon', 'st_nld', 'st_por', 'st_ron', 'st_rus', 'st_slv', 'st_spa', 'st_swe', 'st_tam', 'st_tur', 'st_vie', 'st_zho']
task_names = ['Automatic Speech Recognition', 'Translate to Arabic', 'Translate to Catalan', 'Translate to Czech', 'Translate to Welsh', 'Translate to German', 'Translate to English', 'Translate to Estonian', 'Translate to Persian', 'Translate to French', 'Translate to Indonesian', 'Translate to Italian', 'Translate to Japanese', 'Translate to Latvian', 'Translate to Mongolian', 'Translate to Dutch', 'Translate to Portuguese', 'Translate to Romanian', 'Translate to Russian', 'Translate to Slovenian', 'Translate to Spanish', 'Translate to Swedish', 'Translate to Tamil', 'Translate to Turkish', 'Translate to Vietnamese', 'Translate to Chinese']
lang2code = dict(
[('Unknown', 'none')] + sorted(list(zip(lang_names, iso_codes)), key=lambda x: x[0])
)
task2code = dict(sorted(list(zip(task_names, task_codes)), key=lambda x: x[0]))
code2lang = dict([(v, k) for k, v in lang2code.items()])
# Copied from Whisper utils
def format_timestamp(
seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
):
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return (
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
)
def predict(audio_path, src_lang: str, task: str, beam_size, long_form: bool, text_prev: str,):
task_sym = f'<{task2code[task]}>'
s2t.beam_search.beam_size = int(beam_size)
# Our model is trained on 30s and 16kHz
speech, rate = librosa.load(audio_path, sr=16000) # speech has shape (len,); resample to 16k Hz
lang_code = lang2code[src_lang]
if lang_code == 'none':
# Detect language using the first 30s of speech
lang_code = s2l(speech)[0][0].strip()[1:-1]
lang_sym = f'<{lang_code}>'
# ASR or ST
if long_form:
try:
s2t.maxlenratio = -300
utts = s2t.decode_long(
speech,
condition_on_prev_text=False,
init_text=text_prev,
end_time_threshold="<29.00>",
lang_sym=lang_sym,
task_sym=task_sym,
)
text = []
for t1, t2, res in utts:
text.append(f"[{format_timestamp(seconds=t1)} --> {format_timestamp(seconds=t2)}] {res}")
text = '\n'.join(text)
return code2lang[lang_code], text
except:
print("An exception occurred in long-form decoding. Fall back to standard decoding (only first 30s)")
s2t.maxlenratio = -min(300, int((len(speech) / rate) * 10)) # assuming 10 tokens per second
text = s2t(speech, text_prev, lang_sym=lang_sym, task_sym=task_sym)[0][-2]
return code2lang[lang_code], text
demo = gr.Interface(
predict,
inputs=[
gr.Audio(type="filepath", label="Input Speech (<120s)", max_length=120, sources=["microphone", "upload"], show_download_button=True, show_share_button=True,),
gr.Dropdown(choices=list(lang2code), value="English", label="Language", info="Language of input speech. Select 'Unknown' (1st option) to detect it automatically."),
gr.Dropdown(choices=list(task2code), value="Automatic Speech Recognition", label="Task", info="Task to perform on input speech."),
gr.Slider(minimum=1, maximum=5, step=1, value=5, label="Beam Size", info="Beam size used in beam search."),
gr.Checkbox(label="Long Form (Experimental)", info="Perform long-form decoding. If an exception happens, it will fall back to standard decoding on the initial 30s."),
gr.Text(label="Text Prompt (Optional)", info="Generation will be conditioned on this prompt if provided"),
],
outputs=[
gr.Text(label="Predicted Language", info="Language identification is performed if language is unknown."),
gr.Text(label="Predicted Text", info="Best hypothesis."),
],
title=TITLE,
description=DESCRIPTION,
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch(
show_api=False,
share=True,
# debug=True,
)
|