diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..91e4ac4d38b618f618c6ed7d94e827c724107678 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,34 +2,28 @@ *.arrow filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text *.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text *.ftz filter=lfs diff=lfs merge=lfs -text *.gz filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text *.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text *.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text *.pt filter=lfs diff=lfs merge=lfs -text *.pth filter=lfs diff=lfs merge=lfs -text *.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text *.tflite filter=lfs diff=lfs merge=lfs -text *.tgz filter=lfs diff=lfs merge=lfs -text *.wasm filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +*.opus filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 3c963d8f7b9b2320c24edd0e2e69b2fc63f2507d..75a23e21ef3d4c1cd9b05bed84452fddb85c0755 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ --- -title: Automatic Speech Recognition 2 -emoji: 🐢 -colorFrom: blue -colorTo: red +title: Automatic Speech Recognition +emoji: 🌍 +colorFrom: yellow +colorTo: pink sdk: gradio -sdk_version: 5.25.2 +sdk_version: 4.44.1 +python_version: 3.10.0 app_file: app.py pinned: false license: apache-2.0 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..b84e887e87cb6773af20cb6d7c0730fbd6f125e8 --- /dev/null +++ b/app.py @@ -0,0 +1,436 @@ +#!/usr/bin/env python3 +# +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# References: +# https://gradio.app/docs/#dropdown + +import logging +import os +import tempfile +import time +import urllib.request +import uuid +from datetime import datetime + +import gradio as gr +import torch +import torchaudio + +from examples import examples +from model import ( + decode, + get_pretrained_model, + get_punct_model, + language_to_models, + sample_rate, +) + +languages = list(language_to_models.keys()) + + +def MyPrint(s): + now = datetime.now() + date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f") + print(f"{date_time}: {s}") + + +def convert_to_wav(in_filename: str) -> str: + """Convert the input audio file to a wave file""" + out_filename = str(uuid.uuid4()) + out_filename = f"{in_filename}.wav" + + MyPrint(f"Converting '{in_filename}' to '{out_filename}'") + _ = os.system( + f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 -ac 1 '{out_filename}' -y" + ) + + return out_filename + + +def build_html_output(s: str, style: str = "result_item_success"): + return f""" +
+
+ {s} +
+
+ """ + + +def process_url( + language: str, + repo_id: str, + decoding_method: str, + num_active_paths: int, + add_punct: str, + url: str, +): + MyPrint(f"Processing URL: {url}") + with tempfile.NamedTemporaryFile() as f: + try: + urllib.request.urlretrieve(url, f.name) + + return process( + in_filename=f.name, + language=language, + repo_id=repo_id, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + add_punct=add_punct, + ) + except Exception as e: + MyPrint(str(e)) + return "", build_html_output(str(e), "result_item_error") + + +def process_uploaded_file( + language: str, + repo_id: str, + decoding_method: str, + num_active_paths: int, + add_punct: str, + in_filename: str, +): + if in_filename is None or in_filename == "": + return "", build_html_output( + "Please first upload a file and then click " + 'the button "submit for recognition"', + "result_item_error", + ) + + MyPrint(f"Processing uploaded file: {in_filename}") + try: + return process( + in_filename=in_filename, + language=language, + repo_id=repo_id, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + add_punct=add_punct, + ) + except Exception as e: + MyPrint(str(e)) + return "", build_html_output(str(e), "result_item_error") + + +def process_microphone( + language: str, + repo_id: str, + decoding_method: str, + num_active_paths: int, + add_punct: str, + in_filename: str, +): + if in_filename is None or in_filename == "": + return "", build_html_output( + "Please first click 'Record from microphone', speak, " + "click 'Stop recording', and then " + "click the button 'submit for recognition'", + "result_item_error", + ) + + MyPrint(f"Processing microphone: {in_filename}") + try: + return process( + in_filename=in_filename, + language=language, + repo_id=repo_id, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + add_punct=add_punct, + ) + except Exception as e: + MyPrint(str(e)) + return "", build_html_output(str(e), "result_item_error") + + +@torch.no_grad() +def process( + language: str, + repo_id: str, + decoding_method: str, + num_active_paths: int, + add_punct: str, + in_filename: str, +): + MyPrint(f"language: {language}") + MyPrint(f"repo_id: {repo_id}") + MyPrint(f"decoding_method: {decoding_method}") + MyPrint(f"num_active_paths: {num_active_paths}") + MyPrint(f"in_filename: {in_filename}") + + filename = convert_to_wav(in_filename) + + now = datetime.now() + date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f") + MyPrint(f"Started at {date_time}") + + start = time.time() + + recognizer = get_pretrained_model( + repo_id, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + text = decode(recognizer, filename) + if add_punct == "Yes" and language == "Chinese": + punct = get_punct_model() + text = punct.add_punctuation(text) + + date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f") + end = time.time() + + metadata = torchaudio.info(filename) + duration = metadata.num_frames / sample_rate + rtf = (end - start) / duration + + MyPrint(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s") + + info = f""" + Wave duration : {duration: .3f} s
+ Processing time: {end - start: .3f} s
+ RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f}
+ """ + if ( + rtf > 1 + and repo_id != "csukuangfj/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16" + ): + info += ( + "
We are loading the model for the first run. " + "Please run again to measure the real RTF.
" + ) + + MyPrint(info) + MyPrint(f"\nrepo_id: {repo_id}\nhyp: {text}") + + return text, build_html_output(info) + + +title = "# Automatic Speech Recognition with Next-gen Kaldi" +description = """ +This space shows how to do automatic speech recognition with Next-gen Kaldi. + +Please visit + +for streaming speech recognition with **Next-gen Kaldi** using WebAssembly. + +It is running on CPU within a docker container provided by Hugging Face. + +Please input audio files less than 30 seconds in this space. + +Please see +if you want to try files longer than 30 seconds. + +For text to speech, please see + + +See more information by visiting the following links: + +- +- +- +- +- +- + +If you want to deploy it locally, please see + +""" + +# css style is copied from +# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113 +css = """ +.result {display:flex;flex-direction:column} +.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%} +.result_item_success {background-color:mediumaquamarine;color:white;align-self:start} +.result_item_error {background-color:#ff7070;color:white;align-self:start} +""" + + +def update_model_dropdown(language: str): + if language in language_to_models: + choices = language_to_models[language] + return gr.Dropdown( + choices=choices, + value=choices[0], + interactive=True, + ) + + raise ValueError(f"Unsupported language: {language}") + + +demo = gr.Blocks(css=css) + + +with demo: + gr.Markdown(title) + language_choices = list(language_to_models.keys()) + + language_radio = gr.Radio( + label="Language", + choices=language_choices, + value=language_choices[0], + ) + model_dropdown = gr.Dropdown( + choices=language_to_models[language_choices[0]], + label="Select a model", + value=language_to_models[language_choices[0]][0], + ) + + language_radio.change( + update_model_dropdown, + inputs=language_radio, + outputs=model_dropdown, + ) + + decoding_method_radio = gr.Radio( + label="Decoding method", + choices=["greedy_search", "modified_beam_search"], + value="greedy_search", + ) + + num_active_paths_slider = gr.Slider( + minimum=1, + value=4, + step=1, + label="Number of active paths for modified_beam_search", + ) + + punct_radio = gr.Radio( + label="Whether to add punctuation (Only for Chinese)", + choices=["Yes", "No"], + value="Yes", + ) + + with gr.Tabs(): + with gr.TabItem("Upload from disk"): + uploaded_file = gr.Audio( + sources=["upload"], # Choose between "microphone", "upload" + type="filepath", + label="Upload from disk", + ) + upload_button = gr.Button("Submit for recognition") + uploaded_output = gr.Textbox(label="Recognized speech from uploaded file") + uploaded_html_info = gr.HTML(label="Info") + + # gr.Examples( + # examples=examples, + # inputs=[ + # language_radio, + # model_dropdown, + # decoding_method_radio, + # num_active_paths_slider, + # punct_radio, + # uploaded_file, + # ], + # outputs=[uploaded_output, uploaded_html_info], + # fn=process_uploaded_file, + # ) + + with gr.TabItem("Record from microphone"): + microphone = gr.Audio( + sources=["microphone"], # Choose between "microphone", "upload" + type="filepath", + label="Record from microphone", + ) + + record_button = gr.Button("Submit for recognition") + recorded_output = gr.Textbox(label="Recognized speech from recordings") + recorded_html_info = gr.HTML(label="Info") + + # gr.Examples( + # examples=examples, + # inputs=[ + # language_radio, + # model_dropdown, + # decoding_method_radio, + # num_active_paths_slider, + # punct_radio, + # microphone, + # ], + # outputs=[recorded_output, recorded_html_info], + # fn=process_microphone, + # ) + + with gr.TabItem("From URL"): + url_textbox = gr.Textbox( + max_lines=1, + placeholder="URL to an audio file", + label="URL", + interactive=True, + ) + + url_button = gr.Button("Submit for recognition") + url_output = gr.Textbox(label="Recognized speech from URL") + url_html_info = gr.HTML(label="Info") + + upload_button.click( + process_uploaded_file, + inputs=[ + language_radio, + model_dropdown, + decoding_method_radio, + num_active_paths_slider, + punct_radio, + uploaded_file, + ], + outputs=[uploaded_output, uploaded_html_info], + ) + + record_button.click( + process_microphone, + inputs=[ + language_radio, + model_dropdown, + decoding_method_radio, + num_active_paths_slider, + punct_radio, + microphone, + ], + outputs=[recorded_output, recorded_html_info], + ) + + url_button.click( + process_url, + inputs=[ + language_radio, + model_dropdown, + decoding_method_radio, + num_active_paths_slider, + punct_radio, + url_textbox, + ], + outputs=[url_output, url_html_info], + ) + + gr.Markdown(description) + +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + +torch._C._jit_set_profiling_executor(False) +torch._C._jit_set_profiling_mode(False) +torch._C._set_graph_executor_optimize(False) + +if __name__ == "__main__": + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + + logging.basicConfig(format=formatter, level=logging.INFO) + + demo.launch() diff --git a/decode.py b/decode.py new file mode 100644 index 0000000000000000000000000000000000000000..9e593d57457b10dd47bac4c2747811eb7a64d243 --- /dev/null +++ b/decode.py @@ -0,0 +1,121 @@ +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# Copied from https://github.com/k2-fsa/sherpa/blob/master/sherpa/bin/conformer_rnnt/decode.py +# +# See LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import List + +import torch +from sherpa import RnntConformerModel, greedy_search, modified_beam_search +from torch.nn.utils.rnn import pad_sequence + +LOG_EPS = math.log(1e-10) + + +@torch.no_grad() +def run_model_and_do_greedy_search( + model: RnntConformerModel, + features: List[torch.Tensor], +) -> List[List[int]]: + """Run RNN-T model with the given features and use greedy search + to decode the output of the model. + + Args: + model: + The RNN-T model. + features: + A list of 2-D tensors. Each entry is of shape + (num_frames, feature_dim). + Returns: + Return a list-of-list containing the decoding token IDs. + """ + features_length = torch.tensor( + [f.size(0) for f in features], + dtype=torch.int64, + ) + features = pad_sequence( + features, + batch_first=True, + padding_value=LOG_EPS, + ) + + device = model.device + features = features.to(device) + features_length = features_length.to(device) + + encoder_out, encoder_out_length = model.encoder( + features=features, + features_length=features_length, + ) + + hyp_tokens = greedy_search( + model=model, + encoder_out=encoder_out, + encoder_out_length=encoder_out_length.cpu(), + ) + return hyp_tokens + + +@torch.no_grad() +def run_model_and_do_modified_beam_search( + model: RnntConformerModel, + features: List[torch.Tensor], + num_active_paths: int, +) -> List[List[int]]: + """Run RNN-T model with the given features and use greedy search + to decode the output of the model. + + Args: + model: + The RNN-T model. + features: + A list of 2-D tensors. Each entry is of shape + (num_frames, feature_dim). + num_active_paths: + Used only when decoding_method is modified_beam_search. + It specifies number of active paths for each utterance. Due to + merging paths with identical token sequences, the actual number + may be less than "num_active_paths". + Returns: + Return a list-of-list containing the decoding token IDs. + """ + features_length = torch.tensor( + [f.size(0) for f in features], + dtype=torch.int64, + ) + features = pad_sequence( + features, + batch_first=True, + padding_value=LOG_EPS, + ) + + device = model.device + features = features.to(device) + features_length = features_length.to(device) + + encoder_out, encoder_out_length = model.encoder( + features=features, + features_length=features_length, + ) + + hyp_tokens = modified_beam_search( + model=model, + encoder_out=encoder_out, + encoder_out_length=encoder_out_length.cpu(), + num_active_paths=num_active_paths, + ) + return hyp_tokens diff --git a/examples.py b/examples.py new file mode 100644 index 0000000000000000000000000000000000000000..712c1c8e2beb6a34cfc7487ff80378f1589878c9 --- /dev/null +++ b/examples.py @@ -0,0 +1,544 @@ +#!/usr/bin/env python3 +# +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +examples = [ + [ + "Chinese+English", + "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", + "greedy_search", + 4, + "Yes", + "./test_wavs/tal_csasr/0.wav", + ], + [ + "Chinese+English+Cantonese", + "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en", + "greedy_search", + 4, + "Yes", + "./test_wavs/cantonese/2.wav", + ], + [ + "Chinese+English+Cantonese+Japanese+Korean", + "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", + "greedy_search", + 4, + "Yes", + "./test_wavs/sense_voice/yue.wav", + ], + [ + "Cantonese", + "zrjin/icefall-asr-mdcc-zipformer-2024-03-11", + "greedy_search", + 4, + "Yes", + "./test_wavs/cantonese/1.wav", + ], + [ + "English", + "whisper-base.en", + "greedy_search", + 4, + "Yes", + "./test_wavs/librispeech/1089-134686-0001.wav", + ], + [ + "Chinese", + "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09", + "greedy_search", + 4, + "Yes", + "./test_wavs/paraformer-zh/四川话.wav", + ], + [ + "Japanese", + "reazon-research/reazonspeech-k2-v2", + "greedy_search", + 4, + "No", + "./test_wavs/japanese/1.wav", + ], + [ + "Korean", + "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24", + "greedy_search", + 4, + "No", + "./test_wavs/korean/0.wav", + ], + [ + "Russian", + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24", + "greedy_search", + 4, + "No", + "./test_wavs/russian/russian-i-love-you.wav", + ], + [ + "Thai", + "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20", + "greedy_search", + 4, + "No", + "./test_wavs/thai/0.wav", + ], + # [ + # "Russian", + # "alphacep/vosk-model-ru", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/russian/test.wav", + # ], + # [ + # "German", + # "csukuangfj/wav2vec2.0-torchaudio", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav", + # ], + # [ + # "Arabic", + # "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/arabic/a.wav", + # ], + # [ + # "Tibetan", + # "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/tibetan/a_0_cacm-A70_31117.wav", + # ], + # [ + # "French", + # "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/french/common_voice_fr_19364697.wav", + # ], + # [ + # "Chinese", + # "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/alimeeting/R8003_M8001-8004-165.wav", + # ], + # [ + # "Chinese", + # "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/paraformer-zh/天津话.wav", + # ], + # [ + # "Chinese", + # "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/paraformer-zh/郑州话.wav", + # ], + # [ + # "Chinese", + # "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/alimeeting/R8008_M8013-8049-74.wav", + # ], + # [ + # "Chinese", + # "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav", + # ], + # [ + # "English", + # "videodanchik/icefall-asr-tedlium3-conformer-ctc2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/tedlium3/DanBarber_2010-219.wav", + # ], + # [ + # "English", + # "whisper-base.en", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/tedlium3/DanielKahneman_2010-157.wav", + # ], + # [ + # "English", + # "videodanchik/icefall-asr-tedlium3-conformer-ctc2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/tedlium3/RobertGupta_2010U-15.wav", + # ], + # # librispeech + # # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs + # [ + # "English", + # "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/librispeech/1089-134686-0001.wav", + # ], + # [ + # "English", + # "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/librispeech/1221-135766-0001.wav", + # ], + # [ + # "English", + # "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/librispeech/1221-135766-0002.wav", + # ], + # # gigaspeech + # [ + # "English", + # "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/gigaspeech/1-minute-audiobook.opus", + # ], + # [ + # "English", + # "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/gigaspeech/100-seconds-podcast.opus", + # ], + # [ + # "English", + # "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/gigaspeech/100-seconds-youtube.opus", + # ], + # # wenetspeech + # # https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/test_wavs + # [ + # "Chinese", + # "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/wenetspeech/DEV_T0000000000.opus", + # ], + # [ + # "Chinese", + # "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/wenetspeech/DEV_T0000000001.opus", + # ], + # [ + # "Chinese", + # "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/wenetspeech/DEV_T0000000002.opus", + # ], + # # aishell2-A + # # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/test_wavs + # [ + # "Chinese", + # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aishell2/ID0012W0030.wav", + # ], + # [ + # "Chinese", + # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aishell2/ID0012W0162.wav", + # ], + # [ + # "Chinese", + # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aishell2/ID0012W0215.wav", + # ], + # # aishell2-B + # # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/test_wavs + # [ + # "Chinese", + # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aishell2/ID0012W0030.wav", + # ], + # [ + # "Chinese", + # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aishell2/ID0012W0162.wav", + # ], + # [ + # "Chinese", + # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aishell2/ID0012W0215.wav", + # ], + # # aishell2-B + # # https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2/tree/main/test_wavs + # [ + # "Chinese", + # "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aidatatang_200zh/T0055G0036S0002.wav", + # ], + # [ + # "Chinese", + # "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aidatatang_200zh/T0055G0036S0003.wav", + # ], + # [ + # "Chinese", + # "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/aidatatang_200zh/T0055G0036S0004.wav", + # ], + # # tal_csasr + # [ + # "Chinese+English", + # "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav", + # ], + # [ + # "Chinese+English", + # "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav", + # ], + # [ + # "Chinese+English", + # "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav", + # ], + # [ + # "Tibetan", + # "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/tibetan/a_0_cacm-A70_31116.wav", + # ], + # [ + # "Tibetan", + # "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/tibetan/a_0_cacm-A70_31118.wav", + # ], + # # arabic + # [ + # "Arabic", + # "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/arabic/b.wav", + # ], + # [ + # "Arabic", + # "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/arabic/c.wav", + # ], + # [ + # "German", + # "csukuangfj/wav2vec2.0-torchaudio", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav", + # ], + # [ + # "French", + # "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/french/common_voice_fr_19738183.wav", + # ], + # [ + # "French", + # "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/french/common_voice_fr_27024649.wav", + # ], + # [ + # "Korean", + # "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/korean/1.wav", + # ], + # [ + # "Korean", + # "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/korean/2.wav", + # ], + # [ + # "Korean", + # "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/korean/3.wav", + # ], + # [ + # "Thai", + # "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/thai/1.wav", + # ], + # [ + # "Thai", + # "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/thai/2.wav", + # ], + # [ + # "Chinese+English+Cantonese+Japanese+Korean", + # "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/sense_voice/zh.wav", + # ], + # [ + # "Chinese+English+Cantonese+Japanese+Korean", + # "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/sense_voice/en.wav", + # ], + # [ + # "Chinese+English+Cantonese+Japanese+Korean", + # "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/sense_voice/ja.wav", + # ], + # [ + # "Chinese+English+Cantonese+Japanese+Korean", + # "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", + # "greedy_search", + # 4, + # "Yes", + # "./test_wavs/sense_voice/ko.wav", + # ], + # [ + # "Japanese", + # "reazon-research/reazonspeech-k2-v2", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/japanese/2.wav", + # ], + # [ + # "Japanese", + # "reazon-research/reazonspeech-k2-v2", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/japanese/3.wav", + # ], + # [ + # "Japanese", + # "reazon-research/reazonspeech-k2-v2", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/japanese/4.wav", + # ], + # [ + # "Japanese", + # "reazon-research/reazonspeech-k2-v2", + # "greedy_search", + # 4, + # "No", + # "./test_wavs/japanese/5.wav", + # ], +] diff --git a/giga-tokens.txt b/giga-tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba1721eac8064c61856da0d0ea71d53f47f011db --- /dev/null +++ b/giga-tokens.txt @@ -0,0 +1,500 @@ + 0 + 1 + 2 +S 3 +T 4 +▁THE 5 +▁A 6 +E 7 +▁AND 8 +▁TO 9 +N 10 +D 11 +▁OF 12 +' 13 +ING 14 +▁I 15 +Y 16 +▁IN 17 +ED 18 +▁THAT 19 +▁ 20 +P 21 +R 22 +▁YOU 23 +M 24 +RE 25 +ER 26 +C 27 +O 28 +▁IT 29 +L 30 +A 31 +U 32 +G 33 +▁WE 34 +▁IS 35 +▁SO 36 +AL 37 +I 38 +▁S 39 +▁RE 40 +AR 41 +B 42 +▁FOR 43 +▁C 44 +▁BE 45 +LE 46 +F 47 +W 48 +▁E 49 +▁HE 50 +LL 51 +▁WAS 52 +LY 53 +OR 54 +IN 55 +▁F 56 +VE 57 +▁THIS 58 +TH 59 +K 60 +▁ON 61 +IT 62 +▁B 63 +▁WITH 64 +▁BUT 65 +EN 66 +CE 67 +RI 68 +▁DO 69 +UR 70 +▁HAVE 71 +▁DE 72 +▁ME 73 +▁T 74 +ENT 75 +CH 76 +▁THEY 77 +▁NOT 78 +ES 79 +V 80 +▁AS 81 +RA 82 +▁P 83 +ON 84 +TER 85 +▁ARE 86 +▁WHAT 87 +IC 88 +▁ST 89 +▁LIKE 90 +ATION 91 +▁OR 92 +▁CA 93 +▁AT 94 +H 95 +▁KNOW 96 +▁G 97 +AN 98 +▁CON 99 +IL 100 +ND 101 +RO 102 +▁HIS 103 +▁CAN 104 +▁ALL 105 +TE 106 +▁THERE 107 +▁SU 108 +▁MO 109 +▁MA 110 +LI 111 +▁ONE 112 +▁ABOUT 113 +LA 114 +▁CO 115 +- 116 +▁MY 117 +▁HAD 118 +CK 119 +NG 120 +▁NO 121 +MENT 122 +AD 123 +LO 124 +ME 125 +▁AN 126 +▁FROM 127 +NE 128 +▁IF 129 +VER 130 +▁JUST 131 +▁PRO 132 +ION 133 +▁PA 134 +▁WHO 135 +▁SE 136 +EL 137 +IR 138 +▁US 139 +▁UP 140 +▁YOUR 141 +CI 142 +RY 143 +▁GO 144 +▁SHE 145 +▁LE 146 +▁OUT 147 +▁PO 148 +▁HO 149 +ATE 150 +▁BO 151 +▁BY 152 +▁FA 153 +▁MI 154 +AS 155 +MP 156 +▁HER 157 +VI 158 +▁THINK 159 +▁SOME 160 +▁WHEN 161 +▁AH 162 +▁PEOPLE 163 +IG 164 +▁WA 165 +▁TE 166 +▁LA 167 +▁WERE 168 +▁LI 169 +▁WOULD 170 +▁SEE 171 +▁WHICH 172 +DE 173 +GE 174 +▁K 175 +IGHT 176 +▁HA 177 +▁OUR 178 +UN 179 +▁HOW 180 +▁GET 181 +IS 182 +UT 183 +Z 184 +CO 185 +ET 186 +UL 187 +IES 188 +IVE 189 +AT 190 +▁O 191 +▁DON 192 +LU 193 +▁TIME 194 +▁WILL 195 +▁MORE 196 +▁SP 197 +▁NOW 198 +RU 199 +▁THEIR 200 +▁UN 201 +ITY 202 +OL 203 +X 204 +TI 205 +US 206 +▁VERY 207 +TION 208 +▁FI 209 +▁SAY 210 +▁BECAUSE 211 +▁EX 212 +▁RO 213 +ERS 214 +IST 215 +▁DA 216 +TING 217 +▁EN 218 +OM 219 +▁BA 220 +▁BEEN 221 +▁LO 222 +▁UM 223 +AGE 224 +ABLE 225 +▁WO 226 +▁RA 227 +▁OTHER 228 +▁REALLY 229 +ENCE 230 +▁GOING 231 +▁HIM 232 +▁HAS 233 +▁THEM 234 +▁DIS 235 +▁WANT 236 +ID 237 +TA 238 +▁LOOK 239 +KE 240 +▁DID 241 +▁SA 242 +▁VI 243 +▁SAID 244 +▁RIGHT 245 +▁THESE 246 +▁WORK 247 +▁COM 248 +ALLY 249 +FF 250 +QU 251 +AC 252 +▁DR 253 +▁WAY 254 +▁INTO 255 +MO 256 +TED 257 +EST 258 +▁HERE 259 +OK 260 +▁COULD 261 +▁WELL 262 +MA 263 +▁PRE 264 +▁DI 265 +MAN 266 +▁COMP 267 +▁THEN 268 +IM 269 +▁PER 270 +▁NA 271 +▁WHERE 272 +▁TWO 273 +▁WI 274 +▁FE 275 +INE 276 +▁ANY 277 +TURE 278 +▁OVER 279 +BO 280 +ACH 281 +OW 282 +▁MAKE 283 +▁TRA 284 +HE 285 +UND 286 +▁EVEN 287 +ANCE 288 +▁YEAR 289 +HO 290 +AM 291 +▁CHA 292 +▁BACK 293 +VO 294 +ANT 295 +DI 296 +▁ALSO 297 +▁THOSE 298 +▁MAN 299 +CTION 300 +ICAL 301 +▁JO 302 +▁OP 303 +▁NEW 304 +▁MU 305 +▁HU 306 +▁KIND 307 +▁NE 308 +CA 309 +END 310 +TIC 311 +FUL 312 +▁YEAH 313 +SH 314 +▁APP 315 +▁THINGS 316 +SIDE 317 +▁GOOD 318 +ONE 319 +▁TAKE 320 +CU 321 +▁EVERY 322 +▁MEAN 323 +▁FIRST 324 +OP 325 +▁TH 326 +▁MUCH 327 +▁PART 328 +UGH 329 +▁COME 330 +J 331 +▁THAN 332 +▁EXP 333 +▁AGAIN 334 +▁LITTLE 335 +MB 336 +▁NEED 337 +▁TALK 338 +IF 339 +FOR 340 +▁SH 341 +ISH 342 +▁STA 343 +ATED 344 +▁GU 345 +▁LET 346 +IA 347 +▁MAR 348 +▁DOWN 349 +▁DAY 350 +▁GA 351 +▁SOMETHING 352 +▁BU 353 +DUC 354 +HA 355 +▁LOT 356 +▁RU 357 +▁THOUGH 358 +▁GREAT 359 +AIN 360 +▁THROUGH 361 +▁THING 362 +OUS 363 +▁PRI 364 +▁GOT 365 +▁SHOULD 366 +▁AFTER 367 +▁HEAR 368 +▁TA 369 +▁ONLY 370 +▁CHI 371 +IOUS 372 +▁SHA 373 +▁MOST 374 +▁ACTUALLY 375 +▁START 376 +LIC 377 +▁VA 378 +▁RI 379 +DAY 380 +IAN 381 +▁DOES 382 +ROW 383 +▁GRA 384 +ITION 385 +▁MANY 386 +▁BEFORE 387 +▁GIVE 388 +PORT 389 +QUI 390 +▁LIFE 391 +▁WORLD 392 +▁PI 393 +▁LONG 394 +▁THREE 395 +IZE 396 +NESS 397 +▁SHOW 398 +PH 399 +▁WHY 400 +▁QUESTION 401 +WARD 402 +▁THANK 403 +▁PH 404 +▁DIFFERENT 405 +▁OWN 406 +▁FEEL 407 +▁MIGHT 408 +▁HAPPEN 409 +▁MADE 410 +▁BRO 411 +IBLE 412 +▁HI 413 +▁STATE 414 +▁HAND 415 +▁NEVER 416 +▁PLACE 417 +▁LOVE 418 +▁DU 419 +▁POINT 420 +▁HELP 421 +▁COUNT 422 +▁STILL 423 +▁MR 424 +▁FIND 425 +▁PERSON 426 +▁CAME 427 +▁SAME 428 +▁LAST 429 +▁HIGH 430 +▁OLD 431 +▁UNDER 432 +▁FOUR 433 +▁AROUND 434 +▁SORT 435 +▁CHANGE 436 +▁YES 437 +SHIP 438 +▁ANOTHER 439 +ATIVE 440 +▁FOUND 441 +▁JA 442 +▁ALWAYS 443 +▁NEXT 444 +▁TURN 445 +▁JU 446 +▁SIX 447 +▁FACT 448 +▁INTEREST 449 +▁WORD 450 +▁THOUSAND 451 +▁HUNDRED 452 +▁NUMBER 453 +▁IDEA 454 +▁PLAN 455 +▁COURSE 456 +▁SCHOOL 457 +▁HOUSE 458 +▁TWENTY 459 +▁JE 460 +▁PLAY 461 +▁AWAY 462 +▁LEARN 463 +▁HARD 464 +▁WEEK 465 +▁BETTER 466 +▁WHILE 467 +▁FRIEND 468 +▁OKAY 469 +▁NINE 470 +▁UNDERSTAND 471 +▁KEEP 472 +▁GONNA 473 +▁SYSTEM 474 +▁AMERICA 475 +▁POWER 476 +▁IMPORTANT 477 +▁WITHOUT 478 +▁MAYBE 479 +▁SEVEN 480 +▁BETWEEN 481 +▁BUILD 482 +▁CERTAIN 483 +▁PROBLEM 484 +▁MONEY 485 +▁BELIEVE 486 +▁SECOND 487 +▁REASON 488 +▁TOGETHER 489 +▁PUBLIC 490 +▁ANYTHING 491 +▁SPEAK 492 +▁BUSINESS 493 +▁EVERYTHING 494 +▁CLOSE 495 +▁QUITE 496 +▁ANSWER 497 +▁ENOUGH 498 +Q 499 diff --git a/model.py b/model.py new file mode 100644 index 0000000000000000000000000000000000000000..1fcc3ea08a7497ae2f49c491ceff28536253b79e --- /dev/null +++ b/model.py @@ -0,0 +1,1940 @@ +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from functools import lru_cache +from typing import Union + +import torch +import torchaudio +from huggingface_hub import hf_hub_download + +os.system( + "cp -v /usr/local/lib/python3.8/site-packages/k2/lib/*.so //usr/local/lib/python3.8/site-packages/sherpa/lib/" +) + +os.system( + "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/" +) + +import k2 # noqa +import sherpa +import sherpa_onnx +import numpy as np +from typing import Tuple +import wave + +sample_rate = 16000 + + +def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]: + """ + Args: + wave_filename: + Path to a wave file. It should be single channel and each sample should + be 16-bit. Its sample rate does not need to be 16kHz. + Returns: + Return a tuple containing: + - A 1-D array of dtype np.float32 containing the samples, which are + normalized to the range [-1, 1]. + - sample rate of the wave file + """ + + with wave.open(wave_filename) as f: + assert f.getnchannels() == 1, f.getnchannels() + assert f.getsampwidth() == 2, f.getsampwidth() # it is in bytes + num_samples = f.getnframes() + samples = f.readframes(num_samples) + samples_int16 = np.frombuffer(samples, dtype=np.int16) + samples_float32 = samples_int16.astype(np.float32) + + samples_float32 = samples_float32 / 32768 + return samples_float32, f.getframerate() + + +def decode_offline_recognizer( + recognizer: sherpa.OfflineRecognizer, + filename: str, +) -> str: + s = recognizer.create_stream() + + s.accept_wave_file(filename) + recognizer.decode_stream(s) + + text = s.result.text.strip() + # return text.lower() + return text + + +def decode_online_recognizer( + recognizer: sherpa.OnlineRecognizer, + filename: str, +) -> str: + samples, actual_sample_rate = torchaudio.load(filename) + assert sample_rate == actual_sample_rate, ( + sample_rate, + actual_sample_rate, + ) + samples = samples[0].contiguous() + + s = recognizer.create_stream() + + tail_padding = torch.zeros(int(sample_rate * 0.3), dtype=torch.float32) + s.accept_waveform(sample_rate, samples) + s.accept_waveform(sample_rate, tail_padding) + s.input_finished() + + while recognizer.is_ready(s): + recognizer.decode_stream(s) + + text = recognizer.get_result(s).text + # return text.strip().lower() + return text.strip() + + +def decode_offline_recognizer_sherpa_onnx( + recognizer: sherpa_onnx.OfflineRecognizer, + filename: str, +) -> str: + s = recognizer.create_stream() + samples, sample_rate = read_wave(filename) + s.accept_waveform(sample_rate, samples) + recognizer.decode_stream(s) + + # return s.result.text.lower() + return s.result.text + + +def decode_online_recognizer_sherpa_onnx( + recognizer: sherpa_onnx.OnlineRecognizer, + filename: str, +) -> str: + s = recognizer.create_stream() + samples, sample_rate = read_wave(filename) + s.accept_waveform(sample_rate, samples) + + tail_paddings = np.zeros(int(0.3 * sample_rate), dtype=np.float32) + s.accept_waveform(sample_rate, tail_paddings) + s.input_finished() + + while recognizer.is_ready(s): + recognizer.decode_stream(s) + + # return recognizer.get_result(s).lower() + return recognizer.get_result(s) + + +def decode( + recognizer: Union[ + sherpa.OfflineRecognizer, + sherpa.OnlineRecognizer, + sherpa_onnx.OfflineRecognizer, + sherpa_onnx.OnlineRecognizer, + ], + filename: str, +) -> str: + if isinstance(recognizer, sherpa.OfflineRecognizer): + return decode_offline_recognizer(recognizer, filename) + elif isinstance(recognizer, sherpa.OnlineRecognizer): + return decode_online_recognizer(recognizer, filename) + elif isinstance(recognizer, sherpa_onnx.OfflineRecognizer): + return decode_offline_recognizer_sherpa_onnx(recognizer, filename) + elif isinstance(recognizer, sherpa_onnx.OnlineRecognizer): + return decode_online_recognizer_sherpa_onnx(recognizer, filename) + else: + raise ValueError(f"Unknown recognizer type {type(recognizer)}") + + +@lru_cache(maxsize=30) +def get_pretrained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer]: + if repo_id in multi_lingual_models: + return multi_lingual_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in chinese_models: + return chinese_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in chinese_dialect_models: + return chinese_dialect_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in english_models: + return english_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in chinese_english_mixed_models: + return chinese_english_mixed_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in chinese_cantonese_english_models: + return chinese_cantonese_english_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in chinese_cantonese_english_japanese_korean_models: + return chinese_cantonese_english_japanese_korean_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in cantonese_models: + return cantonese_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in tibetan_models: + return tibetan_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in arabic_models: + return arabic_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in german_models: + return german_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in french_models: + return french_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in japanese_models: + return japanese_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in russian_models: + return russian_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in korean_models: + return korean_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + elif repo_id in thai_models: + return thai_models[repo_id]( + repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths + ) + else: + raise ValueError(f"Unsupported repo_id: {repo_id}") + + +def _get_nn_model_filename( + repo_id: str, + filename: str, + subfolder: str = "exp", +) -> str: + nn_model_filename = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + ) + return nn_model_filename + + +def _get_bpe_model_filename( + repo_id: str, + filename: str = "bpe.model", + subfolder: str = "data/lang_bpe_500", +) -> str: + bpe_model_filename = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + ) + return bpe_model_filename + + +def _get_token_filename( + repo_id: str, + filename: str = "tokens.txt", + subfolder: str = "data/lang_char", +) -> str: + token_filename = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder, + ) + return token_filename + + +@lru_cache(maxsize=10) +def _get_aishell2_pretrained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa.OfflineRecognizer: + assert repo_id in [ + # context-size 1 + "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", # noqa + # context-size 2 + "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", # noqa + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="cpu_jit.pt", + ) + tokens = _get_token_filename(repo_id=repo_id) + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_offline_pre_trained_model( + repo_id: str, decoding_method: str, num_active_paths: int +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in ( + "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24", + "reazon-research/reazonspeech-k2-v2", + ), repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder-epoch-99-avg-1.int8.onnx", + subfolder=".", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder-epoch-99-avg-1.onnx", + subfolder=".", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner-epoch-99-avg-1.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_yifan_thai_pretrained_model( + repo_id: str, decoding_method: str, num_active_paths: int +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in ( + "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20", + ), repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder-epoch-12-avg-5.int8.onnx", + subfolder="exp", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder-epoch-12-avg-5.onnx", + subfolder="exp", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner-epoch-12-avg-5.int8.onnx", + subfolder="exp", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_2000") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_zrjin_cantonese_pre_trained_model( + repo_id: str, decoding_method: str, num_active_paths: int +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in ("zrjin/icefall-asr-mdcc-zipformer-2024-03-11",), repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder-epoch-45-avg-35.int8.onnx", + subfolder="exp", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder-epoch-45-avg-35.onnx", + subfolder="exp", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner-epoch-45-avg-35.int8.onnx", + subfolder="exp", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_char") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_russian_pre_trained_model_ctc( + repo_id: str, decoding_method: str, num_active_paths: int +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in ( + "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24", + "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19", + ), repo_id + + model = _get_nn_model_filename( + repo_id=repo_id, + filename="model.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_nemo_ctc( + model=model, + tokens=tokens, + num_threads=2, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_russian_pre_trained_model( + repo_id: str, decoding_method: str, num_active_paths: int +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in ( + "alphacep/vosk-model-ru", + "alphacep/vosk-model-small-ru", + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24", + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-v2-russian-2025-04-19", + ), repo_id + + if repo_id == "alphacep/vosk-model-ru": + model_dir = "am-onnx" + encoder = "encoder.onnx" + model_type = "transducer" + elif repo_id == "alphacep/vosk-model-small-ru": + model_dir = "am" + encoder = "encoder.onnx" + model_type = "transducer" + elif repo_id in ( + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24", + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-v2-russian-2025-04-19", + ): + model_dir = "." + encoder = "encoder.int8.onnx" + model_type = "nemo_transducer" + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename=encoder, + subfolder=model_dir, + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder.onnx", + subfolder=model_dir, + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner.onnx", + subfolder=model_dir, + ) + + if repo_id in ( + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24", + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-v2-russian-2025-04-19", + ): + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + else: + tokens = _get_token_filename(repo_id=repo_id, subfolder="lang") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + model_type=model_type, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_moonshine_model( + repo_id: str, decoding_method: str, num_active_paths: int +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in ("moonshine-tiny", "moonshine-base"), repo_id + + if repo_id == "moonshine-tiny": + full_repo_id = "csukuangfj/sherpa-onnx-moonshine-tiny-en-int8" + elif repo_id == "moonshine-base": + full_repo_id = "csukuangfj/sherpa-onnx-moonshine-base-en-int8" + else: + raise ValueError(f"Unknown repo_id: {repo_id}") + + preprocessor = _get_nn_model_filename( + repo_id=full_repo_id, + filename=f"preprocess.onnx", + subfolder=".", + ) + + encoder = _get_nn_model_filename( + repo_id=full_repo_id, + filename=f"encode.int8.onnx", + subfolder=".", + ) + + uncached_decoder = _get_nn_model_filename( + repo_id=full_repo_id, + filename=f"uncached_decode.int8.onnx", + subfolder=".", + ) + + cached_decoder = _get_nn_model_filename( + repo_id=full_repo_id, + filename=f"cached_decode.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename( + repo_id=full_repo_id, + subfolder=".", + filename="tokens.txt", + ) + + recognizer = sherpa_onnx.OfflineRecognizer.from_moonshine( + preprocessor=preprocessor, + encoder=encoder, + uncached_decoder=uncached_decoder, + cached_decoder=cached_decoder, + tokens=tokens, + num_threads=2, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_whisper_model( + repo_id: str, decoding_method: str, num_active_paths: int +) -> sherpa_onnx.OfflineRecognizer: + name = repo_id.split("-")[1] + assert name in ("tiny.en", "base.en", "small.en", "medium.en"), repo_id + full_repo_id = "csukuangfj/sherpa-onnx-whisper-" + name + encoder = _get_nn_model_filename( + repo_id=full_repo_id, + filename=f"{name}-encoder.int8.onnx", + subfolder=".", + ) + + decoder = _get_nn_model_filename( + repo_id=full_repo_id, + filename=f"{name}-decoder.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename( + repo_id=full_repo_id, subfolder=".", filename=f"{name}-tokens.txt" + ) + + recognizer = sherpa_onnx.OfflineRecognizer.from_whisper( + encoder=encoder, + decoder=decoder, + tokens=tokens, + num_threads=2, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_gigaspeech_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa.OfflineRecognizer: + assert repo_id in [ + "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="cpu_jit-iter-3488000-avg-20.pt", + ) + tokens = "./giga-tokens.txt" + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_english_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa.OfflineRecognizer: + assert repo_id in [ + "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02", # noqa + "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04", # noqa + "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19", # noqa + "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa + "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11", # noqa + "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14", # noqa + "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16", # noqa + "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15", # noqa + "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16", # noqa + "videodanchik/icefall-asr-tedlium3-conformer-ctc2", + "pkufool/icefall_asr_librispeech_conformer_ctc", + "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21", + ], repo_id + + filename = "cpu_jit.pt" + if ( + repo_id + == "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11" + ): + filename = "cpu_jit-torch-1.10.0.pt" + + if ( + repo_id + == "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02" + ): + filename = "cpu_jit-torch-1.10.pt" + + if ( + repo_id + == "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04" + ): + filename = "cpu_jit-epoch-30-avg-4.pt" + + if ( + repo_id + == "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19" + ): + filename = "cpu_jit-epoch-20-avg-5.pt" + + if repo_id in ( + "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16", + "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15", + "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16", + ): + filename = "jit_script.pt" + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename=filename, + ) + subfolder = "data/lang_bpe_500" + + if repo_id in ( + "videodanchik/icefall-asr-tedlium3-conformer-ctc2", + "pkufool/icefall_asr_librispeech_conformer_ctc", + ): + subfolder = "data/lang_bpe" + + tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder) + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_wenetspeech_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +): + assert repo_id in [ + "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt", + ) + tokens = _get_token_filename(repo_id=repo_id) + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=1) +def _get_fire_red_asr_models(repo_id: str, decoding_method: str, num_active_paths: int): + assert repo_id in ( + "csukuangfj/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16", + ), repo_id + + encoder = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder.int8.onnx", + subfolder=".", + ) + + decoder = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder.int8.onnx", + subfolder=".", + ) + + tokens = _get_nn_model_filename( + repo_id=repo_id, + filename="tokens.txt", + subfolder=".", + ) + + return sherpa_onnx.OfflineRecognizer.from_fire_red_asr( + encoder=encoder, + decoder=decoder, + tokens=tokens, + num_threads=2, + ) + + +@lru_cache(maxsize=10) +def _get_chinese_english_mixed_model_onnx( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22", + ], repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder-epoch-34-avg-19.int8.onnx", + subfolder="exp", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder-epoch-34-avg-19.onnx", + subfolder="exp", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner-epoch-34-avg-19.int8.onnx", + subfolder="exp", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bbpe_2000") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + max_active_paths=num_active_paths, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_chinese_english_mixed_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa.OfflineRecognizer: + assert repo_id in [ + "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5", + "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh", + ], repo_id + + if repo_id == "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": + filename = "cpu_jit.pt" + subfolder = "data/lang_char" + elif repo_id == "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": + filename = "cpu_jit-epoch-11-avg-1.pt" + subfolder = "data/lang_char_bpe" + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename=filename, + ) + tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder) + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_alimeeting_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +): + assert repo_id in [ + "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7", + "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2", + ], repo_id + + if repo_id == "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7": + filename = "cpu_jit.pt" + elif repo_id == "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": + filename = "cpu_jit_torch_1.7.1.pt" + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename=filename, + ) + tokens = _get_token_filename(repo_id=repo_id) + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=4) +def _get_dolphin_ctc_models(repo_id: str, decoding_method: str, num_active_paths: int): + assert repo_id in [ + "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02", + "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-int8-2025-04-02", + "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-2025-04-02", + "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-2025-04-02", + ], repo_id + + if repo_id in [ + "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02", + "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-int8-2025-04-02", + ]: + use_int8 = True + else: + use_int8 = False + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="model.int8.onnx" if use_int8 else "model.onnx", + subfolder=".", + ) + tokens = _get_token_filename( + repo_id=repo_id, + filename="tokens.txt", + subfolder=".", + ) + + recognizer = sherpa_onnx.OfflineRecognizer.from_dolphin_ctc( + tokens=tokens, + model=nn_model, + num_threads=2, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_wenet_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +): + assert repo_id in [ + "csukuangfj/wenet-chinese-model", + "csukuangfj/wenet-english-model", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="final.zip", + subfolder=".", + ) + tokens = _get_token_filename( + repo_id=repo_id, + filename="units.txt", + subfolder=".", + ) + + feat_config = sherpa.FeatureConfig(normalize_samples=False) + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_aidatatang_200zh_pretrained_mode( + repo_id: str, + decoding_method: str, + num_active_paths: int, +): + assert repo_id in [ + "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="cpu_jit_torch.1.7.1.pt", + ) + tokens = _get_token_filename(repo_id=repo_id) + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_tibetan_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +): + assert repo_id in [ + "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02", + "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29", + ], repo_id + + filename = "cpu_jit.pt" + if ( + repo_id + == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29" + ): + filename = "cpu_jit-epoch-28-avg-23-torch-1.10.0.pt" + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename=filename, + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500") + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_arabic_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +): + assert repo_id in [ + "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="cpu_jit.pt", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_5000") + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_german_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +): + assert repo_id in [ + "csukuangfj/wav2vec2.0-torchaudio", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="voxpopuli_asr_base_10k_de.pt", + subfolder=".", + ) + + tokens = _get_token_filename( + repo_id=repo_id, + filename="tokens-de.txt", + subfolder=".", + ) + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_french_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OnlineRecognizer: + assert repo_id in [ + "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14", + ], repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder-epoch-29-avg-9-with-averaged-model.onnx", + subfolder=".", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder-epoch-29-avg-9-with-averaged-model.onnx", + subfolder=".", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner-epoch-29-avg-9-with-averaged-model.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OnlineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + max_active_paths=num_active_paths, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_sherpa_onnx_nemo_transducer_models( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "csukuangfj/sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000", + ], repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder.onnx", + subfolder=".", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder.onnx", + subfolder=".", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + model_type="nemo_transducer", + decoding_method=decoding_method, + max_active_paths=num_active_paths, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_sherpa_onnx_nemo_ctc_models( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "csukuangfj/sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000", + ], repo_id + + model = _get_nn_model_filename( + repo_id=repo_id, + filename="model.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_nemo_ctc( + tokens=tokens, + model=model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_sherpa_onnx_offline_zipformer_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-large", + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-medium", + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-small", + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-large-punct-case", + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-medium-punct-case", + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-small-punct-case", + ], repo_id + + if repo_id == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-large": + epoch = 16 + avg = 3 + elif repo_id == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-medium": + epoch = 60 + avg = 20 + elif repo_id == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-small": + epoch = 90 + avg = 20 + elif ( + repo_id + == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-large-punct-case" + ): + epoch = 16 + avg = 2 + elif ( + repo_id + == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-medium-punct-case" + ): + epoch = 50 + avg = 15 + elif ( + repo_id + == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-small-punct-case" + ): + epoch = 88 + avg = 41 + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename=f"encoder-epoch-{epoch}-avg-{avg}.int8.onnx", + subfolder=".", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename=f"decoder-epoch-{epoch}-avg-{avg}.onnx", + subfolder=".", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename=f"joiner-epoch-{epoch}-avg-{avg}.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + max_active_paths=num_active_paths, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_streaming_zipformer_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OnlineRecognizer: + assert repo_id in [ + "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20", + "k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16", + ], repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder-epoch-99-avg-1.onnx", + subfolder=".", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder-epoch-99-avg-1.onnx", + subfolder=".", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner-epoch-99-avg-1.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OnlineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + max_active_paths=num_active_paths, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_japanese_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa.OnlineRecognizer: + repo_id, kind = repo_id.rsplit("-", maxsplit=1) + + assert repo_id in [ + "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208" + ], repo_id + assert kind in ("fluent", "disfluent"), kind + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, filename="encoder_jit_trace.pt", subfolder=f"exp_{kind}" + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, filename="decoder_jit_trace.pt", subfolder=f"exp_{kind}" + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, filename="joiner_jit_trace.pt", subfolder=f"exp_{kind}" + ) + + tokens = _get_token_filename(repo_id=repo_id) + + feat_config = sherpa.FeatureConfig() + feat_config.fbank_opts.frame_opts.samp_freq = sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OnlineRecognizerConfig( + nn_model="", + encoder_model=encoder_model, + decoder_model=decoder_model, + joiner_model=joiner_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method=decoding_method, + num_active_paths=num_active_paths, + chunk_size=32, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_gigaspeech_pre_trained_model_onnx( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "yfyeung/icefall-asr-gigaspeech-zipformer-2023-10-17", + ], repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder-epoch-30-avg-9.onnx", + subfolder="exp", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder-epoch-30-avg-9.onnx", + subfolder="exp", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner-epoch-30-avg-9.onnx", + subfolder="exp", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + max_active_paths=num_active_paths, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_streaming_paraformer_zh_yue_en_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OnlineRecognizer: + assert repo_id in [ + "csukuangfj/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en", + ], repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder.int8.onnx", + subfolder=".", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_paraformer_en_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "yujinqiu/sherpa-onnx-paraformer-en-2023-10-24", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="model.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename( + repo_id=repo_id, filename="new_tokens.txt", subfolder="." + ) + + recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer( + paraformer=nn_model, + tokens=tokens, + num_threads=2, + sample_rate=sample_rate, + feature_dim=80, + decoding_method="greedy_search", + debug=False, + ) + + return recognizer + + +@lru_cache(maxsize=5) +def _get_chinese_dialect_models( + repo_id: str, decoding_method: str, num_active_paths: int +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="model.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_telespeech_ctc( + model=nn_model, + tokens=tokens, + num_threads=2, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_sense_voice_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="model.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice( + model=nn_model, + tokens=tokens, + num_threads=2, + sample_rate=sample_rate, + feature_dim=80, + decoding_method="greedy_search", + debug=True, + use_itn=True, + ) + + return recognizer + + +@lru_cache(maxsize=10) +def _get_paraformer_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in [ + "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28", + "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09", + "csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09", + "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en", + "csukuangfj/sherpa-onnx-paraformer-en-2024-03-09", + ], repo_id + + nn_model = _get_nn_model_filename( + repo_id=repo_id, + filename="model.int8.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer( + paraformer=nn_model, + tokens=tokens, + num_threads=2, + sample_rate=sample_rate, + feature_dim=80, + decoding_method="greedy_search", + debug=False, + ) + + return recognizer + + +def _get_aishell_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in ( + "zrjin/icefall-asr-aishell-zipformer-large-2023-10-24", + "zrjin/icefall-asr-aishell-zipformer-small-2023-10-24", + "zrjin/icefall-asr-aishell-zipformer-2023-10-24", + ), repo_id + if repo_id == "zrjin/icefall-asr-aishell-zipformer-large-2023-10-24": + epoch = 56 + avg = 23 + elif repo_id == "zrjin/icefall-asr-aishell-zipformer-small-2023-10-24": + epoch = 55 + avg = 21 + elif repo_id == "zrjin/icefall-asr-aishell-zipformer-2023-10-24": + epoch = 55 + avg = 17 + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename=f"encoder-epoch-{epoch}-avg-{avg}.onnx", + subfolder="exp", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename=f"decoder-epoch-{epoch}-avg-{avg}.onnx", + subfolder="exp", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename=f"joiner-epoch-{epoch}-avg-{avg}.onnx", + subfolder="exp", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_char") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + max_active_paths=num_active_paths, + ) + + return recognizer + + +@lru_cache(maxsize=2) +def get_punct_model() -> sherpa_onnx.OfflinePunctuation: + model = _get_nn_model_filename( + repo_id="csukuangfj/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12", + filename="model.onnx", + subfolder=".", + ) + config = sherpa_onnx.OfflinePunctuationConfig( + model=sherpa_onnx.OfflinePunctuationModelConfig(ct_transformer=model), + ) + + punct = sherpa_onnx.OfflinePunctuation(config) + return punct + + +def _get_multi_zh_hans_pre_trained_model( + repo_id: str, + decoding_method: str, + num_active_paths: int, +) -> sherpa_onnx.OfflineRecognizer: + assert repo_id in ("zrjin/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2",), repo_id + + encoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="encoder-epoch-20-avg-1.onnx", + subfolder=".", + ) + + decoder_model = _get_nn_model_filename( + repo_id=repo_id, + filename="decoder-epoch-20-avg-1.onnx", + subfolder=".", + ) + + joiner_model = _get_nn_model_filename( + repo_id=repo_id, + filename="joiner-epoch-20-avg-1.onnx", + subfolder=".", + ) + + tokens = _get_token_filename(repo_id=repo_id, subfolder=".") + + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer( + tokens=tokens, + encoder=encoder_model, + decoder=decoder_model, + joiner=joiner_model, + num_threads=2, + sample_rate=16000, + feature_dim=80, + decoding_method=decoding_method, + max_active_paths=num_active_paths, + ) + + return recognizer + + +chinese_dialect_models = { + "csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04": _get_chinese_dialect_models, +} + +chinese_models = { + "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09": _get_paraformer_pre_trained_model, + "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, # noqa + "csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09": _get_paraformer_pre_trained_model, + "zrjin/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2": _get_multi_zh_hans_pre_trained_model, # noqa + "zrjin/icefall-asr-aishell-zipformer-large-2023-10-24": _get_aishell_pre_trained_model, # noqa + "zrjin/icefall-asr-aishell-zipformer-small-2023-10-24": _get_aishell_pre_trained_model, # noqa + "zrjin/icefall-asr-aishell-zipformer-2023-10-24": _get_aishell_pre_trained_model, # noqa + "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7": _get_alimeeting_pre_trained_model, + "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, # noqa + "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, # noqa + "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, # noqa + "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, # noqa + "csukuangfj/wenet-chinese-model": _get_wenet_model, + # "csukuangfj/icefall-asr-wenetspeech-lstm-transducer-stateless-2022-10-14": _get_lstm_transducer_model, +} + +english_models = { + "whisper-tiny.en": _get_whisper_model, + "moonshine-tiny": _get_moonshine_model, + "moonshine-base": _get_moonshine_model, + "whisper-base.en": _get_whisper_model, + "whisper-small.en": _get_whisper_model, + "csukuangfj/sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000": _get_sherpa_onnx_nemo_ctc_models, + "csukuangfj/sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000": _get_sherpa_onnx_nemo_transducer_models, + # "whisper-medium.en": _get_whisper_model, + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-large": _get_sherpa_onnx_offline_zipformer_pre_trained_model, + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-medium": _get_sherpa_onnx_offline_zipformer_pre_trained_model, + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-small": _get_sherpa_onnx_offline_zipformer_pre_trained_model, + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-large-punct-case": _get_sherpa_onnx_offline_zipformer_pre_trained_model, + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-medium-punct-case": _get_sherpa_onnx_offline_zipformer_pre_trained_model, + "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-small-punct-case": _get_sherpa_onnx_offline_zipformer_pre_trained_model, + "csukuangfj/sherpa-onnx-paraformer-en-2024-03-09": _get_paraformer_pre_trained_model, + "yfyeung/icefall-asr-gigaspeech-zipformer-2023-10-17": _get_gigaspeech_pre_trained_model_onnx, # noqa + "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, # noqa + "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04": _get_english_model, # noqa + "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19": _get_english_model, # noqa + "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_english_model, # noqa + "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_english_model, # noqa + "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_english_model, # noqa + "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_english_model, # noqa + "yujinqiu/sherpa-onnx-paraformer-en-2023-10-24": _get_paraformer_en_pre_trained_model, + "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16": _get_english_model, # noqa + "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15": _get_english_model, # noqa + "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16": _get_english_model, # noqa + "videodanchik/icefall-asr-tedlium3-conformer-ctc2": _get_english_model, + "pkufool/icefall_asr_librispeech_conformer_ctc": _get_english_model, + "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21": _get_english_model, + "csukuangfj/wenet-english-model": _get_wenet_model, +} + +multi_lingual_models = { + "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02": _get_dolphin_ctc_models, + "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-int8-2025-04-02": _get_dolphin_ctc_models, + "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-2025-04-02": _get_dolphin_ctc_models, + "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-2025-04-02": _get_dolphin_ctc_models, +} + +chinese_english_mixed_models = { + "csukuangfj/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16": _get_fire_red_asr_models, + "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": _get_streaming_zipformer_pre_trained_model, + "zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22": _get_chinese_english_mixed_model_onnx, + "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28": _get_paraformer_pre_trained_model, + "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": _get_chinese_english_mixed_model, + "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": _get_chinese_english_mixed_model, # noqa +} + +tibetan_models = { + "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02": _get_tibetan_pre_trained_model, # noqa + "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29": _get_tibetan_pre_trained_model, # noqa +} + +arabic_models = { + "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06": _get_arabic_pre_trained_model, # noqa +} + +german_models = { + "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model, +} + +french_models = { + "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": _get_french_pre_trained_model, +} + +japanese_models = { + "reazon-research/reazonspeech-k2-v2": _get_offline_pre_trained_model, + # "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-fluent": _get_japanese_pre_trained_model, + # "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-disfluent": _get_japanese_pre_trained_model, +} + +russian_models = { + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-v2-russian-2025-04-19": _get_russian_pre_trained_model, + "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19": _get_russian_pre_trained_model_ctc, + "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24": _get_russian_pre_trained_model, + "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24": _get_russian_pre_trained_model_ctc, + "alphacep/vosk-model-ru": _get_russian_pre_trained_model, + "alphacep/vosk-model-small-ru": _get_russian_pre_trained_model, +} + +chinese_cantonese_english_models = { + "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en": _get_paraformer_pre_trained_model, + "csukuangfj/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en": _get_streaming_paraformer_zh_yue_en_pre_trained_model, +} + +chinese_cantonese_english_japanese_korean_models = { + "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17": _get_sense_voice_pre_trained_model, +} + +cantonese_models = { + "zrjin/icefall-asr-mdcc-zipformer-2024-03-11": _get_zrjin_cantonese_pre_trained_model, +} + +korean_models = { + "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24": _get_offline_pre_trained_model, + "k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": _get_streaming_zipformer_pre_trained_model, +} + +thai_models = { + "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20": _get_yifan_thai_pretrained_model, +} + + +all_models = { + **multi_lingual_models, + **chinese_models, + **english_models, + **chinese_english_mixed_models, + **chinese_cantonese_english_models, + **chinese_cantonese_english_japanese_korean_models, + **cantonese_models, + **japanese_models, + **tibetan_models, + **arabic_models, + **german_models, + **french_models, + **russian_models, + **korean_models, + **thai_models, +} + +language_to_models = { + "Multi-lingual (east aisa)": list(multi_lingual_models.keys()), + "超多种中文方言": list(chinese_dialect_models.keys()), + "Chinese": list(chinese_models.keys()), + "English": list(english_models.keys()), + "Chinese+English": list(chinese_english_mixed_models.keys()), + "Chinese+English+Cantonese": list(chinese_cantonese_english_models.keys()), + "Chinese+English+Cantonese+Japanese+Korean": list( + chinese_cantonese_english_japanese_korean_models.keys() + ), + "Cantonese": list(cantonese_models.keys()), + "Japanese": list(japanese_models.keys()), + "Tibetan": list(tibetan_models.keys()), + "Arabic": list(arabic_models.keys()), + "German": list(german_models.keys()), + "French": list(french_models.keys()), + "Russian": list(russian_models.keys()), + "Korean": list(korean_models.keys()), + "Thai": list(thai_models.keys()), +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..14b288d64fa32f92bacf6425910dddfca51546e3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +https://download.pytorch.org/whl/cpu/torch-1.13.1%2Bcpu-cp310-cp310-linux_x86_64.whl +https://download.pytorch.org/whl/cpu/torchaudio-0.13.1%2Bcpu-cp310-cp310-linux_x86_64.whl + +https://huggingface.co/csukuangfj/k2/resolve/main/cpu/1.24.4.dev20250307/linux-x64/k2-1.24.4.dev20250307+cpu.torch1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +https://huggingface.co/csukuangfj/sherpa/resolve/main/cpu/1.4.0.dev20250307/linux-x64/k2_sherpa-1.4.0.dev20250307+cpu.torch1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +https://huggingface.co/csukuangfj/kaldifeat/resolve/main/cpu/1.25.5.dev20250307/linux-x64/kaldifeat-1.25.5.dev20250307+cpu.torch1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + +sentencepiece>=0.1.96 +numpy<2 + +huggingface_hub + +#https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/cpu/1.11.3/sherpa_onnx-1.11.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + +sherpa-onnx>=1.11.3 diff --git a/test_wavs/aidatatang_200zh/README.md b/test_wavs/aidatatang_200zh/README.md new file mode 100644 index 0000000000000000000000000000000000000000..25d41e363682054f55476e217e2f262b89cb33dd --- /dev/null +++ b/test_wavs/aidatatang_200zh/README.md @@ -0,0 +1,2 @@ +Files are downloaded from +https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2/tree/main/test_wavs diff --git a/test_wavs/aidatatang_200zh/T0055G0036S0002.wav b/test_wavs/aidatatang_200zh/T0055G0036S0002.wav new file mode 100644 index 0000000000000000000000000000000000000000..b446cef1f96a4dc9f48bccb327e6fd50e2aac26b --- /dev/null +++ b/test_wavs/aidatatang_200zh/T0055G0036S0002.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c7bf25a97de0819064c05952d40d93047da474d1e927424b3f27fb71bca403e +size 67630 diff --git a/test_wavs/aidatatang_200zh/T0055G0036S0003.wav b/test_wavs/aidatatang_200zh/T0055G0036S0003.wav new file mode 100644 index 0000000000000000000000000000000000000000..194a1b1d612dbd6d0c560216d4797b05e533759d --- /dev/null +++ b/test_wavs/aidatatang_200zh/T0055G0036S0003.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88e2e8ef9cc009305e3cb42ddd806c757a7ffc1b85a4402c39e2b59e81ab9ec8 +size 94174 diff --git a/test_wavs/aidatatang_200zh/T0055G0036S0004.wav b/test_wavs/aidatatang_200zh/T0055G0036S0004.wav new file mode 100644 index 0000000000000000000000000000000000000000..2c1bb29488f0b63dbb021cf11f0903fd2378589f --- /dev/null +++ b/test_wavs/aidatatang_200zh/T0055G0036S0004.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea822f7873b89443191e4a3b4b08c62b81de3a0a4a7b806d273da975a0b9e9fc +size 70460 diff --git a/test_wavs/aishell2/ID0012W0030.wav b/test_wavs/aishell2/ID0012W0030.wav new file mode 100644 index 0000000000000000000000000000000000000000..0113391a8fc0096c10ac9a5a3382889b9c8e9ffd --- /dev/null +++ b/test_wavs/aishell2/ID0012W0030.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f042c6cd8cb7fc745f37805565b5ce41b9a4f38a54b267e1a9afd806d5216a38 +size 112878 diff --git a/test_wavs/aishell2/ID0012W0162.wav b/test_wavs/aishell2/ID0012W0162.wav new file mode 100644 index 0000000000000000000000000000000000000000..9908691638b238131edf10ccc8ea57952a0b6699 --- /dev/null +++ b/test_wavs/aishell2/ID0012W0162.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aacdc76fc8b37bc2bdd1c05a4bfd42a5ac3333a53c06088abe9814fb1e5e0912 +size 114124 diff --git a/test_wavs/aishell2/ID0012W0215.wav b/test_wavs/aishell2/ID0012W0215.wav new file mode 100644 index 0000000000000000000000000000000000000000..c241b6ea30f4d5a25475049b60f083e283160467 --- /dev/null +++ b/test_wavs/aishell2/ID0012W0215.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f48eb860503ec691d7d6b99dfc1491a88f30a0930676b3c5dc9170edce041c46 +size 104368 diff --git a/test_wavs/aishell2/README.md b/test_wavs/aishell2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..40a16b2ac43de0a40248b86e198e7077b8e44ee6 --- /dev/null +++ b/test_wavs/aishell2/README.md @@ -0,0 +1,2 @@ +Files are downloaded from +https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12/tree/main/test_wavs diff --git a/test_wavs/aishell2/trans.txt b/test_wavs/aishell2/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..755cf07a765a5732ccfa42a36afefdb4b66355ba --- /dev/null +++ b/test_wavs/aishell2/trans.txt @@ -0,0 +1,3 @@ +ID0012W0162 立法机关采纳了第二种意见 +ID0012W0215 大家都愿意牺牲自己的生命 +ID0012W0030 完全是典型的军事侵略 \ No newline at end of file diff --git a/test_wavs/alimeeting/165.wav b/test_wavs/alimeeting/165.wav new file mode 100644 index 0000000000000000000000000000000000000000..b9a004b21005393b548d83f24eb3e91dc0810cb6 --- /dev/null +++ b/test_wavs/alimeeting/165.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48c131d205a0d93acdcdfc0d81e2ee839f4f3261ca7654e3e3ce175a0ec6098d +size 262764 diff --git a/test_wavs/alimeeting/209.wav b/test_wavs/alimeeting/209.wav new file mode 100644 index 0000000000000000000000000000000000000000..6bf339fdb1fb572a68a342e294f2b46e961d2900 --- /dev/null +++ b/test_wavs/alimeeting/209.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9374efff5517fd624ceee8551cd8cd3680fc3ed8ff964fe5f17c1064f05ebfb +size 154604 diff --git a/test_wavs/alimeeting/74.wav b/test_wavs/alimeeting/74.wav new file mode 100644 index 0000000000000000000000000000000000000000..1bf165f1ca563f5ce58a09bfac0cffffa130fd83 --- /dev/null +++ b/test_wavs/alimeeting/74.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c371dd14ff73d7128e1508c71dd6eef934f91c082e5946bf4bdd87761ae44a13 +size 120364 diff --git a/test_wavs/alimeeting/R8003_M8001-8004-165.wav b/test_wavs/alimeeting/R8003_M8001-8004-165.wav new file mode 100644 index 0000000000000000000000000000000000000000..dda41565dff7bcb0ecb68023a75959eff972a95f --- /dev/null +++ b/test_wavs/alimeeting/R8003_M8001-8004-165.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b10ddaddabeb905a7915f670502773328d3321beda436907fb0f36c52b2d04e +size 525498 diff --git a/test_wavs/alimeeting/R8008_M8013-8049-74.wav b/test_wavs/alimeeting/R8008_M8013-8049-74.wav new file mode 100644 index 0000000000000000000000000000000000000000..a52c668e006bc60469988a8aa1c502c0cd23bc3f --- /dev/null +++ b/test_wavs/alimeeting/R8008_M8013-8049-74.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cc97f90e46825e8d6783ea0d41112165c5fffb33d5519fd0d3c6860a43cac70 +size 240698 diff --git a/test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav b/test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav new file mode 100644 index 0000000000000000000000000000000000000000..797d31d4007b1f9bb143713f18769a866ae0c179 --- /dev/null +++ b/test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f825ce6a99b00ec30cb276ee821099b63b1594a6782b88aa5117bd578b61f5a +size 309178 diff --git a/test_wavs/alimeeting/trans.txt b/test_wavs/alimeeting/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..497af7b9cdb3c56f919b9e11378cbe289cd9b833 --- /dev/null +++ b/test_wavs/alimeeting/trans.txt @@ -0,0 +1,3 @@ +R8009_M8020_N_SPK8026-8026-209 并不是说一天的话就一定要对一个人进行一个了解这样的话 +R8003_M8001-8004-165 如果他要是不愿意提供地址也不愿意接收礼物那么第二个这个分支可能就省省下了 +R8008_M8013-8049-74 面试的话五月五号到五月十号吧面试 diff --git a/test_wavs/arabic/a.wav b/test_wavs/arabic/a.wav new file mode 100644 index 0000000000000000000000000000000000000000..68bb32af5660a709a66847de91d475cf8f9bf95d --- /dev/null +++ b/test_wavs/arabic/a.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09d4ef01e713b5ea57459dcb8e31631816bc8acdc0833dc41ad3b1ff000a4da5 +size 252846 diff --git a/test_wavs/arabic/b.wav b/test_wavs/arabic/b.wav new file mode 100644 index 0000000000000000000000000000000000000000..67ba665347811dfe2ae9810748069c842826870d --- /dev/null +++ b/test_wavs/arabic/b.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faecc4e69fb4a1b64b47edada3a6a84c8ff7216027c2490b105b4481bef4b12c +size 243244 diff --git a/test_wavs/arabic/c.wav b/test_wavs/arabic/c.wav new file mode 100644 index 0000000000000000000000000000000000000000..4817939203c5aa447649cab7d7bf24ab2b2f3a09 --- /dev/null +++ b/test_wavs/arabic/c.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62f08f3c5148e8c69c1607cb067e66034820c4a4322c80e7b396b1bd4360de8b +size 149804 diff --git a/test_wavs/arabic/trans.txt b/test_wavs/arabic/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f6e20c7e2921a1113d55345fdcd2eed184fe869 --- /dev/null +++ b/test_wavs/arabic/trans.txt @@ -0,0 +1,3 @@ +94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281 بعد أن عجز وبدأ يصدر مشكلات شعبه ومشكلات مصر +94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244 وهؤلاء أولياء الشيطان ها هو ذا أحدهم الآن ضيفا عليكم على قناة الجزيرة ولا يستحي في ذلك +94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004 عندما استغاث الليبيون بالعالم استغاثوا لرفع الظلم وليس لقهر إرادة الأمة ومصادرة الحياة الدستورية diff --git a/test_wavs/cantonese/1.wav b/test_wavs/cantonese/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..8fb8573fc2429890daf90ff1527e7103a624ad59 --- /dev/null +++ b/test_wavs/cantonese/1.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22568f57d298bea915f263dea7f41d628eea096e80a85b81ce88b7689ef3eee4 +size 191276 diff --git a/test_wavs/cantonese/2.wav b/test_wavs/cantonese/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..9f5c30894c7c250f221a90ce83bb6f4d09e1269d --- /dev/null +++ b/test_wavs/cantonese/2.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d75fcd99f9693e91ce3303c97d312594a2a95659db5d43bdcefa87e2256e0de +size 139052 diff --git a/test_wavs/french/common_voice_fr_19364697.wav b/test_wavs/french/common_voice_fr_19364697.wav new file mode 100644 index 0000000000000000000000000000000000000000..f7aa0bb86010fe026bf1a564c6af0e1ac1cad5c6 --- /dev/null +++ b/test_wavs/french/common_voice_fr_19364697.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b057a0b3badb2b5e1352b6b058726dc03a063e74794232ed266d5b3ad573f9ca +size 228174 diff --git a/test_wavs/french/common_voice_fr_19738183.wav b/test_wavs/french/common_voice_fr_19738183.wav new file mode 100644 index 0000000000000000000000000000000000000000..2d6aa7e54984a20671ad160081df9549f5eee416 --- /dev/null +++ b/test_wavs/french/common_voice_fr_19738183.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af7487e23134c3fcc6d74627dcefb5c3c45a2bfa24b4290758efd89139a43884 +size 122190 diff --git a/test_wavs/french/common_voice_fr_27024649.wav b/test_wavs/french/common_voice_fr_27024649.wav new file mode 100644 index 0000000000000000000000000000000000000000..f67bbaceb4c34dc01f1ef1d17b6a8fcba0d947c9 --- /dev/null +++ b/test_wavs/french/common_voice_fr_27024649.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76021a91ebbe9110d8cbd19a091cea4c305c417ba0c25f32d6f995c362b0b9f2 +size 202830 diff --git a/test_wavs/french/trans.txt b/test_wavs/french/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..16e2b4fc4b5833466996f7461c471048125e264d --- /dev/null +++ b/test_wavs/french/trans.txt @@ -0,0 +1,3 @@ +common_voice_fr_19738183 CE DERNIER A ÉVOLUÉ TOUT AU LONG DE L'HISTOIRE ROMAINE +common_voice_fr_27024649 SON ACTIONNAIRE MAJORITAIRE EST LE CONSEIL TERRITORIAL DE SAINT PIERRE ET MIQUELON +common_voice_fr_19364697 CE SITE CONTIENT QUATRE TOMBEAUX DE LA DYNASTIE ACHÉMÉNIDE ET SEPT DES SASSANIDES diff --git a/test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav b/test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav new file mode 100644 index 0000000000000000000000000000000000000000..6fddecfb7317ea4b18fdabf40d310f5768900fc5 --- /dev/null +++ b/test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc4f5a2c3e4f6ce99d11490087ef23fa55806a5e32575d3528bf599e0deb711 +size 381356 diff --git a/test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav b/test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav new file mode 100644 index 0000000000000000000000000000000000000000..d91c7394eb1473ba673807d1693db33a58b08f88 --- /dev/null +++ b/test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c3b63669e92c6df5bfa3aae0843c64f9eef1be2e85e652b0991a25ebc4e30bb +size 282284 diff --git a/test_wavs/gigaspeech/1-minute-audiobook.opus b/test_wavs/gigaspeech/1-minute-audiobook.opus new file mode 100644 index 0000000000000000000000000000000000000000..2122e1cbe795792b15541d9e017c69052106f3ab --- /dev/null +++ b/test_wavs/gigaspeech/1-minute-audiobook.opus @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:759d82de055d12fdfd6bdc74990ad32943a5a061565c457a7eeef73feba6d47f +size 579661 diff --git a/test_wavs/gigaspeech/100-seconds-podcast.opus b/test_wavs/gigaspeech/100-seconds-podcast.opus new file mode 100644 index 0000000000000000000000000000000000000000..25e9caf93d74fe99cafb0cc46c2d4bb069314d0e --- /dev/null +++ b/test_wavs/gigaspeech/100-seconds-podcast.opus @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecb3d5ab9c5eafdc7dc95de7a6e3a0ea6656b524ab0650427cdff829fe3347a0 +size 954991 diff --git a/test_wavs/gigaspeech/100-seconds-youtube.opus b/test_wavs/gigaspeech/100-seconds-youtube.opus new file mode 100644 index 0000000000000000000000000000000000000000..31fa19d3258f4d72af3f1f85f4f5f9933e952407 --- /dev/null +++ b/test_wavs/gigaspeech/100-seconds-youtube.opus @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f0d18ddb1e0b45ef0a3ffdeee1045fa465d39bde77bcc027f5788e72fef646 +size 947770 diff --git a/test_wavs/japanese/1.wav b/test_wavs/japanese/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..0016bb851847d68b7d7c7245564b035495c6561a --- /dev/null +++ b/test_wavs/japanese/1.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c8ccaa1878720165a8034763f2f3fa4fc3333472b09b75d71cdf1017db7af32 +size 429934 diff --git a/test_wavs/japanese/2.wav b/test_wavs/japanese/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..00f3359a70298a81169a977d31b973bd2b9bccc7 --- /dev/null +++ b/test_wavs/japanese/2.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea37375438a3d285b7c4b80434d23c2647b5d988c4373933c817308313f14fe +size 211996 diff --git a/test_wavs/japanese/3.wav b/test_wavs/japanese/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..93113e90fe52600ef69fda04274865219577a51c --- /dev/null +++ b/test_wavs/japanese/3.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8195ae4c0b5e3cad89e5e92aa7e19d681cea73ca8cf193649e423ecb5a19a0c7 +size 199452 diff --git a/test_wavs/japanese/4.wav b/test_wavs/japanese/4.wav new file mode 100644 index 0000000000000000000000000000000000000000..3dcb311d02f7a632d3d5c804d504027ebfacefdb --- /dev/null +++ b/test_wavs/japanese/4.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed64f1cdd19a72c4ef66053d2a0a66e8b35a46b6d98a359acacd3bd81478cfa +size 328468 diff --git a/test_wavs/japanese/5.wav b/test_wavs/japanese/5.wav new file mode 100644 index 0000000000000000000000000000000000000000..c6eb0ca45e39cb78bf894df7fecb2ed021258472 --- /dev/null +++ b/test_wavs/japanese/5.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaa18f4be5e77a340bea3d0bc25f84feaa352b3d5cba541197c2b2740e7f1dd1 +size 446868 diff --git a/test_wavs/japanese/transcript.txt b/test_wavs/japanese/transcript.txt new file mode 100644 index 0000000000000000000000000000000000000000..d33154426027407cdfabf84bbea0080c16c0749c --- /dev/null +++ b/test_wavs/japanese/transcript.txt @@ -0,0 +1,5 @@ +1.wav 気象庁は、雪や路面の凍結による交通への影響、暴風雪や高波に警戒するとともに、雪崩や屋根からの落雪にも十分注意するよう呼びかけています。 +2.wav はやくおじいさんにあのおとこのはなしをきかせたかったのです。 +3.wav ヤンバルクイナとの出会いは18歳の時だった。 +4.wav H2Aは、打ち上げの成功率は高い一方、1回の打ち上げ費用がおよそ100億円と、高額であることが課題となっていました。 +5.wav 持ち主とはぐれた傘が風で舞い看板もなぎ倒されてしまったようです。 diff --git a/test_wavs/korean/0.wav b/test_wavs/korean/0.wav new file mode 100644 index 0000000000000000000000000000000000000000..a028defd15e3470106cd77964a1edc62f9751747 --- /dev/null +++ b/test_wavs/korean/0.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0faf0b037efe428e5e561195f4d2aa148b2a0a2a5fc540b2c184b9d5c241e984 +size 112892 diff --git a/test_wavs/korean/1.wav b/test_wavs/korean/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..32c88e20f2d8c8ffa2be4cd3f390999c53886047 --- /dev/null +++ b/test_wavs/korean/1.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59bf1209d0d37088335d94f21394f31d794743bc9c849e3a4c9932a985c0bae +size 108992 diff --git a/test_wavs/korean/2.wav b/test_wavs/korean/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..e17e61e2b7846ac9facb75ee9d7ea61e745fec1d --- /dev/null +++ b/test_wavs/korean/2.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed95184720061842e8f0f5df7e5826f97b0b26cd3c9bff18709f5be07ff18728 +size 212142 diff --git a/test_wavs/korean/3.wav b/test_wavs/korean/3.wav new file mode 100644 index 0000000000000000000000000000000000000000..d458a12da49575e05ceb163b66228cf5f52938de --- /dev/null +++ b/test_wavs/korean/3.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1d2de5f90c73dfacddc1d6ab93a41427c89573f261ed2d425a6a37b3ee32931 +size 85834 diff --git a/test_wavs/korean/trans.txt b/test_wavs/korean/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..b50b66e57d867c58766c48db8820acea46744b33 --- /dev/null +++ b/test_wavs/korean/trans.txt @@ -0,0 +1,4 @@ +0.wav 그는 괜찮은 척하려고 애쓰는 것 같았다. +1.wav 지하철에서 다리를 벌리고 앉지 마라. +2.wav 부모가 저지르는 큰 실수 중 하나는 자기 아이를 다른 집 아이와 비교하는 것이다. +3.wav 주민등록증을 보여 주시겠어요? diff --git a/test_wavs/librispeech/1089-134686-0001.wav b/test_wavs/librispeech/1089-134686-0001.wav new file mode 100644 index 0000000000000000000000000000000000000000..f58b5dd584aceb425bb6c46e62d434a5e5a171eb --- /dev/null +++ b/test_wavs/librispeech/1089-134686-0001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bc58a4efdf20daac252b6b1502632601a71efe0308f6757dc1eda34891a7e4f +size 212044 diff --git a/test_wavs/librispeech/1221-135766-0001.wav b/test_wavs/librispeech/1221-135766-0001.wav new file mode 100644 index 0000000000000000000000000000000000000000..32f8e24ed1c40a1ea1039728b40742541b37716f --- /dev/null +++ b/test_wavs/librispeech/1221-135766-0001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5143a6ba93c4b274e2c4ac22deb75c2c48936c853f0519add1de828b6c79cc5a +size 534924 diff --git a/test_wavs/librispeech/1221-135766-0002.wav b/test_wavs/librispeech/1221-135766-0002.wav new file mode 100644 index 0000000000000000000000000000000000000000..0f4ab4796500e2a529063deb400e32adea91f1f0 --- /dev/null +++ b/test_wavs/librispeech/1221-135766-0002.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65b222837919ccbb924a4e1077413ea7cc6af3e68b663b012a9539d5c05850f0 +size 154444 diff --git a/test_wavs/librispeech/README.md b/test_wavs/librispeech/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5076b0ba5843e6fad94fdb935c8f321170f9ae1 --- /dev/null +++ b/test_wavs/librispeech/README.md @@ -0,0 +1,2 @@ +Files are downloaded from +https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs diff --git a/test_wavs/librispeech/trans.txt b/test_wavs/librispeech/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..07661742c41af18d8e06b81510df68fadc381f1c --- /dev/null +++ b/test_wavs/librispeech/trans.txt @@ -0,0 +1,3 @@ +1089-134686-0001 AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +1221-135766-0001 GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN +1221-135766-0002 YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION diff --git "a/test_wavs/paraformer-zh/\345\233\233\345\267\235\350\257\235.wav" "b/test_wavs/paraformer-zh/\345\233\233\345\267\235\350\257\235.wav" new file mode 100644 index 0000000000000000000000000000000000000000..bef94074c0a92f5885684ef4db9bb2f5c5402f35 --- /dev/null +++ "b/test_wavs/paraformer-zh/\345\233\233\345\267\235\350\257\235.wav" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:150b75ada9f5b4011018e961e56427e4af6849fcc69764321556586bc4790c49 +size 250764 diff --git "a/test_wavs/paraformer-zh/\345\244\251\346\264\245\350\257\235.wav" "b/test_wavs/paraformer-zh/\345\244\251\346\264\245\350\257\235.wav" new file mode 100644 index 0000000000000000000000000000000000000000..63f7275d942967d9db5c77d760989ea177980a2a --- /dev/null +++ "b/test_wavs/paraformer-zh/\345\244\251\346\264\245\350\257\235.wav" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1127451bad4c45953a5b788ab4977170388e91454ef90fee55754c4587628782 +size 249932 diff --git "a/test_wavs/paraformer-zh/\351\203\221\345\267\236\350\257\235.wav" "b/test_wavs/paraformer-zh/\351\203\221\345\267\236\350\257\235.wav" new file mode 100644 index 0000000000000000000000000000000000000000..378c56dc3acedaca363c493ff48f5cb3ef334036 --- /dev/null +++ "b/test_wavs/paraformer-zh/\351\203\221\345\267\236\350\257\235.wav" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed767dc9a89489888060c65bc246e702b2ab25e16977998be10edc459c9e4397 +size 255724 diff --git a/test_wavs/russian/russian-i-love-you.wav b/test_wavs/russian/russian-i-love-you.wav new file mode 100644 index 0000000000000000000000000000000000000000..cbc1722c5c48ab647cb9212aa9a3fc0bb806296e --- /dev/null +++ b/test_wavs/russian/russian-i-love-you.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3ac4f6e5b818ec89bdd884f60637daa32ef0ed19a11981b7e02e3e7799dfd79 +size 295758 diff --git a/test_wavs/russian/test.wav b/test_wavs/russian/test.wav new file mode 100644 index 0000000000000000000000000000000000000000..d2ca4aba0268846f57bed312d9068485386611e1 --- /dev/null +++ b/test_wavs/russian/test.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e768803b3bc3afcb08326677f3bb872c9beeed29af40d3579bcc14e74484f8 +size 226604 diff --git a/test_wavs/sense_voice/en.wav b/test_wavs/sense_voice/en.wav new file mode 100644 index 0000000000000000000000000000000000000000..bc726367bd0aa4bab950332f59d07212fd1a3df7 --- /dev/null +++ b/test_wavs/sense_voice/en.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b9dd4301ebb66df8937cb217c5f13cc30f5106c842249e1e7fcbf6be29e23de +size 228908 diff --git a/test_wavs/sense_voice/ja.wav b/test_wavs/sense_voice/ja.wav new file mode 100644 index 0000000000000000000000000000000000000000..a654dfbb85196c79e8010b79988f85285fde7995 --- /dev/null +++ b/test_wavs/sense_voice/ja.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0a91ca754be0f7c646f78f07e8f560aff510e632abcd937fd933cf3be950b54 +size 230444 diff --git a/test_wavs/sense_voice/ko.wav b/test_wavs/sense_voice/ko.wav new file mode 100644 index 0000000000000000000000000000000000000000..f57e841f6d3c689718b355783ac6f5ad2305675a --- /dev/null +++ b/test_wavs/sense_voice/ko.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7402c57e1b368a58b463fa7c33860bb60be19f960cfc11ad15719b075c9e76f1 +size 147500 diff --git a/test_wavs/sense_voice/yue.wav b/test_wavs/sense_voice/yue.wav new file mode 100644 index 0000000000000000000000000000000000000000..f20a7eb6c00ff94569e10cc04a732ef5f1c2bad0 --- /dev/null +++ b/test_wavs/sense_voice/yue.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d4ae92ca47165d20a63ea172ae743aecd336e7af4ad15827689e752681604b4 +size 164780 diff --git a/test_wavs/sense_voice/zh.wav b/test_wavs/sense_voice/zh.wav new file mode 100644 index 0000000000000000000000000000000000000000..a9e058ae2ca47d9d0741e5ac73daad057a3e9967 --- /dev/null +++ b/test_wavs/sense_voice/zh.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68590b46b07ec7e99810590d24bc8a371dc733c001ddb8097833e948b7b23987 +size 178988 diff --git a/test_wavs/tal_csasr/0.wav b/test_wavs/tal_csasr/0.wav new file mode 100644 index 0000000000000000000000000000000000000000..1d9c8972ae4f8245d819146fafb501f3be41f3bd --- /dev/null +++ b/test_wavs/tal_csasr/0.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eddf384a906bd6d905c9d9d652d614def1857608b88c2eee663ceeccbb31f7a3 +size 259278 diff --git a/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav b/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav new file mode 100644 index 0000000000000000000000000000000000000000..6a72a4d7d33d069c8b858047964402c0bfb26a6e --- /dev/null +++ b/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bfb42c963e623ebab31b81ff4404867d07d3102507c87ac14577c4c61663b8c +size 163244 diff --git a/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav b/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav new file mode 100644 index 0000000000000000000000000000000000000000..96c6d818f4936e39e5d11485afd441fb949fd114 --- /dev/null +++ b/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20805bcc696b9b65f3357a2508a419d4f33b81006d47e1312e219b0fad934d1d +size 150124 diff --git a/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav b/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav new file mode 100644 index 0000000000000000000000000000000000000000..9a4fa6b2c8786388906257d56aa64b9db4e38653 --- /dev/null +++ b/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5851e31f3e8e5f741635da9466e19fccfbabd117ebcdc92db242c92483aee064 +size 282604 diff --git a/test_wavs/tal_csasr/README.md b/test_wavs/tal_csasr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd1d534036b9aa2f98fc42740e67c6c0100415a2 --- /dev/null +++ b/test_wavs/tal_csasr/README.md @@ -0,0 +1,2 @@ +Files are downloaded from +https://huggingface.co/luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5/tree/main/test_wavs diff --git a/test_wavs/tedlium3/DanBarber_2010-219.wav b/test_wavs/tedlium3/DanBarber_2010-219.wav new file mode 100644 index 0000000000000000000000000000000000000000..a86439e1a6d3215829cbe0c3b4d904c86e20c8c6 --- /dev/null +++ b/test_wavs/tedlium3/DanBarber_2010-219.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:667b031ad4f6ca87c00a152fe8d0005f6e703465662ed87815cb53d2e74a9bb1 +size 116908 diff --git a/test_wavs/tedlium3/DanielKahneman_2010-157.wav b/test_wavs/tedlium3/DanielKahneman_2010-157.wav new file mode 100644 index 0000000000000000000000000000000000000000..bc5ce7e4294292817fbc5f0f9b42d10ed2192a2b --- /dev/null +++ b/test_wavs/tedlium3/DanielKahneman_2010-157.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38adf05dc46341aac948d7d43f5dfc9d8a3627989dbe9d43ab8548fa060612d9 +size 403564 diff --git a/test_wavs/tedlium3/RobertGupta_2010U-15.wav b/test_wavs/tedlium3/RobertGupta_2010U-15.wav new file mode 100644 index 0000000000000000000000000000000000000000..fba230219ebcbb680809b672e6de7085f03468dc --- /dev/null +++ b/test_wavs/tedlium3/RobertGupta_2010U-15.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abadb1c7a252fd1c5781f8ee0a4378db9ec99924048b594b30b416dd2cf2013a +size 554604 diff --git a/test_wavs/tedlium3/trans.txt b/test_wavs/tedlium3/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e2420a329b27fc9f4741ab75195ae123442a83c --- /dev/null +++ b/test_wavs/tedlium3/trans.txt @@ -0,0 +1,3 @@ +DanBarber_2010-219 well the last year this property had six hundred thousand birds on it +RobertGupta_2010U-15 and he was talking about invisible demons and smoke and how someone was poisoning him in his sleep and i was afraid not for myself but i was afraid that i was going to lose him that he was going to sink into one of his states +DanielKahneman_2010-157 goes very different ways depending on how you think and whether you think of the remembering self or you think of the experiencing self this is going to influence policy i think in years to come in the united states efforts are being made diff --git a/test_wavs/thai/0.wav b/test_wavs/thai/0.wav new file mode 100644 index 0000000000000000000000000000000000000000..2618976dd76bbe7330e173e6223396cc885fa182 --- /dev/null +++ b/test_wavs/thai/0.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf49ed282c1e7562cec9f237a72b9ccea1465f5ef286531443c9842c3f78af41 +size 143916 diff --git a/test_wavs/thai/1.wav b/test_wavs/thai/1.wav new file mode 100644 index 0000000000000000000000000000000000000000..1948b05651a70074a3e8a082ec236b684276a684 --- /dev/null +++ b/test_wavs/thai/1.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea040f01c9afacdd9c6c87424ad2cb966e0628817d458a3b8b990eb502b73675 +size 137260 diff --git a/test_wavs/thai/2.wav b/test_wavs/thai/2.wav new file mode 100644 index 0000000000000000000000000000000000000000..8d1d712dd5116f7476963359fb67db740cd5b77c --- /dev/null +++ b/test_wavs/thai/2.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e55272ea0bba30ef92d46dbc3636f6fcfa579786ee9ebc629e6d8122354cc61e +size 316140 diff --git a/test_wavs/thai/trans.txt b/test_wavs/thai/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f38f54aecbdd83c8c799f786798dfd6edaafac1 --- /dev/null +++ b/test_wavs/thai/trans.txt @@ -0,0 +1,3 @@ +0 ก็เดี๋ยวเกมในนัดต่อไปต้องไปเจอกับทางอินโดนีเซียนะครับ +1 ก็ไม่ได้เน้นเรื่องของผลการแข่งขันอยู่แล้วครับเหมือนที่คาร์ลอสเซซาร์นั้นได้บอกไว้นะครับ +2 เกมในเกมที่แล้วเนี่ยตอนพักครึ่งหลังเนี่ยเหมือนคาร์ลอสจะบอกว่าจริงจริงจะไม่ส่งมูฮัมหมัดลงด้วยซ้ําแล้วนะครับแต่ว่าเหมือนกับท้ายเกมเนี่ยส่งไปด้วยความมั่นใจแล้วโอ้โหประตูที่สาม \ No newline at end of file diff --git a/test_wavs/tibetan/a_0_cacm-A70_31116.wav b/test_wavs/tibetan/a_0_cacm-A70_31116.wav new file mode 100644 index 0000000000000000000000000000000000000000..49fc3f50fe282ed973ddd5f2da0a9f6a16340025 --- /dev/null +++ b/test_wavs/tibetan/a_0_cacm-A70_31116.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4887c92e193a422cdb44306d807b7cfb5077bb4474e1449342974a9510f9549e +size 97358 diff --git a/test_wavs/tibetan/a_0_cacm-A70_31117.wav b/test_wavs/tibetan/a_0_cacm-A70_31117.wav new file mode 100644 index 0000000000000000000000000000000000000000..a982875c3adc8fc76fabd7a07f2d12c2ed5c5285 --- /dev/null +++ b/test_wavs/tibetan/a_0_cacm-A70_31117.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d335c49df8536f106c062e3b9a44da9085f02da76fdebee784e261eb0097d94c +size 128078 diff --git a/test_wavs/tibetan/a_0_cacm-A70_31118.wav b/test_wavs/tibetan/a_0_cacm-A70_31118.wav new file mode 100644 index 0000000000000000000000000000000000000000..9a9175dcff98f19a572e842b9a3479c4840cdf6d --- /dev/null +++ b/test_wavs/tibetan/a_0_cacm-A70_31118.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a15845fba46c12d8afca1203ad0a66f29e2a614bfe555ec8d1807b0eaddbfd30 +size 87118 diff --git a/test_wavs/tibetan/trans.txt b/test_wavs/tibetan/trans.txt new file mode 100644 index 0000000000000000000000000000000000000000..11d5bb9766f6e0d8b2a5035cba3fcb4dc8268a79 --- /dev/null +++ b/test_wavs/tibetan/trans.txt @@ -0,0 +1,3 @@ +a_0_cacm-A70_31116.wav ལོ བཅུ ཙམ མ འདང བའི དུས སྐབས ནང +a_0_cacm-A70_31117.wav དྲག པོའི ངོ ལོག ཟིང འཁྲུག སྒྲིག འཛུགས དང ངན བཀོད བྱས ཡོད +a_0_cacm-A70_31118.wav གནས བབ འདིའི རིགས གང མགྱོགས འགྱུར བ གཏོང དགོས diff --git a/test_wavs/wenetspeech/DEV_T0000000000.opus b/test_wavs/wenetspeech/DEV_T0000000000.opus new file mode 100644 index 0000000000000000000000000000000000000000..b94876449fd72d4ccb74f60217d8357fe2d636cf --- /dev/null +++ b/test_wavs/wenetspeech/DEV_T0000000000.opus @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f6829a6caf4b8a932aaaeac81a407157af4fd7e64ab1b713dca902b84261f61 +size 23061 diff --git a/test_wavs/wenetspeech/DEV_T0000000001.opus b/test_wavs/wenetspeech/DEV_T0000000001.opus new file mode 100644 index 0000000000000000000000000000000000000000..83f550d7dc632890e82cd2ed80d4ff02a98ccacd --- /dev/null +++ b/test_wavs/wenetspeech/DEV_T0000000001.opus @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c82c520e331610288cee9431992aa47ae763a64c9bfc77efdce8d9f823a66b36 +size 21487 diff --git a/test_wavs/wenetspeech/DEV_T0000000002.opus b/test_wavs/wenetspeech/DEV_T0000000002.opus new file mode 100644 index 0000000000000000000000000000000000000000..fbeb246a370d774984743c16e46236230c2bd4b1 --- /dev/null +++ b/test_wavs/wenetspeech/DEV_T0000000002.opus @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0973cf6116863dcc97ce1071cff751107df5cef68269ed2ac537a56259f8d9a +size 18837 diff --git a/test_wavs/wenetspeech/README.md b/test_wavs/wenetspeech/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6a2aac877892426c5fa3c90a1dfc4cac93fa2ed8 --- /dev/null +++ b/test_wavs/wenetspeech/README.md @@ -0,0 +1,2 @@ +Files are downloaded from +https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/test_wavs