csukuangfj commited on
Commit
9eb140c
·
1 Parent(s): bb0448c

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -9
  2. README.md +6 -5
  3. app.py +436 -0
  4. decode.py +121 -0
  5. examples.py +544 -0
  6. giga-tokens.txt +500 -0
  7. model.py +1940 -0
  8. requirements.txt +15 -0
  9. test_wavs/aidatatang_200zh/README.md +2 -0
  10. test_wavs/aidatatang_200zh/T0055G0036S0002.wav +3 -0
  11. test_wavs/aidatatang_200zh/T0055G0036S0003.wav +3 -0
  12. test_wavs/aidatatang_200zh/T0055G0036S0004.wav +3 -0
  13. test_wavs/aishell2/ID0012W0030.wav +3 -0
  14. test_wavs/aishell2/ID0012W0162.wav +3 -0
  15. test_wavs/aishell2/ID0012W0215.wav +3 -0
  16. test_wavs/aishell2/README.md +2 -0
  17. test_wavs/aishell2/trans.txt +3 -0
  18. test_wavs/alimeeting/165.wav +3 -0
  19. test_wavs/alimeeting/209.wav +3 -0
  20. test_wavs/alimeeting/74.wav +3 -0
  21. test_wavs/alimeeting/R8003_M8001-8004-165.wav +3 -0
  22. test_wavs/alimeeting/R8008_M8013-8049-74.wav +3 -0
  23. test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav +3 -0
  24. test_wavs/alimeeting/trans.txt +3 -0
  25. test_wavs/arabic/a.wav +3 -0
  26. test_wavs/arabic/b.wav +3 -0
  27. test_wavs/arabic/c.wav +3 -0
  28. test_wavs/arabic/trans.txt +3 -0
  29. test_wavs/cantonese/1.wav +3 -0
  30. test_wavs/cantonese/2.wav +3 -0
  31. test_wavs/french/common_voice_fr_19364697.wav +3 -0
  32. test_wavs/french/common_voice_fr_19738183.wav +3 -0
  33. test_wavs/french/common_voice_fr_27024649.wav +3 -0
  34. test_wavs/french/trans.txt +3 -0
  35. test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav +3 -0
  36. test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav +3 -0
  37. test_wavs/gigaspeech/1-minute-audiobook.opus +3 -0
  38. test_wavs/gigaspeech/100-seconds-podcast.opus +3 -0
  39. test_wavs/gigaspeech/100-seconds-youtube.opus +3 -0
  40. test_wavs/japanese/1.wav +3 -0
  41. test_wavs/japanese/2.wav +3 -0
  42. test_wavs/japanese/3.wav +3 -0
  43. test_wavs/japanese/4.wav +3 -0
  44. test_wavs/japanese/5.wav +3 -0
  45. test_wavs/japanese/transcript.txt +5 -0
  46. test_wavs/korean/0.wav +3 -0
  47. test_wavs/korean/1.wav +3 -0
  48. test_wavs/korean/2.wav +3 -0
  49. test_wavs/korean/3.wav +3 -0
  50. test_wavs/korean/trans.txt +4 -0
.gitattributes CHANGED
@@ -2,34 +2,28 @@
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
5
  *.ftz filter=lfs diff=lfs merge=lfs -text
6
  *.gz filter=lfs diff=lfs merge=lfs -text
7
  *.h5 filter=lfs diff=lfs merge=lfs -text
8
  *.joblib filter=lfs diff=lfs merge=lfs -text
9
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
10
  *.model filter=lfs diff=lfs merge=lfs -text
11
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.onnx filter=lfs diff=lfs merge=lfs -text
13
  *.ot filter=lfs diff=lfs merge=lfs -text
14
  *.parquet filter=lfs diff=lfs merge=lfs -text
15
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
16
  *.pt filter=lfs diff=lfs merge=lfs -text
17
  *.pth filter=lfs diff=lfs merge=lfs -text
18
  *.rar filter=lfs diff=lfs merge=lfs -text
 
19
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tflite filter=lfs diff=lfs merge=lfs -text
22
  *.tgz filter=lfs diff=lfs merge=lfs -text
23
  *.wasm filter=lfs diff=lfs merge=lfs -text
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.wav filter=lfs diff=lfs merge=lfs -text
29
+ *.opus filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: Automatic Speech Recognition 2
3
- emoji: 🐢
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.25.2
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
+ title: Automatic Speech Recognition
3
+ emoji: 🌍
4
+ colorFrom: yellow
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
+ python_version: 3.10.0
9
  app_file: app.py
10
  pinned: false
11
  license: apache-2.0
app.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ # References:
20
+ # https://gradio.app/docs/#dropdown
21
+
22
+ import logging
23
+ import os
24
+ import tempfile
25
+ import time
26
+ import urllib.request
27
+ import uuid
28
+ from datetime import datetime
29
+
30
+ import gradio as gr
31
+ import torch
32
+ import torchaudio
33
+
34
+ from examples import examples
35
+ from model import (
36
+ decode,
37
+ get_pretrained_model,
38
+ get_punct_model,
39
+ language_to_models,
40
+ sample_rate,
41
+ )
42
+
43
+ languages = list(language_to_models.keys())
44
+
45
+
46
+ def MyPrint(s):
47
+ now = datetime.now()
48
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
49
+ print(f"{date_time}: {s}")
50
+
51
+
52
+ def convert_to_wav(in_filename: str) -> str:
53
+ """Convert the input audio file to a wave file"""
54
+ out_filename = str(uuid.uuid4())
55
+ out_filename = f"{in_filename}.wav"
56
+
57
+ MyPrint(f"Converting '{in_filename}' to '{out_filename}'")
58
+ _ = os.system(
59
+ f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 -ac 1 '{out_filename}' -y"
60
+ )
61
+
62
+ return out_filename
63
+
64
+
65
+ def build_html_output(s: str, style: str = "result_item_success"):
66
+ return f"""
67
+ <div class='result'>
68
+ <div class='result_item {style}'>
69
+ {s}
70
+ </div>
71
+ </div>
72
+ """
73
+
74
+
75
+ def process_url(
76
+ language: str,
77
+ repo_id: str,
78
+ decoding_method: str,
79
+ num_active_paths: int,
80
+ add_punct: str,
81
+ url: str,
82
+ ):
83
+ MyPrint(f"Processing URL: {url}")
84
+ with tempfile.NamedTemporaryFile() as f:
85
+ try:
86
+ urllib.request.urlretrieve(url, f.name)
87
+
88
+ return process(
89
+ in_filename=f.name,
90
+ language=language,
91
+ repo_id=repo_id,
92
+ decoding_method=decoding_method,
93
+ num_active_paths=num_active_paths,
94
+ add_punct=add_punct,
95
+ )
96
+ except Exception as e:
97
+ MyPrint(str(e))
98
+ return "", build_html_output(str(e), "result_item_error")
99
+
100
+
101
+ def process_uploaded_file(
102
+ language: str,
103
+ repo_id: str,
104
+ decoding_method: str,
105
+ num_active_paths: int,
106
+ add_punct: str,
107
+ in_filename: str,
108
+ ):
109
+ if in_filename is None or in_filename == "":
110
+ return "", build_html_output(
111
+ "Please first upload a file and then click "
112
+ 'the button "submit for recognition"',
113
+ "result_item_error",
114
+ )
115
+
116
+ MyPrint(f"Processing uploaded file: {in_filename}")
117
+ try:
118
+ return process(
119
+ in_filename=in_filename,
120
+ language=language,
121
+ repo_id=repo_id,
122
+ decoding_method=decoding_method,
123
+ num_active_paths=num_active_paths,
124
+ add_punct=add_punct,
125
+ )
126
+ except Exception as e:
127
+ MyPrint(str(e))
128
+ return "", build_html_output(str(e), "result_item_error")
129
+
130
+
131
+ def process_microphone(
132
+ language: str,
133
+ repo_id: str,
134
+ decoding_method: str,
135
+ num_active_paths: int,
136
+ add_punct: str,
137
+ in_filename: str,
138
+ ):
139
+ if in_filename is None or in_filename == "":
140
+ return "", build_html_output(
141
+ "Please first click 'Record from microphone', speak, "
142
+ "click 'Stop recording', and then "
143
+ "click the button 'submit for recognition'",
144
+ "result_item_error",
145
+ )
146
+
147
+ MyPrint(f"Processing microphone: {in_filename}")
148
+ try:
149
+ return process(
150
+ in_filename=in_filename,
151
+ language=language,
152
+ repo_id=repo_id,
153
+ decoding_method=decoding_method,
154
+ num_active_paths=num_active_paths,
155
+ add_punct=add_punct,
156
+ )
157
+ except Exception as e:
158
+ MyPrint(str(e))
159
+ return "", build_html_output(str(e), "result_item_error")
160
+
161
+
162
+ @torch.no_grad()
163
+ def process(
164
+ language: str,
165
+ repo_id: str,
166
+ decoding_method: str,
167
+ num_active_paths: int,
168
+ add_punct: str,
169
+ in_filename: str,
170
+ ):
171
+ MyPrint(f"language: {language}")
172
+ MyPrint(f"repo_id: {repo_id}")
173
+ MyPrint(f"decoding_method: {decoding_method}")
174
+ MyPrint(f"num_active_paths: {num_active_paths}")
175
+ MyPrint(f"in_filename: {in_filename}")
176
+
177
+ filename = convert_to_wav(in_filename)
178
+
179
+ now = datetime.now()
180
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
181
+ MyPrint(f"Started at {date_time}")
182
+
183
+ start = time.time()
184
+
185
+ recognizer = get_pretrained_model(
186
+ repo_id,
187
+ decoding_method=decoding_method,
188
+ num_active_paths=num_active_paths,
189
+ )
190
+
191
+ text = decode(recognizer, filename)
192
+ if add_punct == "Yes" and language == "Chinese":
193
+ punct = get_punct_model()
194
+ text = punct.add_punctuation(text)
195
+
196
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
197
+ end = time.time()
198
+
199
+ metadata = torchaudio.info(filename)
200
+ duration = metadata.num_frames / sample_rate
201
+ rtf = (end - start) / duration
202
+
203
+ MyPrint(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
204
+
205
+ info = f"""
206
+ Wave duration : {duration: .3f} s <br/>
207
+ Processing time: {end - start: .3f} s <br/>
208
+ RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
209
+ """
210
+ if (
211
+ rtf > 1
212
+ and repo_id != "csukuangfj/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16"
213
+ ):
214
+ info += (
215
+ "<br/>We are loading the model for the first run. "
216
+ "Please run again to measure the real RTF.<br/>"
217
+ )
218
+
219
+ MyPrint(info)
220
+ MyPrint(f"\nrepo_id: {repo_id}\nhyp: {text}")
221
+
222
+ return text, build_html_output(info)
223
+
224
+
225
+ title = "# Automatic Speech Recognition with Next-gen Kaldi"
226
+ description = """
227
+ This space shows how to do automatic speech recognition with Next-gen Kaldi.
228
+
229
+ Please visit
230
+ <https://k2-fsa.github.io/sherpa/ncnn/wasm/hf-spaces.html>
231
+ for streaming speech recognition with **Next-gen Kaldi** using WebAssembly.
232
+
233
+ It is running on CPU within a docker container provided by Hugging Face.
234
+
235
+ Please input audio files less than 30 seconds in this space.
236
+
237
+ Please see <https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos>
238
+ if you want to try files longer than 30 seconds.
239
+
240
+ For text to speech, please see
241
+ <https://huggingface.co/spaces/k2-fsa/text-to-speech>
242
+
243
+ See more information by visiting the following links:
244
+
245
+ - <https://github.com/k2-fsa/icefall>
246
+ - <https://github.com/k2-fsa/sherpa>
247
+ - <https://github.com/k2-fsa/sherpa-onnx>
248
+ - <https://github.com/k2-fsa/sherpa-ncnn>
249
+ - <https://github.com/k2-fsa/k2>
250
+ - <https://github.com/lhotse-speech/lhotse>
251
+
252
+ If you want to deploy it locally, please see
253
+ <https://k2-fsa.github.io/sherpa/>
254
+ """
255
+
256
+ # css style is copied from
257
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
258
+ css = """
259
+ .result {display:flex;flex-direction:column}
260
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
261
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
262
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
263
+ """
264
+
265
+
266
+ def update_model_dropdown(language: str):
267
+ if language in language_to_models:
268
+ choices = language_to_models[language]
269
+ return gr.Dropdown(
270
+ choices=choices,
271
+ value=choices[0],
272
+ interactive=True,
273
+ )
274
+
275
+ raise ValueError(f"Unsupported language: {language}")
276
+
277
+
278
+ demo = gr.Blocks(css=css)
279
+
280
+
281
+ with demo:
282
+ gr.Markdown(title)
283
+ language_choices = list(language_to_models.keys())
284
+
285
+ language_radio = gr.Radio(
286
+ label="Language",
287
+ choices=language_choices,
288
+ value=language_choices[0],
289
+ )
290
+ model_dropdown = gr.Dropdown(
291
+ choices=language_to_models[language_choices[0]],
292
+ label="Select a model",
293
+ value=language_to_models[language_choices[0]][0],
294
+ )
295
+
296
+ language_radio.change(
297
+ update_model_dropdown,
298
+ inputs=language_radio,
299
+ outputs=model_dropdown,
300
+ )
301
+
302
+ decoding_method_radio = gr.Radio(
303
+ label="Decoding method",
304
+ choices=["greedy_search", "modified_beam_search"],
305
+ value="greedy_search",
306
+ )
307
+
308
+ num_active_paths_slider = gr.Slider(
309
+ minimum=1,
310
+ value=4,
311
+ step=1,
312
+ label="Number of active paths for modified_beam_search",
313
+ )
314
+
315
+ punct_radio = gr.Radio(
316
+ label="Whether to add punctuation (Only for Chinese)",
317
+ choices=["Yes", "No"],
318
+ value="Yes",
319
+ )
320
+
321
+ with gr.Tabs():
322
+ with gr.TabItem("Upload from disk"):
323
+ uploaded_file = gr.Audio(
324
+ sources=["upload"], # Choose between "microphone", "upload"
325
+ type="filepath",
326
+ label="Upload from disk",
327
+ )
328
+ upload_button = gr.Button("Submit for recognition")
329
+ uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
330
+ uploaded_html_info = gr.HTML(label="Info")
331
+
332
+ # gr.Examples(
333
+ # examples=examples,
334
+ # inputs=[
335
+ # language_radio,
336
+ # model_dropdown,
337
+ # decoding_method_radio,
338
+ # num_active_paths_slider,
339
+ # punct_radio,
340
+ # uploaded_file,
341
+ # ],
342
+ # outputs=[uploaded_output, uploaded_html_info],
343
+ # fn=process_uploaded_file,
344
+ # )
345
+
346
+ with gr.TabItem("Record from microphone"):
347
+ microphone = gr.Audio(
348
+ sources=["microphone"], # Choose between "microphone", "upload"
349
+ type="filepath",
350
+ label="Record from microphone",
351
+ )
352
+
353
+ record_button = gr.Button("Submit for recognition")
354
+ recorded_output = gr.Textbox(label="Recognized speech from recordings")
355
+ recorded_html_info = gr.HTML(label="Info")
356
+
357
+ # gr.Examples(
358
+ # examples=examples,
359
+ # inputs=[
360
+ # language_radio,
361
+ # model_dropdown,
362
+ # decoding_method_radio,
363
+ # num_active_paths_slider,
364
+ # punct_radio,
365
+ # microphone,
366
+ # ],
367
+ # outputs=[recorded_output, recorded_html_info],
368
+ # fn=process_microphone,
369
+ # )
370
+
371
+ with gr.TabItem("From URL"):
372
+ url_textbox = gr.Textbox(
373
+ max_lines=1,
374
+ placeholder="URL to an audio file",
375
+ label="URL",
376
+ interactive=True,
377
+ )
378
+
379
+ url_button = gr.Button("Submit for recognition")
380
+ url_output = gr.Textbox(label="Recognized speech from URL")
381
+ url_html_info = gr.HTML(label="Info")
382
+
383
+ upload_button.click(
384
+ process_uploaded_file,
385
+ inputs=[
386
+ language_radio,
387
+ model_dropdown,
388
+ decoding_method_radio,
389
+ num_active_paths_slider,
390
+ punct_radio,
391
+ uploaded_file,
392
+ ],
393
+ outputs=[uploaded_output, uploaded_html_info],
394
+ )
395
+
396
+ record_button.click(
397
+ process_microphone,
398
+ inputs=[
399
+ language_radio,
400
+ model_dropdown,
401
+ decoding_method_radio,
402
+ num_active_paths_slider,
403
+ punct_radio,
404
+ microphone,
405
+ ],
406
+ outputs=[recorded_output, recorded_html_info],
407
+ )
408
+
409
+ url_button.click(
410
+ process_url,
411
+ inputs=[
412
+ language_radio,
413
+ model_dropdown,
414
+ decoding_method_radio,
415
+ num_active_paths_slider,
416
+ punct_radio,
417
+ url_textbox,
418
+ ],
419
+ outputs=[url_output, url_html_info],
420
+ )
421
+
422
+ gr.Markdown(description)
423
+
424
+ torch.set_num_threads(1)
425
+ torch.set_num_interop_threads(1)
426
+
427
+ torch._C._jit_set_profiling_executor(False)
428
+ torch._C._jit_set_profiling_mode(False)
429
+ torch._C._set_graph_executor_optimize(False)
430
+
431
+ if __name__ == "__main__":
432
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
433
+
434
+ logging.basicConfig(format=formatter, level=logging.INFO)
435
+
436
+ demo.launch()
decode.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # Copied from https://github.com/k2-fsa/sherpa/blob/master/sherpa/bin/conformer_rnnt/decode.py
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import math
20
+ from typing import List
21
+
22
+ import torch
23
+ from sherpa import RnntConformerModel, greedy_search, modified_beam_search
24
+ from torch.nn.utils.rnn import pad_sequence
25
+
26
+ LOG_EPS = math.log(1e-10)
27
+
28
+
29
+ @torch.no_grad()
30
+ def run_model_and_do_greedy_search(
31
+ model: RnntConformerModel,
32
+ features: List[torch.Tensor],
33
+ ) -> List[List[int]]:
34
+ """Run RNN-T model with the given features and use greedy search
35
+ to decode the output of the model.
36
+
37
+ Args:
38
+ model:
39
+ The RNN-T model.
40
+ features:
41
+ A list of 2-D tensors. Each entry is of shape
42
+ (num_frames, feature_dim).
43
+ Returns:
44
+ Return a list-of-list containing the decoding token IDs.
45
+ """
46
+ features_length = torch.tensor(
47
+ [f.size(0) for f in features],
48
+ dtype=torch.int64,
49
+ )
50
+ features = pad_sequence(
51
+ features,
52
+ batch_first=True,
53
+ padding_value=LOG_EPS,
54
+ )
55
+
56
+ device = model.device
57
+ features = features.to(device)
58
+ features_length = features_length.to(device)
59
+
60
+ encoder_out, encoder_out_length = model.encoder(
61
+ features=features,
62
+ features_length=features_length,
63
+ )
64
+
65
+ hyp_tokens = greedy_search(
66
+ model=model,
67
+ encoder_out=encoder_out,
68
+ encoder_out_length=encoder_out_length.cpu(),
69
+ )
70
+ return hyp_tokens
71
+
72
+
73
+ @torch.no_grad()
74
+ def run_model_and_do_modified_beam_search(
75
+ model: RnntConformerModel,
76
+ features: List[torch.Tensor],
77
+ num_active_paths: int,
78
+ ) -> List[List[int]]:
79
+ """Run RNN-T model with the given features and use greedy search
80
+ to decode the output of the model.
81
+
82
+ Args:
83
+ model:
84
+ The RNN-T model.
85
+ features:
86
+ A list of 2-D tensors. Each entry is of shape
87
+ (num_frames, feature_dim).
88
+ num_active_paths:
89
+ Used only when decoding_method is modified_beam_search.
90
+ It specifies number of active paths for each utterance. Due to
91
+ merging paths with identical token sequences, the actual number
92
+ may be less than "num_active_paths".
93
+ Returns:
94
+ Return a list-of-list containing the decoding token IDs.
95
+ """
96
+ features_length = torch.tensor(
97
+ [f.size(0) for f in features],
98
+ dtype=torch.int64,
99
+ )
100
+ features = pad_sequence(
101
+ features,
102
+ batch_first=True,
103
+ padding_value=LOG_EPS,
104
+ )
105
+
106
+ device = model.device
107
+ features = features.to(device)
108
+ features_length = features_length.to(device)
109
+
110
+ encoder_out, encoder_out_length = model.encoder(
111
+ features=features,
112
+ features_length=features_length,
113
+ )
114
+
115
+ hyp_tokens = modified_beam_search(
116
+ model=model,
117
+ encoder_out=encoder_out,
118
+ encoder_out_length=encoder_out_length.cpu(),
119
+ num_active_paths=num_active_paths,
120
+ )
121
+ return hyp_tokens
examples.py ADDED
@@ -0,0 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ examples = [
19
+ [
20
+ "Chinese+English",
21
+ "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20",
22
+ "greedy_search",
23
+ 4,
24
+ "Yes",
25
+ "./test_wavs/tal_csasr/0.wav",
26
+ ],
27
+ [
28
+ "Chinese+English+Cantonese",
29
+ "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en",
30
+ "greedy_search",
31
+ 4,
32
+ "Yes",
33
+ "./test_wavs/cantonese/2.wav",
34
+ ],
35
+ [
36
+ "Chinese+English+Cantonese+Japanese+Korean",
37
+ "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17",
38
+ "greedy_search",
39
+ 4,
40
+ "Yes",
41
+ "./test_wavs/sense_voice/yue.wav",
42
+ ],
43
+ [
44
+ "Cantonese",
45
+ "zrjin/icefall-asr-mdcc-zipformer-2024-03-11",
46
+ "greedy_search",
47
+ 4,
48
+ "Yes",
49
+ "./test_wavs/cantonese/1.wav",
50
+ ],
51
+ [
52
+ "English",
53
+ "whisper-base.en",
54
+ "greedy_search",
55
+ 4,
56
+ "Yes",
57
+ "./test_wavs/librispeech/1089-134686-0001.wav",
58
+ ],
59
+ [
60
+ "Chinese",
61
+ "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09",
62
+ "greedy_search",
63
+ 4,
64
+ "Yes",
65
+ "./test_wavs/paraformer-zh/四川话.wav",
66
+ ],
67
+ [
68
+ "Japanese",
69
+ "reazon-research/reazonspeech-k2-v2",
70
+ "greedy_search",
71
+ 4,
72
+ "No",
73
+ "./test_wavs/japanese/1.wav",
74
+ ],
75
+ [
76
+ "Korean",
77
+ "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24",
78
+ "greedy_search",
79
+ 4,
80
+ "No",
81
+ "./test_wavs/korean/0.wav",
82
+ ],
83
+ [
84
+ "Russian",
85
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24",
86
+ "greedy_search",
87
+ 4,
88
+ "No",
89
+ "./test_wavs/russian/russian-i-love-you.wav",
90
+ ],
91
+ [
92
+ "Thai",
93
+ "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20",
94
+ "greedy_search",
95
+ 4,
96
+ "No",
97
+ "./test_wavs/thai/0.wav",
98
+ ],
99
+ # [
100
+ # "Russian",
101
+ # "alphacep/vosk-model-ru",
102
+ # "greedy_search",
103
+ # 4,
104
+ # "No",
105
+ # "./test_wavs/russian/test.wav",
106
+ # ],
107
+ # [
108
+ # "German",
109
+ # "csukuangfj/wav2vec2.0-torchaudio",
110
+ # "greedy_search",
111
+ # 4,
112
+ # "No",
113
+ # "./test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav",
114
+ # ],
115
+ # [
116
+ # "Arabic",
117
+ # "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
118
+ # "greedy_search",
119
+ # 4,
120
+ # "No",
121
+ # "./test_wavs/arabic/a.wav",
122
+ # ],
123
+ # [
124
+ # "Tibetan",
125
+ # "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
126
+ # "greedy_search",
127
+ # 4,
128
+ # "No",
129
+ # "./test_wavs/tibetan/a_0_cacm-A70_31117.wav",
130
+ # ],
131
+ # [
132
+ # "French",
133
+ # "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
134
+ # "greedy_search",
135
+ # 4,
136
+ # "No",
137
+ # "./test_wavs/french/common_voice_fr_19364697.wav",
138
+ # ],
139
+ # [
140
+ # "Chinese",
141
+ # "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
142
+ # "greedy_search",
143
+ # 4,
144
+ # "Yes",
145
+ # "./test_wavs/alimeeting/R8003_M8001-8004-165.wav",
146
+ # ],
147
+ # [
148
+ # "Chinese",
149
+ # "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09",
150
+ # "greedy_search",
151
+ # 4,
152
+ # "Yes",
153
+ # "./test_wavs/paraformer-zh/天津话.wav",
154
+ # ],
155
+ # [
156
+ # "Chinese",
157
+ # "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09",
158
+ # "greedy_search",
159
+ # 4,
160
+ # "Yes",
161
+ # "./test_wavs/paraformer-zh/郑州话.wav",
162
+ # ],
163
+ # [
164
+ # "Chinese",
165
+ # "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
166
+ # "greedy_search",
167
+ # 4,
168
+ # "Yes",
169
+ # "./test_wavs/alimeeting/R8008_M8013-8049-74.wav",
170
+ # ],
171
+ # [
172
+ # "Chinese",
173
+ # "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
174
+ # "greedy_search",
175
+ # 4,
176
+ # "Yes",
177
+ # "./test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav",
178
+ # ],
179
+ # [
180
+ # "English",
181
+ # "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
182
+ # "greedy_search",
183
+ # 4,
184
+ # "Yes",
185
+ # "./test_wavs/tedlium3/DanBarber_2010-219.wav",
186
+ # ],
187
+ # [
188
+ # "English",
189
+ # "whisper-base.en",
190
+ # "greedy_search",
191
+ # 4,
192
+ # "Yes",
193
+ # "./test_wavs/tedlium3/DanielKahneman_2010-157.wav",
194
+ # ],
195
+ # [
196
+ # "English",
197
+ # "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
198
+ # "greedy_search",
199
+ # 4,
200
+ # "Yes",
201
+ # "./test_wavs/tedlium3/RobertGupta_2010U-15.wav",
202
+ # ],
203
+ # # librispeech
204
+ # # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
205
+ # [
206
+ # "English",
207
+ # "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
208
+ # "greedy_search",
209
+ # 4,
210
+ # "Yes",
211
+ # "./test_wavs/librispeech/1089-134686-0001.wav",
212
+ # ],
213
+ # [
214
+ # "English",
215
+ # "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
216
+ # "greedy_search",
217
+ # 4,
218
+ # "Yes",
219
+ # "./test_wavs/librispeech/1221-135766-0001.wav",
220
+ # ],
221
+ # [
222
+ # "English",
223
+ # "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
224
+ # "greedy_search",
225
+ # 4,
226
+ # "Yes",
227
+ # "./test_wavs/librispeech/1221-135766-0002.wav",
228
+ # ],
229
+ # # gigaspeech
230
+ # [
231
+ # "English",
232
+ # "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
233
+ # "greedy_search",
234
+ # 4,
235
+ # "Yes",
236
+ # "./test_wavs/gigaspeech/1-minute-audiobook.opus",
237
+ # ],
238
+ # [
239
+ # "English",
240
+ # "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
241
+ # "greedy_search",
242
+ # 4,
243
+ # "Yes",
244
+ # "./test_wavs/gigaspeech/100-seconds-podcast.opus",
245
+ # ],
246
+ # [
247
+ # "English",
248
+ # "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
249
+ # "greedy_search",
250
+ # 4,
251
+ # "Yes",
252
+ # "./test_wavs/gigaspeech/100-seconds-youtube.opus",
253
+ # ],
254
+ # # wenetspeech
255
+ # # https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/test_wavs
256
+ # [
257
+ # "Chinese",
258
+ # "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
259
+ # "greedy_search",
260
+ # 4,
261
+ # "Yes",
262
+ # "./test_wavs/wenetspeech/DEV_T0000000000.opus",
263
+ # ],
264
+ # [
265
+ # "Chinese",
266
+ # "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
267
+ # "greedy_search",
268
+ # 4,
269
+ # "Yes",
270
+ # "./test_wavs/wenetspeech/DEV_T0000000001.opus",
271
+ # ],
272
+ # [
273
+ # "Chinese",
274
+ # "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
275
+ # "greedy_search",
276
+ # 4,
277
+ # "Yes",
278
+ # "./test_wavs/wenetspeech/DEV_T0000000002.opus",
279
+ # ],
280
+ # # aishell2-A
281
+ # # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/test_wavs
282
+ # [
283
+ # "Chinese",
284
+ # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
285
+ # "greedy_search",
286
+ # 4,
287
+ # "Yes",
288
+ # "./test_wavs/aishell2/ID0012W0030.wav",
289
+ # ],
290
+ # [
291
+ # "Chinese",
292
+ # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
293
+ # "greedy_search",
294
+ # 4,
295
+ # "Yes",
296
+ # "./test_wavs/aishell2/ID0012W0162.wav",
297
+ # ],
298
+ # [
299
+ # "Chinese",
300
+ # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
301
+ # "greedy_search",
302
+ # 4,
303
+ # "Yes",
304
+ # "./test_wavs/aishell2/ID0012W0215.wav",
305
+ # ],
306
+ # # aishell2-B
307
+ # # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/test_wavs
308
+ # [
309
+ # "Chinese",
310
+ # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
311
+ # "greedy_search",
312
+ # 4,
313
+ # "Yes",
314
+ # "./test_wavs/aishell2/ID0012W0030.wav",
315
+ # ],
316
+ # [
317
+ # "Chinese",
318
+ # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
319
+ # "greedy_search",
320
+ # 4,
321
+ # "Yes",
322
+ # "./test_wavs/aishell2/ID0012W0162.wav",
323
+ # ],
324
+ # [
325
+ # "Chinese",
326
+ # "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
327
+ # "greedy_search",
328
+ # 4,
329
+ # "Yes",
330
+ # "./test_wavs/aishell2/ID0012W0215.wav",
331
+ # ],
332
+ # # aishell2-B
333
+ # # https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2/tree/main/test_wavs
334
+ # [
335
+ # "Chinese",
336
+ # "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
337
+ # "greedy_search",
338
+ # 4,
339
+ # "Yes",
340
+ # "./test_wavs/aidatatang_200zh/T0055G0036S0002.wav",
341
+ # ],
342
+ # [
343
+ # "Chinese",
344
+ # "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
345
+ # "greedy_search",
346
+ # 4,
347
+ # "Yes",
348
+ # "./test_wavs/aidatatang_200zh/T0055G0036S0003.wav",
349
+ # ],
350
+ # [
351
+ # "Chinese",
352
+ # "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
353
+ # "greedy_search",
354
+ # 4,
355
+ # "Yes",
356
+ # "./test_wavs/aidatatang_200zh/T0055G0036S0004.wav",
357
+ # ],
358
+ # # tal_csasr
359
+ # [
360
+ # "Chinese+English",
361
+ # "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
362
+ # "greedy_search",
363
+ # 4,
364
+ # "Yes",
365
+ # "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav",
366
+ # ],
367
+ # [
368
+ # "Chinese+English",
369
+ # "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
370
+ # "greedy_search",
371
+ # 4,
372
+ # "Yes",
373
+ # "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav",
374
+ # ],
375
+ # [
376
+ # "Chinese+English",
377
+ # "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
378
+ # "greedy_search",
379
+ # 4,
380
+ # "Yes",
381
+ # "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav",
382
+ # ],
383
+ # [
384
+ # "Tibetan",
385
+ # "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
386
+ # "greedy_search",
387
+ # 4,
388
+ # "No",
389
+ # "./test_wavs/tibetan/a_0_cacm-A70_31116.wav",
390
+ # ],
391
+ # [
392
+ # "Tibetan",
393
+ # "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
394
+ # "greedy_search",
395
+ # 4,
396
+ # "No",
397
+ # "./test_wavs/tibetan/a_0_cacm-A70_31118.wav",
398
+ # ],
399
+ # # arabic
400
+ # [
401
+ # "Arabic",
402
+ # "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
403
+ # "greedy_search",
404
+ # 4,
405
+ # "No",
406
+ # "./test_wavs/arabic/b.wav",
407
+ # ],
408
+ # [
409
+ # "Arabic",
410
+ # "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
411
+ # "greedy_search",
412
+ # 4,
413
+ # "No",
414
+ # "./test_wavs/arabic/c.wav",
415
+ # ],
416
+ # [
417
+ # "German",
418
+ # "csukuangfj/wav2vec2.0-torchaudio",
419
+ # "greedy_search",
420
+ # 4,
421
+ # "No",
422
+ # "./test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav",
423
+ # ],
424
+ # [
425
+ # "French",
426
+ # "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
427
+ # "greedy_search",
428
+ # 4,
429
+ # "No",
430
+ # "./test_wavs/french/common_voice_fr_19738183.wav",
431
+ # ],
432
+ # [
433
+ # "French",
434
+ # "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
435
+ # "greedy_search",
436
+ # 4,
437
+ # "No",
438
+ # "./test_wavs/french/common_voice_fr_27024649.wav",
439
+ # ],
440
+ # [
441
+ # "Korean",
442
+ # "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24",
443
+ # "greedy_search",
444
+ # 4,
445
+ # "No",
446
+ # "./test_wavs/korean/1.wav",
447
+ # ],
448
+ # [
449
+ # "Korean",
450
+ # "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24",
451
+ # "greedy_search",
452
+ # 4,
453
+ # "No",
454
+ # "./test_wavs/korean/2.wav",
455
+ # ],
456
+ # [
457
+ # "Korean",
458
+ # "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24",
459
+ # "greedy_search",
460
+ # 4,
461
+ # "No",
462
+ # "./test_wavs/korean/3.wav",
463
+ # ],
464
+ # [
465
+ # "Thai",
466
+ # "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20",
467
+ # "greedy_search",
468
+ # 4,
469
+ # "No",
470
+ # "./test_wavs/thai/1.wav",
471
+ # ],
472
+ # [
473
+ # "Thai",
474
+ # "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20",
475
+ # "greedy_search",
476
+ # 4,
477
+ # "No",
478
+ # "./test_wavs/thai/2.wav",
479
+ # ],
480
+ # [
481
+ # "Chinese+English+Cantonese+Japanese+Korean",
482
+ # "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17",
483
+ # "greedy_search",
484
+ # 4,
485
+ # "Yes",
486
+ # "./test_wavs/sense_voice/zh.wav",
487
+ # ],
488
+ # [
489
+ # "Chinese+English+Cantonese+Japanese+Korean",
490
+ # "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17",
491
+ # "greedy_search",
492
+ # 4,
493
+ # "Yes",
494
+ # "./test_wavs/sense_voice/en.wav",
495
+ # ],
496
+ # [
497
+ # "Chinese+English+Cantonese+Japanese+Korean",
498
+ # "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17",
499
+ # "greedy_search",
500
+ # 4,
501
+ # "Yes",
502
+ # "./test_wavs/sense_voice/ja.wav",
503
+ # ],
504
+ # [
505
+ # "Chinese+English+Cantonese+Japanese+Korean",
506
+ # "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17",
507
+ # "greedy_search",
508
+ # 4,
509
+ # "Yes",
510
+ # "./test_wavs/sense_voice/ko.wav",
511
+ # ],
512
+ # [
513
+ # "Japanese",
514
+ # "reazon-research/reazonspeech-k2-v2",
515
+ # "greedy_search",
516
+ # 4,
517
+ # "No",
518
+ # "./test_wavs/japanese/2.wav",
519
+ # ],
520
+ # [
521
+ # "Japanese",
522
+ # "reazon-research/reazonspeech-k2-v2",
523
+ # "greedy_search",
524
+ # 4,
525
+ # "No",
526
+ # "./test_wavs/japanese/3.wav",
527
+ # ],
528
+ # [
529
+ # "Japanese",
530
+ # "reazon-research/reazonspeech-k2-v2",
531
+ # "greedy_search",
532
+ # 4,
533
+ # "No",
534
+ # "./test_wavs/japanese/4.wav",
535
+ # ],
536
+ # [
537
+ # "Japanese",
538
+ # "reazon-research/reazonspeech-k2-v2",
539
+ # "greedy_search",
540
+ # 4,
541
+ # "No",
542
+ # "./test_wavs/japanese/5.wav",
543
+ # ],
544
+ ]
giga-tokens.txt ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <blk> 0
2
+ <sos/eos> 1
3
+ <unk> 2
4
+ S 3
5
+ T 4
6
+ ▁THE 5
7
+ ▁A 6
8
+ E 7
9
+ ▁AND 8
10
+ ▁TO 9
11
+ N 10
12
+ D 11
13
+ ▁OF 12
14
+ ' 13
15
+ ING 14
16
+ ▁I 15
17
+ Y 16
18
+ ▁IN 17
19
+ ED 18
20
+ ▁THAT 19
21
+ ▁ 20
22
+ P 21
23
+ R 22
24
+ ▁YOU 23
25
+ M 24
26
+ RE 25
27
+ ER 26
28
+ C 27
29
+ O 28
30
+ ▁IT 29
31
+ L 30
32
+ A 31
33
+ U 32
34
+ G 33
35
+ ▁WE 34
36
+ ▁IS 35
37
+ ▁SO 36
38
+ AL 37
39
+ I 38
40
+ ▁S 39
41
+ ▁RE 40
42
+ AR 41
43
+ B 42
44
+ ▁FOR 43
45
+ ▁C 44
46
+ ▁BE 45
47
+ LE 46
48
+ F 47
49
+ W 48
50
+ ▁E 49
51
+ ▁HE 50
52
+ LL 51
53
+ ▁WAS 52
54
+ LY 53
55
+ OR 54
56
+ IN 55
57
+ ▁F 56
58
+ VE 57
59
+ ▁THIS 58
60
+ TH 59
61
+ K 60
62
+ ▁ON 61
63
+ IT 62
64
+ ▁B 63
65
+ ▁WITH 64
66
+ ▁BUT 65
67
+ EN 66
68
+ CE 67
69
+ RI 68
70
+ ▁DO 69
71
+ UR 70
72
+ ▁HAVE 71
73
+ ▁DE 72
74
+ ▁ME 73
75
+ ▁T 74
76
+ ENT 75
77
+ CH 76
78
+ ▁THEY 77
79
+ ▁NOT 78
80
+ ES 79
81
+ V 80
82
+ ▁AS 81
83
+ RA 82
84
+ ▁P 83
85
+ ON 84
86
+ TER 85
87
+ ▁ARE 86
88
+ ▁WHAT 87
89
+ IC 88
90
+ ▁ST 89
91
+ ▁LIKE 90
92
+ ATION 91
93
+ ▁OR 92
94
+ ▁CA 93
95
+ ▁AT 94
96
+ H 95
97
+ ▁KNOW 96
98
+ ▁G 97
99
+ AN 98
100
+ ▁CON 99
101
+ IL 100
102
+ ND 101
103
+ RO 102
104
+ ▁HIS 103
105
+ ▁CAN 104
106
+ ▁ALL 105
107
+ TE 106
108
+ ▁THERE 107
109
+ ▁SU 108
110
+ ▁MO 109
111
+ ▁MA 110
112
+ LI 111
113
+ ▁ONE 112
114
+ ▁ABOUT 113
115
+ LA 114
116
+ ▁CO 115
117
+ - 116
118
+ ▁MY 117
119
+ ▁HAD 118
120
+ CK 119
121
+ NG 120
122
+ ▁NO 121
123
+ MENT 122
124
+ AD 123
125
+ LO 124
126
+ ME 125
127
+ ▁AN 126
128
+ ▁FROM 127
129
+ NE 128
130
+ ▁IF 129
131
+ VER 130
132
+ ▁JUST 131
133
+ ▁PRO 132
134
+ ION 133
135
+ ▁PA 134
136
+ ▁WHO 135
137
+ ▁SE 136
138
+ EL 137
139
+ IR 138
140
+ ▁US 139
141
+ ▁UP 140
142
+ ▁YOUR 141
143
+ CI 142
144
+ RY 143
145
+ ▁GO 144
146
+ ▁SHE 145
147
+ ▁LE 146
148
+ ▁OUT 147
149
+ ▁PO 148
150
+ ▁HO 149
151
+ ATE 150
152
+ ▁BO 151
153
+ ▁BY 152
154
+ ▁FA 153
155
+ ▁MI 154
156
+ AS 155
157
+ MP 156
158
+ ▁HER 157
159
+ VI 158
160
+ ▁THINK 159
161
+ ▁SOME 160
162
+ ▁WHEN 161
163
+ ▁AH 162
164
+ ▁PEOPLE 163
165
+ IG 164
166
+ ▁WA 165
167
+ ▁TE 166
168
+ ▁LA 167
169
+ ▁WERE 168
170
+ ▁LI 169
171
+ ▁WOULD 170
172
+ ▁SEE 171
173
+ ▁WHICH 172
174
+ DE 173
175
+ GE 174
176
+ ▁K 175
177
+ IGHT 176
178
+ ▁HA 177
179
+ ▁OUR 178
180
+ UN 179
181
+ ▁HOW 180
182
+ ▁GET 181
183
+ IS 182
184
+ UT 183
185
+ Z 184
186
+ CO 185
187
+ ET 186
188
+ UL 187
189
+ IES 188
190
+ IVE 189
191
+ AT 190
192
+ ▁O 191
193
+ ▁DON 192
194
+ LU 193
195
+ ▁TIME 194
196
+ ▁WILL 195
197
+ ▁MORE 196
198
+ ▁SP 197
199
+ ▁NOW 198
200
+ RU 199
201
+ ▁THEIR 200
202
+ ▁UN 201
203
+ ITY 202
204
+ OL 203
205
+ X 204
206
+ TI 205
207
+ US 206
208
+ ▁VERY 207
209
+ TION 208
210
+ ▁FI 209
211
+ ▁SAY 210
212
+ ▁BECAUSE 211
213
+ ▁EX 212
214
+ ▁RO 213
215
+ ERS 214
216
+ IST 215
217
+ ▁DA 216
218
+ TING 217
219
+ ▁EN 218
220
+ OM 219
221
+ ▁BA 220
222
+ ▁BEEN 221
223
+ ▁LO 222
224
+ ▁UM 223
225
+ AGE 224
226
+ ABLE 225
227
+ ▁WO 226
228
+ ▁RA 227
229
+ ▁OTHER 228
230
+ ▁REALLY 229
231
+ ENCE 230
232
+ ▁GOING 231
233
+ ▁HIM 232
234
+ ▁HAS 233
235
+ ▁THEM 234
236
+ ▁DIS 235
237
+ ▁WANT 236
238
+ ID 237
239
+ TA 238
240
+ ▁LOOK 239
241
+ KE 240
242
+ ▁DID 241
243
+ ▁SA 242
244
+ ▁VI 243
245
+ ▁SAID 244
246
+ ▁RIGHT 245
247
+ ▁THESE 246
248
+ ▁WORK 247
249
+ ▁COM 248
250
+ ALLY 249
251
+ FF 250
252
+ QU 251
253
+ AC 252
254
+ ▁DR 253
255
+ ▁WAY 254
256
+ ▁INTO 255
257
+ MO 256
258
+ TED 257
259
+ EST 258
260
+ ▁HERE 259
261
+ OK 260
262
+ ▁COULD 261
263
+ ▁WELL 262
264
+ MA 263
265
+ ▁PRE 264
266
+ ▁DI 265
267
+ MAN 266
268
+ ▁COMP 267
269
+ ▁THEN 268
270
+ IM 269
271
+ ▁PER 270
272
+ ▁NA 271
273
+ ▁WHERE 272
274
+ ▁TWO 273
275
+ ▁WI 274
276
+ ▁FE 275
277
+ INE 276
278
+ ▁ANY 277
279
+ TURE 278
280
+ ▁OVER 279
281
+ BO 280
282
+ ACH 281
283
+ OW 282
284
+ ▁MAKE 283
285
+ ▁TRA 284
286
+ HE 285
287
+ UND 286
288
+ ▁EVEN 287
289
+ ANCE 288
290
+ ▁YEAR 289
291
+ HO 290
292
+ AM 291
293
+ ▁CHA 292
294
+ ▁BACK 293
295
+ VO 294
296
+ ANT 295
297
+ DI 296
298
+ ▁ALSO 297
299
+ ▁THOSE 298
300
+ ▁MAN 299
301
+ CTION 300
302
+ ICAL 301
303
+ ▁JO 302
304
+ ▁OP 303
305
+ ▁NEW 304
306
+ ▁MU 305
307
+ ▁HU 306
308
+ ▁KIND 307
309
+ ▁NE 308
310
+ CA 309
311
+ END 310
312
+ TIC 311
313
+ FUL 312
314
+ ▁YEAH 313
315
+ SH 314
316
+ ▁APP 315
317
+ ▁THINGS 316
318
+ SIDE 317
319
+ ▁GOOD 318
320
+ ONE 319
321
+ ▁TAKE 320
322
+ CU 321
323
+ ▁EVERY 322
324
+ ▁MEAN 323
325
+ ▁FIRST 324
326
+ OP 325
327
+ ▁TH 326
328
+ ▁MUCH 327
329
+ ▁PART 328
330
+ UGH 329
331
+ ▁COME 330
332
+ J 331
333
+ ▁THAN 332
334
+ ▁EXP 333
335
+ ▁AGAIN 334
336
+ ▁LITTLE 335
337
+ MB 336
338
+ ▁NEED 337
339
+ ▁TALK 338
340
+ IF 339
341
+ FOR 340
342
+ ▁SH 341
343
+ ISH 342
344
+ ▁STA 343
345
+ ATED 344
346
+ ▁GU 345
347
+ ▁LET 346
348
+ IA 347
349
+ ▁MAR 348
350
+ ▁DOWN 349
351
+ ▁DAY 350
352
+ ▁GA 351
353
+ ▁SOMETHING 352
354
+ ▁BU 353
355
+ DUC 354
356
+ HA 355
357
+ ▁LOT 356
358
+ ▁RU 357
359
+ ▁THOUGH 358
360
+ ▁GREAT 359
361
+ AIN 360
362
+ ▁THROUGH 361
363
+ ▁THING 362
364
+ OUS 363
365
+ ▁PRI 364
366
+ ▁GOT 365
367
+ ▁SHOULD 366
368
+ ▁AFTER 367
369
+ ▁HEAR 368
370
+ ▁TA 369
371
+ ▁ONLY 370
372
+ ▁CHI 371
373
+ IOUS 372
374
+ ▁SHA 373
375
+ ▁MOST 374
376
+ ▁ACTUALLY 375
377
+ ▁START 376
378
+ LIC 377
379
+ ▁VA 378
380
+ ▁RI 379
381
+ DAY 380
382
+ IAN 381
383
+ ▁DOES 382
384
+ ROW 383
385
+ ▁GRA 384
386
+ ITION 385
387
+ ▁MANY 386
388
+ ▁BEFORE 387
389
+ ▁GIVE 388
390
+ PORT 389
391
+ QUI 390
392
+ ▁LIFE 391
393
+ ▁WORLD 392
394
+ ▁PI 393
395
+ ▁LONG 394
396
+ ▁THREE 395
397
+ IZE 396
398
+ NESS 397
399
+ ▁SHOW 398
400
+ PH 399
401
+ ▁WHY 400
402
+ ▁QUESTION 401
403
+ WARD 402
404
+ ▁THANK 403
405
+ ▁PH 404
406
+ ▁DIFFERENT 405
407
+ ▁OWN 406
408
+ ▁FEEL 407
409
+ ▁MIGHT 408
410
+ ▁HAPPEN 409
411
+ ▁MADE 410
412
+ ▁BRO 411
413
+ IBLE 412
414
+ ▁HI 413
415
+ ▁STATE 414
416
+ ▁HAND 415
417
+ ▁NEVER 416
418
+ ▁PLACE 417
419
+ ▁LOVE 418
420
+ ▁DU 419
421
+ ▁POINT 420
422
+ ▁HELP 421
423
+ ▁COUNT 422
424
+ ▁STILL 423
425
+ ▁MR 424
426
+ ▁FIND 425
427
+ ▁PERSON 426
428
+ ▁CAME 427
429
+ ▁SAME 428
430
+ ▁LAST 429
431
+ ▁HIGH 430
432
+ ▁OLD 431
433
+ ▁UNDER 432
434
+ ▁FOUR 433
435
+ ▁AROUND 434
436
+ ▁SORT 435
437
+ ▁CHANGE 436
438
+ ▁YES 437
439
+ SHIP 438
440
+ ▁ANOTHER 439
441
+ ATIVE 440
442
+ ▁FOUND 441
443
+ ▁JA 442
444
+ ▁ALWAYS 443
445
+ ▁NEXT 444
446
+ ▁TURN 445
447
+ ▁JU 446
448
+ ▁SIX 447
449
+ ▁FACT 448
450
+ ▁INTEREST 449
451
+ ▁WORD 450
452
+ ▁THOUSAND 451
453
+ ▁HUNDRED 452
454
+ ▁NUMBER 453
455
+ ▁IDEA 454
456
+ ▁PLAN 455
457
+ ▁COURSE 456
458
+ ▁SCHOOL 457
459
+ ▁HOUSE 458
460
+ ▁TWENTY 459
461
+ ▁JE 460
462
+ ▁PLAY 461
463
+ ▁AWAY 462
464
+ ▁LEARN 463
465
+ ▁HARD 464
466
+ ▁WEEK 465
467
+ ▁BETTER 466
468
+ ▁WHILE 467
469
+ ▁FRIEND 468
470
+ ▁OKAY 469
471
+ ▁NINE 470
472
+ ▁UNDERSTAND 471
473
+ ▁KEEP 472
474
+ ▁GONNA 473
475
+ ▁SYSTEM 474
476
+ ▁AMERICA 475
477
+ ▁POWER 476
478
+ ▁IMPORTANT 477
479
+ ▁WITHOUT 478
480
+ ▁MAYBE 479
481
+ ▁SEVEN 480
482
+ ▁BETWEEN 481
483
+ ▁BUILD 482
484
+ ▁CERTAIN 483
485
+ ▁PROBLEM 484
486
+ ▁MONEY 485
487
+ ▁BELIEVE 486
488
+ ▁SECOND 487
489
+ ▁REASON 488
490
+ ▁TOGETHER 489
491
+ ▁PUBLIC 490
492
+ ▁ANYTHING 491
493
+ ▁SPEAK 492
494
+ ▁BUSINESS 493
495
+ ▁EVERYTHING 494
496
+ ▁CLOSE 495
497
+ ▁QUITE 496
498
+ ▁ANSWER 497
499
+ ▁ENOUGH 498
500
+ Q 499
model.py ADDED
@@ -0,0 +1,1940 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # See LICENSE for clarification regarding multiple authors
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import os
18
+ from functools import lru_cache
19
+ from typing import Union
20
+
21
+ import torch
22
+ import torchaudio
23
+ from huggingface_hub import hf_hub_download
24
+
25
+ os.system(
26
+ "cp -v /usr/local/lib/python3.8/site-packages/k2/lib/*.so //usr/local/lib/python3.8/site-packages/sherpa/lib/"
27
+ )
28
+
29
+ os.system(
30
+ "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
31
+ )
32
+
33
+ import k2 # noqa
34
+ import sherpa
35
+ import sherpa_onnx
36
+ import numpy as np
37
+ from typing import Tuple
38
+ import wave
39
+
40
+ sample_rate = 16000
41
+
42
+
43
+ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
44
+ """
45
+ Args:
46
+ wave_filename:
47
+ Path to a wave file. It should be single channel and each sample should
48
+ be 16-bit. Its sample rate does not need to be 16kHz.
49
+ Returns:
50
+ Return a tuple containing:
51
+ - A 1-D array of dtype np.float32 containing the samples, which are
52
+ normalized to the range [-1, 1].
53
+ - sample rate of the wave file
54
+ """
55
+
56
+ with wave.open(wave_filename) as f:
57
+ assert f.getnchannels() == 1, f.getnchannels()
58
+ assert f.getsampwidth() == 2, f.getsampwidth() # it is in bytes
59
+ num_samples = f.getnframes()
60
+ samples = f.readframes(num_samples)
61
+ samples_int16 = np.frombuffer(samples, dtype=np.int16)
62
+ samples_float32 = samples_int16.astype(np.float32)
63
+
64
+ samples_float32 = samples_float32 / 32768
65
+ return samples_float32, f.getframerate()
66
+
67
+
68
+ def decode_offline_recognizer(
69
+ recognizer: sherpa.OfflineRecognizer,
70
+ filename: str,
71
+ ) -> str:
72
+ s = recognizer.create_stream()
73
+
74
+ s.accept_wave_file(filename)
75
+ recognizer.decode_stream(s)
76
+
77
+ text = s.result.text.strip()
78
+ # return text.lower()
79
+ return text
80
+
81
+
82
+ def decode_online_recognizer(
83
+ recognizer: sherpa.OnlineRecognizer,
84
+ filename: str,
85
+ ) -> str:
86
+ samples, actual_sample_rate = torchaudio.load(filename)
87
+ assert sample_rate == actual_sample_rate, (
88
+ sample_rate,
89
+ actual_sample_rate,
90
+ )
91
+ samples = samples[0].contiguous()
92
+
93
+ s = recognizer.create_stream()
94
+
95
+ tail_padding = torch.zeros(int(sample_rate * 0.3), dtype=torch.float32)
96
+ s.accept_waveform(sample_rate, samples)
97
+ s.accept_waveform(sample_rate, tail_padding)
98
+ s.input_finished()
99
+
100
+ while recognizer.is_ready(s):
101
+ recognizer.decode_stream(s)
102
+
103
+ text = recognizer.get_result(s).text
104
+ # return text.strip().lower()
105
+ return text.strip()
106
+
107
+
108
+ def decode_offline_recognizer_sherpa_onnx(
109
+ recognizer: sherpa_onnx.OfflineRecognizer,
110
+ filename: str,
111
+ ) -> str:
112
+ s = recognizer.create_stream()
113
+ samples, sample_rate = read_wave(filename)
114
+ s.accept_waveform(sample_rate, samples)
115
+ recognizer.decode_stream(s)
116
+
117
+ # return s.result.text.lower()
118
+ return s.result.text
119
+
120
+
121
+ def decode_online_recognizer_sherpa_onnx(
122
+ recognizer: sherpa_onnx.OnlineRecognizer,
123
+ filename: str,
124
+ ) -> str:
125
+ s = recognizer.create_stream()
126
+ samples, sample_rate = read_wave(filename)
127
+ s.accept_waveform(sample_rate, samples)
128
+
129
+ tail_paddings = np.zeros(int(0.3 * sample_rate), dtype=np.float32)
130
+ s.accept_waveform(sample_rate, tail_paddings)
131
+ s.input_finished()
132
+
133
+ while recognizer.is_ready(s):
134
+ recognizer.decode_stream(s)
135
+
136
+ # return recognizer.get_result(s).lower()
137
+ return recognizer.get_result(s)
138
+
139
+
140
+ def decode(
141
+ recognizer: Union[
142
+ sherpa.OfflineRecognizer,
143
+ sherpa.OnlineRecognizer,
144
+ sherpa_onnx.OfflineRecognizer,
145
+ sherpa_onnx.OnlineRecognizer,
146
+ ],
147
+ filename: str,
148
+ ) -> str:
149
+ if isinstance(recognizer, sherpa.OfflineRecognizer):
150
+ return decode_offline_recognizer(recognizer, filename)
151
+ elif isinstance(recognizer, sherpa.OnlineRecognizer):
152
+ return decode_online_recognizer(recognizer, filename)
153
+ elif isinstance(recognizer, sherpa_onnx.OfflineRecognizer):
154
+ return decode_offline_recognizer_sherpa_onnx(recognizer, filename)
155
+ elif isinstance(recognizer, sherpa_onnx.OnlineRecognizer):
156
+ return decode_online_recognizer_sherpa_onnx(recognizer, filename)
157
+ else:
158
+ raise ValueError(f"Unknown recognizer type {type(recognizer)}")
159
+
160
+
161
+ @lru_cache(maxsize=30)
162
+ def get_pretrained_model(
163
+ repo_id: str,
164
+ decoding_method: str,
165
+ num_active_paths: int,
166
+ ) -> Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer]:
167
+ if repo_id in multi_lingual_models:
168
+ return multi_lingual_models[repo_id](
169
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
170
+ )
171
+ elif repo_id in chinese_models:
172
+ return chinese_models[repo_id](
173
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
174
+ )
175
+ elif repo_id in chinese_dialect_models:
176
+ return chinese_dialect_models[repo_id](
177
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
178
+ )
179
+ elif repo_id in english_models:
180
+ return english_models[repo_id](
181
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
182
+ )
183
+ elif repo_id in chinese_english_mixed_models:
184
+ return chinese_english_mixed_models[repo_id](
185
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
186
+ )
187
+ elif repo_id in chinese_cantonese_english_models:
188
+ return chinese_cantonese_english_models[repo_id](
189
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
190
+ )
191
+ elif repo_id in chinese_cantonese_english_japanese_korean_models:
192
+ return chinese_cantonese_english_japanese_korean_models[repo_id](
193
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
194
+ )
195
+ elif repo_id in cantonese_models:
196
+ return cantonese_models[repo_id](
197
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
198
+ )
199
+ elif repo_id in tibetan_models:
200
+ return tibetan_models[repo_id](
201
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
202
+ )
203
+ elif repo_id in arabic_models:
204
+ return arabic_models[repo_id](
205
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
206
+ )
207
+ elif repo_id in german_models:
208
+ return german_models[repo_id](
209
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
210
+ )
211
+ elif repo_id in french_models:
212
+ return french_models[repo_id](
213
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
214
+ )
215
+ elif repo_id in japanese_models:
216
+ return japanese_models[repo_id](
217
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
218
+ )
219
+ elif repo_id in russian_models:
220
+ return russian_models[repo_id](
221
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
222
+ )
223
+ elif repo_id in korean_models:
224
+ return korean_models[repo_id](
225
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
226
+ )
227
+ elif repo_id in thai_models:
228
+ return thai_models[repo_id](
229
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
230
+ )
231
+ else:
232
+ raise ValueError(f"Unsupported repo_id: {repo_id}")
233
+
234
+
235
+ def _get_nn_model_filename(
236
+ repo_id: str,
237
+ filename: str,
238
+ subfolder: str = "exp",
239
+ ) -> str:
240
+ nn_model_filename = hf_hub_download(
241
+ repo_id=repo_id,
242
+ filename=filename,
243
+ subfolder=subfolder,
244
+ )
245
+ return nn_model_filename
246
+
247
+
248
+ def _get_bpe_model_filename(
249
+ repo_id: str,
250
+ filename: str = "bpe.model",
251
+ subfolder: str = "data/lang_bpe_500",
252
+ ) -> str:
253
+ bpe_model_filename = hf_hub_download(
254
+ repo_id=repo_id,
255
+ filename=filename,
256
+ subfolder=subfolder,
257
+ )
258
+ return bpe_model_filename
259
+
260
+
261
+ def _get_token_filename(
262
+ repo_id: str,
263
+ filename: str = "tokens.txt",
264
+ subfolder: str = "data/lang_char",
265
+ ) -> str:
266
+ token_filename = hf_hub_download(
267
+ repo_id=repo_id,
268
+ filename=filename,
269
+ subfolder=subfolder,
270
+ )
271
+ return token_filename
272
+
273
+
274
+ @lru_cache(maxsize=10)
275
+ def _get_aishell2_pretrained_model(
276
+ repo_id: str,
277
+ decoding_method: str,
278
+ num_active_paths: int,
279
+ ) -> sherpa.OfflineRecognizer:
280
+ assert repo_id in [
281
+ # context-size 1
282
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", # noqa
283
+ # context-size 2
284
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", # noqa
285
+ ], repo_id
286
+
287
+ nn_model = _get_nn_model_filename(
288
+ repo_id=repo_id,
289
+ filename="cpu_jit.pt",
290
+ )
291
+ tokens = _get_token_filename(repo_id=repo_id)
292
+
293
+ feat_config = sherpa.FeatureConfig()
294
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
295
+ feat_config.fbank_opts.mel_opts.num_bins = 80
296
+ feat_config.fbank_opts.frame_opts.dither = 0
297
+
298
+ config = sherpa.OfflineRecognizerConfig(
299
+ nn_model=nn_model,
300
+ tokens=tokens,
301
+ use_gpu=False,
302
+ feat_config=feat_config,
303
+ decoding_method=decoding_method,
304
+ num_active_paths=num_active_paths,
305
+ )
306
+
307
+ recognizer = sherpa.OfflineRecognizer(config)
308
+
309
+ return recognizer
310
+
311
+
312
+ @lru_cache(maxsize=10)
313
+ def _get_offline_pre_trained_model(
314
+ repo_id: str, decoding_method: str, num_active_paths: int
315
+ ) -> sherpa_onnx.OfflineRecognizer:
316
+ assert repo_id in (
317
+ "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24",
318
+ "reazon-research/reazonspeech-k2-v2",
319
+ ), repo_id
320
+
321
+ encoder_model = _get_nn_model_filename(
322
+ repo_id=repo_id,
323
+ filename="encoder-epoch-99-avg-1.int8.onnx",
324
+ subfolder=".",
325
+ )
326
+
327
+ decoder_model = _get_nn_model_filename(
328
+ repo_id=repo_id,
329
+ filename="decoder-epoch-99-avg-1.onnx",
330
+ subfolder=".",
331
+ )
332
+
333
+ joiner_model = _get_nn_model_filename(
334
+ repo_id=repo_id,
335
+ filename="joiner-epoch-99-avg-1.onnx",
336
+ subfolder=".",
337
+ )
338
+
339
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
340
+
341
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
342
+ tokens=tokens,
343
+ encoder=encoder_model,
344
+ decoder=decoder_model,
345
+ joiner=joiner_model,
346
+ num_threads=2,
347
+ sample_rate=16000,
348
+ feature_dim=80,
349
+ decoding_method=decoding_method,
350
+ )
351
+
352
+ return recognizer
353
+
354
+
355
+ @lru_cache(maxsize=10)
356
+ def _get_yifan_thai_pretrained_model(
357
+ repo_id: str, decoding_method: str, num_active_paths: int
358
+ ) -> sherpa_onnx.OfflineRecognizer:
359
+ assert repo_id in (
360
+ "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20",
361
+ ), repo_id
362
+
363
+ encoder_model = _get_nn_model_filename(
364
+ repo_id=repo_id,
365
+ filename="encoder-epoch-12-avg-5.int8.onnx",
366
+ subfolder="exp",
367
+ )
368
+
369
+ decoder_model = _get_nn_model_filename(
370
+ repo_id=repo_id,
371
+ filename="decoder-epoch-12-avg-5.onnx",
372
+ subfolder="exp",
373
+ )
374
+
375
+ joiner_model = _get_nn_model_filename(
376
+ repo_id=repo_id,
377
+ filename="joiner-epoch-12-avg-5.int8.onnx",
378
+ subfolder="exp",
379
+ )
380
+
381
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_2000")
382
+
383
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
384
+ tokens=tokens,
385
+ encoder=encoder_model,
386
+ decoder=decoder_model,
387
+ joiner=joiner_model,
388
+ num_threads=2,
389
+ sample_rate=16000,
390
+ feature_dim=80,
391
+ decoding_method=decoding_method,
392
+ )
393
+
394
+ return recognizer
395
+
396
+
397
+ @lru_cache(maxsize=10)
398
+ def _get_zrjin_cantonese_pre_trained_model(
399
+ repo_id: str, decoding_method: str, num_active_paths: int
400
+ ) -> sherpa_onnx.OfflineRecognizer:
401
+ assert repo_id in ("zrjin/icefall-asr-mdcc-zipformer-2024-03-11",), repo_id
402
+
403
+ encoder_model = _get_nn_model_filename(
404
+ repo_id=repo_id,
405
+ filename="encoder-epoch-45-avg-35.int8.onnx",
406
+ subfolder="exp",
407
+ )
408
+
409
+ decoder_model = _get_nn_model_filename(
410
+ repo_id=repo_id,
411
+ filename="decoder-epoch-45-avg-35.onnx",
412
+ subfolder="exp",
413
+ )
414
+
415
+ joiner_model = _get_nn_model_filename(
416
+ repo_id=repo_id,
417
+ filename="joiner-epoch-45-avg-35.int8.onnx",
418
+ subfolder="exp",
419
+ )
420
+
421
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_char")
422
+
423
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
424
+ tokens=tokens,
425
+ encoder=encoder_model,
426
+ decoder=decoder_model,
427
+ joiner=joiner_model,
428
+ num_threads=2,
429
+ sample_rate=16000,
430
+ feature_dim=80,
431
+ decoding_method=decoding_method,
432
+ )
433
+
434
+ return recognizer
435
+
436
+
437
+ @lru_cache(maxsize=10)
438
+ def _get_russian_pre_trained_model_ctc(
439
+ repo_id: str, decoding_method: str, num_active_paths: int
440
+ ) -> sherpa_onnx.OfflineRecognizer:
441
+ assert repo_id in (
442
+ "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24",
443
+ "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19",
444
+ ), repo_id
445
+
446
+ model = _get_nn_model_filename(
447
+ repo_id=repo_id,
448
+ filename="model.int8.onnx",
449
+ subfolder=".",
450
+ )
451
+
452
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
453
+
454
+ recognizer = sherpa_onnx.OfflineRecognizer.from_nemo_ctc(
455
+ model=model,
456
+ tokens=tokens,
457
+ num_threads=2,
458
+ )
459
+
460
+ return recognizer
461
+
462
+
463
+ @lru_cache(maxsize=10)
464
+ def _get_russian_pre_trained_model(
465
+ repo_id: str, decoding_method: str, num_active_paths: int
466
+ ) -> sherpa_onnx.OfflineRecognizer:
467
+ assert repo_id in (
468
+ "alphacep/vosk-model-ru",
469
+ "alphacep/vosk-model-small-ru",
470
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24",
471
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-v2-russian-2025-04-19",
472
+ ), repo_id
473
+
474
+ if repo_id == "alphacep/vosk-model-ru":
475
+ model_dir = "am-onnx"
476
+ encoder = "encoder.onnx"
477
+ model_type = "transducer"
478
+ elif repo_id == "alphacep/vosk-model-small-ru":
479
+ model_dir = "am"
480
+ encoder = "encoder.onnx"
481
+ model_type = "transducer"
482
+ elif repo_id in (
483
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24",
484
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-v2-russian-2025-04-19",
485
+ ):
486
+ model_dir = "."
487
+ encoder = "encoder.int8.onnx"
488
+ model_type = "nemo_transducer"
489
+
490
+ encoder_model = _get_nn_model_filename(
491
+ repo_id=repo_id,
492
+ filename=encoder,
493
+ subfolder=model_dir,
494
+ )
495
+
496
+ decoder_model = _get_nn_model_filename(
497
+ repo_id=repo_id,
498
+ filename="decoder.onnx",
499
+ subfolder=model_dir,
500
+ )
501
+
502
+ joiner_model = _get_nn_model_filename(
503
+ repo_id=repo_id,
504
+ filename="joiner.onnx",
505
+ subfolder=model_dir,
506
+ )
507
+
508
+ if repo_id in (
509
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24",
510
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-v2-russian-2025-04-19",
511
+ ):
512
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
513
+ else:
514
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="lang")
515
+
516
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
517
+ tokens=tokens,
518
+ encoder=encoder_model,
519
+ decoder=decoder_model,
520
+ joiner=joiner_model,
521
+ num_threads=2,
522
+ sample_rate=16000,
523
+ feature_dim=80,
524
+ decoding_method=decoding_method,
525
+ model_type=model_type,
526
+ )
527
+
528
+ return recognizer
529
+
530
+
531
+ @lru_cache(maxsize=10)
532
+ def _get_moonshine_model(
533
+ repo_id: str, decoding_method: str, num_active_paths: int
534
+ ) -> sherpa_onnx.OfflineRecognizer:
535
+ assert repo_id in ("moonshine-tiny", "moonshine-base"), repo_id
536
+
537
+ if repo_id == "moonshine-tiny":
538
+ full_repo_id = "csukuangfj/sherpa-onnx-moonshine-tiny-en-int8"
539
+ elif repo_id == "moonshine-base":
540
+ full_repo_id = "csukuangfj/sherpa-onnx-moonshine-base-en-int8"
541
+ else:
542
+ raise ValueError(f"Unknown repo_id: {repo_id}")
543
+
544
+ preprocessor = _get_nn_model_filename(
545
+ repo_id=full_repo_id,
546
+ filename=f"preprocess.onnx",
547
+ subfolder=".",
548
+ )
549
+
550
+ encoder = _get_nn_model_filename(
551
+ repo_id=full_repo_id,
552
+ filename=f"encode.int8.onnx",
553
+ subfolder=".",
554
+ )
555
+
556
+ uncached_decoder = _get_nn_model_filename(
557
+ repo_id=full_repo_id,
558
+ filename=f"uncached_decode.int8.onnx",
559
+ subfolder=".",
560
+ )
561
+
562
+ cached_decoder = _get_nn_model_filename(
563
+ repo_id=full_repo_id,
564
+ filename=f"cached_decode.int8.onnx",
565
+ subfolder=".",
566
+ )
567
+
568
+ tokens = _get_token_filename(
569
+ repo_id=full_repo_id,
570
+ subfolder=".",
571
+ filename="tokens.txt",
572
+ )
573
+
574
+ recognizer = sherpa_onnx.OfflineRecognizer.from_moonshine(
575
+ preprocessor=preprocessor,
576
+ encoder=encoder,
577
+ uncached_decoder=uncached_decoder,
578
+ cached_decoder=cached_decoder,
579
+ tokens=tokens,
580
+ num_threads=2,
581
+ )
582
+
583
+ return recognizer
584
+
585
+
586
+ @lru_cache(maxsize=10)
587
+ def _get_whisper_model(
588
+ repo_id: str, decoding_method: str, num_active_paths: int
589
+ ) -> sherpa_onnx.OfflineRecognizer:
590
+ name = repo_id.split("-")[1]
591
+ assert name in ("tiny.en", "base.en", "small.en", "medium.en"), repo_id
592
+ full_repo_id = "csukuangfj/sherpa-onnx-whisper-" + name
593
+ encoder = _get_nn_model_filename(
594
+ repo_id=full_repo_id,
595
+ filename=f"{name}-encoder.int8.onnx",
596
+ subfolder=".",
597
+ )
598
+
599
+ decoder = _get_nn_model_filename(
600
+ repo_id=full_repo_id,
601
+ filename=f"{name}-decoder.int8.onnx",
602
+ subfolder=".",
603
+ )
604
+
605
+ tokens = _get_token_filename(
606
+ repo_id=full_repo_id, subfolder=".", filename=f"{name}-tokens.txt"
607
+ )
608
+
609
+ recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
610
+ encoder=encoder,
611
+ decoder=decoder,
612
+ tokens=tokens,
613
+ num_threads=2,
614
+ )
615
+
616
+ return recognizer
617
+
618
+
619
+ @lru_cache(maxsize=10)
620
+ def _get_gigaspeech_pre_trained_model(
621
+ repo_id: str,
622
+ decoding_method: str,
623
+ num_active_paths: int,
624
+ ) -> sherpa.OfflineRecognizer:
625
+ assert repo_id in [
626
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
627
+ ], repo_id
628
+
629
+ nn_model = _get_nn_model_filename(
630
+ repo_id=repo_id,
631
+ filename="cpu_jit-iter-3488000-avg-20.pt",
632
+ )
633
+ tokens = "./giga-tokens.txt"
634
+
635
+ feat_config = sherpa.FeatureConfig()
636
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
637
+ feat_config.fbank_opts.mel_opts.num_bins = 80
638
+ feat_config.fbank_opts.frame_opts.dither = 0
639
+
640
+ config = sherpa.OfflineRecognizerConfig(
641
+ nn_model=nn_model,
642
+ tokens=tokens,
643
+ use_gpu=False,
644
+ feat_config=feat_config,
645
+ decoding_method=decoding_method,
646
+ num_active_paths=num_active_paths,
647
+ )
648
+
649
+ recognizer = sherpa.OfflineRecognizer(config)
650
+
651
+ return recognizer
652
+
653
+
654
+ @lru_cache(maxsize=10)
655
+ def _get_english_model(
656
+ repo_id: str,
657
+ decoding_method: str,
658
+ num_active_paths: int,
659
+ ) -> sherpa.OfflineRecognizer:
660
+ assert repo_id in [
661
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02", # noqa
662
+ "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04", # noqa
663
+ "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19", # noqa
664
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
665
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11", # noqa
666
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14", # noqa
667
+ "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16", # noqa
668
+ "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15", # noqa
669
+ "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16", # noqa
670
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
671
+ "pkufool/icefall_asr_librispeech_conformer_ctc",
672
+ "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21",
673
+ ], repo_id
674
+
675
+ filename = "cpu_jit.pt"
676
+ if (
677
+ repo_id
678
+ == "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11"
679
+ ):
680
+ filename = "cpu_jit-torch-1.10.0.pt"
681
+
682
+ if (
683
+ repo_id
684
+ == "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02"
685
+ ):
686
+ filename = "cpu_jit-torch-1.10.pt"
687
+
688
+ if (
689
+ repo_id
690
+ == "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04"
691
+ ):
692
+ filename = "cpu_jit-epoch-30-avg-4.pt"
693
+
694
+ if (
695
+ repo_id
696
+ == "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19"
697
+ ):
698
+ filename = "cpu_jit-epoch-20-avg-5.pt"
699
+
700
+ if repo_id in (
701
+ "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16",
702
+ "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15",
703
+ "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16",
704
+ ):
705
+ filename = "jit_script.pt"
706
+
707
+ nn_model = _get_nn_model_filename(
708
+ repo_id=repo_id,
709
+ filename=filename,
710
+ )
711
+ subfolder = "data/lang_bpe_500"
712
+
713
+ if repo_id in (
714
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
715
+ "pkufool/icefall_asr_librispeech_conformer_ctc",
716
+ ):
717
+ subfolder = "data/lang_bpe"
718
+
719
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
720
+
721
+ feat_config = sherpa.FeatureConfig()
722
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
723
+ feat_config.fbank_opts.mel_opts.num_bins = 80
724
+ feat_config.fbank_opts.frame_opts.dither = 0
725
+
726
+ config = sherpa.OfflineRecognizerConfig(
727
+ nn_model=nn_model,
728
+ tokens=tokens,
729
+ use_gpu=False,
730
+ feat_config=feat_config,
731
+ decoding_method=decoding_method,
732
+ num_active_paths=num_active_paths,
733
+ )
734
+
735
+ recognizer = sherpa.OfflineRecognizer(config)
736
+
737
+ return recognizer
738
+
739
+
740
+ @lru_cache(maxsize=10)
741
+ def _get_wenetspeech_pre_trained_model(
742
+ repo_id: str,
743
+ decoding_method: str,
744
+ num_active_paths: int,
745
+ ):
746
+ assert repo_id in [
747
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
748
+ ], repo_id
749
+
750
+ nn_model = _get_nn_model_filename(
751
+ repo_id=repo_id,
752
+ filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt",
753
+ )
754
+ tokens = _get_token_filename(repo_id=repo_id)
755
+
756
+ feat_config = sherpa.FeatureConfig()
757
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
758
+ feat_config.fbank_opts.mel_opts.num_bins = 80
759
+ feat_config.fbank_opts.frame_opts.dither = 0
760
+
761
+ config = sherpa.OfflineRecognizerConfig(
762
+ nn_model=nn_model,
763
+ tokens=tokens,
764
+ use_gpu=False,
765
+ feat_config=feat_config,
766
+ decoding_method=decoding_method,
767
+ num_active_paths=num_active_paths,
768
+ )
769
+
770
+ recognizer = sherpa.OfflineRecognizer(config)
771
+
772
+ return recognizer
773
+
774
+
775
+ @lru_cache(maxsize=1)
776
+ def _get_fire_red_asr_models(repo_id: str, decoding_method: str, num_active_paths: int):
777
+ assert repo_id in (
778
+ "csukuangfj/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16",
779
+ ), repo_id
780
+
781
+ encoder = _get_nn_model_filename(
782
+ repo_id=repo_id,
783
+ filename="encoder.int8.onnx",
784
+ subfolder=".",
785
+ )
786
+
787
+ decoder = _get_nn_model_filename(
788
+ repo_id=repo_id,
789
+ filename="decoder.int8.onnx",
790
+ subfolder=".",
791
+ )
792
+
793
+ tokens = _get_nn_model_filename(
794
+ repo_id=repo_id,
795
+ filename="tokens.txt",
796
+ subfolder=".",
797
+ )
798
+
799
+ return sherpa_onnx.OfflineRecognizer.from_fire_red_asr(
800
+ encoder=encoder,
801
+ decoder=decoder,
802
+ tokens=tokens,
803
+ num_threads=2,
804
+ )
805
+
806
+
807
+ @lru_cache(maxsize=10)
808
+ def _get_chinese_english_mixed_model_onnx(
809
+ repo_id: str,
810
+ decoding_method: str,
811
+ num_active_paths: int,
812
+ ) -> sherpa_onnx.OfflineRecognizer:
813
+ assert repo_id in [
814
+ "zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22",
815
+ ], repo_id
816
+
817
+ encoder_model = _get_nn_model_filename(
818
+ repo_id=repo_id,
819
+ filename="encoder-epoch-34-avg-19.int8.onnx",
820
+ subfolder="exp",
821
+ )
822
+
823
+ decoder_model = _get_nn_model_filename(
824
+ repo_id=repo_id,
825
+ filename="decoder-epoch-34-avg-19.onnx",
826
+ subfolder="exp",
827
+ )
828
+
829
+ joiner_model = _get_nn_model_filename(
830
+ repo_id=repo_id,
831
+ filename="joiner-epoch-34-avg-19.int8.onnx",
832
+ subfolder="exp",
833
+ )
834
+
835
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bbpe_2000")
836
+
837
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
838
+ tokens=tokens,
839
+ encoder=encoder_model,
840
+ decoder=decoder_model,
841
+ joiner=joiner_model,
842
+ num_threads=2,
843
+ sample_rate=16000,
844
+ feature_dim=80,
845
+ decoding_method=decoding_method,
846
+ max_active_paths=num_active_paths,
847
+ )
848
+
849
+ return recognizer
850
+
851
+
852
+ @lru_cache(maxsize=10)
853
+ def _get_chinese_english_mixed_model(
854
+ repo_id: str,
855
+ decoding_method: str,
856
+ num_active_paths: int,
857
+ ) -> sherpa.OfflineRecognizer:
858
+ assert repo_id in [
859
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
860
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
861
+ ], repo_id
862
+
863
+ if repo_id == "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5":
864
+ filename = "cpu_jit.pt"
865
+ subfolder = "data/lang_char"
866
+ elif repo_id == "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh":
867
+ filename = "cpu_jit-epoch-11-avg-1.pt"
868
+ subfolder = "data/lang_char_bpe"
869
+
870
+ nn_model = _get_nn_model_filename(
871
+ repo_id=repo_id,
872
+ filename=filename,
873
+ )
874
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
875
+
876
+ feat_config = sherpa.FeatureConfig()
877
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
878
+ feat_config.fbank_opts.mel_opts.num_bins = 80
879
+ feat_config.fbank_opts.frame_opts.dither = 0
880
+
881
+ config = sherpa.OfflineRecognizerConfig(
882
+ nn_model=nn_model,
883
+ tokens=tokens,
884
+ use_gpu=False,
885
+ feat_config=feat_config,
886
+ decoding_method=decoding_method,
887
+ num_active_paths=num_active_paths,
888
+ )
889
+
890
+ recognizer = sherpa.OfflineRecognizer(config)
891
+
892
+ return recognizer
893
+
894
+
895
+ @lru_cache(maxsize=10)
896
+ def _get_alimeeting_pre_trained_model(
897
+ repo_id: str,
898
+ decoding_method: str,
899
+ num_active_paths: int,
900
+ ):
901
+ assert repo_id in [
902
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
903
+ "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
904
+ ], repo_id
905
+
906
+ if repo_id == "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7":
907
+ filename = "cpu_jit.pt"
908
+ elif repo_id == "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2":
909
+ filename = "cpu_jit_torch_1.7.1.pt"
910
+
911
+ nn_model = _get_nn_model_filename(
912
+ repo_id=repo_id,
913
+ filename=filename,
914
+ )
915
+ tokens = _get_token_filename(repo_id=repo_id)
916
+
917
+ feat_config = sherpa.FeatureConfig()
918
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
919
+ feat_config.fbank_opts.mel_opts.num_bins = 80
920
+ feat_config.fbank_opts.frame_opts.dither = 0
921
+
922
+ config = sherpa.OfflineRecognizerConfig(
923
+ nn_model=nn_model,
924
+ tokens=tokens,
925
+ use_gpu=False,
926
+ feat_config=feat_config,
927
+ decoding_method=decoding_method,
928
+ num_active_paths=num_active_paths,
929
+ )
930
+
931
+ recognizer = sherpa.OfflineRecognizer(config)
932
+
933
+ return recognizer
934
+
935
+
936
+ @lru_cache(maxsize=4)
937
+ def _get_dolphin_ctc_models(repo_id: str, decoding_method: str, num_active_paths: int):
938
+ assert repo_id in [
939
+ "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02",
940
+ "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-int8-2025-04-02",
941
+ "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-2025-04-02",
942
+ "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-2025-04-02",
943
+ ], repo_id
944
+
945
+ if repo_id in [
946
+ "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02",
947
+ "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-int8-2025-04-02",
948
+ ]:
949
+ use_int8 = True
950
+ else:
951
+ use_int8 = False
952
+
953
+ nn_model = _get_nn_model_filename(
954
+ repo_id=repo_id,
955
+ filename="model.int8.onnx" if use_int8 else "model.onnx",
956
+ subfolder=".",
957
+ )
958
+ tokens = _get_token_filename(
959
+ repo_id=repo_id,
960
+ filename="tokens.txt",
961
+ subfolder=".",
962
+ )
963
+
964
+ recognizer = sherpa_onnx.OfflineRecognizer.from_dolphin_ctc(
965
+ tokens=tokens,
966
+ model=nn_model,
967
+ num_threads=2,
968
+ )
969
+
970
+ return recognizer
971
+
972
+
973
+ @lru_cache(maxsize=10)
974
+ def _get_wenet_model(
975
+ repo_id: str,
976
+ decoding_method: str,
977
+ num_active_paths: int,
978
+ ):
979
+ assert repo_id in [
980
+ "csukuangfj/wenet-chinese-model",
981
+ "csukuangfj/wenet-english-model",
982
+ ], repo_id
983
+
984
+ nn_model = _get_nn_model_filename(
985
+ repo_id=repo_id,
986
+ filename="final.zip",
987
+ subfolder=".",
988
+ )
989
+ tokens = _get_token_filename(
990
+ repo_id=repo_id,
991
+ filename="units.txt",
992
+ subfolder=".",
993
+ )
994
+
995
+ feat_config = sherpa.FeatureConfig(normalize_samples=False)
996
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
997
+ feat_config.fbank_opts.mel_opts.num_bins = 80
998
+ feat_config.fbank_opts.frame_opts.dither = 0
999
+
1000
+ config = sherpa.OfflineRecognizerConfig(
1001
+ nn_model=nn_model,
1002
+ tokens=tokens,
1003
+ use_gpu=False,
1004
+ feat_config=feat_config,
1005
+ decoding_method=decoding_method,
1006
+ num_active_paths=num_active_paths,
1007
+ )
1008
+
1009
+ recognizer = sherpa.OfflineRecognizer(config)
1010
+
1011
+ return recognizer
1012
+
1013
+
1014
+ @lru_cache(maxsize=10)
1015
+ def _get_aidatatang_200zh_pretrained_mode(
1016
+ repo_id: str,
1017
+ decoding_method: str,
1018
+ num_active_paths: int,
1019
+ ):
1020
+ assert repo_id in [
1021
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
1022
+ ], repo_id
1023
+
1024
+ nn_model = _get_nn_model_filename(
1025
+ repo_id=repo_id,
1026
+ filename="cpu_jit_torch.1.7.1.pt",
1027
+ )
1028
+ tokens = _get_token_filename(repo_id=repo_id)
1029
+
1030
+ feat_config = sherpa.FeatureConfig()
1031
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
1032
+ feat_config.fbank_opts.mel_opts.num_bins = 80
1033
+ feat_config.fbank_opts.frame_opts.dither = 0
1034
+
1035
+ config = sherpa.OfflineRecognizerConfig(
1036
+ nn_model=nn_model,
1037
+ tokens=tokens,
1038
+ use_gpu=False,
1039
+ feat_config=feat_config,
1040
+ decoding_method=decoding_method,
1041
+ num_active_paths=num_active_paths,
1042
+ )
1043
+
1044
+ recognizer = sherpa.OfflineRecognizer(config)
1045
+
1046
+ return recognizer
1047
+
1048
+
1049
+ @lru_cache(maxsize=10)
1050
+ def _get_tibetan_pre_trained_model(
1051
+ repo_id: str,
1052
+ decoding_method: str,
1053
+ num_active_paths: int,
1054
+ ):
1055
+ assert repo_id in [
1056
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
1057
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
1058
+ ], repo_id
1059
+
1060
+ filename = "cpu_jit.pt"
1061
+ if (
1062
+ repo_id
1063
+ == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29"
1064
+ ):
1065
+ filename = "cpu_jit-epoch-28-avg-23-torch-1.10.0.pt"
1066
+
1067
+ nn_model = _get_nn_model_filename(
1068
+ repo_id=repo_id,
1069
+ filename=filename,
1070
+ )
1071
+
1072
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
1073
+
1074
+ feat_config = sherpa.FeatureConfig()
1075
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
1076
+ feat_config.fbank_opts.mel_opts.num_bins = 80
1077
+ feat_config.fbank_opts.frame_opts.dither = 0
1078
+
1079
+ config = sherpa.OfflineRecognizerConfig(
1080
+ nn_model=nn_model,
1081
+ tokens=tokens,
1082
+ use_gpu=False,
1083
+ feat_config=feat_config,
1084
+ decoding_method=decoding_method,
1085
+ num_active_paths=num_active_paths,
1086
+ )
1087
+
1088
+ recognizer = sherpa.OfflineRecognizer(config)
1089
+
1090
+ return recognizer
1091
+
1092
+
1093
+ @lru_cache(maxsize=10)
1094
+ def _get_arabic_pre_trained_model(
1095
+ repo_id: str,
1096
+ decoding_method: str,
1097
+ num_active_paths: int,
1098
+ ):
1099
+ assert repo_id in [
1100
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
1101
+ ], repo_id
1102
+
1103
+ nn_model = _get_nn_model_filename(
1104
+ repo_id=repo_id,
1105
+ filename="cpu_jit.pt",
1106
+ )
1107
+
1108
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_5000")
1109
+
1110
+ feat_config = sherpa.FeatureConfig()
1111
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
1112
+ feat_config.fbank_opts.mel_opts.num_bins = 80
1113
+ feat_config.fbank_opts.frame_opts.dither = 0
1114
+
1115
+ config = sherpa.OfflineRecognizerConfig(
1116
+ nn_model=nn_model,
1117
+ tokens=tokens,
1118
+ use_gpu=False,
1119
+ feat_config=feat_config,
1120
+ decoding_method=decoding_method,
1121
+ num_active_paths=num_active_paths,
1122
+ )
1123
+
1124
+ recognizer = sherpa.OfflineRecognizer(config)
1125
+
1126
+ return recognizer
1127
+
1128
+
1129
+ @lru_cache(maxsize=10)
1130
+ def _get_german_pre_trained_model(
1131
+ repo_id: str,
1132
+ decoding_method: str,
1133
+ num_active_paths: int,
1134
+ ):
1135
+ assert repo_id in [
1136
+ "csukuangfj/wav2vec2.0-torchaudio",
1137
+ ], repo_id
1138
+
1139
+ nn_model = _get_nn_model_filename(
1140
+ repo_id=repo_id,
1141
+ filename="voxpopuli_asr_base_10k_de.pt",
1142
+ subfolder=".",
1143
+ )
1144
+
1145
+ tokens = _get_token_filename(
1146
+ repo_id=repo_id,
1147
+ filename="tokens-de.txt",
1148
+ subfolder=".",
1149
+ )
1150
+
1151
+ config = sherpa.OfflineRecognizerConfig(
1152
+ nn_model=nn_model,
1153
+ tokens=tokens,
1154
+ use_gpu=False,
1155
+ decoding_method=decoding_method,
1156
+ num_active_paths=num_active_paths,
1157
+ )
1158
+
1159
+ recognizer = sherpa.OfflineRecognizer(config)
1160
+
1161
+ return recognizer
1162
+
1163
+
1164
+ @lru_cache(maxsize=10)
1165
+ def _get_french_pre_trained_model(
1166
+ repo_id: str,
1167
+ decoding_method: str,
1168
+ num_active_paths: int,
1169
+ ) -> sherpa_onnx.OnlineRecognizer:
1170
+ assert repo_id in [
1171
+ "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
1172
+ ], repo_id
1173
+
1174
+ encoder_model = _get_nn_model_filename(
1175
+ repo_id=repo_id,
1176
+ filename="encoder-epoch-29-avg-9-with-averaged-model.onnx",
1177
+ subfolder=".",
1178
+ )
1179
+
1180
+ decoder_model = _get_nn_model_filename(
1181
+ repo_id=repo_id,
1182
+ filename="decoder-epoch-29-avg-9-with-averaged-model.onnx",
1183
+ subfolder=".",
1184
+ )
1185
+
1186
+ joiner_model = _get_nn_model_filename(
1187
+ repo_id=repo_id,
1188
+ filename="joiner-epoch-29-avg-9-with-averaged-model.onnx",
1189
+ subfolder=".",
1190
+ )
1191
+
1192
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1193
+
1194
+ recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
1195
+ tokens=tokens,
1196
+ encoder=encoder_model,
1197
+ decoder=decoder_model,
1198
+ joiner=joiner_model,
1199
+ num_threads=2,
1200
+ sample_rate=16000,
1201
+ feature_dim=80,
1202
+ decoding_method=decoding_method,
1203
+ max_active_paths=num_active_paths,
1204
+ )
1205
+
1206
+ return recognizer
1207
+
1208
+
1209
+ @lru_cache(maxsize=10)
1210
+ def _get_sherpa_onnx_nemo_transducer_models(
1211
+ repo_id: str,
1212
+ decoding_method: str,
1213
+ num_active_paths: int,
1214
+ ) -> sherpa_onnx.OfflineRecognizer:
1215
+ assert repo_id in [
1216
+ "csukuangfj/sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000",
1217
+ ], repo_id
1218
+
1219
+ encoder_model = _get_nn_model_filename(
1220
+ repo_id=repo_id,
1221
+ filename="encoder.onnx",
1222
+ subfolder=".",
1223
+ )
1224
+
1225
+ decoder_model = _get_nn_model_filename(
1226
+ repo_id=repo_id,
1227
+ filename="decoder.onnx",
1228
+ subfolder=".",
1229
+ )
1230
+
1231
+ joiner_model = _get_nn_model_filename(
1232
+ repo_id=repo_id,
1233
+ filename="joiner.onnx",
1234
+ subfolder=".",
1235
+ )
1236
+
1237
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1238
+
1239
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
1240
+ tokens=tokens,
1241
+ encoder=encoder_model,
1242
+ decoder=decoder_model,
1243
+ joiner=joiner_model,
1244
+ num_threads=2,
1245
+ sample_rate=16000,
1246
+ feature_dim=80,
1247
+ model_type="nemo_transducer",
1248
+ decoding_method=decoding_method,
1249
+ max_active_paths=num_active_paths,
1250
+ )
1251
+
1252
+ return recognizer
1253
+
1254
+
1255
+ @lru_cache(maxsize=10)
1256
+ def _get_sherpa_onnx_nemo_ctc_models(
1257
+ repo_id: str,
1258
+ decoding_method: str,
1259
+ num_active_paths: int,
1260
+ ) -> sherpa_onnx.OfflineRecognizer:
1261
+ assert repo_id in [
1262
+ "csukuangfj/sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000",
1263
+ ], repo_id
1264
+
1265
+ model = _get_nn_model_filename(
1266
+ repo_id=repo_id,
1267
+ filename="model.onnx",
1268
+ subfolder=".",
1269
+ )
1270
+
1271
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1272
+
1273
+ recognizer = sherpa_onnx.OfflineRecognizer.from_nemo_ctc(
1274
+ tokens=tokens,
1275
+ model=model,
1276
+ num_threads=2,
1277
+ sample_rate=16000,
1278
+ feature_dim=80,
1279
+ )
1280
+
1281
+ return recognizer
1282
+
1283
+
1284
+ @lru_cache(maxsize=10)
1285
+ def _get_sherpa_onnx_offline_zipformer_pre_trained_model(
1286
+ repo_id: str,
1287
+ decoding_method: str,
1288
+ num_active_paths: int,
1289
+ ) -> sherpa_onnx.OfflineRecognizer:
1290
+ assert repo_id in [
1291
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-large",
1292
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-medium",
1293
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-small",
1294
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-large-punct-case",
1295
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-medium-punct-case",
1296
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-small-punct-case",
1297
+ ], repo_id
1298
+
1299
+ if repo_id == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-large":
1300
+ epoch = 16
1301
+ avg = 3
1302
+ elif repo_id == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-medium":
1303
+ epoch = 60
1304
+ avg = 20
1305
+ elif repo_id == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-small":
1306
+ epoch = 90
1307
+ avg = 20
1308
+ elif (
1309
+ repo_id
1310
+ == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-large-punct-case"
1311
+ ):
1312
+ epoch = 16
1313
+ avg = 2
1314
+ elif (
1315
+ repo_id
1316
+ == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-medium-punct-case"
1317
+ ):
1318
+ epoch = 50
1319
+ avg = 15
1320
+ elif (
1321
+ repo_id
1322
+ == "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-small-punct-case"
1323
+ ):
1324
+ epoch = 88
1325
+ avg = 41
1326
+
1327
+ encoder_model = _get_nn_model_filename(
1328
+ repo_id=repo_id,
1329
+ filename=f"encoder-epoch-{epoch}-avg-{avg}.int8.onnx",
1330
+ subfolder=".",
1331
+ )
1332
+
1333
+ decoder_model = _get_nn_model_filename(
1334
+ repo_id=repo_id,
1335
+ filename=f"decoder-epoch-{epoch}-avg-{avg}.onnx",
1336
+ subfolder=".",
1337
+ )
1338
+
1339
+ joiner_model = _get_nn_model_filename(
1340
+ repo_id=repo_id,
1341
+ filename=f"joiner-epoch-{epoch}-avg-{avg}.int8.onnx",
1342
+ subfolder=".",
1343
+ )
1344
+
1345
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1346
+
1347
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
1348
+ tokens=tokens,
1349
+ encoder=encoder_model,
1350
+ decoder=decoder_model,
1351
+ joiner=joiner_model,
1352
+ num_threads=2,
1353
+ sample_rate=16000,
1354
+ feature_dim=80,
1355
+ decoding_method=decoding_method,
1356
+ max_active_paths=num_active_paths,
1357
+ )
1358
+
1359
+ return recognizer
1360
+
1361
+
1362
+ @lru_cache(maxsize=10)
1363
+ def _get_streaming_zipformer_pre_trained_model(
1364
+ repo_id: str,
1365
+ decoding_method: str,
1366
+ num_active_paths: int,
1367
+ ) -> sherpa_onnx.OnlineRecognizer:
1368
+ assert repo_id in [
1369
+ "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20",
1370
+ "k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16",
1371
+ ], repo_id
1372
+
1373
+ encoder_model = _get_nn_model_filename(
1374
+ repo_id=repo_id,
1375
+ filename="encoder-epoch-99-avg-1.onnx",
1376
+ subfolder=".",
1377
+ )
1378
+
1379
+ decoder_model = _get_nn_model_filename(
1380
+ repo_id=repo_id,
1381
+ filename="decoder-epoch-99-avg-1.onnx",
1382
+ subfolder=".",
1383
+ )
1384
+
1385
+ joiner_model = _get_nn_model_filename(
1386
+ repo_id=repo_id,
1387
+ filename="joiner-epoch-99-avg-1.onnx",
1388
+ subfolder=".",
1389
+ )
1390
+
1391
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1392
+
1393
+ recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
1394
+ tokens=tokens,
1395
+ encoder=encoder_model,
1396
+ decoder=decoder_model,
1397
+ joiner=joiner_model,
1398
+ num_threads=2,
1399
+ sample_rate=16000,
1400
+ feature_dim=80,
1401
+ decoding_method=decoding_method,
1402
+ max_active_paths=num_active_paths,
1403
+ )
1404
+
1405
+ return recognizer
1406
+
1407
+
1408
+ @lru_cache(maxsize=10)
1409
+ def _get_japanese_pre_trained_model(
1410
+ repo_id: str,
1411
+ decoding_method: str,
1412
+ num_active_paths: int,
1413
+ ) -> sherpa.OnlineRecognizer:
1414
+ repo_id, kind = repo_id.rsplit("-", maxsplit=1)
1415
+
1416
+ assert repo_id in [
1417
+ "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208"
1418
+ ], repo_id
1419
+ assert kind in ("fluent", "disfluent"), kind
1420
+
1421
+ encoder_model = _get_nn_model_filename(
1422
+ repo_id=repo_id, filename="encoder_jit_trace.pt", subfolder=f"exp_{kind}"
1423
+ )
1424
+
1425
+ decoder_model = _get_nn_model_filename(
1426
+ repo_id=repo_id, filename="decoder_jit_trace.pt", subfolder=f"exp_{kind}"
1427
+ )
1428
+
1429
+ joiner_model = _get_nn_model_filename(
1430
+ repo_id=repo_id, filename="joiner_jit_trace.pt", subfolder=f"exp_{kind}"
1431
+ )
1432
+
1433
+ tokens = _get_token_filename(repo_id=repo_id)
1434
+
1435
+ feat_config = sherpa.FeatureConfig()
1436
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
1437
+ feat_config.fbank_opts.mel_opts.num_bins = 80
1438
+ feat_config.fbank_opts.frame_opts.dither = 0
1439
+
1440
+ config = sherpa.OnlineRecognizerConfig(
1441
+ nn_model="",
1442
+ encoder_model=encoder_model,
1443
+ decoder_model=decoder_model,
1444
+ joiner_model=joiner_model,
1445
+ tokens=tokens,
1446
+ use_gpu=False,
1447
+ feat_config=feat_config,
1448
+ decoding_method=decoding_method,
1449
+ num_active_paths=num_active_paths,
1450
+ chunk_size=32,
1451
+ )
1452
+
1453
+ recognizer = sherpa.OnlineRecognizer(config)
1454
+
1455
+ return recognizer
1456
+
1457
+
1458
+ @lru_cache(maxsize=10)
1459
+ def _get_gigaspeech_pre_trained_model_onnx(
1460
+ repo_id: str,
1461
+ decoding_method: str,
1462
+ num_active_paths: int,
1463
+ ) -> sherpa_onnx.OfflineRecognizer:
1464
+ assert repo_id in [
1465
+ "yfyeung/icefall-asr-gigaspeech-zipformer-2023-10-17",
1466
+ ], repo_id
1467
+
1468
+ encoder_model = _get_nn_model_filename(
1469
+ repo_id=repo_id,
1470
+ filename="encoder-epoch-30-avg-9.onnx",
1471
+ subfolder="exp",
1472
+ )
1473
+
1474
+ decoder_model = _get_nn_model_filename(
1475
+ repo_id=repo_id,
1476
+ filename="decoder-epoch-30-avg-9.onnx",
1477
+ subfolder="exp",
1478
+ )
1479
+
1480
+ joiner_model = _get_nn_model_filename(
1481
+ repo_id=repo_id,
1482
+ filename="joiner-epoch-30-avg-9.onnx",
1483
+ subfolder="exp",
1484
+ )
1485
+
1486
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
1487
+
1488
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
1489
+ tokens=tokens,
1490
+ encoder=encoder_model,
1491
+ decoder=decoder_model,
1492
+ joiner=joiner_model,
1493
+ num_threads=2,
1494
+ sample_rate=16000,
1495
+ feature_dim=80,
1496
+ decoding_method=decoding_method,
1497
+ max_active_paths=num_active_paths,
1498
+ )
1499
+
1500
+ return recognizer
1501
+
1502
+
1503
+ @lru_cache(maxsize=10)
1504
+ def _get_streaming_paraformer_zh_yue_en_pre_trained_model(
1505
+ repo_id: str,
1506
+ decoding_method: str,
1507
+ num_active_paths: int,
1508
+ ) -> sherpa_onnx.OnlineRecognizer:
1509
+ assert repo_id in [
1510
+ "csukuangfj/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en",
1511
+ ], repo_id
1512
+
1513
+ encoder_model = _get_nn_model_filename(
1514
+ repo_id=repo_id,
1515
+ filename="encoder.int8.onnx",
1516
+ subfolder=".",
1517
+ )
1518
+
1519
+ decoder_model = _get_nn_model_filename(
1520
+ repo_id=repo_id,
1521
+ filename="decoder.int8.onnx",
1522
+ subfolder=".",
1523
+ )
1524
+
1525
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1526
+
1527
+ recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
1528
+ tokens=tokens,
1529
+ encoder=encoder_model,
1530
+ decoder=decoder_model,
1531
+ num_threads=2,
1532
+ sample_rate=16000,
1533
+ feature_dim=80,
1534
+ decoding_method=decoding_method,
1535
+ )
1536
+
1537
+ return recognizer
1538
+
1539
+
1540
+ @lru_cache(maxsize=10)
1541
+ def _get_paraformer_en_pre_trained_model(
1542
+ repo_id: str,
1543
+ decoding_method: str,
1544
+ num_active_paths: int,
1545
+ ) -> sherpa_onnx.OfflineRecognizer:
1546
+ assert repo_id in [
1547
+ "yujinqiu/sherpa-onnx-paraformer-en-2023-10-24",
1548
+ ], repo_id
1549
+
1550
+ nn_model = _get_nn_model_filename(
1551
+ repo_id=repo_id,
1552
+ filename="model.int8.onnx",
1553
+ subfolder=".",
1554
+ )
1555
+
1556
+ tokens = _get_token_filename(
1557
+ repo_id=repo_id, filename="new_tokens.txt", subfolder="."
1558
+ )
1559
+
1560
+ recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
1561
+ paraformer=nn_model,
1562
+ tokens=tokens,
1563
+ num_threads=2,
1564
+ sample_rate=sample_rate,
1565
+ feature_dim=80,
1566
+ decoding_method="greedy_search",
1567
+ debug=False,
1568
+ )
1569
+
1570
+ return recognizer
1571
+
1572
+
1573
+ @lru_cache(maxsize=5)
1574
+ def _get_chinese_dialect_models(
1575
+ repo_id: str, decoding_method: str, num_active_paths: int
1576
+ ) -> sherpa_onnx.OfflineRecognizer:
1577
+ assert repo_id in [
1578
+ "csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04",
1579
+ ], repo_id
1580
+
1581
+ nn_model = _get_nn_model_filename(
1582
+ repo_id=repo_id,
1583
+ filename="model.int8.onnx",
1584
+ subfolder=".",
1585
+ )
1586
+
1587
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1588
+
1589
+ recognizer = sherpa_onnx.OfflineRecognizer.from_telespeech_ctc(
1590
+ model=nn_model,
1591
+ tokens=tokens,
1592
+ num_threads=2,
1593
+ )
1594
+
1595
+ return recognizer
1596
+
1597
+
1598
+ @lru_cache(maxsize=10)
1599
+ def _get_sense_voice_pre_trained_model(
1600
+ repo_id: str,
1601
+ decoding_method: str,
1602
+ num_active_paths: int,
1603
+ ) -> sherpa_onnx.OfflineRecognizer:
1604
+ assert repo_id in [
1605
+ "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17",
1606
+ ], repo_id
1607
+
1608
+ nn_model = _get_nn_model_filename(
1609
+ repo_id=repo_id,
1610
+ filename="model.int8.onnx",
1611
+ subfolder=".",
1612
+ )
1613
+
1614
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1615
+
1616
+ recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
1617
+ model=nn_model,
1618
+ tokens=tokens,
1619
+ num_threads=2,
1620
+ sample_rate=sample_rate,
1621
+ feature_dim=80,
1622
+ decoding_method="greedy_search",
1623
+ debug=True,
1624
+ use_itn=True,
1625
+ )
1626
+
1627
+ return recognizer
1628
+
1629
+
1630
+ @lru_cache(maxsize=10)
1631
+ def _get_paraformer_pre_trained_model(
1632
+ repo_id: str,
1633
+ decoding_method: str,
1634
+ num_active_paths: int,
1635
+ ) -> sherpa_onnx.OfflineRecognizer:
1636
+ assert repo_id in [
1637
+ "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28",
1638
+ "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09",
1639
+ "csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09",
1640
+ "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en",
1641
+ "csukuangfj/sherpa-onnx-paraformer-en-2024-03-09",
1642
+ ], repo_id
1643
+
1644
+ nn_model = _get_nn_model_filename(
1645
+ repo_id=repo_id,
1646
+ filename="model.int8.onnx",
1647
+ subfolder=".",
1648
+ )
1649
+
1650
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1651
+
1652
+ recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
1653
+ paraformer=nn_model,
1654
+ tokens=tokens,
1655
+ num_threads=2,
1656
+ sample_rate=sample_rate,
1657
+ feature_dim=80,
1658
+ decoding_method="greedy_search",
1659
+ debug=False,
1660
+ )
1661
+
1662
+ return recognizer
1663
+
1664
+
1665
+ def _get_aishell_pre_trained_model(
1666
+ repo_id: str,
1667
+ decoding_method: str,
1668
+ num_active_paths: int,
1669
+ ) -> sherpa_onnx.OfflineRecognizer:
1670
+ assert repo_id in (
1671
+ "zrjin/icefall-asr-aishell-zipformer-large-2023-10-24",
1672
+ "zrjin/icefall-asr-aishell-zipformer-small-2023-10-24",
1673
+ "zrjin/icefall-asr-aishell-zipformer-2023-10-24",
1674
+ ), repo_id
1675
+ if repo_id == "zrjin/icefall-asr-aishell-zipformer-large-2023-10-24":
1676
+ epoch = 56
1677
+ avg = 23
1678
+ elif repo_id == "zrjin/icefall-asr-aishell-zipformer-small-2023-10-24":
1679
+ epoch = 55
1680
+ avg = 21
1681
+ elif repo_id == "zrjin/icefall-asr-aishell-zipformer-2023-10-24":
1682
+ epoch = 55
1683
+ avg = 17
1684
+
1685
+ encoder_model = _get_nn_model_filename(
1686
+ repo_id=repo_id,
1687
+ filename=f"encoder-epoch-{epoch}-avg-{avg}.onnx",
1688
+ subfolder="exp",
1689
+ )
1690
+
1691
+ decoder_model = _get_nn_model_filename(
1692
+ repo_id=repo_id,
1693
+ filename=f"decoder-epoch-{epoch}-avg-{avg}.onnx",
1694
+ subfolder="exp",
1695
+ )
1696
+
1697
+ joiner_model = _get_nn_model_filename(
1698
+ repo_id=repo_id,
1699
+ filename=f"joiner-epoch-{epoch}-avg-{avg}.onnx",
1700
+ subfolder="exp",
1701
+ )
1702
+
1703
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_char")
1704
+
1705
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
1706
+ tokens=tokens,
1707
+ encoder=encoder_model,
1708
+ decoder=decoder_model,
1709
+ joiner=joiner_model,
1710
+ num_threads=2,
1711
+ sample_rate=16000,
1712
+ feature_dim=80,
1713
+ decoding_method=decoding_method,
1714
+ max_active_paths=num_active_paths,
1715
+ )
1716
+
1717
+ return recognizer
1718
+
1719
+
1720
+ @lru_cache(maxsize=2)
1721
+ def get_punct_model() -> sherpa_onnx.OfflinePunctuation:
1722
+ model = _get_nn_model_filename(
1723
+ repo_id="csukuangfj/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12",
1724
+ filename="model.onnx",
1725
+ subfolder=".",
1726
+ )
1727
+ config = sherpa_onnx.OfflinePunctuationConfig(
1728
+ model=sherpa_onnx.OfflinePunctuationModelConfig(ct_transformer=model),
1729
+ )
1730
+
1731
+ punct = sherpa_onnx.OfflinePunctuation(config)
1732
+ return punct
1733
+
1734
+
1735
+ def _get_multi_zh_hans_pre_trained_model(
1736
+ repo_id: str,
1737
+ decoding_method: str,
1738
+ num_active_paths: int,
1739
+ ) -> sherpa_onnx.OfflineRecognizer:
1740
+ assert repo_id in ("zrjin/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2",), repo_id
1741
+
1742
+ encoder_model = _get_nn_model_filename(
1743
+ repo_id=repo_id,
1744
+ filename="encoder-epoch-20-avg-1.onnx",
1745
+ subfolder=".",
1746
+ )
1747
+
1748
+ decoder_model = _get_nn_model_filename(
1749
+ repo_id=repo_id,
1750
+ filename="decoder-epoch-20-avg-1.onnx",
1751
+ subfolder=".",
1752
+ )
1753
+
1754
+ joiner_model = _get_nn_model_filename(
1755
+ repo_id=repo_id,
1756
+ filename="joiner-epoch-20-avg-1.onnx",
1757
+ subfolder=".",
1758
+ )
1759
+
1760
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
1761
+
1762
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
1763
+ tokens=tokens,
1764
+ encoder=encoder_model,
1765
+ decoder=decoder_model,
1766
+ joiner=joiner_model,
1767
+ num_threads=2,
1768
+ sample_rate=16000,
1769
+ feature_dim=80,
1770
+ decoding_method=decoding_method,
1771
+ max_active_paths=num_active_paths,
1772
+ )
1773
+
1774
+ return recognizer
1775
+
1776
+
1777
+ chinese_dialect_models = {
1778
+ "csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04": _get_chinese_dialect_models,
1779
+ }
1780
+
1781
+ chinese_models = {
1782
+ "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09": _get_paraformer_pre_trained_model,
1783
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, # noqa
1784
+ "csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09": _get_paraformer_pre_trained_model,
1785
+ "zrjin/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2": _get_multi_zh_hans_pre_trained_model, # noqa
1786
+ "zrjin/icefall-asr-aishell-zipformer-large-2023-10-24": _get_aishell_pre_trained_model, # noqa
1787
+ "zrjin/icefall-asr-aishell-zipformer-small-2023-10-24": _get_aishell_pre_trained_model, # noqa
1788
+ "zrjin/icefall-asr-aishell-zipformer-2023-10-24": _get_aishell_pre_trained_model, # noqa
1789
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7": _get_alimeeting_pre_trained_model,
1790
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, # noqa
1791
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, # noqa
1792
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, # noqa
1793
+ "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, # noqa
1794
+ "csukuangfj/wenet-chinese-model": _get_wenet_model,
1795
+ # "csukuangfj/icefall-asr-wenetspeech-lstm-transducer-stateless-2022-10-14": _get_lstm_transducer_model,
1796
+ }
1797
+
1798
+ english_models = {
1799
+ "whisper-tiny.en": _get_whisper_model,
1800
+ "moonshine-tiny": _get_moonshine_model,
1801
+ "moonshine-base": _get_moonshine_model,
1802
+ "whisper-base.en": _get_whisper_model,
1803
+ "whisper-small.en": _get_whisper_model,
1804
+ "csukuangfj/sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000": _get_sherpa_onnx_nemo_ctc_models,
1805
+ "csukuangfj/sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000": _get_sherpa_onnx_nemo_transducer_models,
1806
+ # "whisper-medium.en": _get_whisper_model,
1807
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-large": _get_sherpa_onnx_offline_zipformer_pre_trained_model,
1808
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-medium": _get_sherpa_onnx_offline_zipformer_pre_trained_model,
1809
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230926-small": _get_sherpa_onnx_offline_zipformer_pre_trained_model,
1810
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-large-punct-case": _get_sherpa_onnx_offline_zipformer_pre_trained_model,
1811
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-medium-punct-case": _get_sherpa_onnx_offline_zipformer_pre_trained_model,
1812
+ "csukuangfj/sherpa-onnx-zipformer-en-libriheavy-20230830-small-punct-case": _get_sherpa_onnx_offline_zipformer_pre_trained_model,
1813
+ "csukuangfj/sherpa-onnx-paraformer-en-2024-03-09": _get_paraformer_pre_trained_model,
1814
+ "yfyeung/icefall-asr-gigaspeech-zipformer-2023-10-17": _get_gigaspeech_pre_trained_model_onnx, # noqa
1815
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, # noqa
1816
+ "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04": _get_english_model, # noqa
1817
+ "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19": _get_english_model, # noqa
1818
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_english_model, # noqa
1819
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_english_model, # noqa
1820
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_english_model, # noqa
1821
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_english_model, # noqa
1822
+ "yujinqiu/sherpa-onnx-paraformer-en-2023-10-24": _get_paraformer_en_pre_trained_model,
1823
+ "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16": _get_english_model, # noqa
1824
+ "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15": _get_english_model, # noqa
1825
+ "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16": _get_english_model, # noqa
1826
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2": _get_english_model,
1827
+ "pkufool/icefall_asr_librispeech_conformer_ctc": _get_english_model,
1828
+ "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21": _get_english_model,
1829
+ "csukuangfj/wenet-english-model": _get_wenet_model,
1830
+ }
1831
+
1832
+ multi_lingual_models = {
1833
+ "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02": _get_dolphin_ctc_models,
1834
+ "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-int8-2025-04-02": _get_dolphin_ctc_models,
1835
+ "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-2025-04-02": _get_dolphin_ctc_models,
1836
+ "csukuangfj/sherpa-onnx-dolphin-small-ctc-multi-lang-2025-04-02": _get_dolphin_ctc_models,
1837
+ }
1838
+
1839
+ chinese_english_mixed_models = {
1840
+ "csukuangfj/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16": _get_fire_red_asr_models,
1841
+ "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": _get_streaming_zipformer_pre_trained_model,
1842
+ "zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22": _get_chinese_english_mixed_model_onnx,
1843
+ "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28": _get_paraformer_pre_trained_model,
1844
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": _get_chinese_english_mixed_model,
1845
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": _get_chinese_english_mixed_model, # noqa
1846
+ }
1847
+
1848
+ tibetan_models = {
1849
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02": _get_tibetan_pre_trained_model, # noqa
1850
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29": _get_tibetan_pre_trained_model, # noqa
1851
+ }
1852
+
1853
+ arabic_models = {
1854
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06": _get_arabic_pre_trained_model, # noqa
1855
+ }
1856
+
1857
+ german_models = {
1858
+ "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
1859
+ }
1860
+
1861
+ french_models = {
1862
+ "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": _get_french_pre_trained_model,
1863
+ }
1864
+
1865
+ japanese_models = {
1866
+ "reazon-research/reazonspeech-k2-v2": _get_offline_pre_trained_model,
1867
+ # "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-fluent": _get_japanese_pre_trained_model,
1868
+ # "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-disfluent": _get_japanese_pre_trained_model,
1869
+ }
1870
+
1871
+ russian_models = {
1872
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-v2-russian-2025-04-19": _get_russian_pre_trained_model,
1873
+ "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19": _get_russian_pre_trained_model_ctc,
1874
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24": _get_russian_pre_trained_model,
1875
+ "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24": _get_russian_pre_trained_model_ctc,
1876
+ "alphacep/vosk-model-ru": _get_russian_pre_trained_model,
1877
+ "alphacep/vosk-model-small-ru": _get_russian_pre_trained_model,
1878
+ }
1879
+
1880
+ chinese_cantonese_english_models = {
1881
+ "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en": _get_paraformer_pre_trained_model,
1882
+ "csukuangfj/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en": _get_streaming_paraformer_zh_yue_en_pre_trained_model,
1883
+ }
1884
+
1885
+ chinese_cantonese_english_japanese_korean_models = {
1886
+ "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17": _get_sense_voice_pre_trained_model,
1887
+ }
1888
+
1889
+ cantonese_models = {
1890
+ "zrjin/icefall-asr-mdcc-zipformer-2024-03-11": _get_zrjin_cantonese_pre_trained_model,
1891
+ }
1892
+
1893
+ korean_models = {
1894
+ "k2-fsa/sherpa-onnx-zipformer-korean-2024-06-24": _get_offline_pre_trained_model,
1895
+ "k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": _get_streaming_zipformer_pre_trained_model,
1896
+ }
1897
+
1898
+ thai_models = {
1899
+ "yfyeung/icefall-asr-gigaspeech2-th-zipformer-2024-06-20": _get_yifan_thai_pretrained_model,
1900
+ }
1901
+
1902
+
1903
+ all_models = {
1904
+ **multi_lingual_models,
1905
+ **chinese_models,
1906
+ **english_models,
1907
+ **chinese_english_mixed_models,
1908
+ **chinese_cantonese_english_models,
1909
+ **chinese_cantonese_english_japanese_korean_models,
1910
+ **cantonese_models,
1911
+ **japanese_models,
1912
+ **tibetan_models,
1913
+ **arabic_models,
1914
+ **german_models,
1915
+ **french_models,
1916
+ **russian_models,
1917
+ **korean_models,
1918
+ **thai_models,
1919
+ }
1920
+
1921
+ language_to_models = {
1922
+ "Multi-lingual (east aisa)": list(multi_lingual_models.keys()),
1923
+ "超多种中文方言": list(chinese_dialect_models.keys()),
1924
+ "Chinese": list(chinese_models.keys()),
1925
+ "English": list(english_models.keys()),
1926
+ "Chinese+English": list(chinese_english_mixed_models.keys()),
1927
+ "Chinese+English+Cantonese": list(chinese_cantonese_english_models.keys()),
1928
+ "Chinese+English+Cantonese+Japanese+Korean": list(
1929
+ chinese_cantonese_english_japanese_korean_models.keys()
1930
+ ),
1931
+ "Cantonese": list(cantonese_models.keys()),
1932
+ "Japanese": list(japanese_models.keys()),
1933
+ "Tibetan": list(tibetan_models.keys()),
1934
+ "Arabic": list(arabic_models.keys()),
1935
+ "German": list(german_models.keys()),
1936
+ "French": list(french_models.keys()),
1937
+ "Russian": list(russian_models.keys()),
1938
+ "Korean": list(korean_models.keys()),
1939
+ "Thai": list(thai_models.keys()),
1940
+ }
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://download.pytorch.org/whl/cpu/torch-1.13.1%2Bcpu-cp310-cp310-linux_x86_64.whl
2
+ https://download.pytorch.org/whl/cpu/torchaudio-0.13.1%2Bcpu-cp310-cp310-linux_x86_64.whl
3
+
4
+ https://huggingface.co/csukuangfj/k2/resolve/main/cpu/1.24.4.dev20250307/linux-x64/k2-1.24.4.dev20250307+cpu.torch1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
5
+ https://huggingface.co/csukuangfj/sherpa/resolve/main/cpu/1.4.0.dev20250307/linux-x64/k2_sherpa-1.4.0.dev20250307+cpu.torch1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
6
+ https://huggingface.co/csukuangfj/kaldifeat/resolve/main/cpu/1.25.5.dev20250307/linux-x64/kaldifeat-1.25.5.dev20250307+cpu.torch1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
7
+
8
+ sentencepiece>=0.1.96
9
+ numpy<2
10
+
11
+ huggingface_hub
12
+
13
+ #https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/cpu/1.11.3/sherpa_onnx-1.11.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
14
+
15
+ sherpa-onnx>=1.11.3
test_wavs/aidatatang_200zh/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2/tree/main/test_wavs
test_wavs/aidatatang_200zh/T0055G0036S0002.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c7bf25a97de0819064c05952d40d93047da474d1e927424b3f27fb71bca403e
3
+ size 67630
test_wavs/aidatatang_200zh/T0055G0036S0003.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e2e8ef9cc009305e3cb42ddd806c757a7ffc1b85a4402c39e2b59e81ab9ec8
3
+ size 94174
test_wavs/aidatatang_200zh/T0055G0036S0004.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea822f7873b89443191e4a3b4b08c62b81de3a0a4a7b806d273da975a0b9e9fc
3
+ size 70460
test_wavs/aishell2/ID0012W0030.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f042c6cd8cb7fc745f37805565b5ce41b9a4f38a54b267e1a9afd806d5216a38
3
+ size 112878
test_wavs/aishell2/ID0012W0162.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aacdc76fc8b37bc2bdd1c05a4bfd42a5ac3333a53c06088abe9814fb1e5e0912
3
+ size 114124
test_wavs/aishell2/ID0012W0215.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f48eb860503ec691d7d6b99dfc1491a88f30a0930676b3c5dc9170edce041c46
3
+ size 104368
test_wavs/aishell2/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12/tree/main/test_wavs
test_wavs/aishell2/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ID0012W0162 立法机关采纳了第二种意见
2
+ ID0012W0215 大家都愿意牺牲自己的生命
3
+ ID0012W0030 完全是典型的军事侵略
test_wavs/alimeeting/165.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48c131d205a0d93acdcdfc0d81e2ee839f4f3261ca7654e3e3ce175a0ec6098d
3
+ size 262764
test_wavs/alimeeting/209.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9374efff5517fd624ceee8551cd8cd3680fc3ed8ff964fe5f17c1064f05ebfb
3
+ size 154604
test_wavs/alimeeting/74.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c371dd14ff73d7128e1508c71dd6eef934f91c082e5946bf4bdd87761ae44a13
3
+ size 120364
test_wavs/alimeeting/R8003_M8001-8004-165.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b10ddaddabeb905a7915f670502773328d3321beda436907fb0f36c52b2d04e
3
+ size 525498
test_wavs/alimeeting/R8008_M8013-8049-74.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cc97f90e46825e8d6783ea0d41112165c5fffb33d5519fd0d3c6860a43cac70
3
+ size 240698
test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f825ce6a99b00ec30cb276ee821099b63b1594a6782b88aa5117bd578b61f5a
3
+ size 309178
test_wavs/alimeeting/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ R8009_M8020_N_SPK8026-8026-209 并不是说一天的话就一定要对一个人进行一个了解这样的话
2
+ R8003_M8001-8004-165 如果他要是不愿意提供地址也不愿意接收礼物那么第二个这个分支可能就省省下了
3
+ R8008_M8013-8049-74 面试的话五月五号到五月十号吧面试
test_wavs/arabic/a.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d4ef01e713b5ea57459dcb8e31631816bc8acdc0833dc41ad3b1ff000a4da5
3
+ size 252846
test_wavs/arabic/b.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faecc4e69fb4a1b64b47edada3a6a84c8ff7216027c2490b105b4481bef4b12c
3
+ size 243244
test_wavs/arabic/c.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62f08f3c5148e8c69c1607cb067e66034820c4a4322c80e7b396b1bd4360de8b
3
+ size 149804
test_wavs/arabic/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281 بعد أن عجز وبدأ يصدر مشكلات شعبه ومشكلات مصر
2
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244 وهؤلاء أولياء الشيطان ها هو ذا أحدهم الآن ضيفا عليكم على قناة الجزيرة ولا يستحي في ذلك
3
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004 عندما استغاث الليبيون بالعالم استغاثوا لرفع الظلم وليس لقهر إرادة الأمة ومصادرة الحياة الدستورية
test_wavs/cantonese/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22568f57d298bea915f263dea7f41d628eea096e80a85b81ce88b7689ef3eee4
3
+ size 191276
test_wavs/cantonese/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d75fcd99f9693e91ce3303c97d312594a2a95659db5d43bdcefa87e2256e0de
3
+ size 139052
test_wavs/french/common_voice_fr_19364697.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b057a0b3badb2b5e1352b6b058726dc03a063e74794232ed266d5b3ad573f9ca
3
+ size 228174
test_wavs/french/common_voice_fr_19738183.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af7487e23134c3fcc6d74627dcefb5c3c45a2bfa24b4290758efd89139a43884
3
+ size 122190
test_wavs/french/common_voice_fr_27024649.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76021a91ebbe9110d8cbd19a091cea4c305c417ba0c25f32d6f995c362b0b9f2
3
+ size 202830
test_wavs/french/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ common_voice_fr_19738183 CE DERNIER A ÉVOLUÉ TOUT AU LONG DE L'HISTOIRE ROMAINE
2
+ common_voice_fr_27024649 SON ACTIONNAIRE MAJORITAIRE EST LE CONSEIL TERRITORIAL DE SAINT PIERRE ET MIQUELON
3
+ common_voice_fr_19364697 CE SITE CONTIENT QUATRE TOMBEAUX DE LA DYNASTIE ACHÉMÉNIDE ET SEPT DES SASSANIDES
test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edc4f5a2c3e4f6ce99d11490087ef23fa55806a5e32575d3528bf599e0deb711
3
+ size 381356
test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c3b63669e92c6df5bfa3aae0843c64f9eef1be2e85e652b0991a25ebc4e30bb
3
+ size 282284
test_wavs/gigaspeech/1-minute-audiobook.opus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:759d82de055d12fdfd6bdc74990ad32943a5a061565c457a7eeef73feba6d47f
3
+ size 579661
test_wavs/gigaspeech/100-seconds-podcast.opus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecb3d5ab9c5eafdc7dc95de7a6e3a0ea6656b524ab0650427cdff829fe3347a0
3
+ size 954991
test_wavs/gigaspeech/100-seconds-youtube.opus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f0d18ddb1e0b45ef0a3ffdeee1045fa465d39bde77bcc027f5788e72fef646
3
+ size 947770
test_wavs/japanese/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c8ccaa1878720165a8034763f2f3fa4fc3333472b09b75d71cdf1017db7af32
3
+ size 429934
test_wavs/japanese/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aea37375438a3d285b7c4b80434d23c2647b5d988c4373933c817308313f14fe
3
+ size 211996
test_wavs/japanese/3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8195ae4c0b5e3cad89e5e92aa7e19d681cea73ca8cf193649e423ecb5a19a0c7
3
+ size 199452
test_wavs/japanese/4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed64f1cdd19a72c4ef66053d2a0a66e8b35a46b6d98a359acacd3bd81478cfa
3
+ size 328468
test_wavs/japanese/5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaa18f4be5e77a340bea3d0bc25f84feaa352b3d5cba541197c2b2740e7f1dd1
3
+ size 446868
test_wavs/japanese/transcript.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 1.wav 気象庁は、雪や路面の凍結による交通への影響、暴風雪や高波に警戒するとともに、雪崩や屋根からの落雪にも十分注意するよう呼びかけています。
2
+ 2.wav はやくおじいさんにあのおとこのはなしをきかせたかったのです。
3
+ 3.wav ヤンバルクイナとの出会いは18歳の時だった。
4
+ 4.wav H2Aは、打ち上げの成功率は高い一方、1回の打ち上げ費用がおよそ100億円と、高額であることが課題となっていました。
5
+ 5.wav 持ち主とはぐれた傘が風で舞い看板もなぎ倒されてしまったようです。
test_wavs/korean/0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0faf0b037efe428e5e561195f4d2aa148b2a0a2a5fc540b2c184b9d5c241e984
3
+ size 112892
test_wavs/korean/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b59bf1209d0d37088335d94f21394f31d794743bc9c849e3a4c9932a985c0bae
3
+ size 108992
test_wavs/korean/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed95184720061842e8f0f5df7e5826f97b0b26cd3c9bff18709f5be07ff18728
3
+ size 212142
test_wavs/korean/3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1d2de5f90c73dfacddc1d6ab93a41427c89573f261ed2d425a6a37b3ee32931
3
+ size 85834
test_wavs/korean/trans.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 0.wav 그는 괜찮은 척하려고 애쓰는 것 같았다.
2
+ 1.wav 지하철에서 다리를 벌리고 앉지 마라.
3
+ 2.wav 부모가 저지르는 큰 실수 중 하나는 자기 아이를 다른 집 아이와 비교하는 것이다.
4
+ 3.wav 주민등록증을 보여 주시겠어요?