csukuangfj commited on
Commit
2c7ed87
·
1 Parent(s): 887be19

add more russian models

Browse files
Files changed (2) hide show
  1. examples.py +1 -1
  2. model.py +42 -2
examples.py CHANGED
@@ -82,7 +82,7 @@ examples = [
82
  ],
83
  [
84
  "Russian",
85
- "alphacep/vosk-model-ru",
86
  "greedy_search",
87
  4,
88
  "No",
 
82
  ],
83
  [
84
  "Russian",
85
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24",
86
  "greedy_search",
87
  4,
88
  "No",
model.py CHANGED
@@ -430,6 +430,31 @@ def _get_zrjin_cantonese_pre_trained_model(
430
  return recognizer
431
 
432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  @lru_cache(maxsize=10)
434
  def _get_russian_pre_trained_model(
435
  repo_id: str, decoding_method: str, num_active_paths: int
@@ -437,16 +462,25 @@ def _get_russian_pre_trained_model(
437
  assert repo_id in (
438
  "alphacep/vosk-model-ru",
439
  "alphacep/vosk-model-small-ru",
 
440
  ), repo_id
441
 
442
  if repo_id == "alphacep/vosk-model-ru":
443
  model_dir = "am-onnx"
 
 
444
  elif repo_id == "alphacep/vosk-model-small-ru":
445
  model_dir = "am"
 
 
 
 
 
 
446
 
447
  encoder_model = _get_nn_model_filename(
448
  repo_id=repo_id,
449
- filename="encoder.onnx",
450
  subfolder=model_dir,
451
  )
452
 
@@ -462,7 +496,10 @@ def _get_russian_pre_trained_model(
462
  subfolder=model_dir,
463
  )
464
 
465
- tokens = _get_token_filename(repo_id=repo_id, subfolder="lang")
 
 
 
466
 
467
  recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
468
  tokens=tokens,
@@ -473,6 +510,7 @@ def _get_russian_pre_trained_model(
473
  sample_rate=16000,
474
  feature_dim=80,
475
  decoding_method=decoding_method,
 
476
  )
477
 
478
  return recognizer
@@ -1639,6 +1677,8 @@ japanese_models = {
1639
  }
1640
 
1641
  russian_models = {
 
 
1642
  "alphacep/vosk-model-ru": _get_russian_pre_trained_model,
1643
  "alphacep/vosk-model-small-ru": _get_russian_pre_trained_model,
1644
  }
 
430
  return recognizer
431
 
432
 
433
+ @lru_cache(maxsize=10)
434
+ def _get_russian_pre_trained_model_ctc(
435
+ repo_id: str, decoding_method: str, num_active_paths: int
436
+ ) -> sherpa_onnx.OfflineRecognizer:
437
+ assert repo_id in (
438
+ "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24",
439
+ ), repo_id
440
+
441
+ model = _get_nn_model_filename(
442
+ repo_id=repo_id,
443
+ filename="model.int8.onnx",
444
+ subfolder=model_dir,
445
+ )
446
+
447
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
448
+
449
+ recognizer = sherpa_onnx.OfflineRecognizer.from_nemo_ctc(
450
+ model=model,
451
+ tokens=tokens,
452
+ num_threads=2,
453
+ )
454
+
455
+ return recognizer
456
+
457
+
458
  @lru_cache(maxsize=10)
459
  def _get_russian_pre_trained_model(
460
  repo_id: str, decoding_method: str, num_active_paths: int
 
462
  assert repo_id in (
463
  "alphacep/vosk-model-ru",
464
  "alphacep/vosk-model-small-ru",
465
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24",
466
  ), repo_id
467
 
468
  if repo_id == "alphacep/vosk-model-ru":
469
  model_dir = "am-onnx"
470
+ encoder = "encoder.onnx"
471
+ model_type = "transducer"
472
  elif repo_id == "alphacep/vosk-model-small-ru":
473
  model_dir = "am"
474
+ encoder = "encoder.onnx"
475
+ model_type = "transducer"
476
+ elif repo_id == "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24":
477
+ model_dir = "."
478
+ encoder = "encoder.int8.onnx"
479
+ model_type = "nemo_transducer"
480
 
481
  encoder_model = _get_nn_model_filename(
482
  repo_id=repo_id,
483
+ filename=encoder,
484
  subfolder=model_dir,
485
  )
486
 
 
496
  subfolder=model_dir,
497
  )
498
 
499
+ if repo_id == "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24":
500
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
501
+ else:
502
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="lang")
503
 
504
  recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
505
  tokens=tokens,
 
510
  sample_rate=16000,
511
  feature_dim=80,
512
  decoding_method=decoding_method,
513
+ model_type=model_type,
514
  )
515
 
516
  return recognizer
 
1677
  }
1678
 
1679
  russian_models = {
1680
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24": _get_russian_pre_trained_model,
1681
+ "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24": _get_russian_pre_trained_model_ctc,
1682
  "alphacep/vosk-model-ru": _get_russian_pre_trained_model,
1683
  "alphacep/vosk-model-small-ru": _get_russian_pre_trained_model,
1684
  }