sanchit-gandhi HF staff commited on Sep 12, 2022

Commit

4ee7109

•

1 Parent(s): 62bd796

Push to Hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
__pycache__/process_asr_text_tokenizer.cpython-39.pyc +0 -0
all_results.json +21 -0
check_bnb_install.py +19 -0
checkpoint-100000/optimizer.pt +3 -0
checkpoint-100000/rng_state.pth +3 -0
checkpoint-100000/scheduler.pt +3 -0
checkpoint-100000/stt_en_conformer_transducer_xlarge.nemo +3 -0
checkpoint-100000/trainer_state.json +0 -0
checkpoint-100000/training_args.bin +3 -0
checkpoint-20000/optimizer.pt +3 -0
checkpoint-20000/rng_state.pth +3 -0
checkpoint-20000/scheduler.pt +3 -0
checkpoint-20000/stt_en_conformer_transducer_xlarge.nemo +3 -0
checkpoint-20000/trainer_state.json +2425 -0
checkpoint-20000/training_args.bin +3 -0
checkpoint-40000/optimizer.pt +3 -0
checkpoint-40000/rng_state.pth +3 -0
checkpoint-40000/scheduler.pt +3 -0
checkpoint-40000/stt_en_conformer_transducer_xlarge.nemo +3 -0
checkpoint-40000/trainer_state.json +0 -0
checkpoint-40000/training_args.bin +3 -0
checkpoint-60000/optimizer.pt +3 -0
checkpoint-60000/rng_state.pth +3 -0
checkpoint-60000/scheduler.pt +3 -0
checkpoint-60000/stt_en_conformer_transducer_xlarge.nemo +3 -0
checkpoint-60000/trainer_state.json +0 -0
checkpoint-60000/training_args.bin +3 -0
checkpoint-80000/optimizer.pt +3 -0
checkpoint-80000/rng_state.pth +3 -0
checkpoint-80000/scheduler.pt +3 -0
checkpoint-80000/stt_en_conformer_transducer_xlarge.nemo +3 -0
checkpoint-80000/trainer_state.json +0 -0
checkpoint-80000/training_args.bin +3 -0
conf/conformer_transducer_bpe_dummy.yaml +192 -0
conf/conformer_transducer_bpe_large.yaml +212 -0
conf/conformer_transducer_bpe_xlarge.yaml +196 -0
conf/contextnet_rnnt.yaml +472 -0
conf/contextnet_rnnt_dummy.yaml +197 -0
eval_results.json +9 -0
models/__init__.py +1 -0
models/__pycache__/__init__.cpython-39.pyc +0 -0
models/__pycache__/modeling_rnnt.cpython-39.pyc +0 -0
models/modeling_rnnt.py +115 -0
process_asr_text_tokenizer.py +221 -0
requirements.txt +7 -0
run_ami.sh +38 -0
run_speech_recognition_rnnt.py +935 -0
scripts/run_batch_size_sweep.yaml +61 -0
scripts/run_common_voice_9.sh +38 -0

.gitattributes CHANGED Viewed

@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.nemo filter=lfs diff=lfs merge=lfs -text

__pycache__/process_asr_text_tokenizer.cpython-39.pyc ADDED Viewed

Binary file (3.95 kB). View file

all_results.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "epoch": 7.38,
+    "eval_loss": 8.706663131713867,
+    "eval_runtime": 970.2156,
+    "eval_samples": 13098,
+    "eval_samples_per_second": 13.5,
+    "eval_steps_per_second": 3.376,
+    "eval_wer": 0.20430683297635546,
+    "test_cer": 0.08093431359873023,
+    "test_loss": 5.917323112487793,
+    "test_runtime": 946.7263,
+    "test_samples": 12643,
+    "test_samples_per_second": 13.354,
+    "test_steps_per_second": 3.339,
+    "test_wer": 0.17709850666607363,
+    "train_loss": 10.025987887954182,
+    "train_runtime": 56856.134,
+    "train_samples": 108449,
+    "train_samples_per_second": 14.077,
+    "train_steps_per_second": 1.76
+}

check_bnb_install.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import bitsandbytes as bnb
+import torch
+p = torch.nn.Parameter(torch.rand(10, 10).cuda())
+a = torch.rand(10, 10).cuda()
+p1 = p.data.sum().item()
+adam = bnb.optim.Adam([p])
+out = a * p
+loss = out.sum()
+loss.backward()
+adam.step()
+p2 = p.data.sum().item()
+assert p1 != p2
+print('bnb: installed successfully!')

checkpoint-100000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75ca838bfd7e8d7e8ebc431190243148d186a5f1ed5cd674b751f6079710ab95
+size 5154565443

checkpoint-100000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0714299d2503f04c887174fcb2c5995d31c2a8dd3d887f5907696d7a91cbcb1a
+size 14503

checkpoint-100000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:591846b441d543caac3afc7202fecfc43bf20ba0c611a291457e9c81cc395399
+size 623

checkpoint-100000/stt_en_conformer_transducer_xlarge.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4ddd41c1adabfce64125bbf639cadda2f044651386a1060440b2e49caea9f52
+size 2577971200

checkpoint-100000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-100000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b64c669f66dd7a2e54d3001ce7e31c26cc60dd58136e8ce90e6055bd0ae15eb
+size 3503

checkpoint-20000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f20cc328e6cf018f92f3b71e11bf4a9364f5a247ee5d99d4a62354ede6a516
+size 5154563651

checkpoint-20000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb3410dde03074fae133541463bfebd7d0708693d5ffa17edc4fe4974c0f7eb
+size 14503

checkpoint-20000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:caeda3b27b783dbb84d9e4d82bc20bd764fb8fbed5023345d4c45d753ffa45b0
+size 623

checkpoint-20000/stt_en_conformer_transducer_xlarge.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06c6f31b89b77d8eaf30394215a6001e812460139f4276d335e97c10cc0b632e
+size 2577971200

checkpoint-20000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2425 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.4752526370140886,
+  "global_step": 20000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 1e-05,
+      "loss": 178.9465,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 2e-05,
+      "loss": 164.9707,
+      "step": 100
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 3e-05,
+      "loss": 142.2782,
+      "step": 150
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 4e-05,
+      "loss": 121.5122,
+      "step": 200
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 5e-05,
+      "loss": 91.8622,
+      "step": 250
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 6e-05,
+      "loss": 82.2062,
+      "step": 300
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 7e-05,
+      "loss": 72.6893,
+      "step": 350
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 8e-05,
+      "loss": 71.8709,
+      "step": 400
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 9e-05,
+      "loss": 69.9995,
+      "step": 450
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0001,
+      "loss": 70.6458,
+      "step": 500
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 9.994977448744865e-05,
+      "loss": 73.9929,
+      "step": 550
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 9.989954897489729e-05,
+      "loss": 66.52,
+      "step": 600
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 9.984932346234594e-05,
+      "loss": 65.8947,
+      "step": 650
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 9.979909794979458e-05,
+      "loss": 62.5809,
+      "step": 700
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 9.974887243724323e-05,
+      "loss": 61.212,
+      "step": 750
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 9.969864692469187e-05,
+      "loss": 68.2408,
+      "step": 800
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 9.964842141214051e-05,
+      "loss": 61.5308,
+      "step": 850
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 9.959819589958916e-05,
+      "loss": 58.9116,
+      "step": 900
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 9.95479703870378e-05,
+      "loss": 60.0702,
+      "step": 950
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 9.949774487448646e-05,
+      "loss": 57.6135,
+      "step": 1000
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 9.944751936193509e-05,
+      "loss": 50.9231,
+      "step": 1050
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 9.939729384938373e-05,
+      "loss": 51.187,
+      "step": 1100
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 9.934706833683238e-05,
+      "loss": 52.1127,
+      "step": 1150
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 9.929684282428102e-05,
+      "loss": 47.4608,
+      "step": 1200
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 9.924661731172968e-05,
+      "loss": 51.6108,
+      "step": 1250
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.919639179917831e-05,
+      "loss": 46.5874,
+      "step": 1300
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.914616628662697e-05,
+      "loss": 41.4706,
+      "step": 1350
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.90959407740756e-05,
+      "loss": 43.7544,
+      "step": 1400
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 9.904571526152426e-05,
+      "loss": 44.6039,
+      "step": 1450
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 9.899548974897289e-05,
+      "loss": 41.4384,
+      "step": 1500
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 9.894526423642154e-05,
+      "loss": 42.8289,
+      "step": 1550
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.889503872387019e-05,
+      "loss": 39.9726,
+      "step": 1600
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.884481321131882e-05,
+      "loss": 43.9533,
+      "step": 1650
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 9.879458769876748e-05,
+      "loss": 38.7605,
+      "step": 1700
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 9.87443621862161e-05,
+      "loss": 39.5425,
+      "step": 1750
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 9.869413667366476e-05,
+      "loss": 37.588,
+      "step": 1800
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.86439111611134e-05,
+      "loss": 39.7744,
+      "step": 1850
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.859368564856205e-05,
+      "loss": 38.2154,
+      "step": 1900
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.85434601360107e-05,
+      "loss": 35.0806,
+      "step": 1950
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.849323462345934e-05,
+      "loss": 39.061,
+      "step": 2000
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.844300911090798e-05,
+      "loss": 35.1544,
+      "step": 2050
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.839278359835663e-05,
+      "loss": 38.123,
+      "step": 2100
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 9.834255808580527e-05,
+      "loss": 33.1144,
+      "step": 2150
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 9.829233257325392e-05,
+      "loss": 34.3476,
+      "step": 2200
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.824210706070256e-05,
+      "loss": 29.5665,
+      "step": 2250
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.81918815481512e-05,
+      "loss": 35.8756,
+      "step": 2300
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.814165603559985e-05,
+      "loss": 37.2579,
+      "step": 2350
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.809143052304849e-05,
+      "loss": 33.6245,
+      "step": 2400
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.804120501049714e-05,
+      "loss": 35.6543,
+      "step": 2450
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.799097949794578e-05,
+      "loss": 36.7847,
+      "step": 2500
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 9.794075398539442e-05,
+      "loss": 33.463,
+      "step": 2550
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 9.789052847284307e-05,
+      "loss": 32.2215,
+      "step": 2600
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 9.784030296029171e-05,
+      "loss": 33.4301,
+      "step": 2650
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 9.779007744774036e-05,
+      "loss": 29.9579,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 9.773985193518901e-05,
+      "loss": 31.9141,
+      "step": 2750
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 9.768962642263764e-05,
+      "loss": 33.2049,
+      "step": 2800
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 9.763940091008629e-05,
+      "loss": 32.8774,
+      "step": 2850
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 9.758917539753493e-05,
+      "loss": 29.0858,
+      "step": 2900
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 9.753894988498358e-05,
+      "loss": 30.1145,
+      "step": 2950
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 9.748872437243222e-05,
+      "loss": 27.6986,
+      "step": 3000
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 9.743849885988087e-05,
+      "loss": 31.7807,
+      "step": 3050
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 9.738827334732952e-05,
+      "loss": 30.5108,
+      "step": 3100
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 9.733804783477815e-05,
+      "loss": 31.0909,
+      "step": 3150
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 9.728782232222681e-05,
+      "loss": 27.9057,
+      "step": 3200
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 9.723759680967544e-05,
+      "loss": 29.7323,
+      "step": 3250
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 9.71873712971241e-05,
+      "loss": 29.7527,
+      "step": 3300
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 9.713714578457273e-05,
+      "loss": 29.1442,
+      "step": 3350
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 9.708692027202137e-05,
+      "loss": 30.8906,
+      "step": 3400
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 9.703669475947003e-05,
+      "loss": 26.8419,
+      "step": 3450
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 9.698646924691866e-05,
+      "loss": 29.2181,
+      "step": 3500
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 9.693624373436732e-05,
+      "loss": 27.6549,
+      "step": 3550
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 9.688601822181595e-05,
+      "loss": 34.0701,
+      "step": 3600
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 9.683579270926461e-05,
+      "loss": 24.7487,
+      "step": 3650
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 9.678556719671325e-05,
+      "loss": 30.0266,
+      "step": 3700
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 9.67353416841619e-05,
+      "loss": 25.5011,
+      "step": 3750
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 9.668511617161054e-05,
+      "loss": 26.1437,
+      "step": 3800
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 9.663489065905918e-05,
+      "loss": 23.2303,
+      "step": 3850
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 9.658466514650783e-05,
+      "loss": 26.357,
+      "step": 3900
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 9.653443963395646e-05,
+      "loss": 27.2201,
+      "step": 3950
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 9.648421412140512e-05,
+      "loss": 25.5695,
+      "step": 4000
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 9.643398860885376e-05,
+      "loss": 24.8346,
+      "step": 4050
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 9.63837630963024e-05,
+      "loss": 22.3957,
+      "step": 4100
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 9.633353758375105e-05,
+      "loss": 24.9532,
+      "step": 4150
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 9.628331207119969e-05,
+      "loss": 23.1574,
+      "step": 4200
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 9.623308655864834e-05,
+      "loss": 23.7018,
+      "step": 4250
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 9.618286104609698e-05,
+      "loss": 25.1433,
+      "step": 4300
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 9.613263553354562e-05,
+      "loss": 25.0571,
+      "step": 4350
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 9.608241002099427e-05,
+      "loss": 24.2231,
+      "step": 4400
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 9.603218450844291e-05,
+      "loss": 23.0983,
+      "step": 4450
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 9.598195899589156e-05,
+      "loss": 25.0078,
+      "step": 4500
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 9.59317334833402e-05,
+      "loss": 20.6933,
+      "step": 4550
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 9.588150797078884e-05,
+      "loss": 23.6196,
+      "step": 4600
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 9.583128245823749e-05,
+      "loss": 25.2331,
+      "step": 4650
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 9.578105694568613e-05,
+      "loss": 24.7932,
+      "step": 4700
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 9.573083143313478e-05,
+      "loss": 24.3586,
+      "step": 4750
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 9.568060592058342e-05,
+      "loss": 22.7161,
+      "step": 4800
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 9.563038040803208e-05,
+      "loss": 22.4188,
+      "step": 4850
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 9.558015489548071e-05,
+      "loss": 21.6516,
+      "step": 4900
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 9.552992938292937e-05,
+      "loss": 21.78,
+      "step": 4950
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 9.5479703870378e-05,
+      "loss": 21.0172,
+      "step": 5000
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 9.542947835782665e-05,
+      "loss": 22.4624,
+      "step": 5050
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 9.537925284527528e-05,
+      "loss": 23.6615,
+      "step": 5100
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 9.532902733272393e-05,
+      "loss": 21.8091,
+      "step": 5150
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 9.527880182017259e-05,
+      "loss": 21.4173,
+      "step": 5200
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 9.522857630762122e-05,
+      "loss": 20.5415,
+      "step": 5250
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 9.517835079506987e-05,
+      "loss": 21.0639,
+      "step": 5300
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 9.51281252825185e-05,
+      "loss": 21.6078,
+      "step": 5350
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 9.507789976996716e-05,
+      "loss": 19.4142,
+      "step": 5400
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 9.50276742574158e-05,
+      "loss": 20.2504,
+      "step": 5450
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 9.497744874486445e-05,
+      "loss": 23.8683,
+      "step": 5500
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 9.49272232323131e-05,
+      "loss": 19.7559,
+      "step": 5550
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 9.487699771976174e-05,
+      "loss": 21.1743,
+      "step": 5600
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 9.482677220721038e-05,
+      "loss": 21.1908,
+      "step": 5650
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 9.477654669465901e-05,
+      "loss": 20.9591,
+      "step": 5700
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 9.472632118210767e-05,
+      "loss": 20.9036,
+      "step": 5750
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 9.46760956695563e-05,
+      "loss": 22.249,
+      "step": 5800
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 9.462587015700496e-05,
+      "loss": 19.1093,
+      "step": 5850
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 9.45756446444536e-05,
+      "loss": 21.2714,
+      "step": 5900
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 9.452541913190225e-05,
+      "loss": 21.3794,
+      "step": 5950
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 9.447519361935089e-05,
+      "loss": 20.0326,
+      "step": 6000
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 9.442496810679954e-05,
+      "loss": 19.8004,
+      "step": 6050
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 9.437474259424818e-05,
+      "loss": 19.0229,
+      "step": 6100
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 9.432451708169682e-05,
+      "loss": 17.6587,
+      "step": 6150
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 9.427429156914547e-05,
+      "loss": 21.9247,
+      "step": 6200
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 9.422406605659411e-05,
+      "loss": 19.743,
+      "step": 6250
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 9.417384054404276e-05,
+      "loss": 22.9746,
+      "step": 6300
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 9.41236150314914e-05,
+      "loss": 19.6693,
+      "step": 6350
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 9.407338951894004e-05,
+      "loss": 19.1141,
+      "step": 6400
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 9.402316400638869e-05,
+      "loss": 18.3847,
+      "step": 6450
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 9.397293849383733e-05,
+      "loss": 18.9357,
+      "step": 6500
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 9.392271298128598e-05,
+      "loss": 18.9316,
+      "step": 6550
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 9.387248746873462e-05,
+      "loss": 20.9141,
+      "step": 6600
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 9.382226195618326e-05,
+      "loss": 18.7472,
+      "step": 6650
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 9.377203644363192e-05,
+      "loss": 18.8577,
+      "step": 6700
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 9.372181093108055e-05,
+      "loss": 17.8061,
+      "step": 6750
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 9.36715854185292e-05,
+      "loss": 19.4687,
+      "step": 6800
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 9.362135990597784e-05,
+      "loss": 19.5103,
+      "step": 6850
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 9.357113439342648e-05,
+      "loss": 18.5319,
+      "step": 6900
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 9.352090888087514e-05,
+      "loss": 20.16,
+      "step": 6950
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 9.347068336832377e-05,
+      "loss": 18.1913,
+      "step": 7000
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 9.342045785577243e-05,
+      "loss": 21.341,
+      "step": 7050
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 9.337023234322106e-05,
+      "loss": 16.7701,
+      "step": 7100
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 9.332000683066972e-05,
+      "loss": 18.045,
+      "step": 7150
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 9.326978131811835e-05,
+      "loss": 16.0393,
+      "step": 7200
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 9.3219555805567e-05,
+      "loss": 17.4833,
+      "step": 7250
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 9.316933029301565e-05,
+      "loss": 17.3978,
+      "step": 7300
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 9.31191047804643e-05,
+      "loss": 18.2649,
+      "step": 7350
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 9.306887926791294e-05,
+      "loss": 16.3891,
+      "step": 7400
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 9.301865375536157e-05,
+      "loss": 21.4399,
+      "step": 7450
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 9.296842824281023e-05,
+      "loss": 16.3082,
+      "step": 7500
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 9.291820273025886e-05,
+      "loss": 14.8713,
+      "step": 7550
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 9.286797721770751e-05,
+      "loss": 16.3099,
+      "step": 7600
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 9.281775170515616e-05,
+      "loss": 17.8771,
+      "step": 7650
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 9.27675261926048e-05,
+      "loss": 17.1421,
+      "step": 7700
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 9.271730068005345e-05,
+      "loss": 16.6478,
+      "step": 7750
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 9.266707516750209e-05,
+      "loss": 15.3247,
+      "step": 7800
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 9.261684965495073e-05,
+      "loss": 17.6577,
+      "step": 7850
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 9.256662414239938e-05,
+      "loss": 18.8549,
+      "step": 7900
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 9.251639862984802e-05,
+      "loss": 17.4187,
+      "step": 7950
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 9.246617311729667e-05,
+      "loss": 15.6643,
+      "step": 8000
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 9.241594760474531e-05,
+      "loss": 17.1987,
+      "step": 8050
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 9.236572209219396e-05,
+      "loss": 18.1712,
+      "step": 8100
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 9.23154965796426e-05,
+      "loss": 15.8015,
+      "step": 8150
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 9.226527106709124e-05,
+      "loss": 19.064,
+      "step": 8200
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 9.221504555453989e-05,
+      "loss": 18.2748,
+      "step": 8250
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 9.216482004198853e-05,
+      "loss": 15.0679,
+      "step": 8300
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 9.211459452943718e-05,
+      "loss": 17.995,
+      "step": 8350
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 9.206436901688582e-05,
+      "loss": 17.467,
+      "step": 8400
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 9.201414350433448e-05,
+      "loss": 18.6665,
+      "step": 8450
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 9.196391799178311e-05,
+      "loss": 17.2848,
+      "step": 8500
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 9.191369247923175e-05,
+      "loss": 14.4767,
+      "step": 8550
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 9.18634669666804e-05,
+      "loss": 17.5444,
+      "step": 8600
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 9.181324145412904e-05,
+      "loss": 14.4661,
+      "step": 8650
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 9.176301594157768e-05,
+      "loss": 16.3339,
+      "step": 8700
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 9.171279042902633e-05,
+      "loss": 17.5122,
+      "step": 8750
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 9.166256491647499e-05,
+      "loss": 16.7631,
+      "step": 8800
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 9.161233940392362e-05,
+      "loss": 16.5193,
+      "step": 8850
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 9.156211389137227e-05,
+      "loss": 17.8364,
+      "step": 8900
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 9.15118883788209e-05,
+      "loss": 16.2916,
+      "step": 8950
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 9.146166286626956e-05,
+      "loss": 14.1719,
+      "step": 9000
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 9.141143735371819e-05,
+      "loss": 18.2987,
+      "step": 9050
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 9.136121184116684e-05,
+      "loss": 17.4248,
+      "step": 9100
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 9.13109863286155e-05,
+      "loss": 16.1862,
+      "step": 9150
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 9.126076081606412e-05,
+      "loss": 16.3134,
+      "step": 9200
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 9.121053530351278e-05,
+      "loss": 14.9158,
+      "step": 9250
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 9.116030979096141e-05,
+      "loss": 15.2504,
+      "step": 9300
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 9.111008427841007e-05,
+      "loss": 14.1967,
+      "step": 9350
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 9.105985876585871e-05,
+      "loss": 17.3165,
+      "step": 9400
+    },
+    {
+      "epoch": 0.7,
+      "learning_rate": 9.100963325330736e-05,
+      "loss": 14.5912,
+      "step": 9450
+    },
+    {
+      "epoch": 0.7,
+      "learning_rate": 9.0959407740756e-05,
+      "loss": 17.5593,
+      "step": 9500
+    },
+    {
+      "epoch": 0.7,
+      "learning_rate": 9.090918222820465e-05,
+      "loss": 16.3421,
+      "step": 9550
+    },
+    {
+      "epoch": 0.71,
+      "learning_rate": 9.085895671565329e-05,
+      "loss": 16.2821,
+      "step": 9600
+    },
+    {
+      "epoch": 0.71,
+      "learning_rate": 9.080873120310192e-05,
+      "loss": 16.4985,
+      "step": 9650
+    },
+    {
+      "epoch": 0.72,
+      "learning_rate": 9.075850569055058e-05,
+      "loss": 16.1138,
+      "step": 9700
+    },
+    {
+      "epoch": 0.72,
+      "learning_rate": 9.070828017799922e-05,
+      "loss": 16.3997,
+      "step": 9750
+    },
+    {
+      "epoch": 0.72,
+      "learning_rate": 9.065805466544787e-05,
+      "loss": 15.518,
+      "step": 9800
+    },
+    {
+      "epoch": 0.73,
+      "learning_rate": 9.060782915289651e-05,
+      "loss": 13.8424,
+      "step": 9850
+    },
+    {
+      "epoch": 0.73,
+      "learning_rate": 9.055760364034515e-05,
+      "loss": 15.0784,
+      "step": 9900
+    },
+    {
+      "epoch": 0.73,
+      "learning_rate": 9.05073781277938e-05,
+      "loss": 14.0163,
+      "step": 9950
+    },
+    {
+      "epoch": 0.74,
+      "learning_rate": 9.045715261524244e-05,
+      "loss": 16.7863,
+      "step": 10000
+    },
+    {
+      "epoch": 0.74,
+      "learning_rate": 9.040692710269109e-05,
+      "loss": 13.6715,
+      "step": 10050
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 9.035670159013973e-05,
+      "loss": 15.1071,
+      "step": 10100
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 9.030647607758837e-05,
+      "loss": 14.2658,
+      "step": 10150
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 9.025625056503703e-05,
+      "loss": 15.1115,
+      "step": 10200
+    },
+    {
+      "epoch": 0.76,
+      "learning_rate": 9.020602505248566e-05,
+      "loss": 14.028,
+      "step": 10250
+    },
+    {
+      "epoch": 0.76,
+      "learning_rate": 9.015579953993431e-05,
+      "loss": 13.3066,
+      "step": 10300
+    },
+    {
+      "epoch": 0.76,
+      "learning_rate": 9.010557402738295e-05,
+      "loss": 14.1185,
+      "step": 10350
+    },
+    {
+      "epoch": 0.77,
+      "learning_rate": 9.00553485148316e-05,
+      "loss": 14.061,
+      "step": 10400
+    },
+    {
+      "epoch": 0.77,
+      "learning_rate": 9.000512300228024e-05,
+      "loss": 15.2439,
+      "step": 10450
+    },
+    {
+      "epoch": 0.77,
+      "learning_rate": 8.995489748972888e-05,
+      "loss": 13.3617,
+      "step": 10500
+    },
+    {
+      "epoch": 0.78,
+      "learning_rate": 8.990467197717754e-05,
+      "loss": 14.5514,
+      "step": 10550
+    },
+    {
+      "epoch": 0.78,
+      "learning_rate": 8.985444646462617e-05,
+      "loss": 15.2426,
+      "step": 10600
+    },
+    {
+      "epoch": 0.79,
+      "learning_rate": 8.980422095207483e-05,
+      "loss": 16.6418,
+      "step": 10650
+    },
+    {
+      "epoch": 0.79,
+      "learning_rate": 8.975399543952346e-05,
+      "loss": 13.3146,
+      "step": 10700
+    },
+    {
+      "epoch": 0.79,
+      "learning_rate": 8.970376992697212e-05,
+      "loss": 14.9333,
+      "step": 10750
+    },
+    {
+      "epoch": 0.8,
+      "learning_rate": 8.965354441442075e-05,
+      "loss": 14.4502,
+      "step": 10800
+    },
+    {
+      "epoch": 0.8,
+      "learning_rate": 8.960331890186939e-05,
+      "loss": 14.7886,
+      "step": 10850
+    },
+    {
+      "epoch": 0.8,
+      "learning_rate": 8.955309338931805e-05,
+      "loss": 15.0266,
+      "step": 10900
+    },
+    {
+      "epoch": 0.81,
+      "learning_rate": 8.950286787676668e-05,
+      "loss": 14.543,
+      "step": 10950
+    },
+    {
+      "epoch": 0.81,
+      "learning_rate": 8.945264236421534e-05,
+      "loss": 15.8078,
+      "step": 11000
+    },
+    {
+      "epoch": 0.82,
+      "learning_rate": 8.940241685166397e-05,
+      "loss": 13.6052,
+      "step": 11050
+    },
+    {
+      "epoch": 0.82,
+      "learning_rate": 8.935219133911263e-05,
+      "loss": 14.2995,
+      "step": 11100
+    },
+    {
+      "epoch": 0.82,
+      "learning_rate": 8.930196582656126e-05,
+      "loss": 15.732,
+      "step": 11150
+    },
+    {
+      "epoch": 0.83,
+      "learning_rate": 8.925174031400991e-05,
+      "loss": 14.0573,
+      "step": 11200
+    },
+    {
+      "epoch": 0.83,
+      "learning_rate": 8.920151480145856e-05,
+      "loss": 17.5941,
+      "step": 11250
+    },
+    {
+      "epoch": 0.83,
+      "learning_rate": 8.91512892889072e-05,
+      "loss": 14.7829,
+      "step": 11300
+    },
+    {
+      "epoch": 0.84,
+      "learning_rate": 8.910106377635585e-05,
+      "loss": 14.6669,
+      "step": 11350
+    },
+    {
+      "epoch": 0.84,
+      "learning_rate": 8.905083826380448e-05,
+      "loss": 14.3315,
+      "step": 11400
+    },
+    {
+      "epoch": 0.84,
+      "learning_rate": 8.900061275125313e-05,
+      "loss": 14.2639,
+      "step": 11450
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 8.895038723870176e-05,
+      "loss": 14.3226,
+      "step": 11500
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 8.890016172615042e-05,
+      "loss": 14.4975,
+      "step": 11550
+    },
+    {
+      "epoch": 0.86,
+      "learning_rate": 8.884993621359907e-05,
+      "loss": 14.8436,
+      "step": 11600
+    },
+    {
+      "epoch": 0.86,
+      "learning_rate": 8.879971070104771e-05,
+      "loss": 13.8481,
+      "step": 11650
+    },
+    {
+      "epoch": 0.86,
+      "learning_rate": 8.874948518849635e-05,
+      "loss": 12.8151,
+      "step": 11700
+    },
+    {
+      "epoch": 0.87,
+      "learning_rate": 8.8699259675945e-05,
+      "loss": 13.1659,
+      "step": 11750
+    },
+    {
+      "epoch": 0.87,
+      "learning_rate": 8.864903416339364e-05,
+      "loss": 15.0919,
+      "step": 11800
+    },
+    {
+      "epoch": 0.87,
+      "learning_rate": 8.859880865084229e-05,
+      "loss": 14.4382,
+      "step": 11850
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 8.854858313829093e-05,
+      "loss": 14.0989,
+      "step": 11900
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 8.849835762573957e-05,
+      "loss": 14.5763,
+      "step": 11950
+    },
+    {
+      "epoch": 0.89,
+      "learning_rate": 8.844813211318822e-05,
+      "loss": 13.4144,
+      "step": 12000
+    },
+    {
+      "epoch": 0.89,
+      "learning_rate": 8.839790660063686e-05,
+      "loss": 15.6018,
+      "step": 12050
+    },
+    {
+      "epoch": 0.89,
+      "learning_rate": 8.83476810880855e-05,
+      "loss": 14.7849,
+      "step": 12100
+    },
+    {
+      "epoch": 0.9,
+      "learning_rate": 8.829745557553415e-05,
+      "loss": 14.441,
+      "step": 12150
+    },
+    {
+      "epoch": 0.9,
+      "learning_rate": 8.82472300629828e-05,
+      "loss": 14.2135,
+      "step": 12200
+    },
+    {
+      "epoch": 0.9,
+      "learning_rate": 8.819700455043144e-05,
+      "loss": 17.1245,
+      "step": 12250
+    },
+    {
+      "epoch": 0.91,
+      "learning_rate": 8.814677903788008e-05,
+      "loss": 14.6629,
+      "step": 12300
+    },
+    {
+      "epoch": 0.91,
+      "learning_rate": 8.809655352532873e-05,
+      "loss": 16.6715,
+      "step": 12350
+    },
+    {
+      "epoch": 0.91,
+      "learning_rate": 8.804632801277738e-05,
+      "loss": 13.0133,
+      "step": 12400
+    },
+    {
+      "epoch": 0.92,
+      "learning_rate": 8.799610250022601e-05,
+      "loss": 14.1551,
+      "step": 12450
+    },
+    {
+      "epoch": 0.92,
+      "learning_rate": 8.794587698767466e-05,
+      "loss": 14.019,
+      "step": 12500
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 8.78956514751233e-05,
+      "loss": 14.4279,
+      "step": 12550
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 8.784542596257195e-05,
+      "loss": 12.5293,
+      "step": 12600
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 8.77952004500206e-05,
+      "loss": 15.0403,
+      "step": 12650
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 8.774497493746924e-05,
+      "loss": 13.8193,
+      "step": 12700
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 8.769474942491789e-05,
+      "loss": 13.1564,
+      "step": 12750
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 8.764452391236652e-05,
+      "loss": 14.6415,
+      "step": 12800
+    },
+    {
+      "epoch": 0.95,
+      "learning_rate": 8.759429839981518e-05,
+      "loss": 12.2339,
+      "step": 12850
+    },
+    {
+      "epoch": 0.95,
+      "learning_rate": 8.754407288726381e-05,
+      "loss": 12.1604,
+      "step": 12900
+    },
+    {
+      "epoch": 0.96,
+      "learning_rate": 8.749384737471247e-05,
+      "loss": 15.4939,
+      "step": 12950
+    },
+    {
+      "epoch": 0.96,
+      "learning_rate": 8.744362186216111e-05,
+      "loss": 13.9713,
+      "step": 13000
+    },
+    {
+      "epoch": 0.96,
+      "learning_rate": 8.739339634960976e-05,
+      "loss": 14.0986,
+      "step": 13050
+    },
+    {
+      "epoch": 0.97,
+      "learning_rate": 8.73431708370584e-05,
+      "loss": 13.6334,
+      "step": 13100
+    },
+    {
+      "epoch": 0.97,
+      "learning_rate": 8.729294532450703e-05,
+      "loss": 13.5201,
+      "step": 13150
+    },
+    {
+      "epoch": 0.97,
+      "learning_rate": 8.724271981195569e-05,
+      "loss": 14.3793,
+      "step": 13200
+    },
+    {
+      "epoch": 0.98,
+      "learning_rate": 8.719249429940432e-05,
+      "loss": 13.1741,
+      "step": 13250
+    },
+    {
+      "epoch": 0.98,
+      "learning_rate": 8.714226878685298e-05,
+      "loss": 11.7782,
+      "step": 13300
+    },
+    {
+      "epoch": 0.98,
+      "learning_rate": 8.709204327430162e-05,
+      "loss": 12.2758,
+      "step": 13350
+    },
+    {
+      "epoch": 0.99,
+      "learning_rate": 8.704181776175027e-05,
+      "loss": 13.1723,
+      "step": 13400
+    },
+    {
+      "epoch": 0.99,
+      "learning_rate": 8.699159224919891e-05,
+      "loss": 14.0858,
+      "step": 13450
+    },
+    {
+      "epoch": 1.0,
+      "learning_rate": 8.694136673664755e-05,
+      "loss": 11.2836,
+      "step": 13500
+    },
+    {
+      "epoch": 1.0,
+      "learning_rate": 8.68911412240962e-05,
+      "loss": 15.7226,
+      "step": 13550
+    },
+    {
+      "epoch": 1.0,
+      "learning_rate": 8.684091571154484e-05,
+      "loss": 15.8889,
+      "step": 13600
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 8.679069019899349e-05,
+      "loss": 12.2185,
+      "step": 13650
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 8.674046468644213e-05,
+      "loss": 11.4647,
+      "step": 13700
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 8.669023917389077e-05,
+      "loss": 13.1238,
+      "step": 13750
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 8.664001366133942e-05,
+      "loss": 11.909,
+      "step": 13800
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 8.658978814878806e-05,
+      "loss": 12.5478,
+      "step": 13850
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 8.65395626362367e-05,
+      "loss": 13.017,
+      "step": 13900
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 8.648933712368535e-05,
+      "loss": 12.9134,
+      "step": 13950
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 8.6439111611134e-05,
+      "loss": 13.3485,
+      "step": 14000
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 8.638888609858264e-05,
+      "loss": 11.4706,
+      "step": 14050
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 8.633866058603128e-05,
+      "loss": 11.1063,
+      "step": 14100
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 8.628843507347994e-05,
+      "loss": 12.7408,
+      "step": 14150
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 8.623820956092857e-05,
+      "loss": 12.0689,
+      "step": 14200
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 8.618798404837721e-05,
+      "loss": 11.0724,
+      "step": 14250
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 8.613775853582586e-05,
+      "loss": 12.5685,
+      "step": 14300
+    },
+    {
+      "epoch": 1.06,
+      "learning_rate": 8.60875330232745e-05,
+      "loss": 12.7776,
+      "step": 14350
+    },
+    {
+      "epoch": 1.06,
+      "learning_rate": 8.603730751072315e-05,
+      "loss": 11.3066,
+      "step": 14400
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 8.598708199817179e-05,
+      "loss": 13.06,
+      "step": 14450
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 8.593685648562045e-05,
+      "loss": 15.6523,
+      "step": 14500
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 8.588663097306908e-05,
+      "loss": 12.019,
+      "step": 14550
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 8.583640546051774e-05,
+      "loss": 11.0941,
+      "step": 14600
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 8.578617994796637e-05,
+      "loss": 12.4755,
+      "step": 14650
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 8.573595443541502e-05,
+      "loss": 13.7012,
+      "step": 14700
+    },
+    {
+      "epoch": 1.09,
+      "learning_rate": 8.568572892286366e-05,
+      "loss": 12.2024,
+      "step": 14750
+    },
+    {
+      "epoch": 1.09,
+      "learning_rate": 8.56355034103123e-05,
+      "loss": 12.4744,
+      "step": 14800
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 8.558527789776096e-05,
+      "loss": 12.3234,
+      "step": 14850
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 8.553505238520959e-05,
+      "loss": 12.5616,
+      "step": 14900
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 8.548482687265824e-05,
+      "loss": 11.9559,
+      "step": 14950
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 8.543460136010688e-05,
+      "loss": 12.0734,
+      "step": 15000
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 8.538437584755553e-05,
+      "loss": 13.0341,
+      "step": 15050
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 8.533415033500418e-05,
+      "loss": 12.7406,
+      "step": 15100
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 8.528392482245282e-05,
+      "loss": 11.7258,
+      "step": 15150
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 8.523369930990147e-05,
+      "loss": 11.8709,
+      "step": 15200
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 8.518347379735011e-05,
+      "loss": 11.7021,
+      "step": 15250
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 8.513324828479875e-05,
+      "loss": 13.2674,
+      "step": 15300
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 8.508302277224738e-05,
+      "loss": 11.9099,
+      "step": 15350
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 8.503279725969604e-05,
+      "loss": 11.7841,
+      "step": 15400
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 8.498257174714469e-05,
+      "loss": 11.9573,
+      "step": 15450
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 8.493234623459333e-05,
+      "loss": 11.7211,
+      "step": 15500
+    },
+    {
+      "epoch": 1.15,
+      "learning_rate": 8.488212072204197e-05,
+      "loss": 12.3513,
+      "step": 15550
+    },
+    {
+      "epoch": 1.15,
+      "learning_rate": 8.483189520949062e-05,
+      "loss": 11.0709,
+      "step": 15600
+    },
+    {
+      "epoch": 1.15,
+      "learning_rate": 8.478166969693926e-05,
+      "loss": 11.6544,
+      "step": 15650
+    },
+    {
+      "epoch": 1.16,
+      "learning_rate": 8.47314441843879e-05,
+      "loss": 11.8285,
+      "step": 15700
+    },
+    {
+      "epoch": 1.16,
+      "learning_rate": 8.468121867183655e-05,
+      "loss": 10.4208,
+      "step": 15750
+    },
+    {
+      "epoch": 1.17,
+      "learning_rate": 8.46309931592852e-05,
+      "loss": 10.7821,
+      "step": 15800
+    },
+    {
+      "epoch": 1.17,
+      "learning_rate": 8.458076764673384e-05,
+      "loss": 13.2724,
+      "step": 15850
+    },
+    {
+      "epoch": 1.17,
+      "learning_rate": 8.45305421341825e-05,
+      "loss": 10.9219,
+      "step": 15900
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 8.448031662163113e-05,
+      "loss": 12.2532,
+      "step": 15950
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 8.443009110907977e-05,
+      "loss": 11.0132,
+      "step": 16000
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 8.437986559652841e-05,
+      "loss": 12.319,
+      "step": 16050
+    },
+    {
+      "epoch": 1.19,
+      "learning_rate": 8.432964008397706e-05,
+      "loss": 12.9871,
+      "step": 16100
+    },
+    {
+      "epoch": 1.19,
+      "learning_rate": 8.42794145714257e-05,
+      "loss": 12.0625,
+      "step": 16150
+    },
+    {
+      "epoch": 1.19,
+      "learning_rate": 8.422918905887435e-05,
+      "loss": 13.4629,
+      "step": 16200
+    },
+    {
+      "epoch": 1.2,
+      "learning_rate": 8.4178963546323e-05,
+      "loss": 10.9291,
+      "step": 16250
+    },
+    {
+      "epoch": 1.2,
+      "learning_rate": 8.412873803377163e-05,
+      "loss": 13.7719,
+      "step": 16300
+    },
+    {
+      "epoch": 1.21,
+      "learning_rate": 8.407851252122029e-05,
+      "loss": 11.3634,
+      "step": 16350
+    },
+    {
+      "epoch": 1.21,
+      "learning_rate": 8.402828700866892e-05,
+      "loss": 12.7941,
+      "step": 16400
+    },
+    {
+      "epoch": 1.21,
+      "learning_rate": 8.397806149611758e-05,
+      "loss": 11.8863,
+      "step": 16450
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 8.392783598356621e-05,
+      "loss": 9.5225,
+      "step": 16500
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 8.387761047101485e-05,
+      "loss": 12.983,
+      "step": 16550
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 8.382738495846351e-05,
+      "loss": 11.8489,
+      "step": 16600
+    },
+    {
+      "epoch": 1.23,
+      "learning_rate": 8.377715944591214e-05,
+      "loss": 11.8122,
+      "step": 16650
+    },
+    {
+      "epoch": 1.23,
+      "learning_rate": 8.37269339333608e-05,
+      "loss": 12.3387,
+      "step": 16700
+    },
+    {
+      "epoch": 1.24,
+      "learning_rate": 8.367670842080943e-05,
+      "loss": 13.4648,
+      "step": 16750
+    },
+    {
+      "epoch": 1.24,
+      "learning_rate": 8.362648290825809e-05,
+      "loss": 10.2301,
+      "step": 16800
+    },
+    {
+      "epoch": 1.24,
+      "learning_rate": 8.357625739570672e-05,
+      "loss": 11.492,
+      "step": 16850
+    },
+    {
+      "epoch": 1.25,
+      "learning_rate": 8.352603188315538e-05,
+      "loss": 12.5997,
+      "step": 16900
+    },
+    {
+      "epoch": 1.25,
+      "learning_rate": 8.347580637060402e-05,
+      "loss": 11.5588,
+      "step": 16950
+    },
+    {
+      "epoch": 1.25,
+      "learning_rate": 8.342558085805266e-05,
+      "loss": 11.8627,
+      "step": 17000
+    },
+    {
+      "epoch": 1.26,
+      "learning_rate": 8.337535534550131e-05,
+      "loss": 13.2469,
+      "step": 17050
+    },
+    {
+      "epoch": 1.26,
+      "learning_rate": 8.332512983294994e-05,
+      "loss": 10.4327,
+      "step": 17100
+    },
+    {
+      "epoch": 1.27,
+      "learning_rate": 8.32749043203986e-05,
+      "loss": 12.7566,
+      "step": 17150
+    },
+    {
+      "epoch": 1.27,
+      "learning_rate": 8.322467880784723e-05,
+      "loss": 11.0729,
+      "step": 17200
+    },
+    {
+      "epoch": 1.27,
+      "learning_rate": 8.317445329529588e-05,
+      "loss": 12.3484,
+      "step": 17250
+    },
+    {
+      "epoch": 1.28,
+      "learning_rate": 8.312422778274453e-05,
+      "loss": 10.5193,
+      "step": 17300
+    },
+    {
+      "epoch": 1.28,
+      "learning_rate": 8.307400227019317e-05,
+      "loss": 12.2369,
+      "step": 17350
+    },
+    {
+      "epoch": 1.28,
+      "learning_rate": 8.302377675764182e-05,
+      "loss": 12.2976,
+      "step": 17400
+    },
+    {
+      "epoch": 1.29,
+      "learning_rate": 8.297355124509046e-05,
+      "loss": 12.3852,
+      "step": 17450
+    },
+    {
+      "epoch": 1.29,
+      "learning_rate": 8.29233257325391e-05,
+      "loss": 11.2137,
+      "step": 17500
+    },
+    {
+      "epoch": 1.29,
+      "learning_rate": 8.287310021998775e-05,
+      "loss": 11.609,
+      "step": 17550
+    },
+    {
+      "epoch": 1.3,
+      "learning_rate": 8.282287470743639e-05,
+      "loss": 13.3339,
+      "step": 17600
+    },
+    {
+      "epoch": 1.3,
+      "learning_rate": 8.277264919488504e-05,
+      "loss": 11.4263,
+      "step": 17650
+    },
+    {
+      "epoch": 1.31,
+      "learning_rate": 8.272242368233368e-05,
+      "loss": 12.6949,
+      "step": 17700
+    },
+    {
+      "epoch": 1.31,
+      "learning_rate": 8.267219816978233e-05,
+      "loss": 11.4767,
+      "step": 17750
+    },
+    {
+      "epoch": 1.31,
+      "learning_rate": 8.262197265723097e-05,
+      "loss": 12.2225,
+      "step": 17800
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 8.257174714467961e-05,
+      "loss": 11.0755,
+      "step": 17850
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 8.252152163212826e-05,
+      "loss": 11.9677,
+      "step": 17900
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 8.24712961195769e-05,
+      "loss": 11.098,
+      "step": 17950
+    },
+    {
+      "epoch": 1.33,
+      "learning_rate": 8.242107060702555e-05,
+      "loss": 11.1102,
+      "step": 18000
+    },
+    {
+      "epoch": 1.33,
+      "learning_rate": 8.237084509447419e-05,
+      "loss": 11.4985,
+      "step": 18050
+    },
+    {
+      "epoch": 1.34,
+      "learning_rate": 8.232061958192285e-05,
+      "loss": 11.7356,
+      "step": 18100
+    },
+    {
+      "epoch": 1.34,
+      "learning_rate": 8.227039406937148e-05,
+      "loss": 11.3336,
+      "step": 18150
+    },
+    {
+      "epoch": 1.34,
+      "learning_rate": 8.222016855682012e-05,
+      "loss": 11.0448,
+      "step": 18200
+    },
+    {
+      "epoch": 1.35,
+      "learning_rate": 8.216994304426877e-05,
+      "loss": 10.9986,
+      "step": 18250
+    },
+    {
+      "epoch": 1.35,
+      "learning_rate": 8.211971753171741e-05,
+      "loss": 10.768,
+      "step": 18300
+    },
+    {
+      "epoch": 1.35,
+      "learning_rate": 8.206949201916607e-05,
+      "loss": 11.6844,
+      "step": 18350
+    },
+    {
+      "epoch": 1.36,
+      "learning_rate": 8.20192665066147e-05,
+      "loss": 11.5615,
+      "step": 18400
+    },
+    {
+      "epoch": 1.36,
+      "learning_rate": 8.196904099406336e-05,
+      "loss": 11.4019,
+      "step": 18450
+    },
+    {
+      "epoch": 1.36,
+      "learning_rate": 8.191881548151199e-05,
+      "loss": 12.1784,
+      "step": 18500
+    },
+    {
+      "epoch": 1.37,
+      "learning_rate": 8.186858996896064e-05,
+      "loss": 12.4565,
+      "step": 18550
+    },
+    {
+      "epoch": 1.37,
+      "learning_rate": 8.181836445640927e-05,
+      "loss": 11.0557,
+      "step": 18600
+    },
+    {
+      "epoch": 1.38,
+      "learning_rate": 8.176813894385793e-05,
+      "loss": 12.1892,
+      "step": 18650
+    },
+    {
+      "epoch": 1.38,
+      "learning_rate": 8.171791343130658e-05,
+      "loss": 12.0531,
+      "step": 18700
+    },
+    {
+      "epoch": 1.38,
+      "learning_rate": 8.166768791875522e-05,
+      "loss": 10.1791,
+      "step": 18750
+    },
+    {
+      "epoch": 1.39,
+      "learning_rate": 8.161746240620386e-05,
+      "loss": 11.2501,
+      "step": 18800
+    },
+    {
+      "epoch": 1.39,
+      "learning_rate": 8.15672368936525e-05,
+      "loss": 9.92,
+      "step": 18850
+    },
+    {
+      "epoch": 1.39,
+      "learning_rate": 8.151701138110115e-05,
+      "loss": 10.0603,
+      "step": 18900
+    },
+    {
+      "epoch": 1.4,
+      "learning_rate": 8.146678586854978e-05,
+      "loss": 10.9477,
+      "step": 18950
+    },
+    {
+      "epoch": 1.4,
+      "learning_rate": 8.141656035599844e-05,
+      "loss": 9.7579,
+      "step": 19000
+    },
+    {
+      "epoch": 1.41,
+      "learning_rate": 8.136633484344708e-05,
+      "loss": 11.243,
+      "step": 19050
+    },
+    {
+      "epoch": 1.41,
+      "learning_rate": 8.131610933089573e-05,
+      "loss": 11.0069,
+      "step": 19100
+    },
+    {
+      "epoch": 1.41,
+      "learning_rate": 8.126588381834437e-05,
+      "loss": 9.7387,
+      "step": 19150
+    },
+    {
+      "epoch": 1.42,
+      "learning_rate": 8.121565830579302e-05,
+      "loss": 11.4624,
+      "step": 19200
+    },
+    {
+      "epoch": 1.42,
+      "learning_rate": 8.116543279324166e-05,
+      "loss": 12.1299,
+      "step": 19250
+    },
+    {
+      "epoch": 1.42,
+      "learning_rate": 8.11152072806903e-05,
+      "loss": 12.2796,
+      "step": 19300
+    },
+    {
+      "epoch": 1.43,
+      "learning_rate": 8.106498176813895e-05,
+      "loss": 10.3295,
+      "step": 19350
+    },
+    {
+      "epoch": 1.43,
+      "learning_rate": 8.101475625558759e-05,
+      "loss": 10.0709,
+      "step": 19400
+    },
+    {
+      "epoch": 1.43,
+      "learning_rate": 8.096453074303624e-05,
+      "loss": 11.0725,
+      "step": 19450
+    },
+    {
+      "epoch": 1.44,
+      "learning_rate": 8.091430523048488e-05,
+      "loss": 10.7882,
+      "step": 19500
+    },
+    {
+      "epoch": 1.44,
+      "learning_rate": 8.086407971793352e-05,
+      "loss": 11.4124,
+      "step": 19550
+    },
+    {
+      "epoch": 1.45,
+      "learning_rate": 8.081385420538217e-05,
+      "loss": 10.4941,
+      "step": 19600
+    },
+    {
+      "epoch": 1.45,
+      "learning_rate": 8.076362869283081e-05,
+      "loss": 11.8687,
+      "step": 19650
+    },
+    {
+      "epoch": 1.45,
+      "learning_rate": 8.071340318027946e-05,
+      "loss": 11.3221,
+      "step": 19700
+    },
+    {
+      "epoch": 1.46,
+      "learning_rate": 8.06631776677281e-05,
+      "loss": 10.2167,
+      "step": 19750
+    },
+    {
+      "epoch": 1.46,
+      "learning_rate": 8.061295215517675e-05,
+      "loss": 10.5425,
+      "step": 19800
+    },
+    {
+      "epoch": 1.46,
+      "learning_rate": 8.05627266426254e-05,
+      "loss": 11.2982,
+      "step": 19850
+    },
+    {
+      "epoch": 1.47,
+      "learning_rate": 8.051250113007403e-05,
+      "loss": 12.0685,
+      "step": 19900
+    },
+    {
+      "epoch": 1.47,
+      "learning_rate": 8.046227561752268e-05,
+      "loss": 10.6613,
+      "step": 19950
+    },
+    {
+      "epoch": 1.48,
+      "learning_rate": 8.041205010497132e-05,
+      "loss": 10.8245,
+      "step": 20000
+    },
+    {
+      "epoch": 1.48,
+      "eval_loss": 10.409339904785156,
+      "eval_runtime": 890.9956,
+      "eval_samples_per_second": 14.7,
+      "eval_steps_per_second": 3.676,
+      "eval_wer": 0.2624627273109067,
+      "step": 20000
+    }
+  ],
+  "max_steps": 100051,
+  "num_train_epochs": 8,
+  "total_flos": 0.0,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-20000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b64c669f66dd7a2e54d3001ce7e31c26cc60dd58136e8ce90e6055bd0ae15eb
+size 3503

checkpoint-40000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c09a7ddf632fa2b5485de6d094cf8a763affbefcb8dc5c93001a0539bad686
+size 5154563651

checkpoint-40000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31b1895952e1807b396d4e924fa1fb61ed026336fa2d9b568b14c899ec1ae878
+size 14503

checkpoint-40000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b81037f0665e42c49d437ecf24e1e38406f2a8f8a1c463379f77ea33597052a
+size 623

checkpoint-40000/stt_en_conformer_transducer_xlarge.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c822e20c23a0eb709dc03222743ce215a42db9863af172c34297cd8c402f9e4
+size 2577971200

checkpoint-40000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-40000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b64c669f66dd7a2e54d3001ce7e31c26cc60dd58136e8ce90e6055bd0ae15eb
+size 3503

checkpoint-60000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:406e36deb47741922cd59f748cd1876112106ea059c820e699c269fe0d635c2b
+size 5154563651

checkpoint-60000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3707b4b4d63eda9f45abb91e6157a5777abe5bcccebdf82df707bae7df65cf9e
+size 14503

checkpoint-60000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4edddf9241e66e2708bca7527dec737063f80262825a1b055e50529066c54390
+size 623

checkpoint-60000/stt_en_conformer_transducer_xlarge.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be893728d43d533cf97573378f9587552441031cf01aa9fdc25c779e733140f1
+size 2577971200

checkpoint-60000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-60000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b64c669f66dd7a2e54d3001ce7e31c26cc60dd58136e8ce90e6055bd0ae15eb
+size 3503

checkpoint-80000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56861ad8a03582034a89047c1e6397a79297e194daab37dae36192eb72f16c4a
+size 5154565443

checkpoint-80000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b36e92749442e712801d00e24ed95ea736e78f8ef065b6af0b801ae709dfb48d
+size 14503

checkpoint-80000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:827a7ad0b8599273336e50134d47c6b281fcbf26c0ef32fd1bca5bf3db63fe69
+size 623

checkpoint-80000/stt_en_conformer_transducer_xlarge.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9af5c4d6859c9af2c18bca5723158554500ba93753fb4ffd4923e3e72011340
+size 2577971200

checkpoint-80000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-80000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b64c669f66dd7a2e54d3001ce7e31c26cc60dd58136e8ce90e6055bd0ae15eb
+size 3503

conf/conformer_transducer_bpe_dummy.yaml ADDED Viewed

	@@ -0,0 +1,192 @@

+# It contains the default values for training a Conformer-Transducer ASR model, dummy size, with Transducer loss and sub-word encoding.
+name: "Conformer-Transducer-BPE"
+model:
+  sample_rate: 16000
+  compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
+  log_prediction: true # enables logging sample predictions in the output during training
+  skip_nan_grad: false
+  model_defaults:
+    enc_hidden: ${model.encoder.d_model}
+    pred_hidden: 64
+    joint_hidden: 64
+  train_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+    trim_silence: false
+    max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset
+    min_duration: 0.1
+    # tarred datasets
+    is_tarred: false
+    tarred_audio_filepaths: null
+    shuffle_n: 2048
+    # bucketing params
+    bucketing_strategy: "synced_randomized"
+    bucketing_batch_size: null
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16
+    shuffle: false
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+  test_ds:
+    manifest_filepath: null
+    sample_rate: ${model.sample_rate}
+    batch_size: 16
+    shuffle: false
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+  # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
+  tokenizer:
+    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
+    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: ${model.sample_rate}
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: 80
+    n_fft: 512
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 0
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    freq_masks: 2 # set to zero to disable it
+    time_masks: 10 # set to zero to disable it
+    freq_width: 27
+    time_width: 0.05
+  encoder:
+    _target_: nemo.collections.asr.modules.ConformerEncoder
+    feat_in: ${model.preprocessor.features}
+    feat_out: -1 # you may set it if you need different output size other than the default d_model
+    n_layers: 2
+    d_model: 64
+    # Sub-sampling params
+    subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding
+    subsampling_factor: 4 # must be power of 2 for striding and vggnet
+    subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model
+    causal_downsampling: false
+    # Feed forward module's params
+    ff_expansion_factor: 4
+    # Multi-headed Attention Module's params
+    self_attention_model: rel_pos # rel_pos or abs_pos
+    n_heads: 8 # may need to be lower for smaller d_models
+    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+    att_context_size: [-1, -1] # -1 means unlimited context
+    att_context_style: regular # regular or chunked_limited
+    xscaling: true # scales up the input embeddings by sqrt(d_model)
+    untie_biases: true # unties the biases of the TransformerXL layers
+    pos_emb_max_len: 5000
+    # Convolution module's params
+    conv_kernel_size: 5
+    conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+    # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+    # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+    conv_context_size: null
+    ### regularization
+    dropout: 0.1 # The dropout used in most of the Conformer Modules
+    dropout_emb: 0.0 # The dropout used for embeddings
+    dropout_att: 0.1 # The dropout for multi-headed attention modules
+  decoder:
+    _target_: nemo.collections.asr.modules.RNNTDecoder
+    normalization_mode: null # Currently only null is supported for export.
+    random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
+    blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference.
+    prednet:
+      pred_hidden: ${model.model_defaults.pred_hidden}
+      pred_rnn_layers: 1
+      t_max: null
+      dropout: 0.2
+  joint:
+    _target_: nemo.collections.asr.modules.RNNTJoint
+    log_softmax: null  # 'null' would set it automatically according to CPU/GPU device
+    preserve_memory: false  # dramatically slows down training, but might preserve some memory
+    # Fuses the computation of prediction net + joint net + loss + WER calculation
+    # to be run on sub-batches of size `fused_batch_size`.
+    # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
+    # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
+    # Using small values here will preserve a lot of memory during training, but will make training slower as well.
+    # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
+    # However, to preserve memory, this ratio can be 1:8 or even 1:16.
+    # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
+    fuse_loss_wer: true
+    fused_batch_size: 16
+    jointnet:
+      joint_hidden: ${model.model_defaults.joint_hidden}
+      activation: "relu"
+      dropout: 0.2
+  decoding:
+    strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd.
+    # greedy strategy config
+    greedy:
+      max_symbols: 10
+    # beam strategy config
+    beam:
+      beam_size: 2
+      return_best_hypothesis: False
+      score_norm: true
+      tsd_max_sym_exp: 50  # for Time Synchronous Decoding
+      alsd_max_target_len: 2.0  # for Alignment-Length Synchronous Decoding
+  loss:
+    loss_name: "default"
+    warprnnt_numba_kwargs:
+      # FastEmit regularization: https://arxiv.org/abs/2010.11148
+      # You may enable FastEmit to reduce the latency of the model for streaming
+      fastemit_lambda: 0.0  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
+      clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
+  # Adds Gaussian noise to the gradients of the decoder to avoid overfitting
+  variational_noise:
+    start_step: 0
+    std: 0.0
+  optim:
+    name: adamw
+    lr: 5.0
+    # optimizer arguments
+    betas: [0.9, 0.98]
+    weight_decay: 1e-3
+    # scheduler setup
+    sched:
+      name: NoamAnnealing
+      d_model: ${model.encoder.d_model}
+      # scheduler config override
+      warmup_steps: 10000
+      warmup_ratio: null
+      min_lr: 1e-6

conf/conformer_transducer_bpe_large.yaml ADDED Viewed

	@@ -0,0 +1,212 @@

+# It contains the default values for training a Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding.
+# Architecture and training config:
+# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
+# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
+# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file.
+#
+#  +-------------+---------+---------+----------+--------------+--------------------------+
+#  | Model       | d_model | n_heads | n_layers | weight_decay | pred_hidden/joint_hidden |
+#  +=============+=========+========+===========+==============+==========================+
+#  | Small  (14M)|   176   |    4   |    16     |     0.0      |           320            |
+#  +-------------+---------+--------+-----------+--------------+--------------------------+
+#  | Medium (32M)|   256   |    4   |    16     |     1e-3     |           640            |
+#  +-------------+---------+--------+-----------+--------------+--------------------------+
+#  | Large (120M)|   512   |    8   |    17     |     1e-3     |           640            |
+#  +-----------------------------------------------------------+--------------------------+
+#
+# You may find more info about Conformer-Transducer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer
+# Pre-trained models of Conformer-Transducer can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html
+# The checkpoint of the large model trained on NeMo ASRSET with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large
+name: "Conformer-Transducer-BPE"
+model:
+  sample_rate: 16000
+  compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
+  log_prediction: true # enables logging sample predictions in the output during training
+  skip_nan_grad: false
+  model_defaults:
+    enc_hidden: ${model.encoder.d_model}
+    pred_hidden: 640
+    joint_hidden: 640
+  train_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+    trim_silence: false
+    max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset
+    min_duration: 0.1
+    # tarred datasets
+    is_tarred: false
+    tarred_audio_filepaths: null
+    shuffle_n: 2048
+    # bucketing params
+    bucketing_strategy: "synced_randomized"
+    bucketing_batch_size: null
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16
+    shuffle: false
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+  test_ds:
+    manifest_filepath: null
+    sample_rate: ${model.sample_rate}
+    batch_size: 16
+    shuffle: false
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+  # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
+  tokenizer:
+    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
+    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: ${model.sample_rate}
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: 80
+    n_fft: 512
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 0
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    freq_masks: 2 # set to zero to disable it
+    time_masks: 10 # set to zero to disable it
+    freq_width: 27
+    time_width: 0.05
+  encoder:
+    _target_: nemo.collections.asr.modules.ConformerEncoder
+    feat_in: ${model.preprocessor.features}
+    feat_out: -1 # you may set it if you need different output size other than the default d_model
+    n_layers: 17
+    d_model: 512
+    # Sub-sampling params
+    subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding
+    subsampling_factor: 4 # must be power of 2 for striding and vggnet
+    subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model
+    causal_downsampling: false
+    # Feed forward module's params
+    ff_expansion_factor: 4
+    # Multi-headed Attention Module's params
+    self_attention_model: rel_pos # rel_pos or abs_pos
+    n_heads: 8 # may need to be lower for smaller d_models
+    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+    att_context_size: [-1, -1] # -1 means unlimited context
+    att_context_style: regular # regular or chunked_limited
+    xscaling: true # scales up the input embeddings by sqrt(d_model)
+    untie_biases: true # unties the biases of the TransformerXL layers
+    pos_emb_max_len: 5000
+    # Convolution module's params
+    conv_kernel_size: 31
+    conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+    # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+    # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+    conv_context_size: null
+    ### regularization
+    dropout: 0.1 # The dropout used in most of the Conformer Modules
+    dropout_emb: 0.0 # The dropout used for embeddings
+    dropout_att: 0.1 # The dropout for multi-headed attention modules
+  decoder:
+    _target_: nemo.collections.asr.modules.RNNTDecoder
+    normalization_mode: null # Currently only null is supported for export.
+    random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
+    blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference.
+    prednet:
+      pred_hidden: ${model.model_defaults.pred_hidden}
+      pred_rnn_layers: 1
+      t_max: null
+      dropout: 0.2
+  joint:
+    _target_: nemo.collections.asr.modules.RNNTJoint
+    log_softmax: null  # 'null' would set it automatically according to CPU/GPU device
+    preserve_memory: false  # dramatically slows down training, but might preserve some memory
+    # Fuses the computation of prediction net + joint net + loss + WER calculation
+    # to be run on sub-batches of size `fused_batch_size`.
+    # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
+    # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
+    # Using small values here will preserve a lot of memory during training, but will make training slower as well.
+    # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
+    # However, to preserve memory, this ratio can be 1:8 or even 1:16.
+    # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
+    fuse_loss_wer: true
+    fused_batch_size: 16
+    jointnet:
+      joint_hidden: ${model.model_defaults.joint_hidden}
+      activation: "relu"
+      dropout: 0.2
+  decoding:
+    strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd.
+    # greedy strategy config
+    greedy:
+      max_symbols: 10
+    # beam strategy config
+    beam:
+      beam_size: 2
+      return_best_hypothesis: False
+      score_norm: true
+      tsd_max_sym_exp: 50  # for Time Synchronous Decoding
+      alsd_max_target_len: 2.0  # for Alignment-Length Synchronous Decoding
+  loss:
+    loss_name: "default"
+    warprnnt_numba_kwargs:
+      # FastEmit regularization: https://arxiv.org/abs/2010.11148
+      # You may enable FastEmit to reduce the latency of the model for streaming
+      fastemit_lambda: 0.0  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
+      clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
+  # Adds Gaussian noise to the gradients of the decoder to avoid overfitting
+  variational_noise:
+    start_step: 0
+    std: 0.0
+  optim:
+    name: adamw
+    lr: 5.0
+    # optimizer arguments
+    betas: [0.9, 0.98]
+    weight_decay: 1e-3
+    # scheduler setup
+    sched:
+      name: NoamAnnealing
+      d_model: ${model.encoder.d_model}
+      # scheduler config override
+      warmup_steps: 10000
+      warmup_ratio: null
+      min_lr: 1e-6

conf/conformer_transducer_bpe_xlarge.yaml ADDED Viewed

	@@ -0,0 +1,196 @@

+# It contains the default values for training a Conformer-Transducer ASR model, XL size (~0.6B) with Transducer loss and sub-word encoding.
+# You may find more info about Conformer-Transducer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer
+# Pre-trained models of Conformer-Transducer can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html
+# The checkpoint of the xlarge model trained on NeMo ASRSET with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xlarge
+name: "Conformer-Transducer-BPE"
+model:
+  sample_rate: 16000
+  compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
+  log_prediction: true # enables logging sample predictions in the output during training
+  skip_nan_grad: false
+  model_defaults:
+    enc_hidden: ${model.encoder.d_model}
+    pred_hidden: 640
+    joint_hidden: 640
+  train_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+    trim_silence: false
+    max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset
+    min_duration: 0.1
+    # tarred datasets
+    is_tarred: false
+    tarred_audio_filepaths: null
+    shuffle_n: 2048
+    # bucketing params
+    bucketing_strategy: "synced_randomized"
+    bucketing_batch_size: null
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16
+    shuffle: false
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+  test_ds:
+    manifest_filepath: null
+    sample_rate: ${model.sample_rate}
+    batch_size: 16
+    shuffle: false
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+  # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
+  tokenizer:
+    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
+    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: ${model.sample_rate}
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: 80
+    n_fft: 512
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 0
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    freq_masks: 2 # set to zero to disable it
+    time_masks: 10 # set to zero to disable it
+    freq_width: 27
+    time_width: 0.05
+  encoder:
+    _target_: nemo.collections.asr.modules.ConformerEncoder
+    feat_in: ${model.preprocessor.features}
+    feat_out: -1 # you may set it if you need different output size other than the default d_model
+    n_layers: 24
+    d_model: 1024
+    # Sub-sampling params
+    subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding
+    subsampling_factor: 4 # must be power of 2 for striding and vggnet
+    subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model
+    causal_downsampling: false
+    # Feed forward module's params
+    ff_expansion_factor: 4
+    # Multi-headed Attention Module's params
+    self_attention_model: rel_pos # rel_pos or abs_pos
+    n_heads: 8 # may need to be lower for smaller d_models
+    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+    att_context_size: [-1, -1] # -1 means unlimited context
+    att_context_style: regular # regular or chunked_limited
+    xscaling: true # scales up the input embeddings by sqrt(d_model)
+    untie_biases: true # unties the biases of the TransformerXL layers
+    pos_emb_max_len: 5000
+    # Convolution module's params
+    conv_kernel_size: 5
+    conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+    # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+    # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+    conv_context_size: null
+    ### regularization
+    dropout: 0.1 # The dropout used in most of the Conformer Modules
+    dropout_emb: 0.0 # The dropout used for embeddings
+    dropout_att: 0.1 # The dropout for multi-headed attention modules
+  decoder:
+    _target_: nemo.collections.asr.modules.RNNTDecoder
+    normalization_mode: null # Currently only null is supported for export.
+    random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
+    blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference.
+    prednet:
+      pred_hidden: ${model.model_defaults.pred_hidden}
+      pred_rnn_layers: 2
+      t_max: null
+      dropout: 0.1
+  joint:
+    _target_: nemo.collections.asr.modules.RNNTJoint
+    log_softmax: null  # 'null' would set it automatically according to CPU/GPU device
+    preserve_memory: false  # dramatically slows down training, but might preserve some memory
+    # Fuses the computation of prediction net + joint net + loss + WER calculation
+    # to be run on sub-batches of size `fused_batch_size`.
+    # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
+    # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
+    # Using small values here will preserve a lot of memory during training, but will make training slower as well.
+    # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
+    # However, to preserve memory, this ratio can be 1:8 or even 1:16.
+    # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
+    fuse_loss_wer: true
+    fused_batch_size: 16
+    jointnet:
+      joint_hidden: ${model.model_defaults.joint_hidden}
+      activation: "relu"
+      dropout: 0.1
+  decoding:
+    strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd.
+    # greedy strategy config
+    greedy:
+      max_symbols: 10
+    # beam strategy config
+    beam:
+      beam_size: 2
+      return_best_hypothesis: False
+      score_norm: true
+      tsd_max_sym_exp: 50  # for Time Synchronous Decoding
+      alsd_max_target_len: 2.0  # for Alignment-Length Synchronous Decoding
+  loss:
+    loss_name: "default"
+    warprnnt_numba_kwargs:
+      # FastEmit regularization: https://arxiv.org/abs/2010.11148
+      # You may enable FastEmit to reduce the latency of the model for streaming
+      fastemit_lambda: 0.0  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
+      clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
+  # Adds Gaussian noise to the gradients of the decoder to avoid overfitting
+  variational_noise:
+    start_step: 0
+    std: 0.0
+  optim:
+    name: adamw
+    lr: 5.0
+    # optimizer arguments
+    betas: [0.9, 0.98]
+    weight_decay: 1e-3
+    # scheduler setup
+    sched:
+      name: NoamAnnealing
+      d_model: ${model.encoder.d_model}
+      # scheduler config override
+      warmup_steps: 10000
+      warmup_ratio: null
+      min_lr: 1e-6

conf/contextnet_rnnt.yaml ADDED Viewed

	@@ -0,0 +1,472 @@

+# This config contains the default values for training a modified ContextNet model with Transducer loss and BPE-based vocabulary.
+# In contrast to original ContextNet, the same number of filters is used throughout the model.
+# Default learning parameters in this config are set for effective batch size of 1k on 32 GPUs.
+# To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
+# It contains the default values for training a ContextNet ASR model, large size (~144M) with Transducer loss and sub-word encoding.
+# Architecture and training config:
+# Default learning parameters in this config are set for effective batch size of 1K. To train it with smaller effective
+# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
+# Here are the recommended configs for different variants of ContextNet, other parameters are the same as in this config file.
+#
+#  +-------------+---------+------------+
+#  | Model       | filters | time_masks |
+#  +=============+=========+============+
+#  | Small  (14M)|   256   |    2       |
+#  +-------------+---------+------------+
+#  | Medium (40M)|   512   |    5       |
+#  +-------------+---------+------------+
+#  | Large (145M)|   1024  |   10       |
+#  +-------------------------------------
+name: &name "ContextNet-8x-Stride-RNNT"
+model:
+  sample_rate: 16000
+  compute_eval_loss: false  # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
+  train_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16  # Can be increased if memory allows or when using smaller model
+    trim_silence: false
+    max_duration: 16.7
+    shuffle: true
+    use_start_end_token: false
+    num_workers: 16
+    pin_memory: true
+    # tarred datasets
+    is_tarred: false
+    tarred_audio_filepaths: null
+    tarred_shard_strategy: "scatter"
+    shuffle_n: 2048
+    # bucketing params
+    bucketing_strategy: "synced_randomized"
+    bucketing_batch_size: null
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 8
+    shuffle: false
+    use_start_end_token: false
+    num_workers: 16
+    pin_memory: true
+  test_ds:
+    manifest_filepath: null
+    sample_rate: ${model.sample_rate}
+    batch_size: 8
+    shuffle: false
+    use_start_end_token: false
+    num_workers: 16
+    pin_memory: true
+  model_defaults:
+    filters: 1024
+    repeat: 5
+    dropout: 0.1
+    separable: true
+    se: true
+    se_context_size: -1
+    kernel_size_factor: 1.0
+    # encoder / decoder / joint values
+    enc_hidden: 640
+    pred_hidden: 640
+    joint_hidden: 640
+  tokenizer:
+    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
+    type: ???  # Can be either bpe or wpe
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: ${model.sample_rate}
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: &n_mels 80
+    n_fft: 512
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 16
+    stft_conv: false
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    freq_masks: 2  # should be kept at 2
+    time_masks: 10  # can be 5 for small-med models, 10 for larger models.
+    freq_width: 27
+    time_width: 0.05
+  encoder:
+    _target_: nemo.collections.asr.modules.ConvASREncoder
+    feat_in: *n_mels
+    activation: swish
+    conv_mask: true
+    init_mode: "tds_uniform"
+    jasper:
+      - filters: ${model.model_defaults.filters}
+        repeat: 1
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: 0.0
+        residual: false
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [2]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        stride_last: true
+        residual_mode: "stride_add"
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [2]  # *stride
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        stride_last: true
+        residual_mode: "stride_add"
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [2]  # stride
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        stride_last: true
+        residual_mode: "stride_add"
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: ${model.model_defaults.dropout}
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+      - filters: ${model.model_defaults.enc_hidden}
+        repeat: 1
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: 0.0
+        residual: false
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+  decoder:
+    _target_: nemo.collections.asr.modules.RNNTDecoder
+    normalization_mode: null  # Currently only null is supported for export.
+    random_state_sampling: false  # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
+    blank_as_pad: true  # This flag must be set in order to support exporting of RNNT models + efficient inference.
+    prednet:
+      pred_hidden: ${model.model_defaults.pred_hidden}
+      pred_rnn_layers: 1  # only 1 layer LSTM networks are exportable.
+      t_max: null  # Maximum possible target seq length used for Chrono Initialization - https://arxiv.org/abs/1804.11188. Disabled by default.
+      dropout: 0.1
+  joint:
+    _target_: nemo.collections.asr.modules.RNNTJoint
+    log_softmax: null  # sets it according to cpu/gpu device
+    preserve_memory: false  # dramatically slows down training, but might preserve some memory
+    # Fuses the computation of prediction net + joint net + loss + WER calculation
+    # to be run on sub-batches of size `fused_batch_size`.
+    # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
+    # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
+    # Using small values here will preserve a lot of memory during training, but will make training slower as well.
+    # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
+    # However, to preserve memory, this ratio can be 1:8 or even 1:16.
+    # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
+    fuse_loss_wer: true
+    fused_batch_size: 16
+    jointnet:
+      joint_hidden: ${model.model_defaults.joint_hidden}
+      activation: "relu"
+      dropout: 0.1
+  # RNNT decoding strategy
+  decoding:
+    strategy: "greedy_batch"  # can be greedy, greedy_batch, beam, tsd, alsd.
+    # greedy strategy config
+    greedy:
+      max_symbols: 10
+    # beam strategy config
+    beam:
+      beam_size: 4
+      score_norm: true
+      return_best_hypothesis: False
+      softmax_temperature: 1.0  # scale the logits by some temperature prior to softmax
+      tsd_max_sym_exp: 10  # for Time Synchronous Decoding, int > 0
+      alsd_max_target_len: 5.0  # for Alignment-Length Synchronous Decoding, float > 1.0
+      maes_num_steps: 2  # for modified Adaptive Expansion Search, int > 0
+      maes_prefix_alpha: 1  # for modified Adaptive Expansion Search, int > 0
+      maes_expansion_beta: 2  # for modified Adaptive Expansion Search, int >= 0
+      maes_expansion_gamma: 2.3  # for modified Adaptive Expansion Search, float >= 0
+  # RNNT loss config
+  loss:
+    loss_name: "default"
+    warprnnt_numba_kwargs:
+      # FastEmit regularization: https://arxiv.org/abs/2010.11148
+      fastemit_lambda: 0.001  # Values can be in range [1e-4, 1e-2]. Generally, 0.001 is good start.
+      clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
+  optim:
+    name: novograd
+    lr: 0.05
+    # optimizer arguments
+    betas: [0.9, 0.0]
+    weight_decay: 0.001
+    # scheduler setup
+    sched:
+      name: CosineAnnealing
+      # scheduler config override
+      warmup_steps: 5000
+      warmup_ratio: null
+      min_lr: 1e-6
+      last_epoch: -1

conf/contextnet_rnnt_dummy.yaml ADDED Viewed

	@@ -0,0 +1,197 @@

+# This config contains the values for training a dummy ContextNet model with Transducer loss and BPE-based vocabulary.
+# In contrast to original ContextNet, the same number of filters is used throughout the model.
+# To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
+# It contains the default values for training a ContextNet ASR model, dummy size, with Transducer loss and sub-word encoding.
+name: &name "ContextNet-8x-Stride-RNNT"
+model:
+  sample_rate: 16000
+  compute_eval_loss: false  # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
+  train_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 4  # Can be increased if memory allows or when using smaller model
+    trim_silence: false
+    max_duration: 16.7
+    shuffle: true
+    use_start_end_token: false
+    num_workers: 16
+    pin_memory: true
+    # tarred datasets
+    is_tarred: false
+    tarred_audio_filepaths: null
+    tarred_shard_strategy: "scatter"
+    shuffle_n: 2048
+    # bucketing params
+    bucketing_strategy: "synced_randomized"
+    bucketing_batch_size: null
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 8
+    shuffle: false
+    use_start_end_token: false
+    num_workers: 16
+    pin_memory: true
+  test_ds:
+    manifest_filepath: null
+    sample_rate: ${model.sample_rate}
+    batch_size: 8
+    shuffle: false
+    use_start_end_token: false
+    num_workers: 16
+    pin_memory: true
+  model_defaults:
+    filters: 64
+    repeat: 1
+    dropout: 0.1
+    separable: true
+    se: true
+    se_context_size: -1
+    kernel_size_factor: 1.0
+    # encoder / decoder / joint values
+    enc_hidden: 64
+    pred_hidden: 64
+    joint_hidden: 64
+  tokenizer:
+    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
+    type: ???  # Can be either bpe or wpe
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: ${model.sample_rate}
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: &n_mels 80
+    n_fft: 512
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 16
+    stft_conv: false
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    freq_masks: 2  # should be kept at 2
+    time_masks: 10  # can be 5 for small-med models, 10 for larger models.
+    freq_width: 27
+    time_width: 0.05
+  encoder:
+    _target_: nemo.collections.asr.modules.ConvASREncoder
+    feat_in: *n_mels
+    activation: swish
+    conv_mask: true
+    init_mode: "tds_uniform"
+    jasper:
+      - filters: ${model.model_defaults.filters}
+        repeat: 1
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: 0.0
+        residual: false
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+      - filters: ${model.model_defaults.filters}
+        repeat: ${model.model_defaults.repeat}
+        kernel: [5]
+        stride: [1]
+        dilation: [1]
+        dropout: 0.0
+        residual: true
+        separable: ${model.model_defaults.separable}
+        se: ${model.model_defaults.se}
+        se_context_size: ${model.model_defaults.se_context_size}
+        kernel_size_factor: ${model.model_defaults.kernel_size_factor}
+  decoder:
+    _target_: nemo.collections.asr.modules.RNNTDecoder
+    normalization_mode: null  # Currently only null is supported for export.
+    random_state_sampling: false  # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
+    blank_as_pad: true  # This flag must be set in order to support exporting of RNNT models + efficient inference.
+    prednet:
+      pred_hidden: ${model.model_defaults.pred_hidden}
+      pred_rnn_layers: 1  # only 1 layer LSTM networks are exportable.
+      t_max: null  # Maximum possible target seq length used for Chrono Initialization - https://arxiv.org/abs/1804.11188. Disabled by default.
+      dropout: 0.1
+  joint:
+    _target_: nemo.collections.asr.modules.RNNTJoint
+    log_softmax: null  # sets it according to cpu/gpu device
+    preserve_memory: false  # dramatically slows down training, but might preserve some memory
+    # Fuses the computation of prediction net + joint net + loss + WER calculation
+    # to be run on sub-batches of size `fused_batch_size`.
+    # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
+    # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
+    # Using small values here will preserve a lot of memory during training, but will make training slower as well.
+    # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
+    # However, to preserve memory, this ratio can be 1:8 or even 1:16.
+    # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
+    fuse_loss_wer: true
+    fused_batch_size: 16
+    jointnet:
+      joint_hidden: ${model.model_defaults.joint_hidden}
+      activation: "relu"
+      dropout: 0.1
+  # RNNT decoding strategy
+  decoding:
+    strategy: "greedy_batch"  # can be greedy, greedy_batch, beam, tsd, alsd.
+    # greedy strategy config
+    greedy:
+      max_symbols: 10
+    # beam strategy config
+    beam:
+      beam_size: 4
+      score_norm: true
+      return_best_hypothesis: False
+      softmax_temperature: 1.0  # scale the logits by some temperature prior to softmax
+      tsd_max_sym_exp: 10  # for Time Synchronous Decoding, int > 0
+      alsd_max_target_len: 5.0  # for Alignment-Length Synchronous Decoding, float > 1.0
+      maes_num_steps: 2  # for modified Adaptive Expansion Search, int > 0
+      maes_prefix_alpha: 1  # for modified Adaptive Expansion Search, int > 0
+      maes_expansion_beta: 2  # for modified Adaptive Expansion Search, int >= 0
+      maes_expansion_gamma: 2.3  # for modified Adaptive Expansion Search, float >= 0
+  # RNNT loss config
+  loss:
+    loss_name: "default"
+    warprnnt_numba_kwargs:
+      # FastEmit regularization: https://arxiv.org/abs/2010.11148
+      fastemit_lambda: 0.001  # Values can be in range [1e-4, 1e-2]. Generally, 0.001 is good start.
+      clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
+  optim:
+    name: novograd
+    lr: 0.05
+    # optimizer arguments
+    betas: [0.9, 0.0]
+    weight_decay: 0.001
+    # scheduler setup
+    sched:
+      name: CosineAnnealing
+      # scheduler config override
+      warmup_steps: 5000
+      warmup_ratio: null
+      min_lr: 1e-6
+      last_epoch: -1

eval_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 7.38,
+    "eval_loss": 8.706663131713867,
+    "eval_runtime": 970.2156,
+    "eval_samples": 13098,
+    "eval_samples_per_second": 13.5,
+    "eval_steps_per_second": 3.376,
+    "eval_wer": 0.20430683297635546
+}

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .modeling_rnnt import RNNTBPEModel

models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (197 Bytes). View file

models/__pycache__/modeling_rnnt.cpython-39.pyc ADDED Viewed

Binary file (4.46 kB). View file

models/modeling_rnnt.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from dataclasses import dataclass
+from typing import Optional
+import torch
+from nemo.collections.asr.models import EncDecRNNTBPEModel
+from omegaconf import DictConfig
+from transformers.utils import ModelOutput
+@dataclass
+class RNNTOutput(ModelOutput):
+    """
+    Base class for RNNT outputs.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    wer: Optional[float] = None
+    wer_num: Optional[float] = None
+    wer_denom: Optional[float] = None
+# Adapted from https://github.com/NVIDIA/NeMo/blob/66c7677cd4a68d78965d4905dd1febbf5385dff3/nemo/collections/asr/models/rnnt_bpe_models.py#L33
+class RNNTBPEModel(EncDecRNNTBPEModel):
+    def __init__(self, cfg: DictConfig):
+        super().__init__(cfg=cfg, trainer=None)
+    def encoding(
+            self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None
+    ):
+        """
+        Forward pass of the acoustic model. Note that for RNNT Models, the forward pass of the model is a 3 step process,
+        and this method only performs the first step - forward of the acoustic model.
+        Please refer to the `forward` in order to see the full `forward` step for training - which
+        performs the forward of the acoustic model, the prediction network and then the joint network.
+        Finally, it computes the loss and possibly compute the detokenized text via the `decoding` step.
+        Please refer to the `validation_step` in order to see the full `forward` step for inference - which
+        performs the forward of the acoustic model, the prediction network and then the joint network.
+        Finally, it computes the decoded tokens via the `decoding` step and possibly compute the batch metrics.
+        Args:
+            input_signal: Tensor that represents a batch of raw audio signals,
+                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
+                `self.sample_rate` number of floating point values.
+            input_signal_length: Vector of length B, that contains the individual lengths of the audio
+                sequences.
+            processed_signal: Tensor that represents a batch of processed audio signals,
+                of shape (B, D, T) that has undergone processing via some DALI preprocessor.
+            processed_signal_length: Vector of length B, that contains the individual lengths of the
+                processed audio sequences.
+        Returns:
+            A tuple of 2 elements -
+            1) The log probabilities tensor of shape [B, T, D].
+            2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
+        """
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) is False:
+            raise ValueError(
+                f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+        if not has_processed_signal:
+            processed_signal, processed_signal_length = self.preprocessor(
+                input_signal=input_signal, length=input_signal_length,
+            )
+        # Spec augment is not applied during evaluation/testing
+        if self.spec_augmentation is not None and self.training:
+            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+        encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
+        return encoded, encoded_len
+    def forward(self, input_ids, input_lengths=None, labels=None, label_lengths=None):
+        # encoding() only performs encoder forward
+        encoded, encoded_len = self.encoding(input_signal=input_ids, input_signal_length=input_lengths)
+        del input_ids
+        # During training, loss must be computed, so decoder forward is necessary
+        decoder, target_length, states = self.decoder(targets=labels, target_length=label_lengths)
+        # If experimental fused Joint-Loss-WER is not used
+        if not self.joint.fuse_loss_wer:
+            # Compute full joint and loss
+            joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder)
+            loss_value = self.loss(
+                log_probs=joint, targets=labels, input_lengths=encoded_len, target_lengths=target_length
+            )
+            # Add auxiliary losses, if registered
+            loss_value = self.add_auxiliary_losses(loss_value)
+            wer = wer_num = wer_denom = None
+            if not self.training:
+                self.wer.update(encoded, encoded_len, labels, target_length)
+                wer, wer_num, wer_denom = self.wer.compute()
+                self.wer.reset()
+        else:
+            # If experimental fused Joint-Loss-WER is used
+            # Fused joint step
+            loss_value, wer, wer_num, wer_denom = self.joint(
+                encoder_outputs=encoded,
+                decoder_outputs=decoder,
+                encoder_lengths=encoded_len,
+                transcripts=labels,
+                transcript_lengths=label_lengths,
+                compute_wer=not self.training,
+            )
+            # Add auxiliary losses, if registered
+            loss_value = self.add_auxiliary_losses(loss_value)
+        return RNNTOutput(loss=loss_value, wer=wer, wer_num=wer_num, wer_denom=wer_denom)

process_asr_text_tokenizer.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# USAGE: python process_asr_text_tokenizer.py --manifest=<path to train manifest files, seperated by commas> \
+#         --data_root="<output directory>" \
+#         --vocab_size=<number of tokens in vocabulary> \
+#         --tokenizer=<"spe" or "wpe"> \
+#         --log
+# where <manifest> can be: train_clean_100, train_clean_360, train_other_500
+# You can also put more than one data_set comma-separated:
+# --manifest="train_clean_100,train_clean_360,train_other_500"
+# or
+#       python process_asr_text_tokenizer.py --data_file=<path to train text file> \
+#         --data_root="<output directory>" \
+#         --vocab_size=<number of tokens in vocabulary> \
+#         --tokenizer=<"bpe" or "wpe"> \
+#         --log
+# where <manifest> can be: train_clean_100, train_clean_360, train_other_500
+# You can also put more than one data_set comma-separated:
+# --manifest="train_clean_100,train_clean_360,train_other_500"
+#
+# Args:
+#   --manifest or --data_file: If your text data lies inside of an ASR manifest file,
+#       then use the --manifest path. If instead the text data is inside a file with separate lines
+#       corresponding to different text lines, then use --data_file.
+#       In either case, you can add commas to concatenate different manifests or different data files.
+#
+#   --data_root: The output directory (whose subdirectories will be created if not present) where
+#       the tokenizers will be placed.
+#
+#   --vocab_size: The size of the tokenizer vocabulary. Larger vocabularies can accommodate almost entire,
+#       words but the decoder size of any model will grow proportionally.
+#
+#   --tokenizer: Can be either spe or wpe . spe refers to the Google sentencepiece library tokenizer.
+#       wpe refers to the HuggingFace BERT Word Piece tokenizer.
+#
+#   --no_lower_case: When this flag is passed, it will force the tokenizer to create seperate tokens for
+#       upper and lower case characters. By default, the script will turn all the text to lower case
+#       before tokenization (and if upper case characters are passed during training/inference, the
+#       tokenizer will emit a token equivalent to Out-Of-Vocabulary). Used primarily for the
+#       English language.
+#
+#    --spe_type: The sentencepiece library has a few implementations of the tokenization technique, and
+#       spe_type refers to these implementations. Currently supported types are unigram, bpe, char, word.
+#       Defaults to bpe.
+#
+#   --spe_character_coverage: The sentencepiece library considers how much of the original vocabulary it
+#       should cover in its "base set" of tokens (akin to the lower and upper case characters of the
+#       English language). For almost all languages with small base token sets (<1000 tokens), this
+#       should be kept at its default of 1.0. For languages with larger vocabularies (say Japanese,
+#       Mandarin, Korean etc), the suggested value is 0.9995.
+#
+#   --spe_sample_size: If the dataset is too large, consider using a sampled dataset indicated by a
+#       positive integer. By default, any negative value (default = -1) will use the entire dataset.
+#
+#   --spe_train_extremely_large_corpus: When training a sentencepiece tokenizer on very large amounts of text,
+#       sometimes the tokenizer will run out of memory or wont be able to process so much data on RAM.
+#       At some point you might receive the following error - "Input corpus too large, try with
+#       train_extremely_large_corpus=true". If your machine has large amounts of RAM, it might still be possible
+#       to build the tokenizer using the above flag. Will silently fail if it runs out of RAM.
+#
+#   --spe_max_sentencepiece_length: Limits the maximum length that any any SentencePiece subword can be.
+#       Using this will change the subword tokens generated.
+#
+#   --spe_pad: Adds <pad> as special token.
+#
+#   --spe_bos: Adds <s> as Begining-of-Sentence special token.
+#
+#   --spe_eos: Adds </s> as End-of-Sentence special token.
+#
+#   --log: Whether the script should display log messages
+import json
+import logging
+import os
+import tokenizers
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import create_spt_model
+def __build_document_from_manifests(
+    data_root: str, manifests: str,
+):
+    if ',' in manifests:
+        manifests = manifests.split(',')
+    else:
+        manifests = [manifests]
+    document_dir = os.path.join(data_root, 'text_corpus')
+    if not os.path.exists(document_dir):
+        os.makedirs(document_dir)
+    document_path = os.path.join(document_dir, 'document.txt')
+    if os.path.exists(document_path):
+        logging.info('Corpus already exists at path : %s', document_path)
+        return document_path
+    num_lines = 0
+    with open(document_path, 'w') as out_writer:
+        for manifest in manifests:
+            with open(manifest, 'r') as in_reader:
+                for line in in_reader:
+                    item = json.loads(line)
+                    text = item['text']
+                    out_writer.write(text + '\n')
+                    out_writer.flush()
+                    num_lines += 1
+            logging.info(f"Finished extracting manifest : {manifest}")
+        logging.info("Finished extracting all manifests ! Number of sentences : {}".format(num_lines))
+    return document_path
+def __process_data(
+    text_path: str,
+    dst_folder: str,
+    vocab_size: int,
+    tokenizer_type: str,
+    spe_type: str,
+    spe_character_coverage: float,
+    spe_train_extremely_large_corpus: bool,
+    spe_sample_size: int,
+    spe_max_sentencepiece_length: int,
+    spe_bos: bool,
+    spe_eos: bool,
+    spe_pad: bool,
+    lower_case: bool,
+):
+    """
+    Converts flac to wav and build manifests's json
+    Args:
+        text_path: source with text lines
+        dst_folder: where wav files will be stored
+        vocab_size: vocabular size used in encoding the text
+        tokenizer_type: type of tokenization to perform - wpe or spe
+        spe_type: type of tokenization model used for spe.
+        spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset,
+            can be < 1.0, but for all other languages, it should be set as 1.0
+        spe_sample_size: int, default of -1. If positive integer is used, samples the dataset
+            by given sample size.
+        spe_train_extremely_large_corpus: bool. If dataset is too large, and user has sufficient RAM,
+            this flag can be set to try to trained the tokenizer. Will silently fail if it runs out of RAM.
+        spe_max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed.
+            By default, no limit is placed.
+        spe_bos: Bool flag, whether to add <s> to SentencePiece tokenizer vocabulary.
+        spe_eos: Bool flag, whether to add </s> to SentencePiece tokenizer vocabulary.
+        spe_pad: Bool flag, whether to add <pad> to SentencePiece tokenizer vocabulary.
+        lower_case: whether to tokenize with lower case character set only (for english)
+    Returns:
+    """
+    if tokenizer_type == 'spe':
+        # Prepare directory of tokenizer
+        if spe_max_sentencepiece_length > 0:
+            tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}_max_{}').format(
+                tokenizer_type, spe_type, vocab_size, spe_max_sentencepiece_length
+            )
+        else:
+            tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}').format(
+                tokenizer_type, spe_type, vocab_size
+            )
+        if spe_pad:
+            tokenizer_dir = f'{tokenizer_dir}_pad'
+        if spe_bos:
+            tokenizer_dir = f'{tokenizer_dir}_bos'
+        if spe_eos:
+            tokenizer_dir = f'{tokenizer_dir}_eos'
+        if not os.path.exists(tokenizer_dir):
+            os.makedirs(tokenizer_dir)
+        if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')):
+            logging.warning("Model file already exists, overriding old model file !")
+            os.remove(os.path.join(tokenizer_dir, 'tokenizer.model'))
+        # Build tokenizer
+        tokenizer_path, vocab_path = create_spt_model(
+            data_file=text_path,
+            vocab_size=vocab_size,
+            sample_size=spe_sample_size,
+            do_lower_case=lower_case,
+            output_dir=tokenizer_dir,
+            tokenizer_type=spe_type,
+            character_coverage=spe_character_coverage,
+            train_extremely_large_corpus=spe_train_extremely_large_corpus,
+            max_sentencepiece_length=spe_max_sentencepiece_length,
+            bos=spe_bos,
+            eos=spe_eos,
+            pad=spe_pad,
+        )
+    else:
+        tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format(tokenizer_type, vocab_size)
+        if not os.path.exists(tokenizer_dir):
+            os.makedirs(tokenizer_dir)
+        tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case)
+        tokenizer.train(text_path, vocab_size=vocab_size)
+        tokenizer.save_model(tokenizer_dir)
+    return tokenizer_dir

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pip install transformers
+pip install datasets
+pip install jiwer
+pip install wandb
+pip install soundfile
+pip install librosa
+pip install bitsandbytes

run_ami.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env bash
+CUDA_VISIBLE_DEVICES=0 python run_speech_recognition_rnnt.py \
+        --config_path="conf/conformer_transducer_bpe_xlarge.yaml" \
+        --model_name_or_path="stt_en_conformer_transducer_xlarge" \
+        --dataset_name="speech-seq2seq/ami" \
+        --tokenizer_path="tokenizer" \
+        --vocab_size="1024" \
+        --num_train_epochs="7.38" \
+        --dataset_config_name="ihm" \
+        --train_split_name="train" \
+        --eval_split_name="validation" \
+        --test_split_name="test" \
+        --text_column_name="text" \
+        --output_dir="./" \
+        --run_name="rnnt-ami-baseline" \
+        --wandb_project="rnnt" \
+        --per_device_train_batch_size="8" \
+        --per_device_eval_batch_size="4" \
+        --logging_steps="50" \
+        --learning_rate="1e-4" \
+        --warmup_steps="500" \
+        --save_strategy="steps" \
+        --save_steps="20000" \
+        --evaluation_strategy="steps" \
+        --eval_steps="20000" \
+        --report_to="wandb" \
+        --preprocessing_num_workers="4" \
+        --fused_batch_size="8" \
+        --length_column_name="input_lengths" \
+        --do_lower_case="False" \
+        --fuse_loss_wer \
+        --group_by_length \
+        --overwrite_output_dir \
+        --do_train \
+        --do_eval \
+        --do_predict \
+        --push_to_hub \
+        --use_auth_token

run_speech_recognition_rnnt.py ADDED Viewed

	@@ -0,0 +1,935 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning NVIDIA RNN-T models for speech recognition.
+"""
+# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
+import copy
+import logging
+import os
+import re
+import sys
+from dataclasses import dataclass, field
+from tqdm import tqdm
+import json
+from typing import Optional, Dict, Union, List
+import numpy as np
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf, open_dict
+from models import RNNTBPEModel
+from nemo.core import adapter_mixins
+from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig
+import datasets
+from datasets import DatasetDict, load_dataset
+import transformers
+from transformers import (
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+    set_seed,
+    Trainer,
+    TrainerCallback,
+    TrainingArguments,
+    TrainerState,
+    TrainerControl,
+)
+from transformers.trainer_pt_utils import get_parameter_names
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from process_asr_text_tokenizer import __process_data as nemo_process_data, \
+    __build_document_from_manifests as nemo_build_document_from_manifests
+import bitsandbytes as bnb
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0.dev0")
+require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
+logger = logging.getLogger(__name__)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    config_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models."},
+    )
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained model or model identifier from NVIDIA NeMo NGC."}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co or NVIDIA NeMo NGC."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                    "with private models)."
+        },
+    )
+    manifest_path: str = field(
+        default="data",
+        metadata={
+            "help": "Manifest path."
+        },
+    )
+    tokenizer_path: str = field(
+        default="tokenizers",
+        metadata={
+            "help": "Tokenizer path."
+        },
+    )
+    vocab_size: int = field(
+        default=1024,
+        metadata={"help": "Tokenizer vocab size."}
+    )
+    tokenizer_type: str = field(
+        default="spe",
+        metadata={
+            "help": "Can be either spe or wpe. spe refers to the Google sentencepiece library tokenizer."
+                    "wpe refers to the HuggingFace BERT Word Piece tokenizer."
+        },
+    )
+    spe_type: str = field(
+        default="bpe",
+        metadata={
+            "help": "Type of the SentencePiece model. Can be `bpe`, `unigram`, `char` or `word`."
+                    "Used only if `tokenizer_type` == `spe`"
+        },
+    )
+    cutoff_freq: str = field(
+        default=0.001,
+        metadata={"help": "Drop the least frequent chars from the train set when building the tokenizer."}
+    )
+    fuse_loss_wer: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to fuse the computation of prediction net + joint net + loss + WER calculation to be run "
+                    "on sub-batches of size `fused_batch_size`"
+        }
+    )
+    fused_batch_size: int = field(
+        default=8,
+        metadata={
+            "help": "`fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss."
+                    "Using small values here will preserve a lot of memory during training, but will make training slower as well."
+                    "An optimal ratio of fused_batch_size : per_device_train_batch_size is 1:1."
+                    "However, to preserve memory, this ratio can be 1:8 or even 1:16."
+        }
+    )
+    final_decoding_strategy: str = field(
+        default="greedy_batch",
+        metadata={
+            "help": "Decoding strategy for final eval/prediction steps. One of: [`greedy`, `greedy_batch`, `beam`, "
+                    "`tsd`, `alsd`]."
+        }
+    )
+    final_num_beams: int = field(
+        default=1,
+        metadata={
+            "help": "Number of beams for final eval/prediction steps. Increase beam size for better scores, "
+                    "but it will take much longer for transcription!"
+        }
+    )
+    freeze_encoder: bool = field(
+        default=False,
+        metadata={"help": "Freeze the acoustic encoder of the model. Recommend when fine-tuning on small datasets."}
+    )
+    unfreeze_encoder: bool = field(
+        default=False,
+        metadata={"help": "Unfreeze the acoustic encoder of the model after first evaluation step."}
+    )
+    add_adapter: bool = field(
+        default=False,
+        metadata={"help": "Add an adapter layer to the encoder of the model."}
+    )
+    use_adam8bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to use bitsandbytes 8bit AdamW optimiser."}
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: str = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    text_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
+    )
+    dataset_cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Path to cache directory for saving and loading datasets"}
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+                    "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                    "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+                    "value if set."
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": "Truncate training audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    max_eval_duration_in_seconds: float = field(
+        default=None,
+        metadata={
+            "help": "Truncate eval/test audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
+                    "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    min_target_length: Optional[int] = field(
+        default=2,
+        metadata={
+            "help": "The minimum total sequence length for target text after tokenization. Sequences shorter "
+                    "than this will be filtered."
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to only do data preprocessing and skip training. "
+                    "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
+                    "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
+                    "so that the cached datasets can consequently be loaded in distributed training"
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'validation'"
+        },
+    )
+    test_split_name: str = field(
+        default="test",
+        metadata={"help": "The name of the test data set split to use (via the datasets library). Defaults to 'test'"},
+    )
+    do_lower_case: bool = field(
+        default=True,
+        metadata={"help": "Whether the target text should be lower cased."},
+    )
+    wandb_project: str = field(
+        default="speech-recognition-rnnt",
+        metadata={"help": "The name of the wandb project."},
+    )
+def build_tokenizer(model_args, data_args, manifests):
+    """
+    Function to build a NeMo tokenizer from manifest file(s).
+    Copied from https://github.com/NVIDIA/NeMo/blob/66c7677cd4a68d78965d4905dd1febbf5385dff3/scripts/tokenizers/process_asr_text_tokenizer.py#L268
+    """
+    data_root = model_args.tokenizer_path
+    if isinstance(manifests, list):
+        joint_manifests = ",".join(manifests)
+    else:
+        joint_manifests = manifests
+    vocab_size = model_args.vocab_size
+    tokenizer = model_args.tokenizer_type
+    spe_type = model_args.spe_type
+    if not 0 <= model_args.cutoff_freq < 1:
+        raise ValueError(f"`cutoff_freq` must be between zero and one, got {model_args.cutoff_freq}")
+    spe_character_coverage = 1 - model_args.cutoff_freq
+    logger.info("Building tokenizer...")
+    if not os.path.exists(data_root):
+        os.makedirs(data_root)
+    text_corpus_path = nemo_build_document_from_manifests(data_root, joint_manifests)
+    tokenizer_path = nemo_process_data(
+        text_corpus_path,
+        data_root,
+        vocab_size,
+        tokenizer,
+        spe_type,
+        lower_case=data_args.do_lower_case,
+        spe_character_coverage=spe_character_coverage,
+        spe_sample_size=-1,
+        spe_train_extremely_large_corpus=False,
+        spe_max_sentencepiece_length=-1,
+        spe_bos=False,
+        spe_eos=False,
+        spe_pad=False,
+    )
+    print("Serialized tokenizer at location :", tokenizer_path)
+    logger.info('Done!')
+    # Tokenizer path
+    if tokenizer == 'spe':
+        tokenizer_dir = os.path.join(data_root, f"tokenizer_spe_{spe_type}_v{vocab_size}")
+        tokenizer_type_cfg = "bpe"
+    else:
+        tokenizer_dir = os.path.join(data_root, f"tokenizer_wpe_v{vocab_size}")
+        tokenizer_type_cfg = "wpe"
+    return tokenizer_dir, tokenizer_type_cfg
+def NeMoDataCollator(features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Since NeMo models don't have a HF processor defined (feature extractor + tokenizer), we'll pad by hand...
+    The padding idx is arbitrary: we provide the model with the input lengths and label lengths, from which
+    all the relevant padding information is inferred. Thus, we'll use the default np.pad padding idx (0).
+    """
+    # split inputs and labels since they have to be of different lengths
+    # and need different padding methods
+    input_ids = [feature["input_ids"] for feature in features]
+    labels = [feature["labels"] for feature in features]
+    # first, pad the audio inputs to max_len
+    input_lengths = [feature["input_lengths"] for feature in features]
+    max_input_len = max(input_lengths)
+    input_ids = [np.pad(input_val, (0, max_input_len - input_len), 'constant') for input_val, input_len in
+                 zip(input_ids, input_lengths)]
+    # next, pad the target labels to max_len
+    label_lengths = [len(lab) for lab in labels]
+    max_label_len = max(label_lengths)
+    labels = [np.pad(lab, (0, max_label_len - lab_len), 'constant') for lab, lab_len in zip(labels, label_lengths)]
+    batch = {"input_lengths": input_lengths, "labels": labels, "label_lengths": label_lengths}
+    # return batch as a pt tensor (list -> np.array -> torch.tensor)
+    batch = {k: torch.tensor(np.array(v), requires_grad=False) for k, v in batch.items()}
+    # leave all ints as are, convert float64 to pt float
+    batch["input_ids"] = torch.tensor(np.array(input_ids, dtype=np.float32), requires_grad=False)
+    return batch
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Set wandb project ID before instantiating the Trainer
+    os.environ["WANDB_PROJECT"] = data_args.wandb_project
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # load the model config (discarding optimiser and trainer attributes)
+    config = OmegaConf.load(model_args.config_path).model
+    # 4. Load dataset
+    raw_datasets = DatasetDict()
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            cache_dir=data_args.dataset_cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            cache_dir=data_args.dataset_cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    if training_args.do_predict:
+        test_split = data_args.test_split_name.split("+")
+        for split in test_split:
+            raw_datasets[split] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=split,
+                cache_dir=data_args.dataset_cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+            )
+    if not training_args.do_train and not training_args.do_eval and not training_args.do_predict:
+        raise ValueError(
+            "Cannot not train, not do evaluation and not do prediction. At least one of "
+            "training, evaluation or prediction has to be done."
+        )
+    # if not training, there is no need to run multiple epochs
+    if not training_args.do_train:
+        training_args.num_train_epochs = 1
+    if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
+        raise ValueError(
+            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
+            "Make sure to set `--audio_column_name` to the correct audio column - one of "
+            f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
+        )
+    if data_args.text_column_name not in next(iter(raw_datasets.values())).column_names:
+        raise ValueError(
+            f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+            "Make sure to set `--text_column_name` to the correct text column - one of "
+            f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
+        )
+    # 6. Resample speech dataset ALWAYS
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name, datasets.features.Audio(sampling_rate=config.sample_rate)
+    )
+    # 7. Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    max_input_length = int(data_args.max_duration_in_seconds * config.sample_rate)
+    min_input_length = min(int(data_args.min_duration_in_seconds * config.sample_rate), 1)
+    max_eval_input_length = int(data_args.max_eval_duration_in_seconds * config.sample_rate) if data_args.max_eval_duration_in_seconds else None
+    max_target_length = data_args.max_target_length
+    min_target_length = data_args.min_target_length
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    text_column_name = data_args.text_column_name
+    do_lower_case = data_args.do_lower_case
+    dataset_name = data_args.dataset_name
+    # Define tokens to ignore/replace
+    tedlium_contractions = [" 's", " 't", " 're", " 've", " 'm", " 'll", " 'd", " 'clock", " 'all"]
+    gigaspeech_punctuation = {" <comma>": ",", " <period>": ".", " <questionmark>": "?", " <exclamationpoint>": "!"}
+    gigaspeech_disfluencies = ["<other>", "<sil>"]
+    swb_disfluencies = ["[noise]", "[laughter]", "[silence]", "<a_aside>", "<b_aside>", "<e_aside>", "[laughter-",
+                        "[vocalized-noise]", "_1"]
+    swb_punctuations = ["{", "}", "[", "]-", "]"]
+    earnings_disfluencies = ["<crosstalk>", "<affirmative>", "<inaudible>", "inaudible", "<laugh>"]
+    ignore_segments = ["ignore_time_segment_in_scoring", "<noise>", "<music>", "[noise]", "[laughter]", "[silence]",
+                       "[vocalized-noise]", "<crosstalk>", "<affirmative>", "<inaudible>", "<laugh>", "<other>",
+                       "<sil>", ""]
+    if training_args.do_train and data_args.max_train_samples is not None:
+        raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+    if training_args.do_eval and data_args.max_eval_samples is not None:
+        raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+    if training_args.do_predict and data_args.max_predict_samples is not None:
+        for split in test_split:
+            raw_datasets[split] = raw_datasets[split].select(range(data_args.max_predict_samples))
+    # filter data where the targets are ignored in scoring
+    def is_target_labels(input_str):
+        return input_str.lower() not in ignore_segments
+    raw_datasets = raw_datasets.filter(
+        is_target_labels,
+        num_proc=num_workers,
+        input_columns=[text_column_name],
+        desc="filtering data where the targets are ignored in scoring",
+    )
+    def prepare_dataset(batch):
+        # pre-process audio
+        try:
+            sample = batch[audio_column_name]
+        except ValueError:
+            # E22: some samples are empty (no audio). Reading the empty audio array will trigger
+            # a soundfile ValueError. For now, we'll manually set these arrays to a zero array.
+            # They will be filtered in the subsequent filtering stage and so are
+            # explicitly ignored during training.
+            sample = {"array": np.array([0.]), "sampling_rate": config.sampling_rate}
+        # NeMo RNNT model performs the audio preprocessing in the `.forward()` call
+        # => we only need to supply it with the raw audio values
+        batch["input_ids"] = sample["array"]
+        batch["input_lengths"] = len(sample["array"])
+        # 'Error correction' of targets
+        input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
+        # LibriSpeech ASR
+        if dataset_name == "librispeech_asr":
+            pass  # no error correction necessary
+        # VoxPopuli
+        if dataset_name == "google/xtreme_s":
+            pass  # no error correction necessary
+        # Common Voice 9
+        if dataset_name == "mozilla-foundation/common_voice_9_0":
+            if input_str.startswith('"') and input_str.endswith('"'):
+                # we can remove trailing quotation marks as they do not affect the transcription
+                input_str = input_str[1:-1]
+            # replace double quotation marks with single
+            input_str = input_str.replace('""', '"')
+        # TED-LIUM (Release 3)
+        if dataset_name == "LIUM/tedlium":
+            # delete the <unk> token from the text
+            input_str = input_str.replace("<unk>", "")
+            # replace spaced apostrophes with un-spaced (it 's -> it's)
+            for contraction in tedlium_contractions:
+                input_str = input_str.replace(contraction, contraction[1:])
+        # GigaSpeech
+        if dataset_name == "speechcolab/gigaspeech":
+            for disfluency in gigaspeech_disfluencies:
+                input_str = input_str.replace(disfluency, "")
+            # convert spelled out punctuation to symbolic form
+            for punctuation, replacement in gigaspeech_punctuation.items():
+                input_str = input_str.replace(punctuation, replacement)
+        # SWB: hide the path to the private HF dataset
+        if "switchboard" in dataset_name:
+            for disfluency in swb_disfluencies:
+                input_str = input_str.replace(disfluency, "")
+            # remove parenthesised text (test data only)
+            input_str = re.sub("[\(].*?[\)]", "", input_str)
+            for punctuation in swb_punctuations:
+                input_str = input_str.replace(punctuation, "")
+            # replace anomalous words with their correct transcriptions
+            split_str = input_str.split("/")
+            if len(split_str) > 1:
+                input_str = " ".join(
+                    [" ".join([" ".join(i.split(" ")[:-1]) for i in split_str])] + [split_str[-1].split(" ")[-1]])
+        # Earnings 22: still figuring out best segmenting method. Thus, dataset name subject to change
+        if "earnings22" in dataset_name:
+            for disfluency in earnings_disfluencies:
+                input_str = input_str.replace(disfluency, "")
+        # SPGISpeech
+        if dataset_name == "kensho/spgispeech":
+            pass  # no error correction necessary
+        # JIWER compliance (for WER/CER calc.)
+        # remove multiple spaces
+        input_str = re.sub(r"\s\s+", " ", input_str)
+        # strip trailing spaces
+        input_str = input_str.strip()
+        # We can't currently tokenize the dataset... we need the pre-processed text data in order to
+        # build our SPE tokenizer. Once we've defined our tokenizer, we can come back and
+        # tokenize the text. For now, just return the pre-processed text data
+        batch[text_column_name] = input_str
+        return batch
+    vectorized_datasets = raw_datasets.map(
+        prepare_dataset,
+        num_proc=num_workers,
+        desc="preprocess train dataset",
+    )
+    # filter training data with inputs shorter than min_input_length or longer than max_input_length
+    def is_audio_in_length_range(length):
+        return length > min_input_length and length < max_input_length
+    if training_args.do_train:
+        vectorized_datasets["train"] = vectorized_datasets["train"].filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_lengths"],
+        )
+    if max_eval_input_length is not None:
+        # filter training data with inputs longer than max_input_length
+        def is_eval_audio_in_length_range(length):
+            return min_input_length < length < max_eval_input_length
+        vectorized_datasets = vectorized_datasets.filter(
+            is_eval_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )
+    def is_labels_non_zero(transcription):
+        return len(transcription) > 0
+    vectorized_datasets = vectorized_datasets.filter(
+        is_labels_non_zero,
+        num_proc=num_workers,
+        input_columns=[text_column_name],
+    )
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with `args.preprocessing_only` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step `args.preprocessing_only` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+        return
+    # Function to build a NeMo tokenizer manifest from a HF dataset
+    # TODO: with a bit of hacking around we can probably bypass this step entirely
+    def build_manifest(ds, manifest_path):
+        with open(manifest_path, 'w') as fout:
+            for sample in tqdm(ds[text_column_name]):
+                # Write the metadata to the manifest
+                metadata = {
+                    "text": sample
+                }
+                json.dump(metadata, fout)
+                fout.write('\n')
+    config.train_ds = config.validation_ds = config.test_ds = None
+    if not os.path.exists(model_args.manifest_path) and training_args.do_train:
+        os.makedirs(model_args.manifest_path)
+        manifest = os.path.join(model_args.manifest_path, "train.json")
+        logger.info(f"Building training manifest at {manifest}")
+        build_manifest(vectorized_datasets["train"], manifest)
+    else:
+        manifest = os.path.join(model_args.manifest_path, "train.json")
+        logger.info(f"Re-using training manifest at {manifest}")
+    tokenizer_dir, tokenizer_type_cfg = build_tokenizer(model_args, data_args, manifest)
+    # generalise the script later to load a pre-built tokenizer for eval only
+    config.tokenizer.dir = tokenizer_dir
+    config.tokenizer.type = tokenizer_type_cfg
+    if model_args.add_adapter:
+        # Utility method to check and update the model config
+        def update_model_config_to_support_adapter(model_cfg):
+            with open_dict(model_cfg):
+                adapter_metadata = adapter_mixins.get_registered_adapter(model_cfg.encoder._target_)
+                if adapter_metadata is not None:
+                    model_cfg.encoder._target_ = adapter_metadata.adapter_class_path
+            logging.info("Updated encoder _target_ model :", model_cfg.encoder._target_)
+            return model_cfg
+        config = update_model_config_to_support_adapter(config)
+    # possibly fused-computation of prediction net + joint net + loss + WER calculation
+    config.joint.fuse_loss_wer = model_args.fuse_loss_wer
+    if model_args.fuse_loss_wer:
+        config.joint.fused_batch_size = model_args.fused_batch_size
+    if model_args.model_name_or_path is not None:
+        # load pre-trained model weights
+        model = RNNTBPEModel.from_pretrained(model_args.model_name_or_path, override_config_path=config, map_location="cpu")
+        model.save_name = model_args.model_name_or_path
+        pretrained_decoder = model.decoder.state_dict()
+        pretrained_joint = model.joint.state_dict()
+        model.change_vocabulary(new_tokenizer_dir=tokenizer_dir, new_tokenizer_type=tokenizer_type_cfg)
+        # TODO: add checks for loading decoder/joint state dict
+        model.decoder.load_state_dict(pretrained_decoder)
+        model.joint.load_state_dict(pretrained_joint)
+    else:
+        model = RNNTBPEModel(cfg=config)
+        model.save_name = model_args.config_path.split("/")[-1].split(".")[0]
+        model.change_vocabulary(new_tokenizer_dir=tokenizer_dir, new_tokenizer_type=tokenizer_type_cfg)
+    if model_args.add_adapter:
+        adapter_name = model_args.config_path.split("/")[-1].split(".")[0]
+        adapter_dim = model.cfg.encoder.d_model
+        adapter_activation = "swish"
+        adapter_norm_position = "post"
+        adapter_cfg = LinearAdapterConfig(
+            in_features=model.cfg.encoder.d_model,
+            # conformer specific model dim. Every layer emits this dim at its output.
+            dim=adapter_dim,  # the bottleneck dimension of the adapter
+            activation=adapter_activation,  # activation used in bottleneck block
+            norm_position=adapter_norm_position,  # whether to use LayerNorm at the beginning or the end of the adapter
+        )
+        logger.info("Adapter config: ", adapter_cfg)
+        model.add_adapter(name=adapter_name, cfg=adapter_cfg)
+        model.set_enabled_adapters(enabled=False)  # disable all adapters
+        model.set_enabled_adapters(name=adapter_name, enabled=True)  # enable only the current adapter we want to train
+    def enable_bn(m):
+        if type(m) == nn.BatchNorm1d:
+            m.train()
+            for param in m.parameters():
+                param.requires_grad_(True)
+    if model_args.freeze_encoder:
+        model.encoder.freeze()
+        model.encoder.apply(enable_bn)
+        logging.info("Model encoder has been frozen, and batch normalization has been unfrozen")
+    if model_args.add_adapter:
+        model.unfreeze_enabled_adapters()
+        logging.info("Model adapter has been unfrozen")
+    # now that we have our model and tokenizer defined, we can tokenize the text data
+    tokenizer = model.tokenizer.tokenizer.encode_as_ids
+    def tokenize_transcripts(batch):
+        batch["labels"] = tokenizer(batch[text_column_name])
+        return batch
+    vectorized_datasets = vectorized_datasets.map(tokenize_transcripts, num_proc=num_workers,
+                                                  desc="Tokenizing datasets...",
+                                                  remove_columns=next(iter(raw_datasets.values())).column_names)
+    def compute_metrics(pred):
+        # Tuple of WERs returned by the model during eval: (wer, wer_num, wer_denom)
+        wer_num = pred.predictions[1]
+        wer_denom = pred.predictions[2]
+        # compute WERs over concat batches
+        wer = sum(wer_num) / sum(wer_denom)
+        return {"wer": wer}
+    class UnfreezeEncoderCallback(TrainerCallback):
+        def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+            model.encoder.unfreeze()
+            print("Model encoder has been unfrozen")
+    class NeMoTrainer(Trainer):
+        def _save(self, output_dir: Optional[str] = None, state_dict=None):
+            # If we are executing this function, we are the process zero, so we don't check for that.
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            os.makedirs(output_dir, exist_ok=True)
+            logger.info(f"Saving model checkpoint to {output_dir}")
+            # Save a trained model and configuration using `save_pretrained()`.
+            # They can then be reloaded using `from_pretrained()`
+            self.model.save_to(save_path=os.path.join(output_dir, model.save_name + ".nemo"))
+            # Good practice: save your training arguments together with the trained model
+            torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+    # Initialize Trainer
+    trainer = NeMoTrainer(
+        model=model,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=vectorized_datasets['train'] if training_args.do_train else None,
+        eval_dataset=vectorized_datasets['eval'] if training_args.do_eval else None,
+        data_collator=NeMoDataCollator,
+        callbacks=[UnfreezeEncoderCallback] if model_args.unfreeze_encoder else None,
+    )
+    # 8. Finally, we can start training
+    # Training
+    if training_args.do_train:
+        # use last checkpoint if exist
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(vectorized_datasets["train"])
+        )
+        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+    # Change decoding strategy for final eval/predict
+    if training_args.do_eval or training_args.do_predict:
+        # set beam search decoding config
+        beam_decoding_config = copy.deepcopy(trainer.model.cfg.decoding)
+        beam_decoding_config.strategy = model_args.final_decoding_strategy
+        beam_decoding_config.beam.beam_size = model_args.final_num_beams
+        trainer.model.change_decoding_strategy(beam_decoding_config)
+    results = {}
+    if training_args.do_eval:
+        logger.info(f"*** Running Final Evaluation ({model_args.final_decoding_strategy}) ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    if training_args.do_predict:
+        logger.info(f"*** Running Final Prediction ({model_args.final_decoding_strategy}) ***")
+        for split in test_split:
+            predict_results = trainer.predict(
+                vectorized_datasets[split], metric_key_prefix=split, )
+            metrics = predict_results.metrics
+            max_predict_samples = (
+                data_args.max_predict_samples if data_args.max_predict_samples is not None else len(vectorized_datasets[split])
+            )
+            metrics[f"{split}_samples"] = min(max_predict_samples, len(vectorized_datasets[split]))
+            trainer.log_metrics(split, metrics)
+            trainer.save_metrics(split, metrics)
+            if "wandb" in training_args.report_to:
+                import wandb
+                metrics = {os.path.join(split, k[len(split)+1:]): v for k, v in metrics.items()}
+                wandb.log(metrics)
+        # re-evaluate on the test set, this time computing the CER
+        # this is pretty wasteful to run eval twice, but very fast to implement
+        trainer.model.wer.use_cer = True
+        trainer.model.change_decoding_strategy(trainer.model.cfg.decoding)
+        for split in test_split:
+            predict_results = trainer.predict(
+                vectorized_datasets[split], metric_key_prefix=split, )
+            metrics = predict_results.metrics
+            # the returned metric is the CER, but under an erroneous key; we swap them here
+            metrics = {f"{split}_cer": metrics[f"{split}_wer"]}
+            trainer.log_metrics(split, metrics)
+            trainer.save_metrics(split, metrics)
+            if "wandb" in training_args.report_to:
+                metrics = {os.path.join(split, k[len(split) + 1:]): v for k, v in metrics.items()}
+                wandb.log(metrics)
+    # Write model card and (optionally) push to hub
+    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "speech-recognition",
+        "tags": ["automatic-speech-recognition", data_args.dataset_name],
+        "dataset_args": (
+            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
+            f" {data_args.eval_split_name}"
+        ),
+        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
+    }
+    if "common_voice" in data_args.dataset_name:
+        kwargs["language"] = config_name
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    #else:
+        #trainer.create_model_card(**kwargs)
+    return results
+if __name__ == "__main__":
+    main()

scripts/run_batch_size_sweep.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+command:
+  - python3
+  - ${program}
+  - --use_auth_token
+  - --do_eval
+  - --group_by_length
+  - --overwrite_output_dir
+  - --fp16
+  - --do_lower_case
+  - --do_eval
+  - --do_train
+  - --fuse_loss_wer
+  - ${args}
+method: grid
+metric:
+  goal: minimize
+  name: train/train_loss
+parameters:
+  config_path:
+    value: conf/conformer_transducer_bpe_xlarge.yaml
+  dataset_config_name:
+    value: clean
+  dataset_name:
+    value: librispeech_asr
+  max_steps:
+    value: 50
+  model_name_or_path:
+    value: stt_en_conformer_transducer_xlarge
+  output_dir:
+    value: ./sweep_output_dir
+  gradient_accumulation_steps:
+    values:
+      - 1
+      - 2
+  per_device_train_batch_size:
+    values:
+      - 8
+      - 16
+  fused_batch_size:
+    values:
+      - 4
+      - 8
+      - 16
+  per_device_eval_batch_size:
+    value: 4
+  preprocessing_num_workers:
+    value: 1
+  train_split_name:
+    value: train.100[:500]
+  eval_split_name:
+    value: validation[:100]
+  tokenizer_path:
+    value: tokenizer
+  vocab_size:
+    value: 1024
+  wandb_project:
+    value: rnnt-debug
+  logging_steps:
+    value: 5
+program: run_speech_recognition_rnnt.py
+project: rnnt-debug

scripts/run_common_voice_9.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env bash
+CUDA_VISIBLE_DEVICES=1 python run_speech_recognition_rnnt.py \
+        --config_path="conf/conformer_transducer_bpe_xlarge.yaml" \
+        --model_name_or_path="stt_en_conformer_transducer_xlarge" \
+        --dataset_name="mozilla-foundation/common_voice_9_0" \
+        --tokenizer_path="tokenizer" \
+        --vocab_size="1024" \
+        --num_train_epochs="0.90" \
+        --dataset_config_name="en" \
+        --train_split_name="train" \
+        --eval_split_name="validation" \
+        --test_split_name="test" \
+        --text_column_name="sentence" \
+        --output_dir="./conformer-transducer-xl-cv9" \
+        --run_name="rnnt-cv9-baseline" \
+        --wandb_project="rnnt" \
+        --per_device_train_batch_size="8" \
+        --per_device_eval_batch_size="4" \
+        --logging_steps="50" \
+        --learning_rate="1e-4" \
+        --warmup_steps="500" \
+        --save_strategy="steps" \
+        --save_steps="20000" \
+        --evaluation_strategy="steps" \
+        --eval_steps="20000" \
+        --report_to="wandb" \
+        --preprocessing_num_workers="4" \
+        --fused_batch_size="8" \
+        --length_column_name="input_lengths" \
+        --do_lower_case="False" \
+        --fuse_loss_wer \
+        --group_by_length \
+        --overwrite_output_dir \
+        --do_train \
+        --do_eval \
+        --do_predict \
+        --push_to_hub \
+        --use_auth_token