diff --git "a/log/log-train-2022-11-15-13-11-38-2" "b/log/log-train-2022-11-15-13-11-38-2" new file mode 100644--- /dev/null +++ "b/log/log-train-2022-11-15-13-11-38-2" @@ -0,0 +1,14017 @@ +2022-11-15 13:11:38,147 INFO [train.py:944] (2/4) Training started +2022-11-15 13:11:38,147 INFO [train.py:954] (2/4) Device: cuda:2 +2022-11-15 13:11:38,150 INFO [train.py:963] (2/4) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 100, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'warm_step': 2000, 'env_info': {'k2-version': '1.21', 'k2-build-type': 'Debug', 'k2-with-cuda': True, 'k2-git-sha1': 'f271e82ef30f75fecbae44b163e1244e53def116', 'k2-git-date': 'Fri Oct 28 05:02:16 2022', 'lhotse-version': '1.9.0.dev+git.97bf4b0.dirty', 'torch-version': '1.10.0+cu111', 'torch-cuda-available': True, 'torch-cuda-version': '11.1', 'python-version': '3.8', 'icefall-git-branch': 'ami', 'icefall-git-sha1': '65f14ba-dirty', 'icefall-git-date': 'Mon Nov 14 18:45:09 2022', 'icefall-path': '/exp/draj/mini_scale_2022/icefall', 'k2-path': '/exp/draj/mini_scale_2022/k2/k2/python/k2/__init__.py', 'lhotse-path': '/exp/draj/mini_scale_2022/lhotse/lhotse/__init__.py', 'hostname': 'r8n04', 'IP address': '10.1.8.4'}, 'world_size': 4, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'start_epoch': 1, 'start_batch': 0, 'exp_dir': PosixPath('pruned_transducer_stateless7/exp/v2'), 'bpe_model': 'data/lang_bpe_500/bpe.model', 'base_lr': 0.05, 'lr_batches': 5000, 'lr_epochs': 3.5, 'context_size': 2, 'prune_range': 5, 'lm_scale': 0.25, 'am_scale': 0.0, 'simple_loss_scale': 0.5, 'seed': 42, 'print_diagnostics': False, 'inf_check': False, 'save_every_n': 5000, 'keep_last_k': 10, 'average_period': 200, 'use_fp16': True, 'num_encoder_layers': '2,4,3,2,4', 'feedforward_dims': '1024,1024,2048,2048,1024', 'nhead': '8,8,8,8,8', 'encoder_dims': '384,384,384,384,384', 'attention_dims': '192,192,192,192,192', 'encoder_unmasked_dims': '256,256,256,256,256', 'zipformer_downsampling_factors': '1,2,4,8,2', 'cnn_module_kernels': '31,31,31,31,31', 'decoder_dim': 512, 'joiner_dim': 512, 'manifest_dir': PosixPath('data/manifests'), 'enable_musan': True, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'max_duration': 120, 'num_buckets': 50, 'on_the_fly_feats': False, 'shuffle': True, 'num_workers': 8, 'enable_spec_aug': True, 'spec_aug_time_warp_factor': 80, 'blank_id': 0, 'vocab_size': 500} +2022-11-15 13:11:38,150 INFO [train.py:965] (2/4) About to create model +2022-11-15 13:11:38,530 INFO [zipformer.py:176] (2/4) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8. +2022-11-15 13:11:38,540 INFO [train.py:969] (2/4) Number of model parameters: 70369391 +2022-11-15 13:11:42,924 INFO [train.py:984] (2/4) Using DDP +2022-11-15 13:11:43,611 INFO [asr_datamodule.py:353] (2/4) About to get AMI train cuts +2022-11-15 13:11:43,615 INFO [asr_datamodule.py:201] (2/4) About to get Musan cuts +2022-11-15 13:11:45,156 INFO [asr_datamodule.py:206] (2/4) Enable MUSAN +2022-11-15 13:11:45,156 INFO [asr_datamodule.py:229] (2/4) Enable SpecAugment +2022-11-15 13:11:45,156 INFO [asr_datamodule.py:230] (2/4) Time warp factor: 80 +2022-11-15 13:11:45,156 INFO [asr_datamodule.py:243] (2/4) About to create train dataset +2022-11-15 13:11:45,156 INFO [asr_datamodule.py:256] (2/4) Using DynamicBucketingSampler. +2022-11-15 13:11:45,503 INFO [asr_datamodule.py:264] (2/4) About to create train dataloader +2022-11-15 13:11:45,504 INFO [asr_datamodule.py:385] (2/4) About to get AMI IHM dev cuts +2022-11-15 13:11:45,505 INFO [asr_datamodule.py:296] (2/4) About to create dev dataset +2022-11-15 13:11:45,835 INFO [asr_datamodule.py:311] (2/4) About to create dev dataloader +2022-11-15 13:12:20,648 INFO [train.py:876] (2/4) Epoch 1, batch 0, loss[loss=3.669, simple_loss=3.313, pruned_loss=3.553, over 5550.00 frames. ], tot_loss[loss=3.669, simple_loss=3.313, pruned_loss=3.553, over 5550.00 frames. ], batch size: 21, lr: 2.50e-02, grad_scale: 2.0 +2022-11-15 13:12:20,648 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 13:12:37,307 INFO [train.py:908] (2/4) Epoch 1, validation: loss=3.424, simple_loss=3.08, pruned_loss=3.435, over 1530663.00 frames. +2022-11-15 13:12:37,341 INFO [train.py:909] (2/4) Maximum memory allocated so far is 2701MB +2022-11-15 13:12:39,701 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:12:50,433 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:12:54,119 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=8.32 vs. limit=2.0 +2022-11-15 13:12:56,899 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.74 vs. limit=2.0 +2022-11-15 13:13:09,689 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=114.58 vs. limit=5.0 +2022-11-15 13:13:21,337 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=98.22 vs. limit=5.0 +2022-11-15 13:13:23,359 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:13:32,522 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.298e+01 5.514e+01 1.134e+02 1.922e+02 2.006e+03, threshold=2.268e+02, percent-clipped=0.0 +2022-11-15 13:13:32,569 INFO [train.py:876] (2/4) Epoch 1, batch 100, loss[loss=0.4438, simple_loss=0.3864, pruned_loss=0.4616, over 5567.00 frames. ], tot_loss[loss=0.7457, simple_loss=0.6701, pruned_loss=0.688, over 431469.55 frames. ], batch size: 25, lr: 3.00e-02, grad_scale: 2.0 +2022-11-15 13:13:57,874 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=144.0, num_to_drop=2, layers_to_drop={2, 3} +2022-11-15 13:14:10,437 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3041, 4.3040, 4.3041, 4.3040, 4.3040, 4.3041, 4.3041, 4.3041], + device='cuda:2'), covar=tensor([3.5385e-05, 6.4934e-05, 4.0604e-05, 2.0630e-05, 2.3088e-05, 4.9322e-05, + 3.8256e-05, 3.5155e-05], device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([8.6628e-06, 8.7895e-06, 8.7104e-06, 8.7401e-06, 9.0487e-06, 8.7994e-06, + 8.7183e-06, 8.8455e-06], device='cuda:2') +2022-11-15 13:14:22,121 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.63 vs. limit=2.0 +2022-11-15 13:14:26,994 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.88 vs. limit=2.0 +2022-11-15 13:14:29,288 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.46 vs. limit=2.0 +2022-11-15 13:14:31,596 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.718e+01 2.547e+01 3.263e+01 4.120e+01 1.011e+02, threshold=6.525e+01, percent-clipped=0.0 +2022-11-15 13:14:31,638 INFO [train.py:876] (2/4) Epoch 1, batch 200, loss[loss=0.4305, simple_loss=0.3686, pruned_loss=0.4223, over 5752.00 frames. ], tot_loss[loss=0.5542, simple_loss=0.4886, pruned_loss=0.5324, over 689584.11 frames. ], batch size: 20, lr: 3.50e-02, grad_scale: 2.0 +2022-11-15 13:14:34,506 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.77 vs. limit=2.0 +2022-11-15 13:14:59,977 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=8.15 vs. limit=2.0 +2022-11-15 13:15:14,485 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1306, 3.1334, 3.1336, 3.1340, 3.1335, 3.1310, 3.1352, 3.1348], + device='cuda:2'), covar=tensor([0.0016, 0.0017, 0.0013, 0.0010, 0.0018, 0.0011, 0.0011, 0.0016], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([8.8730e-06, 8.8142e-06, 9.1153e-06, 8.9293e-06, 9.0572e-06, 9.1478e-06, + 8.9336e-06, 8.9902e-06], device='cuda:2') +2022-11-15 13:15:24,757 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=296.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:15:27,875 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=300.0, num_to_drop=2, layers_to_drop={2, 3} +2022-11-15 13:15:28,297 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.173e+01 3.095e+01 4.109e+01 5.560e+01 3.461e+02, threshold=8.218e+01, percent-clipped=17.0 +2022-11-15 13:15:28,338 INFO [train.py:876] (2/4) Epoch 1, batch 300, loss[loss=0.4127, simple_loss=0.3481, pruned_loss=0.3856, over 5744.00 frames. ], tot_loss[loss=0.4865, simple_loss=0.4231, pruned_loss=0.4616, over 842225.52 frames. ], batch size: 16, lr: 4.00e-02, grad_scale: 2.0 +2022-11-15 13:15:41,991 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.85 vs. limit=2.0 +2022-11-15 13:16:00,409 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=357.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:16:16,647 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=387.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:16:24,233 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=14.86 vs. limit=5.0 +2022-11-15 13:16:24,454 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.372e+01 3.559e+01 4.600e+01 6.651e+01 2.649e+02, threshold=9.199e+01, percent-clipped=13.0 +2022-11-15 13:16:24,499 INFO [train.py:876] (2/4) Epoch 1, batch 400, loss[loss=0.295, simple_loss=0.2399, pruned_loss=0.2821, over 4504.00 frames. ], tot_loss[loss=0.4493, simple_loss=0.3854, pruned_loss=0.4179, over 932707.00 frames. ], batch size: 5, lr: 4.50e-02, grad_scale: 4.0 +2022-11-15 13:16:46,789 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=439.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:16:51,924 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=448.0, num_to_drop=2, layers_to_drop={1, 3} +2022-11-15 13:17:03,822 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8103, 2.8364, 2.7963, 2.8444, 2.8242, 2.8057, 2.8517, 2.8483], + device='cuda:2'), covar=tensor([0.0139, 0.0123, 0.0178, 0.0139, 0.0138, 0.0140, 0.0128, 0.0125], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0008, 0.0009], + device='cuda:2'), out_proj_covar=tensor([8.7213e-06, 8.5754e-06, 8.8944e-06, 8.6857e-06, 8.8952e-06, 9.1161e-06, + 8.7809e-06, 8.9237e-06], device='cuda:2') +2022-11-15 13:17:22,363 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.467e+01 3.366e+01 4.397e+01 5.953e+01 8.256e+02, threshold=8.794e+01, percent-clipped=9.0 +2022-11-15 13:17:22,406 INFO [train.py:876] (2/4) Epoch 1, batch 500, loss[loss=0.3699, simple_loss=0.3125, pruned_loss=0.2966, over 5045.00 frames. ], tot_loss[loss=0.433, simple_loss=0.3661, pruned_loss=0.3929, over 989714.50 frames. ], batch size: 109, lr: 4.99e-02, grad_scale: 4.0 +2022-11-15 13:17:28,644 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9372, 3.1426, 3.1537, 3.0829, 3.0825, 3.1426, 3.1746, 3.1476], + device='cuda:2'), covar=tensor([0.0817, 0.0194, 0.0201, 0.0340, 0.0500, 0.0308, 0.0182, 0.0226], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0009, 0.0009, 0.0010, 0.0009, 0.0009, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([9.3641e-06, 8.6780e-06, 8.7212e-06, 9.1344e-06, 8.9515e-06, 9.1615e-06, + 8.5495e-06, 8.7345e-06], device='cuda:2') +2022-11-15 13:17:47,461 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=10.26 vs. limit=5.0 +2022-11-15 13:17:57,327 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=562.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:18:00,627 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=568.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:18:14,150 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=590.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:18:20,264 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=600.0, num_to_drop=2, layers_to_drop={0, 2} +2022-11-15 13:18:20,635 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.558e+01 3.991e+01 4.815e+01 6.242e+01 3.465e+02, threshold=9.630e+01, percent-clipped=11.0 +2022-11-15 13:18:20,677 INFO [train.py:876] (2/4) Epoch 1, batch 600, loss[loss=0.4093, simple_loss=0.3285, pruned_loss=0.3512, over 5555.00 frames. ], tot_loss[loss=0.4246, simple_loss=0.3536, pruned_loss=0.3762, over 1025082.07 frames. ], batch size: 13, lr: 4.98e-02, grad_scale: 4.0 +2022-11-15 13:18:32,994 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=623.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:18:36,256 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=629.0, num_to_drop=2, layers_to_drop={2, 3} +2022-11-15 13:18:47,938 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=648.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:18:49,196 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.82 vs. limit=5.0 +2022-11-15 13:18:49,670 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=651.0, num_to_drop=2, layers_to_drop={1, 2} +2022-11-15 13:18:50,154 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=652.0, num_to_drop=2, layers_to_drop={2, 3} +2022-11-15 13:18:58,908 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.18 vs. limit=5.0 +2022-11-15 13:19:18,387 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.943e+01 4.768e+01 6.411e+01 9.551e+01 4.417e+02, threshold=1.282e+02, percent-clipped=24.0 +2022-11-15 13:19:18,429 INFO [train.py:876] (2/4) Epoch 1, batch 700, loss[loss=0.3836, simple_loss=0.3071, pruned_loss=0.3137, over 5434.00 frames. ], tot_loss[loss=0.4163, simple_loss=0.342, pruned_loss=0.3597, over 1044483.30 frames. ], batch size: 64, lr: 4.98e-02, grad_scale: 4.0 +2022-11-15 13:19:39,849 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=739.0, num_to_drop=2, layers_to_drop={0, 1} +2022-11-15 13:19:42,022 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=743.0, num_to_drop=2, layers_to_drop={1, 3} +2022-11-15 13:20:07,234 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=787.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:20:15,193 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 3.610e+01 5.331e+01 6.709e+01 8.250e+01 2.326e+02, threshold=1.342e+02, percent-clipped=8.0 +2022-11-15 13:20:15,237 INFO [train.py:876] (2/4) Epoch 1, batch 800, loss[loss=0.3635, simple_loss=0.2859, pruned_loss=0.293, over 5463.00 frames. ], tot_loss[loss=0.4095, simple_loss=0.3322, pruned_loss=0.3445, over 1062978.29 frames. ], batch size: 10, lr: 4.97e-02, grad_scale: 8.0 +2022-11-15 13:20:21,195 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.32 vs. limit=2.0 +2022-11-15 13:20:42,187 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=847.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:21:13,727 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 3.878e+01 6.865e+01 9.860e+01 1.395e+02 3.183e+02, threshold=1.972e+02, percent-clipped=28.0 +2022-11-15 13:21:13,772 INFO [train.py:876] (2/4) Epoch 1, batch 900, loss[loss=0.4097, simple_loss=0.3282, pruned_loss=0.3075, over 5321.00 frames. ], tot_loss[loss=0.4027, simple_loss=0.3239, pruned_loss=0.3287, over 1060063.48 frames. ], batch size: 79, lr: 4.96e-02, grad_scale: 8.0 +2022-11-15 13:21:17,934 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=908.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:21:22,938 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5579, 3.6179, 3.7147, 3.6287, 3.5580, 3.4685, 3.4315, 3.3934], + device='cuda:2'), covar=tensor([0.3376, 0.3584, 0.2030, 0.2828, 0.3237, 0.3035, 0.2940, 0.2973], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0015, 0.0016, 0.0017, 0.0017, 0.0016, 0.0017], + device='cuda:2'), out_proj_covar=tensor([1.5634e-05, 1.6281e-05, 1.3724e-05, 1.4676e-05, 1.4347e-05, 1.4563e-05, + 1.4064e-05, 1.4754e-05], device='cuda:2') +2022-11-15 13:21:24,325 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=918.0, num_to_drop=2, layers_to_drop={1, 2} +2022-11-15 13:21:28,509 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=924.0, num_to_drop=2, layers_to_drop={0, 2} +2022-11-15 13:21:40,751 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=946.0, num_to_drop=2, layers_to_drop={1, 2} +2022-11-15 13:21:44,152 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=952.0, num_to_drop=2, layers_to_drop={0, 1} +2022-11-15 13:22:12,363 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1000.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:22:12,819 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.013e+01 8.825e+01 1.129e+02 1.516e+02 3.115e+02, threshold=2.258e+02, percent-clipped=11.0 +2022-11-15 13:22:12,863 INFO [train.py:876] (2/4) Epoch 1, batch 1000, loss[loss=0.389, simple_loss=0.3063, pruned_loss=0.2895, over 5787.00 frames. ], tot_loss[loss=0.3965, simple_loss=0.3171, pruned_loss=0.3133, over 1063257.80 frames. ], batch size: 20, lr: 4.95e-02, grad_scale: 8.0 +2022-11-15 13:22:16,604 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.22 vs. limit=2.0 +2022-11-15 13:22:37,953 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1043.0, num_to_drop=2, layers_to_drop={0, 1} +2022-11-15 13:22:37,975 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9634, 1.8324, 2.0293, 2.0224, 1.9472, 1.8960, 1.8537, 2.1597], + device='cuda:2'), covar=tensor([0.3032, 0.2555, 0.2345, 0.1935, 0.2618, 0.3168, 0.2621, 0.2879], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0019, 0.0021, 0.0019, 0.0021, 0.0021, 0.0020, 0.0020], + device='cuda:2'), out_proj_covar=tensor([1.8439e-05, 1.6555e-05, 1.9130e-05, 1.7508e-05, 2.0165e-05, 2.0807e-05, + 2.0487e-05, 1.7877e-05], device='cuda:2') +2022-11-15 13:22:40,944 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.62 vs. limit=2.0 +2022-11-15 13:22:42,772 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.12 vs. limit=5.0 +2022-11-15 13:23:05,512 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1091.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:23:10,799 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.12 vs. limit=2.0 +2022-11-15 13:23:11,404 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 13:23:11,681 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 4.943e+01 8.541e+01 1.080e+02 1.456e+02 2.899e+02, threshold=2.160e+02, percent-clipped=5.0 +2022-11-15 13:23:11,724 INFO [train.py:876] (2/4) Epoch 1, batch 1100, loss[loss=0.376, simple_loss=0.298, pruned_loss=0.2687, over 5715.00 frames. ], tot_loss[loss=0.3942, simple_loss=0.3139, pruned_loss=0.3016, over 1070465.25 frames. ], batch size: 17, lr: 4.94e-02, grad_scale: 8.0 +2022-11-15 13:23:17,346 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8832, 2.9567, 3.2050, 2.9240, 2.9115, 2.8161, 2.3787, 3.0286], + device='cuda:2'), covar=tensor([0.7752, 0.7365, 0.5489, 0.6264, 0.7624, 0.7557, 0.8704, 0.6540], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0028, 0.0029, 0.0030, 0.0032, 0.0032, 0.0027, 0.0027], + device='cuda:2'), out_proj_covar=tensor([2.9778e-05, 2.8350e-05, 2.6948e-05, 2.6910e-05, 3.0352e-05, 3.2507e-05, + 2.6595e-05, 2.9129e-05], device='cuda:2') +2022-11-15 13:23:42,404 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1153.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:23:45,640 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.07 vs. limit=2.0 +2022-11-15 13:23:50,362 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6819, 2.4157, 2.1184, 2.4483, 2.7628, 2.7163, 2.2751, 2.6321], + device='cuda:2'), covar=tensor([0.2654, 0.3254, 0.2923, 0.2526, 0.2283, 0.1767, 0.3027, 0.2349], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0012, 0.0012, 0.0011, 0.0014, 0.0013], + device='cuda:2'), out_proj_covar=tensor([1.1722e-05, 1.1999e-05, 1.0266e-05, 1.0795e-05, 9.2933e-06, 1.0070e-05, + 1.0980e-05, 1.0527e-05], device='cuda:2') +2022-11-15 13:24:09,645 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.854e+01 1.222e+02 1.602e+02 1.992e+02 6.604e+02, threshold=3.204e+02, percent-clipped=19.0 +2022-11-15 13:24:09,690 INFO [train.py:876] (2/4) Epoch 1, batch 1200, loss[loss=0.3965, simple_loss=0.3141, pruned_loss=0.2759, over 5558.00 frames. ], tot_loss[loss=0.3895, simple_loss=0.3091, pruned_loss=0.2893, over 1077323.60 frames. ], batch size: 15, lr: 4.93e-02, grad_scale: 8.0 +2022-11-15 13:24:10,867 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1203.0, num_to_drop=2, layers_to_drop={1, 2} +2022-11-15 13:24:17,635 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1214.0, num_to_drop=2, layers_to_drop={0, 2} +2022-11-15 13:24:20,298 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1218.0, num_to_drop=2, layers_to_drop={0, 1} +2022-11-15 13:24:23,565 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1224.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 13:24:36,258 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1246.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 13:24:41,556 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.36 vs. limit=5.0 +2022-11-15 13:24:46,008 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.21 vs. limit=5.0 +2022-11-15 13:24:48,468 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1266.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:24:50,683 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5638, 2.7349, 2.0237, 2.2377, 2.6324, 2.7615, 2.4034, 2.7195], + device='cuda:2'), covar=tensor([0.3014, 0.2480, 0.3315, 0.4369, 0.2434, 0.2053, 0.3126, 0.2371], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0013, 0.0013, 0.0012, 0.0012, 0.0014, 0.0013], + device='cuda:2'), out_proj_covar=tensor([1.1134e-05, 1.1781e-05, 1.0890e-05, 1.1740e-05, 9.8665e-06, 1.0099e-05, + 1.1657e-05, 1.1211e-05], device='cuda:2') +2022-11-15 13:24:51,634 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1272.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:24:56,904 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.69 vs. limit=5.0 +2022-11-15 13:25:04,718 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1294.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:25:06,557 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.23 vs. limit=2.0 +2022-11-15 13:25:08,461 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.439e+01 1.113e+02 1.512e+02 2.067e+02 6.699e+02, threshold=3.023e+02, percent-clipped=4.0 +2022-11-15 13:25:08,505 INFO [train.py:876] (2/4) Epoch 1, batch 1300, loss[loss=0.3279, simple_loss=0.2606, pruned_loss=0.2219, over 5454.00 frames. ], tot_loss[loss=0.383, simple_loss=0.3032, pruned_loss=0.2768, over 1077186.71 frames. ], batch size: 11, lr: 4.92e-02, grad_scale: 8.0 +2022-11-15 13:26:09,309 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.007e+01 1.313e+02 1.838e+02 2.468e+02 4.304e+02, threshold=3.675e+02, percent-clipped=9.0 +2022-11-15 13:26:09,350 INFO [train.py:876] (2/4) Epoch 1, batch 1400, loss[loss=0.4148, simple_loss=0.3245, pruned_loss=0.2792, over 5696.00 frames. ], tot_loss[loss=0.3754, simple_loss=0.2965, pruned_loss=0.2644, over 1087791.47 frames. ], batch size: 28, lr: 4.91e-02, grad_scale: 8.0 +2022-11-15 13:26:16,362 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8418, 4.5644, 5.2821, 4.6778, 4.3884, 4.4662, 4.8779, 4.6288], + device='cuda:2'), covar=tensor([0.0887, 0.1129, 0.0483, 0.0921, 0.0854, 0.1529, 0.0672, 0.0749], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0019, 0.0019, 0.0015, 0.0021, 0.0019, 0.0021], + device='cuda:2'), out_proj_covar=tensor([2.0672e-05, 2.0302e-05, 1.8298e-05, 1.9085e-05, 1.5590e-05, 1.9782e-05, + 1.7941e-05, 2.0606e-05], device='cuda:2') +2022-11-15 13:26:28,879 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1434.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:26:38,693 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.37 vs. limit=5.0 +2022-11-15 13:27:05,768 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1495.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:27:09,431 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.161e+01 1.360e+02 1.948e+02 2.853e+02 7.338e+02, threshold=3.896e+02, percent-clipped=10.0 +2022-11-15 13:27:09,475 INFO [train.py:876] (2/4) Epoch 1, batch 1500, loss[loss=0.4312, simple_loss=0.335, pruned_loss=0.2861, over 5425.00 frames. ], tot_loss[loss=0.373, simple_loss=0.2933, pruned_loss=0.2573, over 1080660.59 frames. ], batch size: 70, lr: 4.89e-02, grad_scale: 8.0 +2022-11-15 13:27:10,780 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1503.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:27:14,374 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1509.0, num_to_drop=2, layers_to_drop={2, 3} +2022-11-15 13:27:40,010 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1551.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:27:49,366 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1566.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:28:10,525 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.692e+01 1.557e+02 1.959e+02 2.401e+02 7.122e+02, threshold=3.919e+02, percent-clipped=3.0 +2022-11-15 13:28:10,571 INFO [train.py:876] (2/4) Epoch 1, batch 1600, loss[loss=0.3664, simple_loss=0.2825, pruned_loss=0.24, over 5542.00 frames. ], tot_loss[loss=0.3661, simple_loss=0.2879, pruned_loss=0.2468, over 1081748.77 frames. ], batch size: 16, lr: 4.88e-02, grad_scale: 8.0 +2022-11-15 13:28:17,275 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1611.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:28:26,869 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1627.0, num_to_drop=2, layers_to_drop={0, 2} +2022-11-15 13:28:54,811 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1672.0, num_to_drop=2, layers_to_drop={1, 2} +2022-11-15 13:29:04,405 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1759, 1.1917, 1.1140, 1.1336, 1.1565, 1.1262, 0.9906, 1.0922], + device='cuda:2'), covar=tensor([0.0358, 0.0337, 0.0426, 0.0388, 0.0446, 0.0452, 0.0549, 0.0492], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0023, 0.0024, 0.0025, 0.0026, 0.0026, 0.0027, 0.0027], + device='cuda:2'), out_proj_covar=tensor([2.3447e-05, 2.0978e-05, 2.2317e-05, 2.3222e-05, 2.5002e-05, 2.5550e-05, + 2.6182e-05, 2.4086e-05], device='cuda:2') +2022-11-15 13:29:05,832 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 13:29:10,265 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.66 vs. limit=5.0 +2022-11-15 13:29:11,721 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.804e+01 1.584e+02 2.065e+02 2.557e+02 6.153e+02, threshold=4.130e+02, percent-clipped=6.0 +2022-11-15 13:29:11,768 INFO [train.py:876] (2/4) Epoch 1, batch 1700, loss[loss=0.356, simple_loss=0.2775, pruned_loss=0.2271, over 5651.00 frames. ], tot_loss[loss=0.3629, simple_loss=0.2843, pruned_loss=0.2403, over 1080218.11 frames. ], batch size: 32, lr: 4.86e-02, grad_scale: 8.0 +2022-11-15 13:29:45,639 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.20 vs. limit=2.0 +2022-11-15 13:30:07,143 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1790.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 13:30:14,463 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.349e+01 1.628e+02 2.392e+02 3.072e+02 5.496e+02, threshold=4.784e+02, percent-clipped=8.0 +2022-11-15 13:30:14,505 INFO [train.py:876] (2/4) Epoch 1, batch 1800, loss[loss=0.3035, simple_loss=0.2428, pruned_loss=0.1867, over 5503.00 frames. ], tot_loss[loss=0.3579, simple_loss=0.28, pruned_loss=0.2328, over 1080165.20 frames. ], batch size: 10, lr: 4.85e-02, grad_scale: 8.0 +2022-11-15 13:30:19,421 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1809.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:30:37,488 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1838.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:30:37,997 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6907, 3.4662, 3.5778, 3.5908, 3.5966, 3.4338, 3.6653, 3.4701], + device='cuda:2'), covar=tensor([0.0674, 0.0730, 0.0822, 0.0659, 0.0630, 0.0639, 0.0552, 0.0650], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0028, 0.0028, 0.0027, 0.0021, 0.0026, 0.0026, 0.0027], + device='cuda:2'), out_proj_covar=tensor([2.7947e-05, 2.8953e-05, 2.8529e-05, 2.7769e-05, 2.1899e-05, 2.6218e-05, + 2.6350e-05, 2.7194e-05], device='cuda:2') +2022-11-15 13:30:49,385 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1857.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:30:49,470 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1857.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:30:54,467 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.23 vs. limit=5.0 +2022-11-15 13:30:57,325 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1870.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:31:16,028 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1899.0, num_to_drop=2, layers_to_drop={1, 3} +2022-11-15 13:31:17,085 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.491e+01 1.765e+02 2.160e+02 2.889e+02 5.463e+02, threshold=4.319e+02, percent-clipped=1.0 +2022-11-15 13:31:17,127 INFO [train.py:876] (2/4) Epoch 1, batch 1900, loss[loss=0.4021, simple_loss=0.3122, pruned_loss=0.2493, over 5796.00 frames. ], tot_loss[loss=0.3583, simple_loss=0.2796, pruned_loss=0.2293, over 1082078.91 frames. ], batch size: 22, lr: 4.83e-02, grad_scale: 8.0 +2022-11-15 13:31:17,577 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.72 vs. limit=5.0 +2022-11-15 13:31:25,788 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.06 vs. limit=5.0 +2022-11-15 13:31:27,808 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1918.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:31:30,124 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1922.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 13:31:35,536 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1931.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:31:56,152 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.06 vs. limit=2.0 +2022-11-15 13:31:58,479 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1967.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:32:08,968 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.87 vs. limit=5.0 +2022-11-15 13:32:13,848 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.33 vs. limit=2.0 +2022-11-15 13:32:19,572 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.112e+02 1.971e+02 2.700e+02 3.319e+02 5.947e+02, threshold=5.400e+02, percent-clipped=8.0 +2022-11-15 13:32:19,616 INFO [train.py:876] (2/4) Epoch 1, batch 2000, loss[loss=0.3341, simple_loss=0.268, pruned_loss=0.2, over 5732.00 frames. ], tot_loss[loss=0.3577, simple_loss=0.2782, pruned_loss=0.2257, over 1077068.01 frames. ], batch size: 15, lr: 4.82e-02, grad_scale: 16.0 +2022-11-15 13:32:47,801 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-11-15 13:33:20,331 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2090.0, num_to_drop=2, layers_to_drop={0, 1} +2022-11-15 13:33:27,169 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.484e+01 1.719e+02 2.410e+02 2.808e+02 6.250e+02, threshold=4.821e+02, percent-clipped=3.0 +2022-11-15 13:33:27,212 INFO [train.py:876] (2/4) Epoch 1, batch 2100, loss[loss=0.303, simple_loss=0.2465, pruned_loss=0.1797, over 5437.00 frames. ], tot_loss[loss=0.3538, simple_loss=0.2759, pruned_loss=0.2201, over 1083324.41 frames. ], batch size: 11, lr: 4.80e-02, grad_scale: 16.0 +2022-11-15 13:33:52,292 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2138.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:33:57,660 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.30 vs. limit=2.0 +2022-11-15 13:34:01,503 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-11-15 13:34:29,775 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=2194.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:34:34,681 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.077e+02 1.857e+02 2.320e+02 3.014e+02 5.745e+02, threshold=4.640e+02, percent-clipped=3.0 +2022-11-15 13:34:34,724 INFO [train.py:876] (2/4) Epoch 1, batch 2200, loss[loss=0.2717, simple_loss=0.2223, pruned_loss=0.1606, over 5668.00 frames. ], tot_loss[loss=0.348, simple_loss=0.2723, pruned_loss=0.2144, over 1083134.71 frames. ], batch size: 11, lr: 4.78e-02, grad_scale: 16.0 +2022-11-15 13:34:43,176 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=2213.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:34:49,322 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2222.0, num_to_drop=2, layers_to_drop={0, 1} +2022-11-15 13:34:51,872 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=2226.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:34:59,480 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0303, 3.9972, 4.2724, 4.1035, 4.0317, 3.6346, 3.8124, 3.7873], + device='cuda:2'), covar=tensor([0.0364, 0.0334, 0.0324, 0.0300, 0.0451, 0.0623, 0.0391, 0.0510], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0022, 0.0022, 0.0022, 0.0020, 0.0020, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.6495e-05, 2.5076e-05, 2.1422e-05, 2.1502e-05, 2.2333e-05, 2.3123e-05, + 2.0626e-05, 2.2638e-05], device='cuda:2') +2022-11-15 13:35:08,156 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.95 vs. limit=2.0 +2022-11-15 13:35:19,982 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2267.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:35:21,885 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2270.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:35:39,172 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0777, 3.6646, 3.8578, 3.8693, 3.7746, 3.7940, 3.8684, 3.6561], + device='cuda:2'), covar=tensor([0.0462, 0.0509, 0.0641, 0.0425, 0.0455, 0.0457, 0.0413, 0.0462], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0034, 0.0035, 0.0033, 0.0027, 0.0029, 0.0030, 0.0033], + device='cuda:2'), out_proj_covar=tensor([3.4051e-05, 3.6868e-05, 3.7917e-05, 3.4515e-05, 2.8413e-05, 3.1386e-05, + 3.1487e-05, 3.4283e-05], device='cuda:2') +2022-11-15 13:35:42,978 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.350e+01 1.693e+02 2.305e+02 3.135e+02 7.690e+02, threshold=4.610e+02, percent-clipped=6.0 +2022-11-15 13:35:43,020 INFO [train.py:876] (2/4) Epoch 1, batch 2300, loss[loss=0.3289, simple_loss=0.2669, pruned_loss=0.1955, over 5712.00 frames. ], tot_loss[loss=0.3412, simple_loss=0.2682, pruned_loss=0.2086, over 1084901.92 frames. ], batch size: 15, lr: 4.77e-02, grad_scale: 16.0 +2022-11-15 13:35:52,627 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2315.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:36:09,637 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=2340.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:36:42,625 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 13:36:51,176 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.270e+01 1.877e+02 2.321e+02 3.011e+02 5.507e+02, threshold=4.642e+02, percent-clipped=4.0 +2022-11-15 13:36:51,221 INFO [train.py:876] (2/4) Epoch 1, batch 2400, loss[loss=0.3055, simple_loss=0.2556, pruned_loss=0.1777, over 5709.00 frames. ], tot_loss[loss=0.3357, simple_loss=0.2655, pruned_loss=0.2039, over 1080534.33 frames. ], batch size: 17, lr: 4.75e-02, grad_scale: 16.0 +2022-11-15 13:36:51,431 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=2401.0, num_to_drop=2, layers_to_drop={2, 3} +2022-11-15 13:36:55,839 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6783, 3.5057, 3.4759, 3.1975, 3.1513, 3.1579, 2.8675, 3.3897], + device='cuda:2'), covar=tensor([0.0429, 0.0367, 0.0322, 0.0439, 0.0532, 0.0522, 0.0650, 0.0411], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0028, 0.0027, 0.0028, 0.0031, 0.0031, 0.0033, 0.0030], + device='cuda:2'), out_proj_covar=tensor([2.1627e-05, 2.3243e-05, 2.3970e-05, 2.3211e-05, 2.6213e-05, 2.6507e-05, + 2.8521e-05, 2.4871e-05], device='cuda:2') +2022-11-15 13:37:03,687 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.00 vs. limit=5.0 +2022-11-15 13:37:08,391 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2142, 2.7924, 2.9111, 2.6059, 3.2807, 2.6483, 3.0035, 3.1713], + device='cuda:2'), covar=tensor([0.0096, 0.0373, 0.0168, 0.0283, 0.0125, 0.0211, 0.0173, 0.0157], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0012, 0.0015, 0.0011, 0.0012, 0.0012, 0.0011], + device='cuda:2'), out_proj_covar=tensor([8.6909e-06, 1.5266e-05, 9.5530e-06, 1.3051e-05, 8.4327e-06, 9.2739e-06, + 1.0570e-05, 8.6002e-06], device='cuda:2') +2022-11-15 13:37:25,034 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4803, 3.5721, 3.3368, 3.8329, 3.8978, 3.4153, 2.7128, 3.7219], + device='cuda:2'), covar=tensor([0.0543, 0.0436, 0.0570, 0.0287, 0.0223, 0.0514, 0.1814, 0.0358], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0017, 0.0017, 0.0016, 0.0014, 0.0016, 0.0025, 0.0016], + device='cuda:2'), out_proj_covar=tensor([1.6246e-05, 1.6424e-05, 1.5315e-05, 1.3418e-05, 1.1197e-05, 1.3867e-05, + 2.6390e-05, 1.3643e-05], device='cuda:2') +2022-11-15 13:37:36,689 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5625, 4.3330, 3.8207, 3.7602, 3.6178, 3.9437, 2.3505, 4.4415], + device='cuda:2'), covar=tensor([0.0373, 0.0261, 0.0217, 0.0470, 0.0432, 0.0383, 0.0902, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0019, 0.0017, 0.0024, 0.0021, 0.0019, 0.0018, 0.0016], + device='cuda:2'), out_proj_covar=tensor([1.8982e-05, 2.0307e-05, 1.6038e-05, 2.5131e-05, 2.0364e-05, 1.9054e-05, + 1.9804e-05, 1.4315e-05], device='cuda:2') +2022-11-15 13:37:54,122 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2494.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 13:37:58,564 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.057e+02 2.130e+02 2.557e+02 3.291e+02 7.818e+02, threshold=5.113e+02, percent-clipped=6.0 +2022-11-15 13:37:58,605 INFO [train.py:876] (2/4) Epoch 1, batch 2500, loss[loss=0.4302, simple_loss=0.33, pruned_loss=0.2652, over 5453.00 frames. ], tot_loss[loss=0.3358, simple_loss=0.2662, pruned_loss=0.2032, over 1081380.35 frames. ], batch size: 58, lr: 4.73e-02, grad_scale: 16.0 +2022-11-15 13:38:06,774 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2513.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:38:15,169 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2526.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:38:25,944 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2542.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:38:29,206 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 13:38:34,660 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.31 vs. limit=5.0 +2022-11-15 13:38:38,840 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2561.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:38:47,655 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2574.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:38:52,080 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.11 vs. limit=5.0 +2022-11-15 13:38:58,708 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5703, 4.1219, 3.2028, 3.6786, 3.4017, 3.1459, 2.6524, 3.4582], + device='cuda:2'), covar=tensor([0.0342, 0.0237, 0.0398, 0.0383, 0.0285, 0.0333, 0.0464, 0.0276], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0018, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017], + device='cuda:2'), out_proj_covar=tensor([1.4918e-05, 1.6484e-05, 1.6004e-05, 1.3877e-05, 1.4910e-05, 1.3610e-05, + 1.3586e-05, 1.2983e-05], device='cuda:2') +2022-11-15 13:39:06,572 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.050e+02 1.872e+02 2.338e+02 3.221e+02 9.126e+02, threshold=4.676e+02, percent-clipped=6.0 +2022-11-15 13:39:06,616 INFO [train.py:876] (2/4) Epoch 1, batch 2600, loss[loss=0.3389, simple_loss=0.2493, pruned_loss=0.2142, over 4161.00 frames. ], tot_loss[loss=0.3319, simple_loss=0.2646, pruned_loss=0.1999, over 1077879.37 frames. ], batch size: 183, lr: 4.71e-02, grad_scale: 16.0 +2022-11-15 13:39:29,513 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.05 vs. limit=5.0 +2022-11-15 13:39:43,158 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-11-15 13:40:04,058 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=2685.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:40:11,218 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=2696.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 13:40:14,323 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.241e+02 1.859e+02 2.538e+02 3.417e+02 6.213e+02, threshold=5.075e+02, percent-clipped=6.0 +2022-11-15 13:40:14,365 INFO [train.py:876] (2/4) Epoch 1, batch 2700, loss[loss=0.3863, simple_loss=0.3007, pruned_loss=0.2359, over 5633.00 frames. ], tot_loss[loss=0.329, simple_loss=0.264, pruned_loss=0.1972, over 1090928.37 frames. ], batch size: 23, lr: 4.69e-02, grad_scale: 16.0 +2022-11-15 13:40:31,501 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-15 13:40:45,515 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=2746.0, num_to_drop=2, layers_to_drop={0, 2} +2022-11-15 13:41:23,155 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.193e+02 2.037e+02 2.481e+02 3.360e+02 6.352e+02, threshold=4.961e+02, percent-clipped=3.0 +2022-11-15 13:41:23,202 INFO [train.py:876] (2/4) Epoch 1, batch 2800, loss[loss=0.3401, simple_loss=0.2889, pruned_loss=0.1957, over 5760.00 frames. ], tot_loss[loss=0.3275, simple_loss=0.2637, pruned_loss=0.1958, over 1092081.44 frames. ], batch size: 20, lr: 4.67e-02, grad_scale: 16.0 +2022-11-15 13:41:39,166 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=2824.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:41:44,699 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5077, 2.5189, 2.5521, 2.8238, 2.7046, 2.5768, 1.3372, 2.4937], + device='cuda:2'), covar=tensor([0.0688, 0.0627, 0.0466, 0.0356, 0.0426, 0.0645, 0.2992, 0.0518], + device='cuda:2'), in_proj_covar=tensor([0.0032, 0.0031, 0.0028, 0.0028, 0.0026, 0.0031, 0.0050, 0.0027], + device='cuda:2'), out_proj_covar=tensor([3.2106e-05, 2.8789e-05, 2.6094e-05, 2.3802e-05, 2.1249e-05, 2.6864e-05, + 6.0497e-05, 2.3160e-05], device='cuda:2') +2022-11-15 13:42:10,516 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7492, 2.8954, 2.7238, 3.0682, 2.9721, 2.6394, 1.9243, 2.7944], + device='cuda:2'), covar=tensor([0.0646, 0.0605, 0.0605, 0.0388, 0.0434, 0.0870, 0.2946, 0.0588], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0034, 0.0031, 0.0030, 0.0029, 0.0033, 0.0056, 0.0029], + device='cuda:2'), out_proj_covar=tensor([3.4861e-05, 3.1830e-05, 2.8739e-05, 2.5830e-05, 2.2805e-05, 2.9229e-05, + 6.7324e-05, 2.4799e-05], device='cuda:2') +2022-11-15 13:42:20,509 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=2885.0, num_to_drop=2, layers_to_drop={0, 1} +2022-11-15 13:42:30,824 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.149e+02 2.042e+02 2.577e+02 3.345e+02 6.665e+02, threshold=5.155e+02, percent-clipped=6.0 +2022-11-15 13:42:30,866 INFO [train.py:876] (2/4) Epoch 1, batch 2900, loss[loss=0.28, simple_loss=0.2352, pruned_loss=0.1624, over 5703.00 frames. ], tot_loss[loss=0.3254, simple_loss=0.2619, pruned_loss=0.1945, over 1089042.42 frames. ], batch size: 11, lr: 4.65e-02, grad_scale: 16.0 +2022-11-15 13:42:35,480 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1060, 2.1406, 1.9445, 2.2606, 2.0513, 2.5078, 2.3334, 2.3278], + device='cuda:2'), covar=tensor([0.0366, 0.0361, 0.0382, 0.0311, 0.0329, 0.0275, 0.0378, 0.0260], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0023, 0.0023, 0.0023, 0.0026, 0.0023, 0.0022, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.4636e-05, 2.1243e-05, 2.0270e-05, 2.1419e-05, 2.4461e-05, 2.0916e-05, + 2.2773e-05, 2.0227e-05], device='cuda:2') +2022-11-15 13:42:55,173 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=2937.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:43:16,987 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.71 vs. limit=2.0 +2022-11-15 13:43:29,904 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.63 vs. limit=5.0 +2022-11-15 13:43:32,209 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.44 vs. limit=5.0 +2022-11-15 13:43:35,972 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2996.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:43:37,245 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=2998.0, num_to_drop=2, layers_to_drop={0, 3} +2022-11-15 13:43:39,749 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.318e+01 2.017e+02 2.613e+02 3.220e+02 7.122e+02, threshold=5.226e+02, percent-clipped=3.0 +2022-11-15 13:43:39,792 INFO [train.py:876] (2/4) Epoch 1, batch 3000, loss[loss=0.282, simple_loss=0.2458, pruned_loss=0.1591, over 5742.00 frames. ], tot_loss[loss=0.3203, simple_loss=0.2594, pruned_loss=0.1906, over 1087730.43 frames. ], batch size: 13, lr: 4.63e-02, grad_scale: 16.0 +2022-11-15 13:43:39,792 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 13:43:58,880 INFO [train.py:908] (2/4) Epoch 1, validation: loss=0.2736, simple_loss=0.2548, pruned_loss=0.1462, over 1530663.00 frames. +2022-11-15 13:43:58,881 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4165MB +2022-11-15 13:44:08,396 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.21 vs. limit=2.0 +2022-11-15 13:44:11,414 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3019.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:44:26,585 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3041.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:44:28,578 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3044.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:44:29,677 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4585, 3.4641, 3.3802, 3.5142, 3.2174, 3.4484, 3.0343, 3.3858], + device='cuda:2'), covar=tensor([0.0284, 0.0173, 0.0284, 0.0188, 0.0282, 0.0225, 0.0409, 0.0235], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0024, 0.0026, 0.0023, 0.0026, 0.0023, 0.0029, 0.0026], + device='cuda:2'), out_proj_covar=tensor([2.6511e-05, 2.3590e-05, 2.8436e-05, 2.3699e-05, 2.8499e-05, 2.3405e-05, + 2.9396e-05, 2.6594e-05], device='cuda:2') +2022-11-15 13:44:46,078 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-11-15 13:44:53,892 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3080.0, num_to_drop=2, layers_to_drop={1, 2} +2022-11-15 13:45:03,181 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.86 vs. limit=5.0 +2022-11-15 13:45:08,267 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.114e+02 2.068e+02 2.442e+02 3.389e+02 5.023e+02, threshold=4.884e+02, percent-clipped=1.0 +2022-11-15 13:45:08,308 INFO [train.py:876] (2/4) Epoch 1, batch 3100, loss[loss=0.3403, simple_loss=0.2693, pruned_loss=0.2056, over 5699.00 frames. ], tot_loss[loss=0.3187, simple_loss=0.2589, pruned_loss=0.1893, over 1090762.66 frames. ], batch size: 19, lr: 4.61e-02, grad_scale: 16.0 +2022-11-15 13:45:20,245 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.57 vs. limit=5.0 +2022-11-15 13:45:34,780 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5209, 1.9206, 1.7732, 1.7540, 1.8811, 1.8553, 1.7846, 1.8587], + device='cuda:2'), covar=tensor([0.0302, 0.0186, 0.0236, 0.0223, 0.0247, 0.0281, 0.0333, 0.0183], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0018, 0.0019, 0.0019, 0.0021, 0.0020, 0.0020, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.1223e-05, 1.7283e-05, 1.7271e-05, 1.7358e-05, 2.0275e-05, 1.9452e-05, + 2.1495e-05, 1.6280e-05], device='cuda:2') +2022-11-15 13:45:51,592 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.06 vs. limit=2.0 +2022-11-15 13:45:57,964 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.35 vs. limit=2.0 +2022-11-15 13:46:02,909 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3180.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:46:17,616 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.090e+02 2.233e+02 2.604e+02 3.182e+02 6.551e+02, threshold=5.207e+02, percent-clipped=6.0 +2022-11-15 13:46:17,660 INFO [train.py:876] (2/4) Epoch 1, batch 3200, loss[loss=0.2506, simple_loss=0.2123, pruned_loss=0.1444, over 5691.00 frames. ], tot_loss[loss=0.3215, simple_loss=0.2607, pruned_loss=0.1912, over 1085079.37 frames. ], batch size: 12, lr: 4.59e-02, grad_scale: 16.0 +2022-11-15 13:46:30,876 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7889, 5.2811, 5.4015, 4.8230, 5.4845, 5.1925, 4.6117, 4.6881], + device='cuda:2'), covar=tensor([0.0257, 0.0236, 0.0206, 0.0274, 0.0292, 0.0149, 0.0283, 0.0238], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0037, 0.0033, 0.0035, 0.0035, 0.0033, 0.0032, 0.0031], + device='cuda:2'), out_proj_covar=tensor([4.8135e-05, 4.4444e-05, 3.9040e-05, 4.0333e-05, 4.6957e-05, 4.4021e-05, + 3.8355e-05, 3.9486e-05], device='cuda:2') +2022-11-15 13:46:42,477 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7208, 4.1396, 3.4322, 3.3215, 3.6695, 4.1172, 3.3940, 4.3410], + device='cuda:2'), covar=tensor([0.0180, 0.0193, 0.0160, 0.0387, 0.0178, 0.0117, 0.0188, 0.0055], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0019, 0.0018, 0.0024, 0.0019, 0.0017, 0.0015, 0.0015], + device='cuda:2'), out_proj_covar=tensor([2.2713e-05, 2.2910e-05, 2.0495e-05, 2.9044e-05, 2.1617e-05, 1.9091e-05, + 1.8244e-05, 1.5102e-05], device='cuda:2') +2022-11-15 13:46:42,883 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 13:46:56,827 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6981, 3.7525, 3.7719, 3.6837, 3.3255, 3.4491, 3.8456, 3.5378], + device='cuda:2'), covar=tensor([0.0443, 0.0603, 0.0562, 0.0469, 0.0565, 0.0381, 0.0506, 0.0536], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0056, 0.0047, 0.0048, 0.0037, 0.0039, 0.0044, 0.0043], + device='cuda:2'), out_proj_covar=tensor([5.0791e-05, 6.9652e-05, 5.9399e-05, 5.7625e-05, 4.4222e-05, 4.6939e-05, + 5.4009e-05, 5.2294e-05], device='cuda:2') +2022-11-15 13:47:17,412 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3288.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:47:20,614 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3293.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:47:25,674 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.790e+01 2.210e+02 2.954e+02 4.251e+02 1.287e+03, threshold=5.908e+02, percent-clipped=13.0 +2022-11-15 13:47:25,716 INFO [train.py:876] (2/4) Epoch 1, batch 3300, loss[loss=0.3258, simple_loss=0.2523, pruned_loss=0.1996, over 5011.00 frames. ], tot_loss[loss=0.3183, simple_loss=0.259, pruned_loss=0.1888, over 1081434.18 frames. ], batch size: 110, lr: 4.57e-02, grad_scale: 16.0 +2022-11-15 13:47:51,499 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.83 vs. limit=5.0 +2022-11-15 13:47:53,295 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3341.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:47:59,057 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3349.0, num_to_drop=2, layers_to_drop={2, 3} +2022-11-15 13:48:17,243 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3375.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:48:23,755 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1711, 2.3924, 2.0050, 2.4762, 2.2343, 2.2936, 1.4437, 2.2707], + device='cuda:2'), covar=tensor([0.0492, 0.0209, 0.0468, 0.0208, 0.0322, 0.0418, 0.1930, 0.0312], + device='cuda:2'), in_proj_covar=tensor([0.0054, 0.0043, 0.0045, 0.0042, 0.0037, 0.0054, 0.0091, 0.0043], + device='cuda:2'), out_proj_covar=tensor([5.6556e-05, 3.8444e-05, 4.0945e-05, 3.5379e-05, 2.9270e-05, 5.0348e-05, + 1.1188e-04, 3.7260e-05], device='cuda:2') +2022-11-15 13:48:26,983 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3389.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:48:35,851 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.826e+01 1.721e+02 2.073e+02 2.750e+02 6.330e+02, threshold=4.146e+02, percent-clipped=3.0 +2022-11-15 13:48:35,892 INFO [train.py:876] (2/4) Epoch 1, batch 3400, loss[loss=0.3083, simple_loss=0.2556, pruned_loss=0.1805, over 5687.00 frames. ], tot_loss[loss=0.314, simple_loss=0.2564, pruned_loss=0.1858, over 1080411.56 frames. ], batch size: 36, lr: 4.55e-02, grad_scale: 16.0 +2022-11-15 13:48:46,482 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.93 vs. limit=5.0 +2022-11-15 13:48:50,940 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7104, 3.8149, 3.0836, 2.9493, 3.4069, 3.4648, 2.8676, 3.5006], + device='cuda:2'), covar=tensor([0.0109, 0.0131, 0.0177, 0.0363, 0.0160, 0.0151, 0.0168, 0.0133], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0017, 0.0017, 0.0023, 0.0017, 0.0015, 0.0014, 0.0014], + device='cuda:2'), out_proj_covar=tensor([2.1359e-05, 2.1201e-05, 1.9351e-05, 2.8013e-05, 2.0242e-05, 1.8022e-05, + 1.7369e-05, 1.5841e-05], device='cuda:2') +2022-11-15 13:49:30,564 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3480.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:49:41,460 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4863, 3.7890, 3.5642, 3.6669, 3.4370, 3.6232, 3.1613, 3.6700], + device='cuda:2'), covar=tensor([0.0176, 0.0081, 0.0141, 0.0104, 0.0153, 0.0108, 0.0288, 0.0114], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0023, 0.0025, 0.0021, 0.0025, 0.0022, 0.0031, 0.0024], + device='cuda:2'), out_proj_covar=tensor([2.9359e-05, 2.4060e-05, 2.8913e-05, 2.3599e-05, 2.8927e-05, 2.4184e-05, + 3.4286e-05, 2.7184e-05], device='cuda:2') +2022-11-15 13:49:45,036 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.127e+02 1.861e+02 2.622e+02 3.661e+02 7.520e+02, threshold=5.245e+02, percent-clipped=13.0 +2022-11-15 13:49:45,087 INFO [train.py:876] (2/4) Epoch 1, batch 3500, loss[loss=0.316, simple_loss=0.2629, pruned_loss=0.1846, over 5705.00 frames. ], tot_loss[loss=0.3106, simple_loss=0.2546, pruned_loss=0.1833, over 1085300.75 frames. ], batch size: 15, lr: 4.53e-02, grad_scale: 16.0 +2022-11-15 13:49:50,318 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-11-15 13:49:56,555 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.32 vs. limit=5.0 +2022-11-15 13:50:04,182 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3528.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:50:11,203 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.72 vs. limit=5.0 +2022-11-15 13:50:17,073 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3546.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:50:49,952 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3593.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:50:56,148 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.435e+01 2.156e+02 2.680e+02 3.527e+02 6.618e+02, threshold=5.360e+02, percent-clipped=3.0 +2022-11-15 13:50:56,192 INFO [train.py:876] (2/4) Epoch 1, batch 3600, loss[loss=0.3099, simple_loss=0.2532, pruned_loss=0.1833, over 5595.00 frames. ], tot_loss[loss=0.3116, simple_loss=0.2554, pruned_loss=0.1839, over 1083867.15 frames. ], batch size: 18, lr: 4.50e-02, grad_scale: 16.0 +2022-11-15 13:51:00,498 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3607.0, num_to_drop=2, layers_to_drop={1, 3} +2022-11-15 13:51:19,618 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4623, 3.8740, 3.5941, 3.9590, 3.5352, 3.7160, 3.2674, 3.5688], + device='cuda:2'), covar=tensor([0.0370, 0.0156, 0.0357, 0.0167, 0.0293, 0.0255, 0.0476, 0.0268], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0025, 0.0027, 0.0022, 0.0029, 0.0025, 0.0034, 0.0027], + device='cuda:2'), out_proj_covar=tensor([3.3078e-05, 2.7063e-05, 3.2577e-05, 2.4555e-05, 3.3343e-05, 2.8760e-05, + 3.8437e-05, 3.1597e-05], device='cuda:2') +2022-11-15 13:51:24,661 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3641.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:51:27,131 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3644.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:51:49,272 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3675.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:51:49,957 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3676.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:51:53,186 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-11-15 13:52:02,781 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 13:52:07,940 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.388e+02 2.330e+02 3.139e+02 3.973e+02 9.859e+02, threshold=6.278e+02, percent-clipped=9.0 +2022-11-15 13:52:07,980 INFO [train.py:876] (2/4) Epoch 1, batch 3700, loss[loss=0.1995, simple_loss=0.1796, pruned_loss=0.1097, over 5093.00 frames. ], tot_loss[loss=0.3099, simple_loss=0.2544, pruned_loss=0.1827, over 1084673.58 frames. ], batch size: 7, lr: 4.48e-02, grad_scale: 16.0 +2022-11-15 13:52:24,113 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3723.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:52:33,961 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3737.0, num_to_drop=2, layers_to_drop={2, 3} +2022-11-15 13:52:37,043 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5162, 1.8121, 2.2644, 2.8458, 2.9307, 2.3282, 2.4294, 2.7698], + device='cuda:2'), covar=tensor([0.0220, 0.0675, 0.0494, 0.0192, 0.0237, 0.0462, 0.0335, 0.0236], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0033, 0.0031, 0.0020, 0.0024, 0.0026, 0.0025, 0.0024], + device='cuda:2'), out_proj_covar=tensor([1.8493e-05, 3.3093e-05, 2.7843e-05, 1.6665e-05, 2.0387e-05, 2.3354e-05, + 2.1433e-05, 2.1687e-05], device='cuda:2') +2022-11-15 13:53:14,589 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2727, 3.5558, 3.0758, 3.4720, 3.1760, 3.3709, 2.8329, 3.2544], + device='cuda:2'), covar=tensor([0.0236, 0.0163, 0.0281, 0.0159, 0.0209, 0.0265, 0.0414, 0.0179], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0025, 0.0026, 0.0022, 0.0027, 0.0025, 0.0033, 0.0026], + device='cuda:2'), out_proj_covar=tensor([3.3282e-05, 2.7337e-05, 3.1586e-05, 2.5839e-05, 3.1679e-05, 2.8689e-05, + 3.8473e-05, 3.0689e-05], device='cuda:2') +2022-11-15 13:53:20,084 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.212e+02 2.129e+02 2.584e+02 3.254e+02 9.208e+02, threshold=5.168e+02, percent-clipped=2.0 +2022-11-15 13:53:20,127 INFO [train.py:876] (2/4) Epoch 1, batch 3800, loss[loss=0.2996, simple_loss=0.2424, pruned_loss=0.1784, over 5712.00 frames. ], tot_loss[loss=0.3062, simple_loss=0.2517, pruned_loss=0.1803, over 1083766.22 frames. ], batch size: 34, lr: 4.46e-02, grad_scale: 16.0 +2022-11-15 13:53:37,568 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-11-15 13:53:47,016 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.06 vs. limit=2.0 +2022-11-15 13:53:47,070 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.93 vs. limit=5.0 +2022-11-15 13:54:23,122 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6030, 2.0712, 1.0662, 1.7846, 2.0447, 1.8963, 0.9822, 1.0897], + device='cuda:2'), covar=tensor([0.0300, 0.0327, 0.0422, 0.0180, 0.0201, 0.0191, 0.0579, 0.0591], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0028, 0.0032, 0.0029, 0.0029, 0.0027, 0.0029, 0.0029], + device='cuda:2'), out_proj_covar=tensor([2.9362e-05, 2.8521e-05, 3.1600e-05, 2.7678e-05, 2.9772e-05, 2.7200e-05, + 3.3240e-05, 2.9761e-05], device='cuda:2') +2022-11-15 13:54:31,926 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.183e+02 2.219e+02 2.583e+02 3.487e+02 8.673e+02, threshold=5.166e+02, percent-clipped=10.0 +2022-11-15 13:54:31,968 INFO [train.py:876] (2/4) Epoch 1, batch 3900, loss[loss=0.3089, simple_loss=0.2409, pruned_loss=0.1884, over 4034.00 frames. ], tot_loss[loss=0.3054, simple_loss=0.2516, pruned_loss=0.1796, over 1082868.42 frames. ], batch size: 181, lr: 4.44e-02, grad_scale: 16.0 +2022-11-15 13:54:32,297 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.58 vs. limit=5.0 +2022-11-15 13:54:32,682 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3902.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 13:54:50,512 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3926.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:54:54,710 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 13:55:03,824 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3944.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 13:55:34,766 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3987.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:55:36,243 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.21 vs. limit=2.0 +2022-11-15 13:55:37,794 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.31 vs. limit=5.0 +2022-11-15 13:55:38,177 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3992.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 13:55:44,802 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.110e+02 2.234e+02 2.679e+02 3.538e+02 6.488e+02, threshold=5.359e+02, percent-clipped=3.0 +2022-11-15 13:55:44,844 INFO [train.py:876] (2/4) Epoch 1, batch 4000, loss[loss=0.2226, simple_loss=0.1974, pruned_loss=0.1239, over 5090.00 frames. ], tot_loss[loss=0.3024, simple_loss=0.2506, pruned_loss=0.1771, over 1088324.69 frames. ], batch size: 6, lr: 4.42e-02, grad_scale: 32.0 +2022-11-15 13:55:57,426 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6868, 1.8322, 2.1025, 2.6848, 2.8201, 2.3967, 1.8137, 1.7958], + device='cuda:2'), covar=tensor([0.0179, 0.0620, 0.0498, 0.0163, 0.0211, 0.0279, 0.0416, 0.0364], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0045, 0.0043, 0.0026, 0.0031, 0.0032, 0.0034, 0.0032], + device='cuda:2'), out_proj_covar=tensor([2.3007e-05, 4.5750e-05, 4.0869e-05, 2.1565e-05, 2.5743e-05, 3.0183e-05, + 3.1639e-05, 2.9709e-05], device='cuda:2') +2022-11-15 13:56:06,484 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1755, 5.5904, 5.7749, 4.9921, 5.6780, 5.9080, 4.8273, 4.7494], + device='cuda:2'), covar=tensor([0.0301, 0.0183, 0.0305, 0.0251, 0.0358, 0.0055, 0.0247, 0.0252], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0042, 0.0036, 0.0040, 0.0044, 0.0031, 0.0036, 0.0032], + device='cuda:2'), out_proj_covar=tensor([5.6344e-05, 5.5413e-05, 4.9305e-05, 5.1652e-05, 7.1125e-05, 4.3672e-05, + 4.9224e-05, 4.4261e-05], device='cuda:2') +2022-11-15 13:56:07,923 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=4032.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:56:23,785 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.07 vs. limit=5.0 +2022-11-15 13:56:58,557 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.016e+02 2.074e+02 2.727e+02 3.371e+02 7.611e+02, threshold=5.454e+02, percent-clipped=4.0 +2022-11-15 13:56:58,599 INFO [train.py:876] (2/4) Epoch 1, batch 4100, loss[loss=0.3157, simple_loss=0.2664, pruned_loss=0.1825, over 5587.00 frames. ], tot_loss[loss=0.2998, simple_loss=0.2492, pruned_loss=0.1751, over 1088357.66 frames. ], batch size: 22, lr: 4.40e-02, grad_scale: 32.0 +2022-11-15 13:57:02,260 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8407, 2.9139, 2.9037, 2.9789, 2.7608, 2.9726, 2.7036, 2.6998], + device='cuda:2'), covar=tensor([0.0204, 0.0209, 0.0164, 0.0156, 0.0231, 0.0150, 0.0326, 0.0254], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0027, 0.0027, 0.0024, 0.0029, 0.0026, 0.0035, 0.0028], + device='cuda:2'), out_proj_covar=tensor([3.7208e-05, 3.1989e-05, 3.3657e-05, 2.8993e-05, 3.4786e-05, 3.1553e-05, + 4.3529e-05, 3.4847e-05], device='cuda:2') +2022-11-15 13:57:07,686 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4520, 1.2444, 1.6783, 2.0266, 1.4057, 1.8151, 1.8304, 1.4056], + device='cuda:2'), covar=tensor([0.0112, 0.0134, 0.0106, 0.0048, 0.0196, 0.0081, 0.0074, 0.0121], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0021, 0.0019, 0.0017, 0.0024, 0.0019, 0.0020, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.3355e-05, 1.5488e-05, 1.5307e-05, 1.1805e-05, 1.8866e-05, 1.3204e-05, + 1.4552e-05, 1.2426e-05], device='cuda:2') +2022-11-15 13:57:11,433 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.69 vs. limit=5.0 +2022-11-15 13:57:13,101 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 13:57:24,947 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8915, 3.4557, 2.9018, 2.2306, 3.1597, 2.8418, 2.5723, 3.8065], + device='cuda:2'), covar=tensor([0.0292, 0.0195, 0.0178, 0.0687, 0.0147, 0.0228, 0.0283, 0.0089], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0018, 0.0017, 0.0028, 0.0017, 0.0018, 0.0014, 0.0016], + device='cuda:2'), out_proj_covar=tensor([2.6414e-05, 2.3973e-05, 2.1907e-05, 3.8349e-05, 2.1259e-05, 2.3122e-05, + 2.0013e-05, 2.0084e-05], device='cuda:2') +2022-11-15 13:57:28,080 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-11-15 13:57:28,544 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=4141.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:57:48,900 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.25 vs. limit=5.0 +2022-11-15 13:58:13,175 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.362e+02 1.959e+02 2.484e+02 3.266e+02 7.504e+02, threshold=4.967e+02, percent-clipped=2.0 +2022-11-15 13:58:13,220 INFO [train.py:876] (2/4) Epoch 1, batch 4200, loss[loss=0.2963, simple_loss=0.2466, pruned_loss=0.173, over 5113.00 frames. ], tot_loss[loss=0.3024, simple_loss=0.251, pruned_loss=0.1769, over 1086290.87 frames. ], batch size: 7, lr: 4.38e-02, grad_scale: 32.0 +2022-11-15 13:58:14,064 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4202.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:58:14,112 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=4202.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:58:27,595 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6509, 2.2398, 2.2535, 1.7367, 1.9966, 1.6661, 1.8603, 2.1120], + device='cuda:2'), covar=tensor([0.0253, 0.0311, 0.0138, 0.0316, 0.0272, 0.0703, 0.0335, 0.0201], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0017, 0.0017, 0.0019, 0.0019, 0.0017, 0.0018, 0.0017], + device='cuda:2'), out_proj_covar=tensor([2.0216e-05, 1.8519e-05, 1.7358e-05, 2.1178e-05, 2.1417e-05, 1.9553e-05, + 2.1038e-05, 1.9201e-05], device='cuda:2') +2022-11-15 13:58:49,669 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4250.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 13:59:13,089 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=4282.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:59:17,708 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=4288.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 13:59:26,184 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.22 vs. limit=5.0 +2022-11-15 13:59:27,276 INFO [train.py:876] (2/4) Epoch 1, batch 4300, loss[loss=0.3217, simple_loss=0.2497, pruned_loss=0.1969, over 3149.00 frames. ], tot_loss[loss=0.3036, simple_loss=0.2521, pruned_loss=0.1776, over 1087361.04 frames. ], batch size: 284, lr: 4.35e-02, grad_scale: 16.0 +2022-11-15 13:59:27,970 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.171e+02 2.229e+02 3.130e+02 3.930e+02 1.663e+03, threshold=6.259e+02, percent-clipped=10.0 +2022-11-15 13:59:50,083 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4332.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:00:02,440 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=4349.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:00:03,519 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3822, 2.2813, 2.1987, 2.7172, 3.1490, 2.9024, 3.2733, 2.7675], + device='cuda:2'), covar=tensor([0.0427, 0.1295, 0.1380, 0.0449, 0.0407, 0.0663, 0.0617, 0.0419], + device='cuda:2'), in_proj_covar=tensor([0.0033, 0.0063, 0.0058, 0.0036, 0.0038, 0.0046, 0.0046, 0.0039], + device='cuda:2'), out_proj_covar=tensor([2.9955e-05, 6.5082e-05, 5.7581e-05, 3.2082e-05, 3.3975e-05, 4.4911e-05, + 4.5505e-05, 3.7034e-05], device='cuda:2') +2022-11-15 14:00:25,184 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4380.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 14:00:28,124 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7567, 1.8990, 1.6900, 1.7225, 1.7225, 1.5441, 1.8752, 1.5942], + device='cuda:2'), covar=tensor([0.0154, 0.0147, 0.0134, 0.0170, 0.0236, 0.0317, 0.0168, 0.0253], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0016, 0.0018, 0.0018, 0.0019, 0.0016, 0.0017, 0.0017], + device='cuda:2'), out_proj_covar=tensor([1.9241e-05, 1.7630e-05, 1.8175e-05, 2.0578e-05, 2.1760e-05, 1.8163e-05, + 1.9713e-05, 2.0015e-05], device='cuda:2') +2022-11-15 14:00:33,079 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.04 vs. limit=5.0 +2022-11-15 14:00:39,099 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.08 vs. limit=2.0 +2022-11-15 14:00:41,283 INFO [train.py:876] (2/4) Epoch 1, batch 4400, loss[loss=0.223, simple_loss=0.198, pruned_loss=0.124, over 4956.00 frames. ], tot_loss[loss=0.3008, simple_loss=0.2507, pruned_loss=0.1754, over 1088241.33 frames. ], batch size: 5, lr: 4.33e-02, grad_scale: 16.0 +2022-11-15 14:00:41,969 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.193e+02 1.948e+02 2.508e+02 3.167e+02 7.237e+02, threshold=5.016e+02, percent-clipped=3.0 +2022-11-15 14:00:57,310 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8984, 2.9613, 2.8031, 2.9682, 2.4927, 2.5731, 1.7696, 2.7980], + device='cuda:2'), covar=tensor([0.1215, 0.0248, 0.0347, 0.0165, 0.0386, 0.0593, 0.2598, 0.0296], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0065, 0.0072, 0.0059, 0.0061, 0.0086, 0.0138, 0.0065], + device='cuda:2'), out_proj_covar=tensor([1.0618e-04, 5.9439e-05, 6.7669e-05, 5.2131e-05, 5.5334e-05, 8.7362e-05, + 1.6315e-04, 5.8449e-05], device='cuda:2') +2022-11-15 14:01:20,043 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.23 vs. limit=5.0 +2022-11-15 14:01:40,475 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.17 vs. limit=2.0 +2022-11-15 14:01:45,887 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-11-15 14:01:51,588 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=4497.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:01:54,773 INFO [train.py:876] (2/4) Epoch 1, batch 4500, loss[loss=0.3157, simple_loss=0.2606, pruned_loss=0.1855, over 5700.00 frames. ], tot_loss[loss=0.2987, simple_loss=0.2491, pruned_loss=0.1741, over 1084385.14 frames. ], batch size: 36, lr: 4.31e-02, grad_scale: 16.0 +2022-11-15 14:01:55,417 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 2.194e+02 3.031e+02 3.828e+02 9.010e+02, threshold=6.062e+02, percent-clipped=8.0 +2022-11-15 14:02:22,208 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.62 vs. limit=5.0 +2022-11-15 14:02:35,365 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-15 14:02:36,296 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-11-15 14:02:54,224 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4582.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:02:59,641 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-15 14:03:05,174 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7144, 1.5182, 1.6647, 1.6383, 1.7710, 1.6372, 1.7910, 1.7580], + device='cuda:2'), covar=tensor([0.0783, 0.1360, 0.0801, 0.1013, 0.0589, 0.0623, 0.1091, 0.0721], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0061, 0.0049, 0.0052, 0.0038, 0.0037, 0.0055, 0.0045], + device='cuda:2'), out_proj_covar=tensor([5.6813e-05, 8.8463e-05, 6.8972e-05, 7.1159e-05, 5.3906e-05, 5.1091e-05, + 8.3896e-05, 6.3276e-05], device='cuda:2') +2022-11-15 14:03:06,274 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.25 vs. limit=5.0 +2022-11-15 14:03:08,725 INFO [train.py:876] (2/4) Epoch 1, batch 4600, loss[loss=0.3229, simple_loss=0.2746, pruned_loss=0.1856, over 5576.00 frames. ], tot_loss[loss=0.299, simple_loss=0.2496, pruned_loss=0.1742, over 1085158.04 frames. ], batch size: 25, lr: 4.29e-02, grad_scale: 16.0 +2022-11-15 14:03:09,369 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.162e+01 1.839e+02 2.747e+02 3.849e+02 7.443e+02, threshold=5.493e+02, percent-clipped=4.0 +2022-11-15 14:03:29,434 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.06 vs. limit=2.0 +2022-11-15 14:03:29,921 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4630.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:03:33,496 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4667, 4.8151, 4.9337, 4.3605, 5.0774, 4.8367, 4.1724, 3.9708], + device='cuda:2'), covar=tensor([0.0452, 0.0233, 0.0340, 0.0283, 0.0342, 0.0173, 0.0273, 0.0386], + device='cuda:2'), in_proj_covar=tensor([0.0044, 0.0046, 0.0037, 0.0044, 0.0049, 0.0035, 0.0037, 0.0038], + device='cuda:2'), out_proj_covar=tensor([6.9526e-05, 6.4751e-05, 5.5190e-05, 5.9811e-05, 8.7869e-05, 5.1273e-05, + 5.2817e-05, 5.6169e-05], device='cuda:2') +2022-11-15 14:03:34,790 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.19 vs. limit=5.0 +2022-11-15 14:03:35,227 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.37 vs. limit=5.0 +2022-11-15 14:03:40,560 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=4644.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:04:04,324 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0564, 4.3648, 4.2854, 4.3975, 4.2139, 3.5342, 3.1092, 4.4639], + device='cuda:2'), covar=tensor([0.1446, 0.0202, 0.0260, 0.0188, 0.0170, 0.0685, 0.2853, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0068, 0.0073, 0.0061, 0.0062, 0.0093, 0.0144, 0.0066], + device='cuda:2'), out_proj_covar=tensor([1.1219e-04, 6.4117e-05, 6.9036e-05, 5.5247e-05, 5.8155e-05, 9.4764e-05, + 1.6671e-04, 6.0055e-05], device='cuda:2') +2022-11-15 14:04:21,453 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7657, 2.7568, 2.2525, 2.6069, 1.9032, 3.0392, 2.4045, 2.5001], + device='cuda:2'), covar=tensor([0.0677, 0.0244, 0.0349, 0.0206, 0.0521, 0.0168, 0.0346, 0.0205], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0025, 0.0024, 0.0020, 0.0030, 0.0023, 0.0026, 0.0023], + device='cuda:2'), out_proj_covar=tensor([3.4095e-05, 2.0695e-05, 2.0667e-05, 1.6622e-05, 2.6093e-05, 1.8863e-05, + 2.1687e-05, 1.8070e-05], device='cuda:2') +2022-11-15 14:04:24,340 INFO [train.py:876] (2/4) Epoch 1, batch 4700, loss[loss=0.2843, simple_loss=0.2403, pruned_loss=0.1641, over 5716.00 frames. ], tot_loss[loss=0.2983, simple_loss=0.2491, pruned_loss=0.1738, over 1087950.83 frames. ], batch size: 19, lr: 4.27e-02, grad_scale: 16.0 +2022-11-15 14:04:24,958 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.032e+02 2.250e+02 2.748e+02 3.964e+02 7.433e+02, threshold=5.495e+02, percent-clipped=7.0 +2022-11-15 14:04:55,390 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-11-15 14:05:34,556 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4797.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:05:37,702 INFO [train.py:876] (2/4) Epoch 1, batch 4800, loss[loss=0.258, simple_loss=0.2209, pruned_loss=0.1475, over 5690.00 frames. ], tot_loss[loss=0.2936, simple_loss=0.2463, pruned_loss=0.1704, over 1090451.77 frames. ], batch size: 11, lr: 4.25e-02, grad_scale: 16.0 +2022-11-15 14:05:38,055 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.14 vs. limit=2.0 +2022-11-15 14:05:38,343 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.248e+02 1.870e+02 2.529e+02 3.283e+02 6.481e+02, threshold=5.059e+02, percent-clipped=1.0 +2022-11-15 14:05:51,227 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3261, 0.7366, 1.0499, 1.0708, 1.0926, 1.2483, 0.9175, 1.3981], + device='cuda:2'), covar=tensor([0.0275, 0.0344, 0.0327, 0.0632, 0.1742, 0.0329, 0.0516, 0.0342], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0022, 0.0024, 0.0026, 0.0024, 0.0024, 0.0025, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.5885e-05, 2.6311e-05, 2.9014e-05, 3.5513e-05, 2.7178e-05, 2.6802e-05, + 2.6224e-05, 2.5669e-05], device='cuda:2') +2022-11-15 14:06:09,937 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4845.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:06:15,739 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6720, 1.2856, 1.4597, 1.6370, 1.3677, 0.7240, 1.8200, 1.6000], + device='cuda:2'), covar=tensor([0.0175, 0.0243, 0.0225, 0.0201, 0.0365, 0.0458, 0.0220, 0.0241], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0022, 0.0024, 0.0026, 0.0028, 0.0025, 0.0030, 0.0026], + device='cuda:2'), out_proj_covar=tensor([2.1585e-05, 2.2522e-05, 2.7365e-05, 2.9303e-05, 3.2649e-05, 3.0224e-05, + 3.3987e-05, 2.8795e-05], device='cuda:2') +2022-11-15 14:06:18,563 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6728, 3.7625, 3.8666, 3.8650, 3.5654, 2.9735, 4.2995, 3.6345], + device='cuda:2'), covar=tensor([0.0600, 0.0879, 0.0563, 0.0640, 0.0651, 0.0965, 0.0788, 0.0581], + device='cuda:2'), in_proj_covar=tensor([0.0042, 0.0062, 0.0052, 0.0055, 0.0039, 0.0039, 0.0056, 0.0045], + device='cuda:2'), out_proj_covar=tensor([5.9033e-05, 9.2297e-05, 7.4980e-05, 7.6339e-05, 5.6751e-05, 5.5442e-05, + 9.0320e-05, 6.3571e-05], device='cuda:2') +2022-11-15 14:06:19,298 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7044, 5.1908, 5.3433, 5.1433, 4.8432, 4.2954, 5.5918, 4.8658], + device='cuda:2'), covar=tensor([0.0471, 0.0541, 0.0341, 0.0444, 0.0331, 0.0546, 0.0655, 0.0485], + device='cuda:2'), in_proj_covar=tensor([0.0042, 0.0062, 0.0052, 0.0055, 0.0039, 0.0039, 0.0056, 0.0045], + device='cuda:2'), out_proj_covar=tensor([5.8971e-05, 9.2183e-05, 7.4926e-05, 7.6263e-05, 5.6701e-05, 5.5402e-05, + 9.0212e-05, 6.3571e-05], device='cuda:2') +2022-11-15 14:06:22,248 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8152, 3.4154, 2.0131, 2.7200, 3.3934, 3.2488, 2.5179, 3.1999], + device='cuda:2'), covar=tensor([0.0341, 0.0221, 0.0528, 0.0286, 0.0165, 0.0122, 0.0219, 0.0228], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0036, 0.0036, 0.0035, 0.0031, 0.0028, 0.0029, 0.0032], + device='cuda:2'), out_proj_covar=tensor([4.3548e-05, 3.7713e-05, 4.2863e-05, 3.7240e-05, 3.4782e-05, 2.9792e-05, + 3.2820e-05, 3.3897e-05], device='cuda:2') +2022-11-15 14:06:25,627 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.05 vs. limit=2.0 +2022-11-15 14:06:50,936 INFO [train.py:876] (2/4) Epoch 1, batch 4900, loss[loss=0.2157, simple_loss=0.1955, pruned_loss=0.118, over 5313.00 frames. ], tot_loss[loss=0.2901, simple_loss=0.2439, pruned_loss=0.1681, over 1081673.95 frames. ], batch size: 9, lr: 4.23e-02, grad_scale: 16.0 +2022-11-15 14:06:51,624 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.056e+02 2.074e+02 2.835e+02 3.865e+02 7.498e+02, threshold=5.670e+02, percent-clipped=5.0 +2022-11-15 14:07:02,725 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.14 vs. limit=2.0 +2022-11-15 14:07:22,594 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4944.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:07:40,999 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.09 vs. limit=2.0 +2022-11-15 14:07:57,423 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4992.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:08:08,416 INFO [train.py:876] (2/4) Epoch 1, batch 5000, loss[loss=0.3251, simple_loss=0.2573, pruned_loss=0.1964, over 3075.00 frames. ], tot_loss[loss=0.2884, simple_loss=0.243, pruned_loss=0.1669, over 1081639.23 frames. ], batch size: 284, lr: 4.20e-02, grad_scale: 16.0 +2022-11-15 14:08:09,091 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 2.045e+02 2.576e+02 3.694e+02 7.012e+02, threshold=5.152e+02, percent-clipped=6.0 +2022-11-15 14:09:01,953 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-15 14:09:21,845 INFO [train.py:876] (2/4) Epoch 1, batch 5100, loss[loss=0.3123, simple_loss=0.2619, pruned_loss=0.1813, over 5710.00 frames. ], tot_loss[loss=0.2892, simple_loss=0.2443, pruned_loss=0.167, over 1089302.27 frames. ], batch size: 34, lr: 4.18e-02, grad_scale: 16.0 +2022-11-15 14:09:22,490 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.065e+02 2.161e+02 2.601e+02 3.354e+02 8.150e+02, threshold=5.203e+02, percent-clipped=5.0 +2022-11-15 14:09:34,911 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2354, 3.7854, 3.8504, 3.8175, 3.4210, 3.2014, 2.3862, 3.6319], + device='cuda:2'), covar=tensor([0.1778, 0.0189, 0.0193, 0.0122, 0.0232, 0.0633, 0.3239, 0.0151], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0068, 0.0074, 0.0059, 0.0066, 0.0095, 0.0152, 0.0067], + device='cuda:2'), out_proj_covar=tensor([1.2553e-04, 6.4355e-05, 7.2292e-05, 5.4491e-05, 6.4176e-05, 9.9369e-05, + 1.7238e-04, 6.2396e-05], device='cuda:2') +2022-11-15 14:10:04,303 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-15 14:10:25,170 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0828, 1.3377, 2.3111, 1.9804, 2.4756, 1.8218, 1.6473, 1.7184], + device='cuda:2'), covar=tensor([0.0176, 0.0299, 0.0111, 0.0173, 0.0173, 0.0489, 0.0228, 0.0240], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0015, 0.0014, 0.0016, 0.0015, 0.0013, 0.0015, 0.0015], + device='cuda:2'), out_proj_covar=tensor([1.9091e-05, 1.7674e-05, 1.5473e-05, 1.9410e-05, 1.9570e-05, 1.6978e-05, + 1.7847e-05, 1.7960e-05], device='cuda:2') +2022-11-15 14:10:33,659 INFO [train.py:876] (2/4) Epoch 1, batch 5200, loss[loss=0.2206, simple_loss=0.1988, pruned_loss=0.1212, over 5506.00 frames. ], tot_loss[loss=0.2884, simple_loss=0.2438, pruned_loss=0.1665, over 1082861.78 frames. ], batch size: 10, lr: 4.16e-02, grad_scale: 16.0 +2022-11-15 14:10:34,300 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.259e+02 2.030e+02 2.662e+02 3.916e+02 1.299e+03, threshold=5.323e+02, percent-clipped=9.0 +2022-11-15 14:10:40,795 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0652, 1.0005, 1.1086, 1.2782, 1.6284, 1.6377, 0.8662, 1.0212], + device='cuda:2'), covar=tensor([0.0363, 0.0329, 0.0285, 0.0623, 0.0518, 0.0190, 0.0437, 0.0356], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0022, 0.0026, 0.0021, 0.0022, 0.0023, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.5236e-05, 2.7458e-05, 2.5909e-05, 3.7137e-05, 2.6209e-05, 2.5358e-05, + 2.7041e-05, 2.6954e-05], device='cuda:2') +2022-11-15 14:11:46,390 INFO [train.py:876] (2/4) Epoch 1, batch 5300, loss[loss=0.3596, simple_loss=0.2867, pruned_loss=0.2162, over 5570.00 frames. ], tot_loss[loss=0.2872, simple_loss=0.2434, pruned_loss=0.1655, over 1088814.46 frames. ], batch size: 43, lr: 4.14e-02, grad_scale: 16.0 +2022-11-15 14:11:47,375 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.166e+02 2.088e+02 2.621e+02 3.205e+02 6.242e+02, threshold=5.243e+02, percent-clipped=4.0 +2022-11-15 14:11:50,929 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6191, 4.4542, 4.7590, 4.4054, 5.0628, 4.6346, 4.3204, 4.4765], + device='cuda:2'), covar=tensor([0.0392, 0.0242, 0.0315, 0.0257, 0.0322, 0.0172, 0.0231, 0.0305], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0050, 0.0039, 0.0047, 0.0050, 0.0039, 0.0039, 0.0040], + device='cuda:2'), out_proj_covar=tensor([8.0286e-05, 7.5769e-05, 6.2270e-05, 6.9314e-05, 9.8476e-05, 6.0074e-05, + 5.9061e-05, 6.3009e-05], device='cuda:2') +2022-11-15 14:12:02,717 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.31 vs. limit=5.0 +2022-11-15 14:12:24,697 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6315, 4.5143, 4.1982, 4.6545, 4.4687, 4.0210, 4.0017, 3.4521], + device='cuda:2'), covar=tensor([0.0423, 0.0227, 0.0199, 0.0118, 0.0157, 0.0233, 0.0353, 0.0387], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0036, 0.0050, 0.0036, 0.0044, 0.0045, 0.0040, 0.0038], + device='cuda:2'), out_proj_covar=tensor([5.8428e-05, 5.4030e-05, 7.0108e-05, 5.1061e-05, 6.0200e-05, 5.9191e-05, + 5.5753e-05, 5.3904e-05], device='cuda:2') +2022-11-15 14:13:00,083 INFO [train.py:876] (2/4) Epoch 1, batch 5400, loss[loss=0.3932, simple_loss=0.3068, pruned_loss=0.2399, over 5434.00 frames. ], tot_loss[loss=0.2883, simple_loss=0.244, pruned_loss=0.1663, over 1086855.87 frames. ], batch size: 58, lr: 4.12e-02, grad_scale: 16.0 +2022-11-15 14:13:00,711 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.074e+02 2.076e+02 2.742e+02 3.471e+02 5.546e+02, threshold=5.484e+02, percent-clipped=1.0 +2022-11-15 14:13:15,804 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5375, 4.5174, 4.6901, 4.6892, 4.3806, 4.1043, 5.1055, 4.2989], + device='cuda:2'), covar=tensor([0.0355, 0.0626, 0.0314, 0.0525, 0.0469, 0.0296, 0.0526, 0.0351], + device='cuda:2'), in_proj_covar=tensor([0.0044, 0.0067, 0.0052, 0.0058, 0.0041, 0.0037, 0.0058, 0.0048], + device='cuda:2'), out_proj_covar=tensor([6.5460e-05, 1.0511e-04, 7.7382e-05, 8.5549e-05, 6.3557e-05, 5.7095e-05, + 9.8611e-05, 6.9933e-05], device='cuda:2') +2022-11-15 14:13:15,862 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4955, 4.3827, 4.3737, 4.2784, 3.9800, 3.1140, 2.8025, 3.9837], + device='cuda:2'), covar=tensor([0.2198, 0.0196, 0.0169, 0.0171, 0.0229, 0.0973, 0.3222, 0.0144], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0073, 0.0078, 0.0064, 0.0072, 0.0104, 0.0159, 0.0074], + device='cuda:2'), out_proj_covar=tensor([1.4151e-04, 6.9746e-05, 7.6758e-05, 6.1694e-05, 7.1791e-05, 1.1088e-04, + 1.7808e-04, 7.2401e-05], device='cuda:2') +2022-11-15 14:13:34,296 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=11.15 vs. limit=5.0 +2022-11-15 14:13:47,592 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9222, 1.3906, 2.0261, 2.2875, 2.0026, 2.0450, 1.7105, 1.2291], + device='cuda:2'), covar=tensor([0.0175, 0.0222, 0.0129, 0.0176, 0.0237, 0.0371, 0.0423, 0.0283], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0017, 0.0015, 0.0018, 0.0017, 0.0016, 0.0017, 0.0017], + device='cuda:2'), out_proj_covar=tensor([2.1329e-05, 2.0234e-05, 1.6494e-05, 2.2251e-05, 2.2531e-05, 1.9865e-05, + 2.0971e-05, 2.0893e-05], device='cuda:2') +2022-11-15 14:13:52,842 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=11.23 vs. limit=5.0 +2022-11-15 14:14:12,215 INFO [train.py:876] (2/4) Epoch 1, batch 5500, loss[loss=0.246, simple_loss=0.2216, pruned_loss=0.1352, over 5716.00 frames. ], tot_loss[loss=0.2905, simple_loss=0.2453, pruned_loss=0.1679, over 1083627.20 frames. ], batch size: 15, lr: 4.10e-02, grad_scale: 16.0 +2022-11-15 14:14:12,894 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.206e+02 2.146e+02 2.749e+02 3.970e+02 7.189e+02, threshold=5.498e+02, percent-clipped=5.0 +2022-11-15 14:15:05,847 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8332, 2.3566, 1.3926, 1.6754, 1.3185, 2.4246, 1.8824, 2.1662], + device='cuda:2'), covar=tensor([0.0369, 0.0195, 0.0486, 0.0296, 0.0278, 0.0104, 0.0111, 0.0171], + device='cuda:2'), in_proj_covar=tensor([0.0050, 0.0040, 0.0036, 0.0040, 0.0033, 0.0026, 0.0030, 0.0036], + device='cuda:2'), out_proj_covar=tensor([5.7122e-05, 4.6151e-05, 4.6494e-05, 4.5964e-05, 3.9653e-05, 3.0699e-05, + 3.4695e-05, 4.0172e-05], device='cuda:2') +2022-11-15 14:15:25,190 INFO [train.py:876] (2/4) Epoch 1, batch 5600, loss[loss=0.2017, simple_loss=0.1855, pruned_loss=0.109, over 5324.00 frames. ], tot_loss[loss=0.2874, simple_loss=0.2435, pruned_loss=0.1657, over 1087209.23 frames. ], batch size: 9, lr: 4.08e-02, grad_scale: 16.0 +2022-11-15 14:15:26,175 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.047e+02 2.152e+02 2.832e+02 3.606e+02 7.262e+02, threshold=5.664e+02, percent-clipped=5.0 +2022-11-15 14:15:30,915 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.10 vs. limit=2.0 +2022-11-15 14:15:37,620 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-11-15 14:15:40,816 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6121, 1.6149, 1.3661, 1.9341, 1.4515, 1.3819, 1.2473, 1.8322], + device='cuda:2'), covar=tensor([0.0264, 0.0301, 0.0181, 0.0306, 0.0275, 0.0686, 0.0577, 0.0311], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0018, 0.0018, 0.0020, 0.0019, 0.0018, 0.0019, 0.0017], + device='cuda:2'), out_proj_covar=tensor([2.2949e-05, 2.1195e-05, 1.9499e-05, 2.5077e-05, 2.5465e-05, 2.1876e-05, + 2.3495e-05, 2.1237e-05], device='cuda:2') +2022-11-15 14:16:00,747 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=5650.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:16:21,257 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8665, 1.6955, 1.0707, 1.7473, 1.7457, 1.1399, 1.0873, 2.2467], + device='cuda:2'), covar=tensor([0.0254, 0.0389, 0.0188, 0.0576, 0.0440, 0.2006, 0.0532, 0.0225], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0016, 0.0016, 0.0018, 0.0018, 0.0016, 0.0017, 0.0016], + device='cuda:2'), out_proj_covar=tensor([2.1064e-05, 1.9372e-05, 1.8134e-05, 2.3297e-05, 2.3658e-05, 2.0534e-05, + 2.1397e-05, 1.9593e-05], device='cuda:2') +2022-11-15 14:16:37,324 INFO [train.py:876] (2/4) Epoch 1, batch 5700, loss[loss=0.2027, simple_loss=0.1949, pruned_loss=0.1052, over 5733.00 frames. ], tot_loss[loss=0.2858, simple_loss=0.2423, pruned_loss=0.1647, over 1084617.85 frames. ], batch size: 13, lr: 4.06e-02, grad_scale: 16.0 +2022-11-15 14:16:37,961 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.160e+02 2.164e+02 2.765e+02 3.457e+02 8.983e+02, threshold=5.530e+02, percent-clipped=5.0 +2022-11-15 14:16:44,907 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5711.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:16:59,363 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.05 vs. limit=2.0 +2022-11-15 14:17:16,891 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=5755.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:17:50,570 INFO [train.py:876] (2/4) Epoch 1, batch 5800, loss[loss=0.1863, simple_loss=0.1723, pruned_loss=0.1001, over 5494.00 frames. ], tot_loss[loss=0.2861, simple_loss=0.2428, pruned_loss=0.1647, over 1087841.56 frames. ], batch size: 12, lr: 4.04e-02, grad_scale: 16.0 +2022-11-15 14:17:51,234 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.770e+01 1.984e+02 2.593e+02 3.696e+02 7.124e+02, threshold=5.186e+02, percent-clipped=5.0 +2022-11-15 14:18:01,238 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5816.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:18:11,348 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2152, 4.6936, 4.1467, 4.9659, 4.7065, 4.4018, 4.0274, 3.8749], + device='cuda:2'), covar=tensor([0.0280, 0.0295, 0.0306, 0.0112, 0.0189, 0.0215, 0.0272, 0.0460], + device='cuda:2'), in_proj_covar=tensor([0.0046, 0.0041, 0.0053, 0.0041, 0.0048, 0.0047, 0.0042, 0.0041], + device='cuda:2'), out_proj_covar=tensor([6.6971e-05, 6.4145e-05, 7.5259e-05, 6.1539e-05, 6.9573e-05, 6.4167e-05, + 6.1544e-05, 5.9695e-05], device='cuda:2') +2022-11-15 14:18:24,617 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 14:18:25,942 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8601, 2.3292, 1.8859, 3.2867, 3.1588, 3.2719, 2.6586, 3.8904], + device='cuda:2'), covar=tensor([0.0315, 0.1643, 0.1455, 0.0464, 0.0407, 0.0641, 0.1424, 0.0139], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0100, 0.0094, 0.0053, 0.0064, 0.0084, 0.0096, 0.0054], + device='cuda:2'), out_proj_covar=tensor([4.9279e-05, 1.1339e-04, 1.0248e-04, 5.7320e-05, 6.5329e-05, 9.3091e-05, + 1.0583e-04, 5.3903e-05], device='cuda:2') +2022-11-15 14:18:27,635 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.18 vs. limit=5.0 +2022-11-15 14:18:31,841 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=5858.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:18:53,116 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.44 vs. limit=5.0 +2022-11-15 14:18:57,609 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8155, 2.7354, 1.1790, 1.7603, 2.0335, 2.9153, 1.7493, 2.6327], + device='cuda:2'), covar=tensor([0.0535, 0.0230, 0.0634, 0.0422, 0.0236, 0.0075, 0.0188, 0.0203], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0045, 0.0041, 0.0047, 0.0038, 0.0028, 0.0033, 0.0040], + device='cuda:2'), out_proj_covar=tensor([6.9983e-05, 5.2134e-05, 5.2581e-05, 5.6445e-05, 4.5401e-05, 3.2961e-05, + 3.9954e-05, 4.4733e-05], device='cuda:2') +2022-11-15 14:19:03,095 INFO [train.py:876] (2/4) Epoch 1, batch 5900, loss[loss=0.2395, simple_loss=0.2194, pruned_loss=0.1297, over 5716.00 frames. ], tot_loss[loss=0.2855, simple_loss=0.243, pruned_loss=0.164, over 1084814.20 frames. ], batch size: 17, lr: 4.02e-02, grad_scale: 16.0 +2022-11-15 14:19:03,748 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.146e+02 1.869e+02 2.719e+02 3.366e+02 7.828e+02, threshold=5.439e+02, percent-clipped=3.0 +2022-11-15 14:19:16,318 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5919.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:19:30,251 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6516, 2.9373, 2.7779, 3.0277, 2.8843, 2.6501, 2.6374, 2.5900], + device='cuda:2'), covar=tensor([0.0370, 0.0365, 0.0322, 0.0345, 0.0300, 0.0366, 0.0381, 0.0404], + device='cuda:2'), in_proj_covar=tensor([0.0045, 0.0041, 0.0052, 0.0041, 0.0050, 0.0049, 0.0042, 0.0041], + device='cuda:2'), out_proj_covar=tensor([6.6993e-05, 6.4908e-05, 7.3955e-05, 6.2717e-05, 7.2505e-05, 6.7319e-05, + 6.0571e-05, 5.9159e-05], device='cuda:2') +2022-11-15 14:20:07,710 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=5990.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:20:15,912 INFO [train.py:876] (2/4) Epoch 1, batch 6000, loss[loss=0.3086, simple_loss=0.2625, pruned_loss=0.1773, over 5707.00 frames. ], tot_loss[loss=0.2853, simple_loss=0.2425, pruned_loss=0.1641, over 1082227.96 frames. ], batch size: 17, lr: 4.00e-02, grad_scale: 16.0 +2022-11-15 14:20:15,913 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 14:20:34,716 INFO [train.py:908] (2/4) Epoch 1, validation: loss=0.2263, simple_loss=0.2274, pruned_loss=0.1126, over 1530663.00 frames. +2022-11-15 14:20:34,717 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4515MB +2022-11-15 14:20:35,402 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.082e+02 2.347e+02 2.873e+02 3.885e+02 1.859e+03, threshold=5.746e+02, percent-clipped=5.0 +2022-11-15 14:20:38,400 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6006.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 14:20:39,773 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4139, 1.5737, 1.4880, 1.5368, 0.9065, 0.5409, 1.2940, 1.5421], + device='cuda:2'), covar=tensor([0.0197, 0.0156, 0.0274, 0.0147, 0.0352, 0.0394, 0.0249, 0.0148], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0028, 0.0033, 0.0028, 0.0037, 0.0032, 0.0035, 0.0027], + device='cuda:2'), out_proj_covar=tensor([3.4000e-05, 3.3624e-05, 4.2523e-05, 3.7053e-05, 4.7542e-05, 4.6828e-05, + 4.5860e-05, 3.5207e-05], device='cuda:2') +2022-11-15 14:21:00,460 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.77 vs. limit=5.0 +2022-11-15 14:21:05,030 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6043.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:21:11,061 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6051.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:21:16,263 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-15 14:21:21,311 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.06 vs. limit=2.0 +2022-11-15 14:21:29,764 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9340, 1.1900, 1.0655, 0.9108, 1.2197, 1.5165, 0.7349, 1.1898], + device='cuda:2'), covar=tensor([0.0147, 0.0170, 0.0220, 0.0264, 0.0211, 0.0116, 0.0227, 0.0132], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0023, 0.0022, 0.0021, 0.0020, 0.0021, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.4666e-05, 2.6138e-05, 2.7583e-05, 2.5120e-05, 2.3971e-05, 2.3026e-05, + 2.9943e-05, 2.2354e-05], device='cuda:2') +2022-11-15 14:21:47,069 INFO [train.py:876] (2/4) Epoch 1, batch 6100, loss[loss=0.2878, simple_loss=0.247, pruned_loss=0.1643, over 5693.00 frames. ], tot_loss[loss=0.2849, simple_loss=0.2423, pruned_loss=0.1637, over 1082739.27 frames. ], batch size: 19, lr: 3.98e-02, grad_scale: 16.0 +2022-11-15 14:21:47,117 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1846, 3.2342, 3.3418, 3.3669, 3.2540, 2.9165, 3.7567, 2.9260], + device='cuda:2'), covar=tensor([0.0558, 0.0843, 0.0412, 0.0552, 0.0598, 0.0419, 0.0606, 0.0763], + device='cuda:2'), in_proj_covar=tensor([0.0049, 0.0072, 0.0058, 0.0063, 0.0045, 0.0040, 0.0065, 0.0053], + device='cuda:2'), out_proj_covar=tensor([7.7831e-05, 1.1697e-04, 9.0523e-05, 9.9713e-05, 7.3503e-05, 6.2788e-05, + 1.1765e-04, 8.3156e-05], device='cuda:2') +2022-11-15 14:21:47,728 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.385e+02 2.266e+02 2.673e+02 3.416e+02 6.924e+02, threshold=5.346e+02, percent-clipped=3.0 +2022-11-15 14:21:49,335 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6104.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:21:54,397 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6111.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:22:03,560 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8180, 2.0909, 1.3841, 1.4587, 0.8381, 1.3048, 1.3082, 1.2625], + device='cuda:2'), covar=tensor([0.0237, 0.0131, 0.0148, 0.0220, 0.0543, 0.0389, 0.0225, 0.0278], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0016, 0.0018, 0.0019, 0.0019, 0.0016, 0.0016, 0.0015], + device='cuda:2'), out_proj_covar=tensor([2.3255e-05, 1.9271e-05, 2.0662e-05, 2.4605e-05, 2.5655e-05, 2.1943e-05, + 2.0798e-05, 1.9503e-05], device='cuda:2') +2022-11-15 14:22:11,650 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6135.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:22:14,682 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 14:22:32,514 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.95 vs. limit=2.0 +2022-11-15 14:22:37,305 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6170.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:22:49,276 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.51 vs. limit=5.0 +2022-11-15 14:22:55,829 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6196.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:22:59,794 INFO [train.py:876] (2/4) Epoch 1, batch 6200, loss[loss=0.3651, simple_loss=0.2968, pruned_loss=0.2167, over 5671.00 frames. ], tot_loss[loss=0.2848, simple_loss=0.2421, pruned_loss=0.1637, over 1086427.14 frames. ], batch size: 34, lr: 3.96e-02, grad_scale: 16.0 +2022-11-15 14:23:00,449 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.055e+02 1.942e+02 2.617e+02 4.109e+02 1.137e+03, threshold=5.234e+02, percent-clipped=10.0 +2022-11-15 14:23:01,344 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6203.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:23:07,061 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6211.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:23:09,418 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6214.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:23:21,835 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6231.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:23:37,788 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 14:23:45,459 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6264.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:23:46,881 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6266.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:23:50,195 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.94 vs. limit=5.0 +2022-11-15 14:23:51,482 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6272.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:24:07,940 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9907, 3.0575, 3.0177, 3.7818, 4.1518, 3.8178, 3.4607, 3.3538], + device='cuda:2'), covar=tensor([0.0123, 0.1076, 0.0857, 0.0208, 0.0151, 0.0674, 0.0991, 0.0172], + device='cuda:2'), in_proj_covar=tensor([0.0051, 0.0107, 0.0102, 0.0058, 0.0067, 0.0096, 0.0110, 0.0058], + device='cuda:2'), out_proj_covar=tensor([5.3142e-05, 1.2217e-04, 1.1159e-04, 6.8482e-05, 7.1339e-05, 1.0889e-04, + 1.2313e-04, 6.2843e-05], device='cuda:2') +2022-11-15 14:24:12,259 INFO [train.py:876] (2/4) Epoch 1, batch 6300, loss[loss=0.2294, simple_loss=0.2077, pruned_loss=0.1255, over 5301.00 frames. ], tot_loss[loss=0.2824, simple_loss=0.2408, pruned_loss=0.162, over 1084836.08 frames. ], batch size: 9, lr: 3.94e-02, grad_scale: 32.0 +2022-11-15 14:24:12,909 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.397e+02 2.220e+02 2.802e+02 3.554e+02 1.076e+03, threshold=5.605e+02, percent-clipped=6.0 +2022-11-15 14:24:15,819 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6306.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 14:24:31,164 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6327.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:24:31,931 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2880, 1.5298, 1.7421, 1.8150, 2.3170, 1.4092, 1.8854, 2.0631], + device='cuda:2'), covar=tensor([0.0097, 0.0561, 0.0199, 0.0308, 0.0097, 0.0362, 0.0175, 0.0151], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0092, 0.0049, 0.0068, 0.0041, 0.0060, 0.0054, 0.0045], + device='cuda:2'), out_proj_covar=tensor([4.7143e-05, 1.1136e-04, 5.8494e-05, 8.1895e-05, 4.8658e-05, 7.2162e-05, + 6.5307e-05, 5.5624e-05], device='cuda:2') +2022-11-15 14:24:44,723 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6346.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:24:50,573 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6354.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:25:01,981 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7763, 1.5631, 1.5071, 1.3237, 0.6938, 1.0284, 1.2531, 1.4728], + device='cuda:2'), covar=tensor([0.0219, 0.0170, 0.0275, 0.0385, 0.0527, 0.0363, 0.0372, 0.0166], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0026, 0.0032, 0.0027, 0.0037, 0.0029, 0.0036, 0.0027], + device='cuda:2'), out_proj_covar=tensor([3.6108e-05, 3.1991e-05, 4.2786e-05, 3.7321e-05, 5.1545e-05, 4.4120e-05, + 4.8893e-05, 3.6501e-05], device='cuda:2') +2022-11-15 14:25:22,864 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6399.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:25:24,492 INFO [train.py:876] (2/4) Epoch 1, batch 6400, loss[loss=0.2743, simple_loss=0.2388, pruned_loss=0.1549, over 5646.00 frames. ], tot_loss[loss=0.2805, simple_loss=0.2395, pruned_loss=0.1608, over 1084328.66 frames. ], batch size: 29, lr: 3.92e-02, grad_scale: 32.0 +2022-11-15 14:25:25,171 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.202e+02 2.235e+02 2.872e+02 3.964e+02 7.777e+02, threshold=5.745e+02, percent-clipped=4.0 +2022-11-15 14:25:32,014 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6411.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:25:34,233 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-11-15 14:25:46,161 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-11-15 14:25:56,665 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-11-15 14:26:06,808 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6459.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:26:09,017 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6462.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:26:12,756 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6467.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:26:15,953 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-11-15 14:26:30,101 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6491.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:26:36,866 INFO [train.py:876] (2/4) Epoch 1, batch 6500, loss[loss=0.3603, simple_loss=0.2765, pruned_loss=0.2221, over 2984.00 frames. ], tot_loss[loss=0.2799, simple_loss=0.2392, pruned_loss=0.1603, over 1080722.87 frames. ], batch size: 284, lr: 3.90e-02, grad_scale: 32.0 +2022-11-15 14:26:37,570 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.388e+02 2.150e+02 2.872e+02 3.674e+02 6.857e+02, threshold=5.744e+02, percent-clipped=4.0 +2022-11-15 14:26:46,471 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6514.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:26:53,203 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6523.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 14:26:55,175 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6526.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:26:56,670 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6528.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:26:59,841 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7229, 2.7976, 1.5041, 1.8213, 3.1478, 3.1638, 1.7742, 3.2154], + device='cuda:2'), covar=tensor([0.0723, 0.0223, 0.0568, 0.0609, 0.0143, 0.0107, 0.0270, 0.0148], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0052, 0.0046, 0.0059, 0.0041, 0.0039, 0.0039, 0.0043], + device='cuda:2'), out_proj_covar=tensor([8.9367e-05, 6.2650e-05, 6.3149e-05, 7.3449e-05, 5.0964e-05, 4.5258e-05, + 4.9775e-05, 4.9564e-05], device='cuda:2') +2022-11-15 14:27:11,651 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-11-15 14:27:18,840 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6559.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:27:20,933 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6562.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:27:24,782 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6567.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:27:49,420 INFO [train.py:876] (2/4) Epoch 1, batch 6600, loss[loss=0.2322, simple_loss=0.2139, pruned_loss=0.1252, over 5773.00 frames. ], tot_loss[loss=0.2801, simple_loss=0.2398, pruned_loss=0.1602, over 1083959.26 frames. ], batch size: 14, lr: 3.89e-02, grad_scale: 32.0 +2022-11-15 14:27:50,094 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.067e+02 2.099e+02 2.757e+02 3.560e+02 8.696e+02, threshold=5.514e+02, percent-clipped=5.0 +2022-11-15 14:28:04,921 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6622.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:28:10,140 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6629.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:28:22,498 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6646.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:28:29,989 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9816, 1.3330, 0.7795, 0.7466, 1.1453, 0.7176, 0.5372, 0.5724], + device='cuda:2'), covar=tensor([0.0162, 0.0291, 0.0158, 0.0183, 0.0174, 0.0157, 0.0460, 0.0858], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0021, 0.0022, 0.0021, 0.0023, 0.0023, 0.0023, 0.0021], + device='cuda:2'), out_proj_covar=tensor([2.6292e-05, 2.5576e-05, 2.9296e-05, 2.4726e-05, 2.5658e-05, 2.6399e-05, + 3.4602e-05, 2.5412e-05], device='cuda:2') +2022-11-15 14:28:49,229 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-15 14:28:54,042 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6690.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:28:56,687 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6694.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:29:00,628 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6699.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:29:01,867 INFO [train.py:876] (2/4) Epoch 1, batch 6700, loss[loss=0.2603, simple_loss=0.2165, pruned_loss=0.152, over 5008.00 frames. ], tot_loss[loss=0.282, simple_loss=0.2414, pruned_loss=0.1613, over 1085248.78 frames. ], batch size: 109, lr: 3.87e-02, grad_scale: 16.0 +2022-11-15 14:29:03,244 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.203e+02 2.211e+02 2.874e+02 3.707e+02 9.191e+02, threshold=5.749e+02, percent-clipped=7.0 +2022-11-15 14:29:34,899 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6747.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:30:00,899 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.11 vs. limit=2.0 +2022-11-15 14:30:06,622 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6791.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:30:13,994 INFO [train.py:876] (2/4) Epoch 1, batch 6800, loss[loss=0.1669, simple_loss=0.1598, pruned_loss=0.08698, over 5772.00 frames. ], tot_loss[loss=0.2794, simple_loss=0.2396, pruned_loss=0.1596, over 1084470.47 frames. ], batch size: 8, lr: 3.85e-02, grad_scale: 16.0 +2022-11-15 14:30:15,298 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.257e+02 2.052e+02 2.561e+02 3.297e+02 6.876e+02, threshold=5.122e+02, percent-clipped=2.0 +2022-11-15 14:30:16,445 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-11-15 14:30:26,399 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6818.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 14:30:30,173 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6823.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:30:32,309 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6826.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:30:41,204 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6839.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:30:57,029 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6859.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:31:02,562 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6867.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:31:07,534 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6874.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:31:09,987 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3188, 1.9065, 0.8421, 1.8521, 1.3038, 0.8868, 1.4356, 1.3468], + device='cuda:2'), covar=tensor([0.0284, 0.0206, 0.0288, 0.0302, 0.0356, 0.0963, 0.0432, 0.0492], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0017, 0.0019, 0.0021, 0.0019, 0.0017, 0.0018, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.6688e-05, 2.0797e-05, 2.2433e-05, 2.7608e-05, 2.7461e-05, 2.4345e-05, + 2.3820e-05, 2.5197e-05], device='cuda:2') +2022-11-15 14:31:10,661 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6878.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:31:26,636 INFO [train.py:876] (2/4) Epoch 1, batch 6900, loss[loss=0.32, simple_loss=0.2694, pruned_loss=0.1853, over 5548.00 frames. ], tot_loss[loss=0.2797, simple_loss=0.24, pruned_loss=0.1597, over 1084087.13 frames. ], batch size: 46, lr: 3.83e-02, grad_scale: 16.0 +2022-11-15 14:31:27,995 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.342e+02 2.317e+02 3.048e+02 4.158e+02 6.462e+02, threshold=6.096e+02, percent-clipped=10.0 +2022-11-15 14:31:28,131 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1995, 4.0105, 4.2908, 3.6794, 4.5412, 4.1015, 4.0664, 3.8726], + device='cuda:2'), covar=tensor([0.0550, 0.0449, 0.0504, 0.0495, 0.0565, 0.0317, 0.0325, 0.0589], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0059, 0.0046, 0.0057, 0.0058, 0.0041, 0.0048, 0.0044], + device='cuda:2'), out_proj_covar=tensor([1.0709e-04, 1.0250e-04, 8.2938e-05, 9.5561e-05, 1.2441e-04, 6.9135e-05, + 8.6014e-05, 7.9802e-05], device='cuda:2') +2022-11-15 14:31:30,927 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6907.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:31:32,409 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6909.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:31:33,551 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8741, 1.5321, 1.4591, 1.6285, 1.8723, 1.3950, 1.8340, 1.9392], + device='cuda:2'), covar=tensor([0.0052, 0.0364, 0.0171, 0.0143, 0.0073, 0.0246, 0.0114, 0.0057], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0103, 0.0052, 0.0077, 0.0047, 0.0069, 0.0059, 0.0053], + device='cuda:2'), out_proj_covar=tensor([4.9649e-05, 1.2830e-04, 6.5660e-05, 9.5917e-05, 5.8689e-05, 8.7298e-05, + 7.4182e-05, 6.7036e-05], device='cuda:2') +2022-11-15 14:31:36,821 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6915.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:31:41,765 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6922.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:31:54,350 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6939.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:32:17,041 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6970.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:32:17,166 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6970.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:32:17,806 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.6142, 1.3713, 0.7556, 1.0564, 0.6713, 1.0250, 1.0238, 0.9786], + device='cuda:2'), covar=tensor([0.0206, 0.0087, 0.0181, 0.0147, 0.0369, 0.0139, 0.0264, 0.0225], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0016, 0.0018, 0.0020, 0.0019, 0.0017, 0.0017, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.5443e-05, 1.9817e-05, 2.1719e-05, 2.6612e-05, 2.7170e-05, 2.3761e-05, + 2.2948e-05, 2.4735e-05], device='cuda:2') +2022-11-15 14:32:27,729 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6985.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:32:31,278 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3381, 3.3026, 3.5006, 3.1246, 3.7255, 3.1469, 3.0939, 3.2626], + device='cuda:2'), covar=tensor([0.0546, 0.0402, 0.0495, 0.0441, 0.0360, 0.0452, 0.0458, 0.0411], + device='cuda:2'), in_proj_covar=tensor([0.0054, 0.0055, 0.0044, 0.0053, 0.0052, 0.0038, 0.0046, 0.0041], + device='cuda:2'), out_proj_covar=tensor([1.0052e-04, 9.6079e-05, 8.0471e-05, 8.8678e-05, 1.1268e-04, 6.4488e-05, + 8.2065e-05, 7.4712e-05], device='cuda:2') +2022-11-15 14:32:39,437 INFO [train.py:876] (2/4) Epoch 1, batch 7000, loss[loss=0.2795, simple_loss=0.2438, pruned_loss=0.1576, over 5695.00 frames. ], tot_loss[loss=0.278, simple_loss=0.2391, pruned_loss=0.1584, over 1081507.02 frames. ], batch size: 19, lr: 3.81e-02, grad_scale: 16.0 +2022-11-15 14:32:40,782 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.079e+02 2.319e+02 2.855e+02 3.574e+02 7.700e+02, threshold=5.709e+02, percent-clipped=2.0 +2022-11-15 14:32:45,501 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5564, 4.9254, 4.4859, 4.5433, 4.3640, 4.6271, 3.2989, 4.4840], + device='cuda:2'), covar=tensor([0.0164, 0.0131, 0.0193, 0.0167, 0.0170, 0.0179, 0.0726, 0.0170], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0034, 0.0032, 0.0027, 0.0032, 0.0028, 0.0054, 0.0034], + device='cuda:2'), out_proj_covar=tensor([5.9237e-05, 5.5299e-05, 4.9305e-05, 4.3221e-05, 5.0432e-05, 4.5328e-05, + 8.6282e-05, 5.5470e-05], device='cuda:2') +2022-11-15 14:33:13,375 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7048.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 14:33:15,051 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 14:33:19,197 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2481, 2.0017, 1.2363, 1.9355, 1.9536, 2.2928, 1.4538, 2.0982], + device='cuda:2'), covar=tensor([0.0245, 0.0164, 0.0395, 0.0172, 0.0153, 0.0115, 0.0323, 0.0158], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0014, 0.0018, 0.0019, 0.0017, 0.0017, 0.0021, 0.0017], + device='cuda:2'), out_proj_covar=tensor([2.2943e-05, 1.8944e-05, 2.3318e-05, 2.3337e-05, 2.2673e-05, 2.0421e-05, + 2.8165e-05, 2.0296e-05], device='cuda:2') +2022-11-15 14:33:25,217 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.36 vs. limit=2.0 +2022-11-15 14:33:51,323 INFO [train.py:876] (2/4) Epoch 1, batch 7100, loss[loss=0.2498, simple_loss=0.224, pruned_loss=0.1378, over 5699.00 frames. ], tot_loss[loss=0.276, simple_loss=0.2377, pruned_loss=0.1571, over 1076639.65 frames. ], batch size: 28, lr: 3.79e-02, grad_scale: 16.0 +2022-11-15 14:33:52,659 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.200e+02 2.197e+02 2.721e+02 3.665e+02 9.993e+02, threshold=5.441e+02, percent-clipped=4.0 +2022-11-15 14:33:56,932 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7109.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 14:34:05,344 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7118.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 14:34:09,108 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7123.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:34:38,213 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7163.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:34:40,171 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7166.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:34:43,792 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7171.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:35:05,688 INFO [train.py:876] (2/4) Epoch 1, batch 7200, loss[loss=0.297, simple_loss=0.2499, pruned_loss=0.172, over 5690.00 frames. ], tot_loss[loss=0.2781, simple_loss=0.2391, pruned_loss=0.1585, over 1079673.51 frames. ], batch size: 34, lr: 3.78e-02, grad_scale: 16.0 +2022-11-15 14:35:07,001 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.380e+02 2.253e+02 2.788e+02 3.499e+02 9.174e+02, threshold=5.576e+02, percent-clipped=8.0 +2022-11-15 14:35:08,786 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-11-15 14:35:22,002 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7224.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 14:35:29,038 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7234.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 14:35:35,378 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.31 vs. limit=2.0 +2022-11-15 14:35:39,847 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1284, 4.4694, 4.1430, 4.6818, 4.6975, 4.2377, 3.8374, 3.6502], + device='cuda:2'), covar=tensor([0.0272, 0.0327, 0.0297, 0.0172, 0.0178, 0.0366, 0.0477, 0.0459], + device='cuda:2'), in_proj_covar=tensor([0.0050, 0.0040, 0.0056, 0.0046, 0.0055, 0.0054, 0.0049, 0.0043], + device='cuda:2'), out_proj_covar=tensor([8.0925e-05, 6.9389e-05, 9.0222e-05, 7.5588e-05, 9.1446e-05, 8.3434e-05, + 8.0245e-05, 6.7171e-05], device='cuda:2') +2022-11-15 14:35:50,127 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7265.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:37:25,310 INFO [train.py:876] (2/4) Epoch 2, batch 0, loss[loss=0.2203, simple_loss=0.2008, pruned_loss=0.1199, over 4981.00 frames. ], tot_loss[loss=0.2203, simple_loss=0.2008, pruned_loss=0.1199, over 4981.00 frames. ], batch size: 7, lr: 3.69e-02, grad_scale: 16.0 +2022-11-15 14:37:25,310 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 14:37:42,502 INFO [train.py:908] (2/4) Epoch 2, validation: loss=0.2258, simple_loss=0.228, pruned_loss=0.1118, over 1530663.00 frames. +2022-11-15 14:37:42,504 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4550MB +2022-11-15 14:37:44,102 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7275.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:37:45,439 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7658, 3.9528, 3.3858, 2.0053, 3.9129, 3.0507, 3.5366, 2.7121], + device='cuda:2'), covar=tensor([0.0406, 0.0300, 0.0228, 0.2046, 0.0220, 0.0624, 0.0330, 0.0994], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0042, 0.0032, 0.0074, 0.0039, 0.0050, 0.0031, 0.0056], + device='cuda:2'), out_proj_covar=tensor([1.0681e-04, 7.2589e-05, 6.1160e-05, 1.2546e-04, 6.5623e-05, 9.0567e-05, + 6.0776e-05, 1.0207e-04], device='cuda:2') +2022-11-15 14:37:51,155 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7285.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:38:04,586 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.318e+02 2.115e+02 2.889e+02 4.195e+02 1.182e+03, threshold=5.778e+02, percent-clipped=11.0 +2022-11-15 14:38:05,554 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7194, 1.3248, 0.5624, 0.8145, 1.0141, 0.8588, 0.9449, 0.8442], + device='cuda:2'), covar=tensor([0.0385, 0.0177, 0.0247, 0.0958, 0.0645, 0.0230, 0.0466, 0.0291], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0019, 0.0024, 0.0018, 0.0018, 0.0021, 0.0020], + device='cuda:2'), out_proj_covar=tensor([2.8570e-05, 2.7405e-05, 2.7797e-05, 3.9582e-05, 2.6884e-05, 2.5469e-05, + 3.2498e-05, 2.7302e-05], device='cuda:2') +2022-11-15 14:38:22,953 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.45 vs. limit=5.0 +2022-11-15 14:38:25,362 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-15 14:38:26,541 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7333.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:38:28,809 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7336.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:38:55,139 INFO [train.py:876] (2/4) Epoch 2, batch 100, loss[loss=0.3208, simple_loss=0.2752, pruned_loss=0.1832, over 5471.00 frames. ], tot_loss[loss=0.2711, simple_loss=0.237, pruned_loss=0.1525, over 439534.14 frames. ], batch size: 49, lr: 3.67e-02, grad_scale: 16.0 +2022-11-15 14:39:12,204 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-15 14:39:17,528 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.188e+01 2.195e+02 2.755e+02 3.428e+02 7.515e+02, threshold=5.510e+02, percent-clipped=5.0 +2022-11-15 14:39:18,279 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7404.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:39:20,777 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7407.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:39:55,961 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8282, 3.1504, 1.4404, 1.4359, 2.0401, 2.3746, 2.0229, 2.9338], + device='cuda:2'), covar=tensor([0.0830, 0.0247, 0.0546, 0.0766, 0.0239, 0.0220, 0.0191, 0.0143], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0062, 0.0052, 0.0073, 0.0047, 0.0044, 0.0045, 0.0049], + device='cuda:2'), out_proj_covar=tensor([1.0632e-04, 7.6391e-05, 7.2035e-05, 9.3297e-05, 5.9347e-05, 5.4089e-05, + 5.8070e-05, 5.8181e-05], device='cuda:2') +2022-11-15 14:40:05,191 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7468.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:40:08,500 INFO [train.py:876] (2/4) Epoch 2, batch 200, loss[loss=0.2457, simple_loss=0.2214, pruned_loss=0.135, over 5503.00 frames. ], tot_loss[loss=0.2763, simple_loss=0.2384, pruned_loss=0.1571, over 690987.24 frames. ], batch size: 17, lr: 3.66e-02, grad_scale: 16.0 +2022-11-15 14:40:30,169 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.374e+02 2.136e+02 2.623e+02 3.249e+02 5.222e+02, threshold=5.245e+02, percent-clipped=0.0 +2022-11-15 14:40:32,734 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.94 vs. limit=5.0 +2022-11-15 14:40:38,232 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-15 14:40:41,702 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7519.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 14:40:52,513 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7534.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:41:01,171 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4791, 4.0819, 3.2261, 1.4688, 3.6417, 2.0353, 3.2517, 2.3043], + device='cuda:2'), covar=tensor([0.0897, 0.0160, 0.0257, 0.2294, 0.0186, 0.0960, 0.0252, 0.1005], + device='cuda:2'), in_proj_covar=tensor([0.0062, 0.0041, 0.0033, 0.0075, 0.0037, 0.0054, 0.0033, 0.0059], + device='cuda:2'), out_proj_covar=tensor([1.1325e-04, 7.2564e-05, 6.3029e-05, 1.3083e-04, 6.4493e-05, 9.8089e-05, + 6.4780e-05, 1.0911e-04], device='cuda:2') +2022-11-15 14:41:12,956 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.18 vs. limit=5.0 +2022-11-15 14:41:14,922 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7565.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:41:20,227 INFO [train.py:876] (2/4) Epoch 2, batch 300, loss[loss=0.2866, simple_loss=0.2376, pruned_loss=0.1678, over 5122.00 frames. ], tot_loss[loss=0.2733, simple_loss=0.2359, pruned_loss=0.1554, over 848082.49 frames. ], batch size: 91, lr: 3.64e-02, grad_scale: 16.0 +2022-11-15 14:41:27,233 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7582.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 14:41:42,307 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.027e+02 2.168e+02 2.646e+02 3.466e+02 1.431e+03, threshold=5.292e+02, percent-clipped=6.0 +2022-11-15 14:41:49,772 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7613.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:41:57,612 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7624.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:42:02,261 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7631.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:42:33,000 INFO [train.py:876] (2/4) Epoch 2, batch 400, loss[loss=0.2868, simple_loss=0.261, pruned_loss=0.1563, over 5606.00 frames. ], tot_loss[loss=0.2731, simple_loss=0.2363, pruned_loss=0.1549, over 938677.73 frames. ], batch size: 22, lr: 3.62e-02, grad_scale: 16.0 +2022-11-15 14:42:41,557 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7685.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:42:42,671 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.84 vs. limit=2.0 +2022-11-15 14:42:53,926 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.29 vs. limit=2.0 +2022-11-15 14:42:54,871 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.176e+02 2.314e+02 2.984e+02 3.754e+02 8.890e+02, threshold=5.969e+02, percent-clipped=7.0 +2022-11-15 14:42:55,767 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7704.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:43:23,434 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7743.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:43:30,223 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7752.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:43:37,919 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7763.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:43:44,738 INFO [train.py:876] (2/4) Epoch 2, batch 500, loss[loss=0.3238, simple_loss=0.2683, pruned_loss=0.1897, over 5450.00 frames. ], tot_loss[loss=0.2745, simple_loss=0.237, pruned_loss=0.156, over 996147.04 frames. ], batch size: 64, lr: 3.61e-02, grad_scale: 16.0 +2022-11-15 14:43:45,939 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.36 vs. limit=5.0 +2022-11-15 14:44:06,100 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7802.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:44:06,565 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.399e+02 2.366e+02 3.140e+02 3.903e+02 7.653e+02, threshold=6.280e+02, percent-clipped=5.0 +2022-11-15 14:44:07,841 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7804.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:44:18,472 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7819.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 14:44:33,493 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.62 vs. limit=5.0 +2022-11-15 14:44:46,579 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2800, 1.2105, 1.1073, 0.8052, 1.2539, 1.1381, 1.3058, 0.9415], + device='cuda:2'), covar=tensor([0.0097, 0.0073, 0.0080, 0.0149, 0.0055, 0.0128, 0.0082, 0.0100], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0015, 0.0016, 0.0019, 0.0017, 0.0017, 0.0019, 0.0016], + device='cuda:2'), out_proj_covar=tensor([2.3010e-05, 1.9584e-05, 2.2702e-05, 2.4332e-05, 2.2600e-05, 2.1645e-05, + 2.6698e-05, 2.0111e-05], device='cuda:2') +2022-11-15 14:44:50,033 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7863.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:44:52,616 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7867.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:44:56,654 INFO [train.py:876] (2/4) Epoch 2, batch 600, loss[loss=0.3202, simple_loss=0.2718, pruned_loss=0.1843, over 5700.00 frames. ], tot_loss[loss=0.271, simple_loss=0.2348, pruned_loss=0.1536, over 1025652.13 frames. ], batch size: 34, lr: 3.59e-02, grad_scale: 16.0 +2022-11-15 14:45:18,184 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.234e+02 2.116e+02 2.659e+02 3.486e+02 9.417e+02, threshold=5.318e+02, percent-clipped=5.0 +2022-11-15 14:45:38,473 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7931.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:46:08,128 INFO [train.py:876] (2/4) Epoch 2, batch 700, loss[loss=0.2742, simple_loss=0.2392, pruned_loss=0.1546, over 5774.00 frames. ], tot_loss[loss=0.2689, simple_loss=0.234, pruned_loss=0.1519, over 1059377.62 frames. ], batch size: 21, lr: 3.57e-02, grad_scale: 16.0 +2022-11-15 14:46:12,994 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7979.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:46:13,702 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7980.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:46:26,081 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9927, 2.8020, 2.9461, 2.7102, 3.0765, 2.8920, 2.9317, 2.8747], + device='cuda:2'), covar=tensor([0.0443, 0.0384, 0.0437, 0.0396, 0.0398, 0.0184, 0.0271, 0.0436], + device='cuda:2'), in_proj_covar=tensor([0.0058, 0.0059, 0.0048, 0.0058, 0.0056, 0.0038, 0.0045, 0.0046], + device='cuda:2'), out_proj_covar=tensor([1.1658e-04, 1.0899e-04, 9.1233e-05, 1.0573e-04, 1.2623e-04, 6.8250e-05, + 8.5317e-05, 8.8264e-05], device='cuda:2') +2022-11-15 14:46:30,156 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.456e+02 2.448e+02 3.316e+02 4.282e+02 8.235e+02, threshold=6.631e+02, percent-clipped=7.0 +2022-11-15 14:47:06,844 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6427, 3.2885, 2.4211, 1.5767, 2.6011, 3.5233, 2.9106, 3.4999], + device='cuda:2'), covar=tensor([0.0698, 0.0351, 0.0329, 0.0889, 0.0156, 0.0062, 0.0132, 0.0148], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0071, 0.0052, 0.0081, 0.0045, 0.0044, 0.0045, 0.0052], + device='cuda:2'), out_proj_covar=tensor([1.1356e-04, 8.8995e-05, 7.3307e-05, 1.0434e-04, 5.7460e-05, 5.6560e-05, + 5.7386e-05, 6.2949e-05], device='cuda:2') +2022-11-15 14:47:13,894 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8063.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:47:20,644 INFO [train.py:876] (2/4) Epoch 2, batch 800, loss[loss=0.2682, simple_loss=0.2385, pruned_loss=0.149, over 5612.00 frames. ], tot_loss[loss=0.271, simple_loss=0.2352, pruned_loss=0.1534, over 1067342.50 frames. ], batch size: 38, lr: 3.56e-02, grad_scale: 16.0 +2022-11-15 14:47:39,435 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=8099.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:47:42,123 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.171e+02 2.291e+02 2.781e+02 3.438e+02 1.081e+03, threshold=5.561e+02, percent-clipped=3.0 +2022-11-15 14:47:48,147 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8111.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:47:50,430 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-15 14:47:52,789 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1491, 1.4012, 1.1365, 1.2763, 1.0992, 1.0964, 1.4566, 1.1841], + device='cuda:2'), covar=tensor([0.0293, 0.0299, 0.0223, 0.0233, 0.0596, 0.0719, 0.0243, 0.0343], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0018, 0.0019, 0.0021, 0.0019, 0.0016, 0.0018, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.9865e-05, 2.3714e-05, 2.4893e-05, 2.9907e-05, 2.9749e-05, 2.5041e-05, + 2.5630e-05, 2.7579e-05], device='cuda:2') +2022-11-15 14:48:21,014 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.09 vs. limit=5.0 +2022-11-15 14:48:22,103 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=8158.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:48:24,896 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8056, 1.3716, 0.6328, 0.9205, 0.9138, 1.1576, 0.8896, 0.8119], + device='cuda:2'), covar=tensor([0.0157, 0.0060, 0.0145, 0.0127, 0.0226, 0.0077, 0.0195, 0.0181], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0017, 0.0019, 0.0020, 0.0018, 0.0015, 0.0018, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.8701e-05, 2.2241e-05, 2.4089e-05, 2.8680e-05, 2.8051e-05, 2.3273e-05, + 2.4433e-05, 2.6618e-05], device='cuda:2') +2022-11-15 14:48:26,322 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9217, 1.1532, 1.1409, 1.0753, 1.1071, 1.0879, 0.7523, 1.0377], + device='cuda:2'), covar=tensor([0.0090, 0.0047, 0.0074, 0.0130, 0.0092, 0.0083, 0.0110, 0.0120], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0020, 0.0018, 0.0020, 0.0020, 0.0020, 0.0021, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.5465e-05, 2.4440e-05, 2.6158e-05, 2.4069e-05, 2.3541e-05, 2.5474e-05, + 3.4172e-05, 2.4743e-05], device='cuda:2') +2022-11-15 14:48:29,444 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0368, 1.6941, 1.3002, 1.1138, 1.8216, 1.8056, 1.5127, 1.3699], + device='cuda:2'), covar=tensor([0.0277, 0.0224, 0.0258, 0.0182, 0.0143, 0.0182, 0.0220, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0014, 0.0015, 0.0017, 0.0015, 0.0017, 0.0018, 0.0015], + device='cuda:2'), out_proj_covar=tensor([2.1718e-05, 1.8766e-05, 2.1781e-05, 2.2694e-05, 2.0831e-05, 2.2198e-05, + 2.5471e-05, 1.9021e-05], device='cuda:2') +2022-11-15 14:48:32,989 INFO [train.py:876] (2/4) Epoch 2, batch 900, loss[loss=0.296, simple_loss=0.2588, pruned_loss=0.1666, over 5750.00 frames. ], tot_loss[loss=0.2676, simple_loss=0.233, pruned_loss=0.1511, over 1077128.77 frames. ], batch size: 31, lr: 3.54e-02, grad_scale: 16.0 +2022-11-15 14:48:54,941 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.202e+02 2.386e+02 2.834e+02 3.764e+02 8.164e+02, threshold=5.667e+02, percent-clipped=3.0 +2022-11-15 14:48:55,382 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-11-15 14:49:28,493 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4032, 4.0679, 3.8031, 3.7957, 3.6312, 3.1144, 2.2833, 3.6369], + device='cuda:2'), covar=tensor([0.2093, 0.0127, 0.0363, 0.0160, 0.0220, 0.0834, 0.2977, 0.0175], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0085, 0.0107, 0.0074, 0.0087, 0.0128, 0.0177, 0.0080], + device='cuda:2'), out_proj_covar=tensor([1.7859e-04, 8.9633e-05, 1.2283e-04, 8.3061e-05, 1.0306e-04, 1.5087e-04, + 1.9577e-04, 8.7043e-05], device='cuda:2') +2022-11-15 14:49:31,300 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3577, 1.9646, 3.3090, 2.4777, 3.2449, 2.1291, 3.2424, 3.2908], + device='cuda:2'), covar=tensor([0.0088, 0.1069, 0.0197, 0.0688, 0.0142, 0.0671, 0.0266, 0.0184], + device='cuda:2'), in_proj_covar=tensor([0.0053, 0.0121, 0.0068, 0.0100, 0.0057, 0.0097, 0.0075, 0.0062], + device='cuda:2'), out_proj_covar=tensor([7.1997e-05, 1.6191e-04, 9.3858e-05, 1.3260e-04, 7.8868e-05, 1.3083e-04, + 1.0281e-04, 8.6140e-05], device='cuda:2') +2022-11-15 14:49:33,290 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7854, 1.6450, 0.6737, 1.5043, 0.7652, 1.0192, 1.4926, 1.0517], + device='cuda:2'), covar=tensor([0.0317, 0.0138, 0.0214, 0.0217, 0.0501, 0.0753, 0.0240, 0.0291], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0018, 0.0020, 0.0023, 0.0020, 0.0016, 0.0019, 0.0020], + device='cuda:2'), out_proj_covar=tensor([3.1743e-05, 2.3555e-05, 2.5883e-05, 3.2128e-05, 3.0516e-05, 2.5151e-05, + 2.6853e-05, 2.9778e-05], device='cuda:2') +2022-11-15 14:49:34,920 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 14:49:39,602 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=8265.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:49:41,192 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-11-15 14:49:44,958 INFO [train.py:876] (2/4) Epoch 2, batch 1000, loss[loss=0.1923, simple_loss=0.1842, pruned_loss=0.1002, over 5726.00 frames. ], tot_loss[loss=0.2667, simple_loss=0.2326, pruned_loss=0.1504, over 1080140.06 frames. ], batch size: 11, lr: 3.53e-02, grad_scale: 16.0 +2022-11-15 14:49:50,698 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8280.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:49:58,476 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.04 vs. limit=5.0 +2022-11-15 14:50:07,378 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.298e+02 2.271e+02 2.772e+02 3.875e+02 7.231e+02, threshold=5.545e+02, percent-clipped=6.0 +2022-11-15 14:50:22,811 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0653, 0.9211, 1.0225, 0.8720, 1.1652, 1.1246, 0.9355, 1.3051], + device='cuda:2'), covar=tensor([0.0396, 0.0338, 0.0274, 0.0497, 0.2711, 0.0279, 0.0482, 0.0233], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0014, 0.0019, 0.0015, 0.0014, 0.0016, 0.0014], + device='cuda:2'), out_proj_covar=tensor([2.2068e-05, 2.3220e-05, 2.3593e-05, 3.3822e-05, 2.3846e-05, 2.2030e-05, + 2.5452e-05, 2.1990e-05], device='cuda:2') +2022-11-15 14:50:23,522 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=8326.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:50:24,723 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8328.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:50:33,414 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.11 vs. limit=2.0 +2022-11-15 14:50:57,476 INFO [train.py:876] (2/4) Epoch 2, batch 1100, loss[loss=0.1859, simple_loss=0.1784, pruned_loss=0.09673, over 5372.00 frames. ], tot_loss[loss=0.2657, simple_loss=0.2325, pruned_loss=0.1494, over 1088703.08 frames. ], batch size: 9, lr: 3.51e-02, grad_scale: 16.0 +2022-11-15 14:51:16,581 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8399.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:51:19,490 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.385e+02 2.261e+02 2.575e+02 3.836e+02 7.235e+02, threshold=5.150e+02, percent-clipped=6.0 +2022-11-15 14:51:21,676 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.09 vs. limit=2.0 +2022-11-15 14:51:36,323 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.4496, 0.9585, 0.5300, 1.0528, 1.0052, 0.9872, 0.5673, 0.9028], + device='cuda:2'), covar=tensor([0.0115, 0.0048, 0.0067, 0.0062, 0.0118, 0.0068, 0.0109, 0.0092], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0020, 0.0018, 0.0019, 0.0021, 0.0019, 0.0021, 0.0020], + device='cuda:2'), out_proj_covar=tensor([2.6691e-05, 2.4837e-05, 2.6548e-05, 2.3922e-05, 2.5475e-05, 2.4860e-05, + 3.4631e-05, 2.5915e-05], device='cuda:2') +2022-11-15 14:51:51,358 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8447.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:51:58,973 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8458.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:52:03,146 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8814, 1.4015, 1.7206, 1.6314, 1.9002, 1.0958, 1.6169, 1.8559], + device='cuda:2'), covar=tensor([0.0062, 0.0299, 0.0096, 0.0156, 0.0058, 0.0420, 0.0094, 0.0066], + device='cuda:2'), in_proj_covar=tensor([0.0053, 0.0120, 0.0071, 0.0103, 0.0060, 0.0099, 0.0076, 0.0063], + device='cuda:2'), out_proj_covar=tensor([7.4725e-05, 1.6307e-04, 9.8498e-05, 1.3695e-04, 8.3930e-05, 1.3416e-04, + 1.0576e-04, 8.9495e-05], device='cuda:2') +2022-11-15 14:52:09,820 INFO [train.py:876] (2/4) Epoch 2, batch 1200, loss[loss=0.2222, simple_loss=0.2088, pruned_loss=0.1178, over 5543.00 frames. ], tot_loss[loss=0.2672, simple_loss=0.2335, pruned_loss=0.1504, over 1086269.72 frames. ], batch size: 13, lr: 3.50e-02, grad_scale: 16.0 +2022-11-15 14:52:31,193 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 2.113e+02 2.806e+02 3.522e+02 6.703e+02, threshold=5.613e+02, percent-clipped=5.0 +2022-11-15 14:52:33,374 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8506.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:52:43,716 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-11-15 14:53:03,481 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-11-15 14:53:20,999 INFO [train.py:876] (2/4) Epoch 2, batch 1300, loss[loss=0.2348, simple_loss=0.2189, pruned_loss=0.1253, over 5731.00 frames. ], tot_loss[loss=0.2652, simple_loss=0.2327, pruned_loss=0.1489, over 1090358.96 frames. ], batch size: 14, lr: 3.48e-02, grad_scale: 16.0 +2022-11-15 14:53:34,157 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6111, 4.5482, 3.4255, 1.8503, 4.2079, 2.1218, 3.8645, 2.9927], + device='cuda:2'), covar=tensor([0.0542, 0.0093, 0.0183, 0.2295, 0.0119, 0.1158, 0.0154, 0.1052], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0049, 0.0041, 0.0086, 0.0046, 0.0069, 0.0038, 0.0075], + device='cuda:2'), out_proj_covar=tensor([1.4914e-04, 9.3745e-05, 8.4751e-05, 1.6132e-04, 8.5657e-05, 1.3354e-04, + 8.3803e-05, 1.4720e-04], device='cuda:2') +2022-11-15 14:53:42,885 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.205e+02 2.077e+02 2.771e+02 3.615e+02 8.724e+02, threshold=5.542e+02, percent-clipped=7.0 +2022-11-15 14:53:56,863 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=8621.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:54:10,830 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-15 14:54:23,183 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-15 14:54:35,338 INFO [train.py:876] (2/4) Epoch 2, batch 1400, loss[loss=0.3115, simple_loss=0.262, pruned_loss=0.1805, over 5562.00 frames. ], tot_loss[loss=0.2617, simple_loss=0.23, pruned_loss=0.1467, over 1090816.61 frames. ], batch size: 40, lr: 3.46e-02, grad_scale: 32.0 +2022-11-15 14:54:56,929 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.304e+02 2.372e+02 3.042e+02 3.801e+02 7.959e+02, threshold=6.083e+02, percent-clipped=7.0 +2022-11-15 14:54:59,147 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9806, 2.1384, 2.4005, 2.2180, 1.9203, 3.0348, 2.2410, 1.8037], + device='cuda:2'), covar=tensor([0.0244, 0.0107, 0.0071, 0.0177, 0.0268, 0.0052, 0.0170, 0.0106], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0043, 0.0043, 0.0044, 0.0075, 0.0045, 0.0059, 0.0041], + device='cuda:2'), out_proj_covar=tensor([9.8248e-05, 5.7979e-05, 5.5760e-05, 6.4051e-05, 1.0636e-04, 5.6284e-05, + 7.8937e-05, 5.2471e-05], device='cuda:2') +2022-11-15 14:55:37,194 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=8760.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:55:46,196 INFO [train.py:876] (2/4) Epoch 2, batch 1500, loss[loss=0.3016, simple_loss=0.2557, pruned_loss=0.1737, over 5674.00 frames. ], tot_loss[loss=0.2626, simple_loss=0.2305, pruned_loss=0.1474, over 1082944.95 frames. ], batch size: 36, lr: 3.45e-02, grad_scale: 32.0 +2022-11-15 14:56:08,247 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.328e+02 2.321e+02 2.844e+02 3.403e+02 6.170e+02, threshold=5.688e+02, percent-clipped=1.0 +2022-11-15 14:56:20,356 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2200, 2.5794, 2.8520, 3.8415, 4.1031, 3.2631, 2.9634, 3.2778], + device='cuda:2'), covar=tensor([0.0058, 0.1463, 0.1160, 0.0392, 0.0088, 0.1066, 0.1180, 0.0097], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0157, 0.0159, 0.0084, 0.0095, 0.0160, 0.0164, 0.0082], + device='cuda:2'), out_proj_covar=tensor([9.2997e-05, 1.9283e-04, 1.9304e-04, 1.0976e-04, 1.1223e-04, 1.9981e-04, + 2.0052e-04, 9.8213e-05], device='cuda:2') +2022-11-15 14:56:20,984 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=8821.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:56:42,952 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.86 vs. limit=5.0 +2022-11-15 14:56:57,533 INFO [train.py:876] (2/4) Epoch 2, batch 1600, loss[loss=0.2925, simple_loss=0.2564, pruned_loss=0.1643, over 5591.00 frames. ], tot_loss[loss=0.2613, simple_loss=0.2298, pruned_loss=0.1464, over 1086897.81 frames. ], batch size: 22, lr: 3.44e-02, grad_scale: 16.0 +2022-11-15 14:57:19,295 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.565e+02 2.087e+02 2.971e+02 3.839e+02 7.053e+02, threshold=5.941e+02, percent-clipped=2.0 +2022-11-15 14:57:31,971 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8921.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:57:36,143 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=8927.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:57:43,144 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8543, 1.1979, 1.3904, 1.8739, 0.4798, 1.3582, 1.2977, 1.2026], + device='cuda:2'), covar=tensor([0.0251, 0.0725, 0.0169, 0.0177, 0.0562, 0.0325, 0.0242, 0.0317], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0020, 0.0020, 0.0022, 0.0021, 0.0017, 0.0019, 0.0019], + device='cuda:2'), out_proj_covar=tensor([3.3018e-05, 2.7397e-05, 2.6857e-05, 3.1611e-05, 3.3240e-05, 2.8527e-05, + 2.7427e-05, 2.8640e-05], device='cuda:2') +2022-11-15 14:57:57,859 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7576, 1.1698, 1.1771, 1.7523, 0.6403, 1.3976, 1.1711, 1.3315], + device='cuda:2'), covar=tensor([0.0222, 0.0598, 0.0192, 0.0236, 0.0796, 0.0440, 0.0193, 0.0354], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0020, 0.0021, 0.0022, 0.0021, 0.0017, 0.0020, 0.0019], + device='cuda:2'), out_proj_covar=tensor([3.2870e-05, 2.7882e-05, 2.7222e-05, 3.2065e-05, 3.3692e-05, 2.8349e-05, + 2.7756e-05, 2.8885e-05], device='cuda:2') +2022-11-15 14:57:58,486 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5042, 3.7736, 3.8187, 3.9470, 3.5261, 2.9078, 4.2891, 3.6571], + device='cuda:2'), covar=tensor([0.0912, 0.0887, 0.0533, 0.0625, 0.0543, 0.0539, 0.0710, 0.0605], + device='cuda:2'), in_proj_covar=tensor([0.0047, 0.0071, 0.0059, 0.0066, 0.0043, 0.0040, 0.0066, 0.0053], + device='cuda:2'), out_proj_covar=tensor([8.7588e-05, 1.3412e-04, 1.1102e-04, 1.2201e-04, 8.3606e-05, 7.4673e-05, + 1.4003e-04, 1.0002e-04], device='cuda:2') +2022-11-15 14:58:05,412 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8969.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:58:08,102 INFO [train.py:876] (2/4) Epoch 2, batch 1700, loss[loss=0.2835, simple_loss=0.2285, pruned_loss=0.1693, over 5448.00 frames. ], tot_loss[loss=0.266, simple_loss=0.2328, pruned_loss=0.1496, over 1086086.71 frames. ], batch size: 53, lr: 3.42e-02, grad_scale: 16.0 +2022-11-15 14:58:18,149 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=8986.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:58:19,535 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=8988.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:58:23,685 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-11-15 14:58:29,905 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2942, 4.7254, 4.8004, 4.6093, 3.7709, 3.6478, 5.0249, 4.3292], + device='cuda:2'), covar=tensor([0.0592, 0.0565, 0.0384, 0.0566, 0.0773, 0.0343, 0.0743, 0.0455], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0072, 0.0060, 0.0068, 0.0044, 0.0041, 0.0070, 0.0055], + device='cuda:2'), out_proj_covar=tensor([8.9478e-05, 1.3588e-04, 1.1406e-04, 1.2702e-04, 8.5407e-05, 7.6773e-05, + 1.4740e-04, 1.0341e-04], device='cuda:2') +2022-11-15 14:58:30,481 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.407e+02 2.261e+02 2.879e+02 3.540e+02 8.492e+02, threshold=5.758e+02, percent-clipped=3.0 +2022-11-15 14:58:33,190 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-15 14:58:37,291 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 14:58:37,617 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9013.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:59:01,668 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9047.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:59:02,988 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8858, 1.5863, 2.3271, 3.5152, 3.1205, 2.5607, 2.6470, 3.2480], + device='cuda:2'), covar=tensor([0.0142, 0.1552, 0.1120, 0.0272, 0.0181, 0.0919, 0.0952, 0.0104], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0156, 0.0162, 0.0083, 0.0094, 0.0161, 0.0161, 0.0081], + device='cuda:2'), out_proj_covar=tensor([9.5878e-05, 1.9225e-04, 1.9614e-04, 1.0899e-04, 1.1425e-04, 2.0331e-04, + 1.9969e-04, 9.7054e-05], device='cuda:2') +2022-11-15 14:59:20,368 INFO [train.py:876] (2/4) Epoch 2, batch 1800, loss[loss=0.09646, simple_loss=0.09296, pruned_loss=0.04998, over 4464.00 frames. ], tot_loss[loss=0.2657, simple_loss=0.2326, pruned_loss=0.1494, over 1085625.07 frames. ], batch size: 5, lr: 3.41e-02, grad_scale: 16.0 +2022-11-15 14:59:21,218 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9074.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 14:59:38,668 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 14:59:42,159 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.031e+02 2.362e+02 3.022e+02 3.932e+02 1.031e+03, threshold=6.044e+02, percent-clipped=5.0 +2022-11-15 14:59:50,853 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9116.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:00:18,041 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0969, 2.0533, 1.4779, 1.6095, 1.4030, 1.3778, 0.7081, 0.9841], + device='cuda:2'), covar=tensor([0.0188, 0.0070, 0.0181, 0.0092, 0.0232, 0.0281, 0.0190, 0.0535], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0021, 0.0020, 0.0022, 0.0022, 0.0023, 0.0022, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.9456e-05, 2.6422e-05, 2.9539e-05, 2.6548e-05, 2.7614e-05, 2.9541e-05, + 3.6836e-05, 2.5943e-05], device='cuda:2') +2022-11-15 15:00:31,253 INFO [train.py:876] (2/4) Epoch 2, batch 1900, loss[loss=0.2736, simple_loss=0.2463, pruned_loss=0.1505, over 5694.00 frames. ], tot_loss[loss=0.2628, simple_loss=0.2306, pruned_loss=0.1474, over 1084052.28 frames. ], batch size: 19, lr: 3.39e-02, grad_scale: 16.0 +2022-11-15 15:00:53,890 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.186e+02 2.307e+02 3.025e+02 3.862e+02 6.126e+02, threshold=6.049e+02, percent-clipped=1.0 +2022-11-15 15:01:18,279 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1310, 1.4574, 1.6002, 2.0056, 1.6832, 1.4011, 1.7249, 1.8314], + device='cuda:2'), covar=tensor([0.0356, 0.0288, 0.0413, 0.0374, 0.0416, 0.0453, 0.0354, 0.0303], + device='cuda:2'), in_proj_covar=tensor([0.0032, 0.0034, 0.0039, 0.0030, 0.0041, 0.0034, 0.0040, 0.0026], + device='cuda:2'), out_proj_covar=tensor([4.9768e-05, 5.1693e-05, 6.5182e-05, 4.7826e-05, 7.1664e-05, 5.9462e-05, + 6.5877e-05, 4.3735e-05], device='cuda:2') +2022-11-15 15:01:20,272 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6844, 2.7496, 2.9680, 2.9778, 2.5196, 2.2184, 3.1731, 2.6999], + device='cuda:2'), covar=tensor([0.0641, 0.0982, 0.0514, 0.0722, 0.0817, 0.0622, 0.0935, 0.0656], + device='cuda:2'), in_proj_covar=tensor([0.0049, 0.0070, 0.0058, 0.0067, 0.0042, 0.0040, 0.0071, 0.0053], + device='cuda:2'), out_proj_covar=tensor([9.3108e-05, 1.3288e-04, 1.0920e-04, 1.2553e-04, 8.3824e-05, 7.6241e-05, + 1.5047e-04, 9.9769e-05], device='cuda:2') +2022-11-15 15:01:42,871 INFO [train.py:876] (2/4) Epoch 2, batch 2000, loss[loss=0.249, simple_loss=0.2456, pruned_loss=0.1262, over 5542.00 frames. ], tot_loss[loss=0.2629, simple_loss=0.23, pruned_loss=0.1478, over 1081363.89 frames. ], batch size: 17, lr: 3.38e-02, grad_scale: 16.0 +2022-11-15 15:01:43,736 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8402, 3.3861, 2.9644, 3.3281, 2.6116, 2.4569, 1.9478, 2.9890], + device='cuda:2'), covar=tensor([0.1725, 0.0166, 0.0439, 0.0159, 0.0463, 0.1074, 0.2353, 0.0181], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0085, 0.0110, 0.0076, 0.0093, 0.0136, 0.0173, 0.0083], + device='cuda:2'), out_proj_covar=tensor([1.7931e-04, 9.4059e-05, 1.3067e-04, 8.9256e-05, 1.1246e-04, 1.6297e-04, + 1.9569e-04, 9.3080e-05], device='cuda:2') +2022-11-15 15:01:50,676 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9283.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:02:05,762 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.305e+02 2.275e+02 2.942e+02 3.786e+02 7.709e+02, threshold=5.884e+02, percent-clipped=5.0 +2022-11-15 15:02:14,431 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.15 vs. limit=5.0 +2022-11-15 15:02:32,856 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9342.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:02:46,613 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7852, 4.2814, 3.9764, 4.4237, 4.4201, 3.8094, 3.7867, 3.4723], + device='cuda:2'), covar=tensor([0.0366, 0.0393, 0.0319, 0.0293, 0.0214, 0.0331, 0.0281, 0.0335], + device='cuda:2'), in_proj_covar=tensor([0.0056, 0.0050, 0.0068, 0.0052, 0.0072, 0.0066, 0.0056, 0.0048], + device='cuda:2'), out_proj_covar=tensor([9.8317e-05, 1.0129e-04, 1.1346e-04, 9.6398e-05, 1.3798e-04, 1.0974e-04, + 9.8012e-05, 8.0875e-05], device='cuda:2') +2022-11-15 15:02:50,457 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.09 vs. limit=2.0 +2022-11-15 15:02:52,309 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9369.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:02:54,973 INFO [train.py:876] (2/4) Epoch 2, batch 2100, loss[loss=0.2964, simple_loss=0.2577, pruned_loss=0.1675, over 5587.00 frames. ], tot_loss[loss=0.2609, simple_loss=0.2289, pruned_loss=0.1465, over 1082411.00 frames. ], batch size: 43, lr: 3.36e-02, grad_scale: 16.0 +2022-11-15 15:03:17,093 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.205e+02 2.437e+02 2.901e+02 3.645e+02 9.793e+02, threshold=5.801e+02, percent-clipped=2.0 +2022-11-15 15:03:24,894 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9414.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:03:26,167 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=9416.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:03:30,676 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.78 vs. limit=5.0 +2022-11-15 15:04:00,502 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=9464.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:04:06,702 INFO [train.py:876] (2/4) Epoch 2, batch 2200, loss[loss=0.1783, simple_loss=0.1824, pruned_loss=0.08713, over 5708.00 frames. ], tot_loss[loss=0.2593, simple_loss=0.2284, pruned_loss=0.1451, over 1082978.22 frames. ], batch size: 12, lr: 3.35e-02, grad_scale: 16.0 +2022-11-15 15:04:08,204 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9475.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:04:19,262 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 15:04:28,437 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.967e+01 2.258e+02 2.836e+02 4.027e+02 8.312e+02, threshold=5.673e+02, percent-clipped=7.0 +2022-11-15 15:04:53,218 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.47 vs. limit=2.0 +2022-11-15 15:05:02,611 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6023, 3.9883, 3.9517, 4.0502, 3.7103, 3.8104, 1.6524, 3.3757], + device='cuda:2'), covar=tensor([0.0429, 0.0233, 0.0204, 0.0179, 0.0295, 0.0268, 0.2596, 0.0435], + device='cuda:2'), in_proj_covar=tensor([0.0049, 0.0044, 0.0040, 0.0031, 0.0044, 0.0036, 0.0080, 0.0048], + device='cuda:2'), out_proj_covar=tensor([8.9654e-05, 8.0875e-05, 7.1814e-05, 5.6180e-05, 7.9139e-05, 6.3778e-05, + 1.3721e-04, 8.8151e-05], device='cuda:2') +2022-11-15 15:05:08,750 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3145, 1.4535, 1.4395, 1.5671, 1.2884, 1.6889, 1.1724, 1.5785], + device='cuda:2'), covar=tensor([0.0058, 0.0134, 0.0194, 0.0061, 0.0069, 0.0072, 0.0136, 0.0068], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0016, 0.0017, 0.0018, 0.0017, 0.0016, 0.0018, 0.0016], + device='cuda:2'), out_proj_covar=tensor([2.1566e-05, 2.2566e-05, 2.5135e-05, 2.3844e-05, 2.4476e-05, 2.1204e-05, + 2.5251e-05, 2.0654e-05], device='cuda:2') +2022-11-15 15:05:15,164 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0754, 3.0796, 2.9119, 2.6433, 2.3761, 3.5341, 2.9021, 2.8512], + device='cuda:2'), covar=tensor([0.0376, 0.0179, 0.0111, 0.0221, 0.0346, 0.0069, 0.0193, 0.0075], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0045, 0.0048, 0.0048, 0.0082, 0.0048, 0.0064, 0.0043], + device='cuda:2'), out_proj_covar=tensor([1.1071e-04, 6.2524e-05, 6.4330e-05, 7.3200e-05, 1.2199e-04, 6.2038e-05, + 8.9976e-05, 5.8373e-05], device='cuda:2') +2022-11-15 15:05:18,108 INFO [train.py:876] (2/4) Epoch 2, batch 2300, loss[loss=0.3253, simple_loss=0.2708, pruned_loss=0.1899, over 5241.00 frames. ], tot_loss[loss=0.2609, simple_loss=0.2293, pruned_loss=0.1462, over 1084899.32 frames. ], batch size: 79, lr: 3.34e-02, grad_scale: 16.0 +2022-11-15 15:05:25,120 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=9583.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:05:25,871 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9584.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:05:39,845 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.198e+02 2.289e+02 3.013e+02 4.083e+02 8.581e+02, threshold=6.026e+02, percent-clipped=8.0 +2022-11-15 15:05:47,338 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.33 vs. limit=5.0 +2022-11-15 15:05:58,945 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0107, 3.3103, 3.0211, 3.4206, 2.8462, 2.5820, 2.1890, 3.1612], + device='cuda:2'), covar=tensor([0.1920, 0.0255, 0.0484, 0.0220, 0.0498, 0.0988, 0.2586, 0.0202], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0092, 0.0115, 0.0078, 0.0098, 0.0139, 0.0177, 0.0082], + device='cuda:2'), out_proj_covar=tensor([1.8453e-04, 1.0350e-04, 1.3742e-04, 9.2843e-05, 1.1972e-04, 1.6580e-04, + 2.0112e-04, 9.2720e-05], device='cuda:2') +2022-11-15 15:05:59,631 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=9631.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:06:07,379 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=9642.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:06:09,481 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9645.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:06:18,537 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9658.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:06:26,549 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=9669.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:06:29,227 INFO [train.py:876] (2/4) Epoch 2, batch 2400, loss[loss=0.3553, simple_loss=0.2713, pruned_loss=0.2197, over 3044.00 frames. ], tot_loss[loss=0.2596, simple_loss=0.2289, pruned_loss=0.1451, over 1088146.18 frames. ], batch size: 284, lr: 3.32e-02, grad_scale: 16.0 +2022-11-15 15:06:37,084 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1559, 1.8733, 1.0721, 1.7443, 0.7485, 1.5730, 1.2788, 0.9344], + device='cuda:2'), covar=tensor([0.0356, 0.0161, 0.0223, 0.0217, 0.0719, 0.0321, 0.0304, 0.0351], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0022, 0.0023, 0.0025, 0.0023, 0.0017, 0.0023, 0.0022], + device='cuda:2'), out_proj_covar=tensor([3.7478e-05, 3.1805e-05, 3.1140e-05, 3.6119e-05, 3.8248e-05, 2.8623e-05, + 3.3331e-05, 3.2789e-05], device='cuda:2') +2022-11-15 15:06:41,903 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=9690.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:06:51,271 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.316e+02 2.170e+02 2.571e+02 3.474e+02 5.585e+02, threshold=5.143e+02, percent-clipped=0.0 +2022-11-15 15:07:00,765 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=9717.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:07:02,280 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9719.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:07:19,796 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9743.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:07:23,607 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-11-15 15:07:26,379 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.09 vs. limit=2.0 +2022-11-15 15:07:27,467 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6748, 1.9076, 1.5181, 2.1950, 1.2878, 1.9152, 1.7521, 1.4755], + device='cuda:2'), covar=tensor([0.0094, 0.0038, 0.0053, 0.0041, 0.0156, 0.0041, 0.0081, 0.0047], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0046, 0.0049, 0.0050, 0.0083, 0.0049, 0.0067, 0.0045], + device='cuda:2'), out_proj_covar=tensor([1.1844e-04, 6.5652e-05, 6.6510e-05, 7.6300e-05, 1.2503e-04, 6.3406e-05, + 9.5318e-05, 6.1028e-05], device='cuda:2') +2022-11-15 15:07:37,252 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.73 vs. limit=5.0 +2022-11-15 15:07:38,166 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9770.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:07:40,526 INFO [train.py:876] (2/4) Epoch 2, batch 2500, loss[loss=0.2247, simple_loss=0.2023, pruned_loss=0.1236, over 5745.00 frames. ], tot_loss[loss=0.2593, simple_loss=0.2285, pruned_loss=0.145, over 1088204.31 frames. ], batch size: 15, lr: 3.31e-02, grad_scale: 16.0 +2022-11-15 15:08:03,222 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.240e+02 2.186e+02 2.866e+02 3.924e+02 6.368e+02, threshold=5.732e+02, percent-clipped=5.0 +2022-11-15 15:08:03,455 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9804.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:08:05,091 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-11-15 15:08:52,034 INFO [train.py:876] (2/4) Epoch 2, batch 2600, loss[loss=0.2439, simple_loss=0.2136, pruned_loss=0.1371, over 5157.00 frames. ], tot_loss[loss=0.2578, simple_loss=0.2282, pruned_loss=0.1437, over 1086922.66 frames. ], batch size: 91, lr: 3.30e-02, grad_scale: 16.0 +2022-11-15 15:08:56,813 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9760, 2.5855, 2.3106, 2.6465, 1.9527, 2.9794, 2.4666, 2.8351], + device='cuda:2'), covar=tensor([0.0230, 0.0075, 0.0079, 0.0175, 0.0233, 0.0042, 0.0128, 0.0037], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0045, 0.0048, 0.0049, 0.0083, 0.0048, 0.0065, 0.0044], + device='cuda:2'), out_proj_covar=tensor([1.1740e-04, 6.4541e-05, 6.6390e-05, 7.5617e-05, 1.2501e-04, 6.3028e-05, + 9.3159e-05, 5.9547e-05], device='cuda:2') +2022-11-15 15:09:14,802 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.239e+02 2.199e+02 2.978e+02 3.710e+02 9.077e+02, threshold=5.957e+02, percent-clipped=5.0 +2022-11-15 15:09:22,242 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-11-15 15:09:32,904 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1620, 1.7959, 1.2628, 1.8620, 0.7086, 1.5484, 1.1817, 1.0498], + device='cuda:2'), covar=tensor([0.0346, 0.0308, 0.0203, 0.0206, 0.0836, 0.1119, 0.0443, 0.0612], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0024, 0.0024, 0.0026, 0.0025, 0.0019, 0.0024, 0.0025], + device='cuda:2'), out_proj_covar=tensor([4.0139e-05, 3.3881e-05, 3.1940e-05, 3.8039e-05, 4.0756e-05, 3.1801e-05, + 3.5092e-05, 3.6670e-05], device='cuda:2') +2022-11-15 15:09:40,250 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9940.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:10:03,769 INFO [train.py:876] (2/4) Epoch 2, batch 2700, loss[loss=0.2802, simple_loss=0.2503, pruned_loss=0.1551, over 5642.00 frames. ], tot_loss[loss=0.2596, simple_loss=0.2289, pruned_loss=0.1452, over 1085607.80 frames. ], batch size: 32, lr: 3.28e-02, grad_scale: 16.0 +2022-11-15 15:10:29,554 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.282e+02 2.293e+02 2.993e+02 4.046e+02 1.330e+03, threshold=5.986e+02, percent-clipped=8.0 +2022-11-15 15:10:37,238 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10014.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:10:54,444 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10039.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:10:58,232 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1355, 2.3344, 2.1702, 2.2496, 2.1489, 2.2542, 1.2754, 2.1542], + device='cuda:2'), covar=tensor([0.0295, 0.0156, 0.0206, 0.0112, 0.0231, 0.0172, 0.1458, 0.0235], + device='cuda:2'), in_proj_covar=tensor([0.0054, 0.0045, 0.0045, 0.0035, 0.0049, 0.0038, 0.0087, 0.0051], + device='cuda:2'), out_proj_covar=tensor([1.0019e-04, 8.4107e-05, 8.1224e-05, 6.3530e-05, 8.9129e-05, 7.0205e-05, + 1.5108e-04, 9.4140e-05], device='cuda:2') +2022-11-15 15:10:59,053 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-11-15 15:11:10,953 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10062.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 15:11:16,386 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10070.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:11:18,620 INFO [train.py:876] (2/4) Epoch 2, batch 2800, loss[loss=0.1729, simple_loss=0.1611, pruned_loss=0.09241, over 5512.00 frames. ], tot_loss[loss=0.2602, simple_loss=0.2289, pruned_loss=0.1458, over 1081857.77 frames. ], batch size: 12, lr: 3.27e-02, grad_scale: 16.0 +2022-11-15 15:11:36,438 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10099.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:11:37,190 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10100.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 15:11:38,248 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.01 vs. limit=2.0 +2022-11-15 15:11:39,751 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.304e+02 2.201e+02 2.831e+02 3.552e+02 8.014e+02, threshold=5.662e+02, percent-clipped=2.0 +2022-11-15 15:11:49,993 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10118.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:11:53,005 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.09 vs. limit=2.0 +2022-11-15 15:11:53,488 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10123.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 15:12:04,542 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.57 vs. limit=5.0 +2022-11-15 15:12:29,726 INFO [train.py:876] (2/4) Epoch 2, batch 2900, loss[loss=0.2769, simple_loss=0.2362, pruned_loss=0.1588, over 5061.00 frames. ], tot_loss[loss=0.2556, simple_loss=0.2264, pruned_loss=0.1424, over 1084338.64 frames. ], batch size: 91, lr: 3.26e-02, grad_scale: 16.0 +2022-11-15 15:12:52,051 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.103e+02 2.141e+02 2.737e+02 3.549e+02 7.365e+02, threshold=5.475e+02, percent-clipped=2.0 +2022-11-15 15:13:06,372 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9258, 3.1293, 2.7681, 2.9729, 2.8223, 2.8756, 1.3406, 2.9128], + device='cuda:2'), covar=tensor([0.0328, 0.0206, 0.0268, 0.0162, 0.0281, 0.0258, 0.1829, 0.0261], + device='cuda:2'), in_proj_covar=tensor([0.0054, 0.0044, 0.0045, 0.0035, 0.0049, 0.0038, 0.0084, 0.0051], + device='cuda:2'), out_proj_covar=tensor([9.9183e-05, 8.1339e-05, 8.1399e-05, 6.3943e-05, 8.9562e-05, 6.9932e-05, + 1.4629e-04, 9.5363e-05], device='cuda:2') +2022-11-15 15:13:18,134 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10240.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:13:18,278 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-11-15 15:13:41,344 INFO [train.py:876] (2/4) Epoch 2, batch 3000, loss[loss=0.2994, simple_loss=0.2573, pruned_loss=0.1708, over 5717.00 frames. ], tot_loss[loss=0.2549, simple_loss=0.2261, pruned_loss=0.1418, over 1085341.18 frames. ], batch size: 27, lr: 3.24e-02, grad_scale: 16.0 +2022-11-15 15:13:41,345 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 15:14:00,265 INFO [train.py:908] (2/4) Epoch 2, validation: loss=0.2049, simple_loss=0.215, pruned_loss=0.09736, over 1530663.00 frames. +2022-11-15 15:14:00,266 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4573MB +2022-11-15 15:14:10,303 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6546, 1.6248, 2.4577, 1.8727, 2.6442, 1.7806, 2.3537, 2.6692], + device='cuda:2'), covar=tensor([0.0050, 0.0462, 0.0087, 0.0356, 0.0090, 0.0340, 0.0204, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0140, 0.0082, 0.0135, 0.0073, 0.0118, 0.0108, 0.0083], + device='cuda:2'), out_proj_covar=tensor([9.5066e-05, 2.0065e-04, 1.1996e-04, 1.8764e-04, 1.0630e-04, 1.6957e-04, + 1.6077e-04, 1.2154e-04], device='cuda:2') +2022-11-15 15:14:10,872 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10288.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:14:15,218 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-11-15 15:14:22,009 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.133e+02 2.226e+02 2.767e+02 3.573e+02 6.449e+02, threshold=5.534e+02, percent-clipped=5.0 +2022-11-15 15:14:28,960 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10314.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:14:35,926 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0450, 5.1239, 4.5585, 5.2309, 5.1143, 4.6464, 4.4211, 4.0288], + device='cuda:2'), covar=tensor([0.0249, 0.0268, 0.0324, 0.0144, 0.0290, 0.0226, 0.0256, 0.0357], + device='cuda:2'), in_proj_covar=tensor([0.0058, 0.0056, 0.0077, 0.0057, 0.0074, 0.0071, 0.0059, 0.0053], + device='cuda:2'), out_proj_covar=tensor([1.0611e-04, 1.1321e-04, 1.3385e-04, 1.0928e-04, 1.4339e-04, 1.1936e-04, + 1.0472e-04, 9.3552e-05], device='cuda:2') +2022-11-15 15:14:36,227 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-15 15:14:45,273 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.77 vs. limit=5.0 +2022-11-15 15:15:03,068 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10362.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:15:07,021 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.55 vs. limit=5.0 +2022-11-15 15:15:10,524 INFO [train.py:876] (2/4) Epoch 2, batch 3100, loss[loss=0.2287, simple_loss=0.2073, pruned_loss=0.1251, over 5523.00 frames. ], tot_loss[loss=0.2561, simple_loss=0.2271, pruned_loss=0.1425, over 1088321.22 frames. ], batch size: 13, lr: 3.23e-02, grad_scale: 16.0 +2022-11-15 15:15:26,628 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10395.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 15:15:26,639 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7768, 3.0003, 2.6025, 2.9229, 2.7287, 2.8130, 1.3402, 2.7341], + device='cuda:2'), covar=tensor([0.0398, 0.0329, 0.0414, 0.0170, 0.0400, 0.0332, 0.2339, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0053, 0.0045, 0.0044, 0.0035, 0.0050, 0.0039, 0.0086, 0.0052], + device='cuda:2'), out_proj_covar=tensor([9.8436e-05, 8.4494e-05, 8.1317e-05, 6.3480e-05, 9.0666e-05, 7.1714e-05, + 1.4976e-04, 9.6810e-05], device='cuda:2') +2022-11-15 15:15:29,527 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10399.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:15:33,064 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.883e+01 2.180e+02 2.990e+02 3.781e+02 9.963e+02, threshold=5.979e+02, percent-clipped=5.0 +2022-11-15 15:15:36,983 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10409.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:15:43,105 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10418.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 15:16:02,917 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.47 vs. limit=5.0 +2022-11-15 15:16:04,085 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10447.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:16:06,312 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-11-15 15:16:08,972 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10454.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:16:20,276 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10470.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:16:22,102 INFO [train.py:876] (2/4) Epoch 2, batch 3200, loss[loss=0.2318, simple_loss=0.2256, pruned_loss=0.119, over 5741.00 frames. ], tot_loss[loss=0.2557, simple_loss=0.2269, pruned_loss=0.1422, over 1084557.50 frames. ], batch size: 14, lr: 3.22e-02, grad_scale: 16.0 +2022-11-15 15:16:44,407 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.221e+01 2.155e+02 2.904e+02 3.416e+02 7.936e+02, threshold=5.808e+02, percent-clipped=4.0 +2022-11-15 15:16:52,747 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10515.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 15:17:21,397 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.23 vs. limit=2.0 +2022-11-15 15:17:33,911 INFO [train.py:876] (2/4) Epoch 2, batch 3300, loss[loss=0.2122, simple_loss=0.2047, pruned_loss=0.1098, over 5554.00 frames. ], tot_loss[loss=0.253, simple_loss=0.2249, pruned_loss=0.1405, over 1085920.02 frames. ], batch size: 13, lr: 3.21e-02, grad_scale: 16.0 +2022-11-15 15:17:55,744 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.063e+02 1.974e+02 2.609e+02 3.131e+02 6.226e+02, threshold=5.219e+02, percent-clipped=2.0 +2022-11-15 15:18:02,164 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9936, 3.9880, 3.7979, 4.0959, 3.5036, 2.8144, 4.4467, 3.6786], + device='cuda:2'), covar=tensor([0.0304, 0.0751, 0.0375, 0.0516, 0.0489, 0.0393, 0.0552, 0.0423], + device='cuda:2'), in_proj_covar=tensor([0.0046, 0.0068, 0.0056, 0.0067, 0.0043, 0.0040, 0.0068, 0.0052], + device='cuda:2'), out_proj_covar=tensor([9.1023e-05, 1.3663e-04, 1.0855e-04, 1.3149e-04, 8.7474e-05, 7.8659e-05, + 1.5334e-04, 1.0210e-04], device='cuda:2') +2022-11-15 15:18:45,890 INFO [train.py:876] (2/4) Epoch 2, batch 3400, loss[loss=0.236, simple_loss=0.2084, pruned_loss=0.1317, over 5042.00 frames. ], tot_loss[loss=0.2511, simple_loss=0.2236, pruned_loss=0.1393, over 1084104.51 frames. ], batch size: 110, lr: 3.19e-02, grad_scale: 16.0 +2022-11-15 15:19:01,424 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10695.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:19:06,236 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4206, 1.2891, 1.6408, 1.5846, 1.4919, 1.9414, 0.9800, 1.3211], + device='cuda:2'), covar=tensor([0.0112, 0.0210, 0.0130, 0.0102, 0.0177, 0.0092, 0.0224, 0.0128], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0017, 0.0017, 0.0020, 0.0020, 0.0019, 0.0023, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.5338e-05, 2.5647e-05, 2.6280e-05, 2.5746e-05, 2.8165e-05, 2.6583e-05, + 3.2887e-05, 2.5014e-05], device='cuda:2') +2022-11-15 15:19:07,529 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.578e+02 2.426e+02 2.941e+02 3.632e+02 1.443e+03, threshold=5.881e+02, percent-clipped=8.0 +2022-11-15 15:19:18,043 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10718.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 15:19:36,087 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10743.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:19:36,914 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10744.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:19:51,855 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10765.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:19:52,491 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10766.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 15:19:58,025 INFO [train.py:876] (2/4) Epoch 2, batch 3500, loss[loss=0.2068, simple_loss=0.2011, pruned_loss=0.1063, over 5771.00 frames. ], tot_loss[loss=0.2524, simple_loss=0.2256, pruned_loss=0.1396, over 1085580.18 frames. ], batch size: 16, lr: 3.18e-02, grad_scale: 16.0 +2022-11-15 15:19:58,854 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10774.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:19:59,818 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.27 vs. limit=2.0 +2022-11-15 15:20:20,009 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.227e+02 2.667e+02 3.424e+02 6.980e+02, threshold=5.333e+02, percent-clipped=5.0 +2022-11-15 15:20:20,893 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10805.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:20:24,280 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10810.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 15:20:42,597 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10835.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:20:47,450 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8997, 0.7635, 0.9342, 0.8053, 1.1727, 1.0587, 0.5249, 0.8662], + device='cuda:2'), covar=tensor([0.0100, 0.0094, 0.0095, 0.0052, 0.0075, 0.0073, 0.0146, 0.0065], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0020, 0.0021, 0.0020, 0.0019, 0.0020, 0.0022, 0.0020], + device='cuda:2'), out_proj_covar=tensor([2.9339e-05, 2.7810e-05, 3.1143e-05, 2.4353e-05, 2.7634e-05, 2.7713e-05, + 3.6629e-05, 2.9523e-05], device='cuda:2') +2022-11-15 15:21:08,880 INFO [train.py:876] (2/4) Epoch 2, batch 3600, loss[loss=0.2987, simple_loss=0.2492, pruned_loss=0.1741, over 5140.00 frames. ], tot_loss[loss=0.2505, simple_loss=0.224, pruned_loss=0.1385, over 1090376.80 frames. ], batch size: 91, lr: 3.17e-02, grad_scale: 32.0 +2022-11-15 15:21:18,324 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10885.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 15:21:31,442 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.316e+02 2.250e+02 2.765e+02 3.840e+02 7.288e+02, threshold=5.531e+02, percent-clipped=6.0 +2022-11-15 15:21:43,922 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9934, 1.1876, 1.1032, 0.7878, 1.3803, 1.0409, 0.7259, 0.8140], + device='cuda:2'), covar=tensor([0.0929, 0.0308, 0.0356, 0.1006, 0.0431, 0.0491, 0.0487, 0.0752], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0012, 0.0011, 0.0013, 0.0010, 0.0011, 0.0013, 0.0011], + device='cuda:2'), out_proj_covar=tensor([2.2086e-05, 2.1821e-05, 2.3402e-05, 2.8381e-05, 1.9004e-05, 2.1285e-05, + 2.5299e-05, 2.0644e-05], device='cuda:2') +2022-11-15 15:21:46,759 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-15 15:21:49,507 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.60 vs. limit=5.0 +2022-11-15 15:21:54,476 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4571, 2.8935, 3.2477, 3.0518, 2.5219, 3.6942, 3.1445, 3.4618], + device='cuda:2'), covar=tensor([0.0233, 0.0144, 0.0080, 0.0197, 0.0283, 0.0040, 0.0134, 0.0038], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0048, 0.0056, 0.0053, 0.0093, 0.0053, 0.0072, 0.0048], + device='cuda:2'), out_proj_covar=tensor([1.2767e-04, 7.2522e-05, 8.1836e-05, 8.5384e-05, 1.4429e-04, 7.4093e-05, + 1.0764e-04, 6.9059e-05], device='cuda:2') +2022-11-15 15:22:01,320 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10946.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 15:22:07,822 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-11-15 15:22:15,393 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7894, 1.7621, 2.3493, 3.6502, 3.8180, 2.6429, 2.1061, 3.3231], + device='cuda:2'), covar=tensor([0.0050, 0.1492, 0.1370, 0.0551, 0.0095, 0.1054, 0.1287, 0.0096], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0179, 0.0184, 0.0102, 0.0109, 0.0194, 0.0179, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 15:22:16,483 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.05 vs. limit=2.0 +2022-11-15 15:22:19,989 INFO [train.py:876] (2/4) Epoch 2, batch 3700, loss[loss=0.2381, simple_loss=0.2168, pruned_loss=0.1297, over 5580.00 frames. ], tot_loss[loss=0.25, simple_loss=0.2233, pruned_loss=0.1383, over 1089455.39 frames. ], batch size: 25, lr: 3.16e-02, grad_scale: 32.0 +2022-11-15 15:22:33,993 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3876, 3.6840, 3.6281, 3.6486, 3.4770, 3.7170, 1.7318, 3.4022], + device='cuda:2'), covar=tensor([0.0241, 0.0161, 0.0123, 0.0127, 0.0200, 0.0152, 0.1681, 0.0234], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0046, 0.0046, 0.0038, 0.0051, 0.0040, 0.0088, 0.0054], + device='cuda:2'), out_proj_covar=tensor([1.0901e-04, 8.8360e-05, 8.6137e-05, 7.1044e-05, 9.4045e-05, 7.7600e-05, + 1.5425e-04, 1.0259e-04], device='cuda:2') +2022-11-15 15:22:42,992 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.438e+02 2.397e+02 3.169e+02 4.273e+02 6.249e+02, threshold=6.338e+02, percent-clipped=7.0 +2022-11-15 15:23:25,738 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11065.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:23:31,379 INFO [train.py:876] (2/4) Epoch 2, batch 3800, loss[loss=0.2489, simple_loss=0.2329, pruned_loss=0.1325, over 5596.00 frames. ], tot_loss[loss=0.2509, simple_loss=0.2236, pruned_loss=0.1391, over 1084313.00 frames. ], batch size: 23, lr: 3.15e-02, grad_scale: 16.0 +2022-11-15 15:23:50,499 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11100.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:23:54,057 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.988e+01 2.162e+02 2.820e+02 3.661e+02 7.630e+02, threshold=5.641e+02, percent-clipped=4.0 +2022-11-15 15:23:57,601 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11110.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 15:23:59,579 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11113.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:24:02,385 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8410, 2.8623, 2.5762, 2.9086, 2.9360, 2.5967, 2.5821, 2.4686], + device='cuda:2'), covar=tensor([0.0285, 0.0363, 0.0482, 0.0300, 0.0292, 0.0442, 0.0321, 0.0312], + device='cuda:2'), in_proj_covar=tensor([0.0059, 0.0060, 0.0081, 0.0059, 0.0075, 0.0074, 0.0064, 0.0056], + device='cuda:2'), out_proj_covar=tensor([1.0968e-04, 1.2173e-04, 1.4119e-04, 1.1415e-04, 1.4805e-04, 1.2809e-04, + 1.1289e-04, 9.6244e-05], device='cuda:2') +2022-11-15 15:24:11,333 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.51 vs. limit=2.0 +2022-11-15 15:24:11,517 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11130.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:24:28,323 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-11-15 15:24:31,160 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11158.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:24:41,931 INFO [train.py:876] (2/4) Epoch 2, batch 3900, loss[loss=0.2891, simple_loss=0.2566, pruned_loss=0.1608, over 5596.00 frames. ], tot_loss[loss=0.2518, simple_loss=0.2246, pruned_loss=0.1395, over 1088615.67 frames. ], batch size: 24, lr: 3.13e-02, grad_scale: 16.0 +2022-11-15 15:24:42,735 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11174.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:24:46,393 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1903, 3.6161, 3.1875, 3.6681, 2.9916, 2.6165, 1.9310, 3.1662], + device='cuda:2'), covar=tensor([0.1389, 0.0161, 0.0422, 0.0150, 0.0359, 0.0870, 0.2061, 0.0174], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0097, 0.0124, 0.0086, 0.0102, 0.0139, 0.0176, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 15:25:04,801 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.114e+02 2.353e+02 2.852e+02 3.627e+02 7.008e+02, threshold=5.704e+02, percent-clipped=3.0 +2022-11-15 15:25:27,366 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11235.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:25:31,463 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11241.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 15:25:31,674 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 15:25:46,756 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0092, 1.3643, 1.0563, 0.6110, 1.7049, 1.3109, 1.1689, 1.3345], + device='cuda:2'), covar=tensor([0.0730, 0.0203, 0.0397, 0.0990, 0.0330, 0.0185, 0.0372, 0.0441], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0011, 0.0010, 0.0012, 0.0009, 0.0009, 0.0011, 0.0009], + device='cuda:2'), out_proj_covar=tensor([1.9749e-05, 1.9430e-05, 2.2345e-05, 2.6586e-05, 1.8253e-05, 1.8949e-05, + 2.1653e-05, 1.8260e-05], device='cuda:2') +2022-11-15 15:25:54,062 INFO [train.py:876] (2/4) Epoch 2, batch 4000, loss[loss=0.2704, simple_loss=0.232, pruned_loss=0.1544, over 5586.00 frames. ], tot_loss[loss=0.2509, simple_loss=0.2248, pruned_loss=0.1385, over 1091142.12 frames. ], batch size: 43, lr: 3.12e-02, grad_scale: 16.0 +2022-11-15 15:26:07,946 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9076, 1.7097, 2.6224, 2.2251, 2.5609, 2.0917, 2.5597, 2.7971], + device='cuda:2'), covar=tensor([0.0049, 0.0426, 0.0082, 0.0262, 0.0085, 0.0268, 0.0126, 0.0088], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0142, 0.0088, 0.0142, 0.0079, 0.0128, 0.0117, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 15:26:16,540 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.259e+02 2.176e+02 2.940e+02 3.819e+02 6.622e+02, threshold=5.880e+02, percent-clipped=2.0 +2022-11-15 15:26:59,573 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 15:27:04,368 INFO [train.py:876] (2/4) Epoch 2, batch 4100, loss[loss=0.3091, simple_loss=0.2769, pruned_loss=0.1707, over 5754.00 frames. ], tot_loss[loss=0.249, simple_loss=0.2236, pruned_loss=0.1372, over 1094739.02 frames. ], batch size: 20, lr: 3.11e-02, grad_scale: 16.0 +2022-11-15 15:27:24,549 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11400.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:27:27,789 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.046e+02 2.309e+02 2.774e+02 3.508e+02 5.775e+02, threshold=5.548e+02, percent-clipped=0.0 +2022-11-15 15:27:46,237 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11430.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:27:58,771 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11448.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:28:10,060 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6037, 4.7735, 4.3981, 4.4851, 4.4232, 4.5072, 1.8045, 4.4342], + device='cuda:2'), covar=tensor([0.0218, 0.0643, 0.0498, 0.0197, 0.0224, 0.0269, 0.2113, 0.0241], + device='cuda:2'), in_proj_covar=tensor([0.0061, 0.0048, 0.0049, 0.0039, 0.0055, 0.0043, 0.0096, 0.0059], + device='cuda:2'), out_proj_covar=tensor([1.1714e-04, 9.1856e-05, 9.2904e-05, 7.3115e-05, 1.0368e-04, 8.2123e-05, + 1.6899e-04, 1.1139e-04], device='cuda:2') +2022-11-15 15:28:16,209 INFO [train.py:876] (2/4) Epoch 2, batch 4200, loss[loss=0.2129, simple_loss=0.2054, pruned_loss=0.1102, over 5593.00 frames. ], tot_loss[loss=0.253, simple_loss=0.2263, pruned_loss=0.1398, over 1095595.64 frames. ], batch size: 24, lr: 3.10e-02, grad_scale: 16.0 +2022-11-15 15:28:19,839 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11478.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:28:22,517 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9287, 2.9573, 2.6520, 3.0044, 3.0636, 2.7288, 2.6558, 2.4050], + device='cuda:2'), covar=tensor([0.0243, 0.0404, 0.0518, 0.0296, 0.0318, 0.0404, 0.0439, 0.0490], + device='cuda:2'), in_proj_covar=tensor([0.0058, 0.0063, 0.0081, 0.0059, 0.0076, 0.0073, 0.0064, 0.0057], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 15:28:36,878 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.58 vs. limit=5.0 +2022-11-15 15:28:39,538 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.267e+02 2.116e+02 2.605e+02 3.416e+02 5.601e+02, threshold=5.209e+02, percent-clipped=1.0 +2022-11-15 15:28:54,519 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-15 15:28:54,790 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2298, 3.9961, 3.4746, 4.0419, 4.1193, 3.3989, 3.4227, 2.9992], + device='cuda:2'), covar=tensor([0.0559, 0.0294, 0.0371, 0.0237, 0.0171, 0.0356, 0.0325, 0.0607], + device='cuda:2'), in_proj_covar=tensor([0.0062, 0.0066, 0.0085, 0.0063, 0.0079, 0.0077, 0.0066, 0.0060], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 15:28:56,851 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11530.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:28:58,319 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11532.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:29:04,732 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11541.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 15:29:12,978 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9632, 0.6191, 0.6919, 0.4808, 1.1380, 0.9325, 0.7028, 0.8485], + device='cuda:2'), covar=tensor([0.0164, 0.0103, 0.0191, 0.0398, 0.0148, 0.0136, 0.0294, 0.0396], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0011, 0.0010, 0.0011, 0.0010, 0.0010, 0.0011, 0.0010], + device='cuda:2'), out_proj_covar=tensor([2.0005e-05, 2.0387e-05, 2.3490e-05, 2.5269e-05, 1.9661e-05, 2.0272e-05, + 2.2594e-05, 1.9594e-05], device='cuda:2') +2022-11-15 15:29:22,513 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.10 vs. limit=2.0 +2022-11-15 15:29:23,929 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0 +2022-11-15 15:29:27,220 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.11 vs. limit=5.0 +2022-11-15 15:29:27,547 INFO [train.py:876] (2/4) Epoch 2, batch 4300, loss[loss=0.1868, simple_loss=0.1829, pruned_loss=0.09533, over 5697.00 frames. ], tot_loss[loss=0.251, simple_loss=0.2248, pruned_loss=0.1386, over 1088326.42 frames. ], batch size: 28, lr: 3.09e-02, grad_scale: 16.0 +2022-11-15 15:29:38,725 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11589.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 15:29:41,469 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11593.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:29:51,941 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.207e+02 2.389e+02 3.097e+02 3.751e+02 1.482e+03, threshold=6.195e+02, percent-clipped=9.0 +2022-11-15 15:30:05,141 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2486, 2.2671, 4.2598, 3.0018, 3.8176, 3.2517, 4.1591, 4.3482], + device='cuda:2'), covar=tensor([0.0038, 0.0453, 0.0062, 0.0353, 0.0047, 0.0266, 0.0161, 0.0095], + device='cuda:2'), in_proj_covar=tensor([0.0073, 0.0143, 0.0092, 0.0145, 0.0081, 0.0129, 0.0125, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 15:30:11,780 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11635.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:30:39,157 INFO [train.py:876] (2/4) Epoch 2, batch 4400, loss[loss=0.218, simple_loss=0.2068, pruned_loss=0.1146, over 5546.00 frames. ], tot_loss[loss=0.2495, simple_loss=0.224, pruned_loss=0.1376, over 1089225.48 frames. ], batch size: 15, lr: 3.08e-02, grad_scale: 8.0 +2022-11-15 15:30:49,499 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.67 vs. limit=2.0 +2022-11-15 15:30:54,038 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-11-15 15:30:55,185 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11696.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:30:57,325 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11699.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:31:02,563 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.242e+02 2.298e+02 2.735e+02 3.598e+02 7.155e+02, threshold=5.470e+02, percent-clipped=1.0 +2022-11-15 15:31:40,408 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11760.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:31:49,956 INFO [train.py:876] (2/4) Epoch 2, batch 4500, loss[loss=0.2338, simple_loss=0.2108, pruned_loss=0.1284, over 5727.00 frames. ], tot_loss[loss=0.2468, simple_loss=0.2227, pruned_loss=0.1354, over 1087962.33 frames. ], batch size: 11, lr: 3.07e-02, grad_scale: 8.0 +2022-11-15 15:32:13,965 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.306e+02 2.368e+02 2.947e+02 3.819e+02 5.858e+02, threshold=5.894e+02, percent-clipped=4.0 +2022-11-15 15:32:20,971 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11816.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:32:31,008 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11830.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:32:41,971 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11846.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:32:47,580 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1715, 3.6550, 3.1297, 2.9865, 2.3670, 3.3795, 2.8716, 3.3949], + device='cuda:2'), covar=tensor([0.0241, 0.0033, 0.0072, 0.0190, 0.0256, 0.0050, 0.0114, 0.0024], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0052, 0.0063, 0.0057, 0.0101, 0.0059, 0.0081, 0.0050], + device='cuda:2'), out_proj_covar=tensor([1.4546e-04, 8.1865e-05, 9.7629e-05, 9.9261e-05, 1.6148e-04, 8.4056e-05, + 1.2551e-04, 7.4681e-05], device='cuda:2') +2022-11-15 15:32:49,081 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 15:33:01,637 INFO [train.py:876] (2/4) Epoch 2, batch 4600, loss[loss=0.2814, simple_loss=0.2354, pruned_loss=0.1637, over 5473.00 frames. ], tot_loss[loss=0.2478, simple_loss=0.2228, pruned_loss=0.1364, over 1086583.02 frames. ], batch size: 64, lr: 3.05e-02, grad_scale: 8.0 +2022-11-15 15:33:04,581 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11877.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:33:05,517 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11878.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:33:12,419 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11888.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:33:25,410 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.212e+02 2.171e+02 2.900e+02 3.774e+02 7.017e+02, threshold=5.800e+02, percent-clipped=1.0 +2022-11-15 15:33:25,623 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11907.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:33:34,964 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11920.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:33:37,086 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.5499, 0.7188, 0.6818, 0.5972, 0.7800, 0.9380, 0.6422, 0.8511], + device='cuda:2'), covar=tensor([0.0100, 0.0023, 0.0122, 0.0029, 0.0051, 0.0086, 0.0090, 0.0026], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0020, 0.0020, 0.0019, 0.0020, 0.0021, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.8621e-05, 2.9434e-05, 3.1559e-05, 2.4830e-05, 2.6427e-05, 2.6734e-05, + 3.7767e-05, 2.6645e-05], device='cuda:2') +2022-11-15 15:33:54,833 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11948.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:34:10,470 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.29 vs. limit=2.0 +2022-11-15 15:34:12,181 INFO [train.py:876] (2/4) Epoch 2, batch 4700, loss[loss=0.2686, simple_loss=0.2368, pruned_loss=0.1502, over 5715.00 frames. ], tot_loss[loss=0.2474, simple_loss=0.2227, pruned_loss=0.1361, over 1084486.81 frames. ], batch size: 28, lr: 3.04e-02, grad_scale: 8.0 +2022-11-15 15:34:18,283 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11981.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:34:20,680 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11984.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:34:25,389 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11991.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:34:36,939 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.194e+02 2.107e+02 2.730e+02 3.355e+02 8.347e+02, threshold=5.461e+02, percent-clipped=3.0 +2022-11-15 15:34:38,489 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12009.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:35:04,508 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12045.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:35:11,922 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12055.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:35:14,194 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-11-15 15:35:24,089 INFO [train.py:876] (2/4) Epoch 2, batch 4800, loss[loss=0.2541, simple_loss=0.2199, pruned_loss=0.1442, over 5709.00 frames. ], tot_loss[loss=0.2473, simple_loss=0.2232, pruned_loss=0.1358, over 1086827.00 frames. ], batch size: 28, lr: 3.03e-02, grad_scale: 8.0 +2022-11-15 15:35:26,316 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=12076.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:35:48,825 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.190e+02 2.296e+02 2.933e+02 3.523e+02 8.613e+02, threshold=5.866e+02, percent-clipped=4.0 +2022-11-15 15:35:49,978 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-15 15:36:09,338 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12137.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:36:34,716 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12172.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:36:35,353 INFO [train.py:876] (2/4) Epoch 2, batch 4900, loss[loss=0.277, simple_loss=0.2549, pruned_loss=0.1495, over 5743.00 frames. ], tot_loss[loss=0.2445, simple_loss=0.2209, pruned_loss=0.134, over 1087346.75 frames. ], batch size: 27, lr: 3.02e-02, grad_scale: 8.0 +2022-11-15 15:36:45,797 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12188.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:36:55,621 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12202.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:36:59,038 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.339e+02 2.288e+02 2.922e+02 4.193e+02 1.035e+03, threshold=5.844e+02, percent-clipped=8.0 +2022-11-15 15:37:13,697 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.87 vs. limit=5.0 +2022-11-15 15:37:20,201 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12236.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:37:41,520 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7316, 1.7875, 1.9121, 1.8932, 1.4203, 2.0264, 1.4365, 1.0925], + device='cuda:2'), covar=tensor([0.0125, 0.0457, 0.0332, 0.0401, 0.0254, 0.0191, 0.0259, 0.0908], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0035, 0.0040, 0.0029, 0.0040, 0.0035, 0.0040, 0.0028], + device='cuda:2'), out_proj_covar=tensor([5.1813e-05, 6.2687e-05, 8.0242e-05, 5.3173e-05, 7.4753e-05, 7.2312e-05, + 7.2651e-05, 5.2707e-05], device='cuda:2') +2022-11-15 15:37:46,254 INFO [train.py:876] (2/4) Epoch 2, batch 5000, loss[loss=0.2479, simple_loss=0.2266, pruned_loss=0.1346, over 5649.00 frames. ], tot_loss[loss=0.2439, simple_loss=0.2205, pruned_loss=0.1337, over 1091493.18 frames. ], batch size: 38, lr: 3.01e-02, grad_scale: 8.0 +2022-11-15 15:37:48,756 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12276.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:37:49,165 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.05 vs. limit=5.0 +2022-11-15 15:37:59,287 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12291.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:38:02,524 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.07 vs. limit=2.0 +2022-11-15 15:38:08,022 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12304.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:38:09,979 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.089e+02 2.107e+02 2.700e+02 3.490e+02 8.758e+02, threshold=5.401e+02, percent-clipped=1.0 +2022-11-15 15:38:10,793 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5031, 3.5921, 2.7617, 1.6898, 3.5791, 1.0593, 3.5896, 1.8779], + device='cuda:2'), covar=tensor([0.0726, 0.0137, 0.0387, 0.1635, 0.0093, 0.1578, 0.0080, 0.1283], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0062, 0.0063, 0.0104, 0.0065, 0.0107, 0.0052, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-11-15 15:38:12,840 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6368, 4.1298, 4.6018, 4.1331, 4.8025, 4.3557, 4.2311, 4.5074], + device='cuda:2'), covar=tensor([0.0311, 0.0267, 0.0340, 0.0271, 0.0293, 0.0154, 0.0280, 0.0272], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0074, 0.0060, 0.0072, 0.0067, 0.0048, 0.0060, 0.0063], + device='cuda:2'), out_proj_covar=tensor([1.5157e-04, 1.5177e-04, 1.2719e-04, 1.4627e-04, 1.5815e-04, 9.8449e-05, + 1.2516e-04, 1.3221e-04], device='cuda:2') +2022-11-15 15:38:16,761 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-11-15 15:38:21,600 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9416, 1.8024, 1.4801, 1.5205, 1.1608, 2.0914, 1.6147, 1.3274], + device='cuda:2'), covar=tensor([0.0246, 0.0454, 0.0840, 0.0468, 0.0421, 0.0473, 0.0399, 0.1323], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0033, 0.0041, 0.0029, 0.0040, 0.0035, 0.0040, 0.0027], + device='cuda:2'), out_proj_covar=tensor([5.3158e-05, 6.1052e-05, 8.2318e-05, 5.3771e-05, 7.5947e-05, 7.2312e-05, + 7.3306e-05, 5.1162e-05], device='cuda:2') +2022-11-15 15:38:32,896 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12339.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:38:33,581 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12340.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:38:35,676 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1602, 1.1881, 1.1794, 0.7248, 1.2428, 1.1574, 0.7907, 0.9162], + device='cuda:2'), covar=tensor([0.0029, 0.0054, 0.0021, 0.0054, 0.0027, 0.0035, 0.0053, 0.0046], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0017, 0.0016, 0.0020, 0.0020, 0.0018, 0.0021, 0.0020], + device='cuda:2'), out_proj_covar=tensor([2.6059e-05, 2.6225e-05, 2.5286e-05, 2.6530e-05, 2.8778e-05, 2.2948e-05, + 2.9107e-05, 2.7370e-05], device='cuda:2') +2022-11-15 15:38:43,939 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12355.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:38:46,476 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.95 vs. limit=2.0 +2022-11-15 15:38:56,198 INFO [train.py:876] (2/4) Epoch 2, batch 5100, loss[loss=0.249, simple_loss=0.2348, pruned_loss=0.1316, over 5571.00 frames. ], tot_loss[loss=0.2476, simple_loss=0.2236, pruned_loss=0.1358, over 1089047.13 frames. ], batch size: 25, lr: 3.00e-02, grad_scale: 8.0 +2022-11-15 15:39:01,489 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9446, 5.2718, 5.3096, 5.2122, 4.7395, 4.0172, 5.8214, 5.2420], + device='cuda:2'), covar=tensor([0.0315, 0.0775, 0.0309, 0.0751, 0.0492, 0.0326, 0.0850, 0.0355], + device='cuda:2'), in_proj_covar=tensor([0.0050, 0.0073, 0.0060, 0.0072, 0.0048, 0.0041, 0.0078, 0.0054], + device='cuda:2'), out_proj_covar=tensor([1.0419e-04, 1.5401e-04, 1.2483e-04, 1.4977e-04, 1.0510e-04, 8.6701e-05, + 1.8158e-04, 1.1117e-04], device='cuda:2') +2022-11-15 15:39:15,047 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-11-15 15:39:17,688 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12403.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:39:20,359 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.099e+02 2.363e+02 2.989e+02 3.735e+02 9.189e+02, threshold=5.978e+02, percent-clipped=6.0 +2022-11-15 15:39:38,297 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12432.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:39:48,212 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=12446.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:39:58,431 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9787, 2.3130, 2.8644, 4.0537, 4.0315, 2.9642, 2.8502, 3.9046], + device='cuda:2'), covar=tensor([0.0048, 0.1104, 0.0769, 0.0303, 0.0073, 0.0810, 0.0825, 0.0039], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0191, 0.0199, 0.0107, 0.0118, 0.0212, 0.0189, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0001], + device='cuda:2') +2022-11-15 15:40:04,733 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 15:40:05,735 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12472.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:40:06,291 INFO [train.py:876] (2/4) Epoch 2, batch 5200, loss[loss=0.2554, simple_loss=0.2276, pruned_loss=0.1416, over 5548.00 frames. ], tot_loss[loss=0.2449, simple_loss=0.2212, pruned_loss=0.1343, over 1082514.11 frames. ], batch size: 40, lr: 2.99e-02, grad_scale: 8.0 +2022-11-15 15:40:19,872 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8483, 2.1956, 1.8502, 1.3038, 2.2197, 0.9443, 2.2941, 1.2505], + device='cuda:2'), covar=tensor([0.0683, 0.0213, 0.0445, 0.1630, 0.0233, 0.1615, 0.0150, 0.1247], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0066, 0.0065, 0.0109, 0.0068, 0.0110, 0.0055, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-11-15 15:40:27,643 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12502.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:40:30,895 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.236e+02 2.247e+02 2.853e+02 3.509e+02 7.106e+02, threshold=5.707e+02, percent-clipped=3.0 +2022-11-15 15:40:31,097 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12507.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:40:39,938 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12520.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:41:01,892 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12550.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:41:12,673 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.86 vs. limit=5.0 +2022-11-15 15:41:17,841 INFO [train.py:876] (2/4) Epoch 2, batch 5300, loss[loss=0.2025, simple_loss=0.2053, pruned_loss=0.09979, over 5455.00 frames. ], tot_loss[loss=0.246, simple_loss=0.2222, pruned_loss=0.1349, over 1088684.95 frames. ], batch size: 10, lr: 2.98e-02, grad_scale: 8.0 +2022-11-15 15:41:20,137 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12576.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:41:40,300 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12604.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:41:42,527 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.253e+02 2.286e+02 2.922e+02 3.556e+02 5.667e+02, threshold=5.844e+02, percent-clipped=0.0 +2022-11-15 15:41:54,447 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12624.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:41:57,454 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-11-15 15:42:05,334 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12640.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:42:13,761 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12652.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:42:29,011 INFO [train.py:876] (2/4) Epoch 2, batch 5400, loss[loss=0.1907, simple_loss=0.1817, pruned_loss=0.09984, over 5756.00 frames. ], tot_loss[loss=0.2429, simple_loss=0.2198, pruned_loss=0.133, over 1090992.19 frames. ], batch size: 20, lr: 2.97e-02, grad_scale: 8.0 +2022-11-15 15:42:39,258 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12688.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:42:52,240 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.322e+02 2.272e+02 2.868e+02 3.649e+02 6.503e+02, threshold=5.736e+02, percent-clipped=2.0 +2022-11-15 15:43:10,909 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12732.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:43:11,628 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2755, 1.0205, 1.0392, 1.3860, 0.3073, 0.9766, 1.0976, 1.1111], + device='cuda:2'), covar=tensor([0.0066, 0.0073, 0.0068, 0.0037, 0.0166, 0.0084, 0.0064, 0.0036], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0034, 0.0040, 0.0029, 0.0041, 0.0033, 0.0041, 0.0027], + device='cuda:2'), out_proj_covar=tensor([5.0514e-05, 6.1064e-05, 8.2163e-05, 5.3656e-05, 7.9988e-05, 6.9925e-05, + 7.6579e-05, 5.0833e-05], device='cuda:2') +2022-11-15 15:43:23,520 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6162, 1.7731, 3.1693, 2.6206, 2.9794, 2.1720, 3.0703, 3.3732], + device='cuda:2'), covar=tensor([0.0024, 0.0383, 0.0066, 0.0285, 0.0061, 0.0267, 0.0123, 0.0084], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0156, 0.0099, 0.0164, 0.0089, 0.0141, 0.0140, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 15:43:32,320 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=12763.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:43:40,145 INFO [train.py:876] (2/4) Epoch 2, batch 5500, loss[loss=0.2656, simple_loss=0.2425, pruned_loss=0.1444, over 5513.00 frames. ], tot_loss[loss=0.2431, simple_loss=0.2201, pruned_loss=0.133, over 1091328.86 frames. ], batch size: 49, lr: 2.96e-02, grad_scale: 8.0 +2022-11-15 15:43:45,536 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12780.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:43:55,907 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.66 vs. limit=5.0 +2022-11-15 15:44:01,673 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12802.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:44:02,059 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 15:44:05,124 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.455e+02 2.098e+02 2.585e+02 3.345e+02 6.540e+02, threshold=5.170e+02, percent-clipped=1.0 +2022-11-15 15:44:08,827 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3853, 4.0553, 4.3124, 4.0787, 4.6136, 4.2408, 4.0417, 4.5289], + device='cuda:2'), covar=tensor([0.0406, 0.0285, 0.0459, 0.0247, 0.0392, 0.0267, 0.0262, 0.0324], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0077, 0.0063, 0.0076, 0.0069, 0.0051, 0.0063, 0.0066], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 15:44:18,795 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12824.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 15:44:30,022 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9489, 4.0772, 3.1735, 1.8989, 3.8240, 1.6565, 4.1223, 2.4386], + device='cuda:2'), covar=tensor([0.0822, 0.0125, 0.0477, 0.2023, 0.0180, 0.1673, 0.0118, 0.1395], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0067, 0.0064, 0.0107, 0.0070, 0.0113, 0.0057, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-11-15 15:44:53,049 INFO [train.py:876] (2/4) Epoch 2, batch 5600, loss[loss=0.3175, simple_loss=0.2479, pruned_loss=0.1935, over 2987.00 frames. ], tot_loss[loss=0.2453, simple_loss=0.2217, pruned_loss=0.1345, over 1084944.35 frames. ], batch size: 284, lr: 2.95e-02, grad_scale: 8.0 +2022-11-15 15:45:17,178 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.085e+02 2.173e+02 2.659e+02 3.655e+02 7.963e+02, threshold=5.318e+02, percent-clipped=7.0 +2022-11-15 15:45:24,312 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 15:46:03,227 INFO [train.py:876] (2/4) Epoch 2, batch 5700, loss[loss=0.3371, simple_loss=0.2706, pruned_loss=0.2018, over 5606.00 frames. ], tot_loss[loss=0.2433, simple_loss=0.2201, pruned_loss=0.1332, over 1082296.37 frames. ], batch size: 50, lr: 2.94e-02, grad_scale: 8.0 +2022-11-15 15:46:28,142 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.438e+02 2.403e+02 3.143e+02 3.984e+02 1.020e+03, threshold=6.287e+02, percent-clipped=10.0 +2022-11-15 15:46:47,751 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2401, 2.3879, 3.1682, 4.3729, 4.6204, 3.1247, 3.0118, 4.5116], + device='cuda:2'), covar=tensor([0.0041, 0.1319, 0.0834, 0.0183, 0.0060, 0.0976, 0.0825, 0.0027], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0199, 0.0207, 0.0118, 0.0131, 0.0220, 0.0197, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0001], + device='cuda:2') +2022-11-15 15:47:03,822 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.32 vs. limit=5.0 +2022-11-15 15:47:14,316 INFO [train.py:876] (2/4) Epoch 2, batch 5800, loss[loss=0.2733, simple_loss=0.2407, pruned_loss=0.153, over 5641.00 frames. ], tot_loss[loss=0.2456, simple_loss=0.2217, pruned_loss=0.1348, over 1078365.18 frames. ], batch size: 38, lr: 2.93e-02, grad_scale: 8.0 +2022-11-15 15:47:35,093 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=13102.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:47:38,323 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.334e+02 2.022e+02 2.861e+02 3.567e+02 9.909e+02, threshold=5.722e+02, percent-clipped=2.0 +2022-11-15 15:47:46,543 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=13119.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 15:48:08,627 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=13150.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:48:15,907 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4524, 1.1000, 1.1911, 1.4748, 0.7315, 1.2280, 1.0011, 1.3748], + device='cuda:2'), covar=tensor([0.0075, 0.0120, 0.0150, 0.0072, 0.0232, 0.0133, 0.0112, 0.0049], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0034, 0.0043, 0.0030, 0.0043, 0.0035, 0.0042, 0.0026], + device='cuda:2'), out_proj_covar=tensor([5.2156e-05, 6.3631e-05, 8.8284e-05, 5.6883e-05, 8.3366e-05, 7.4154e-05, + 7.8117e-05, 5.0051e-05], device='cuda:2') +2022-11-15 15:48:24,658 INFO [train.py:876] (2/4) Epoch 2, batch 5900, loss[loss=0.183, simple_loss=0.1799, pruned_loss=0.09308, over 5201.00 frames. ], tot_loss[loss=0.2425, simple_loss=0.2193, pruned_loss=0.1328, over 1079780.70 frames. ], batch size: 6, lr: 2.92e-02, grad_scale: 8.0 +2022-11-15 15:48:49,673 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.160e+02 2.159e+02 2.799e+02 3.502e+02 4.935e+02, threshold=5.598e+02, percent-clipped=0.0 +2022-11-15 15:48:51,925 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7418, 1.8052, 1.7563, 2.2141, 0.7948, 1.5238, 1.5477, 2.1244], + device='cuda:2'), covar=tensor([0.0156, 0.0190, 0.0302, 0.0276, 0.0335, 0.0435, 0.0249, 0.0369], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0034, 0.0041, 0.0029, 0.0042, 0.0034, 0.0043, 0.0027], + device='cuda:2'), out_proj_covar=tensor([5.2703e-05, 6.2169e-05, 8.5553e-05, 5.4728e-05, 8.2808e-05, 7.3344e-05, + 7.8410e-05, 5.1737e-05], device='cuda:2') +2022-11-15 15:49:24,049 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-11-15 15:49:30,280 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4843, 0.9369, 1.3111, 0.7469, 1.5198, 1.1416, 0.9316, 1.0693], + device='cuda:2'), covar=tensor([0.0046, 0.0053, 0.0038, 0.0055, 0.0057, 0.0055, 0.0094, 0.0074], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0017, 0.0018, 0.0017, 0.0017, 0.0018, 0.0016], + device='cuda:2'), out_proj_covar=tensor([2.7347e-05, 2.8899e-05, 2.7230e-05, 2.2271e-05, 2.2505e-05, 2.2613e-05, + 3.2144e-05, 2.4480e-05], device='cuda:2') +2022-11-15 15:49:33,027 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2159, 5.1324, 4.2075, 5.0560, 4.2698, 3.5241, 2.9039, 4.5924], + device='cuda:2'), covar=tensor([0.1127, 0.0247, 0.0382, 0.0103, 0.0178, 0.0563, 0.1608, 0.0081], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0102, 0.0141, 0.0093, 0.0118, 0.0155, 0.0188, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 15:49:36,276 INFO [train.py:876] (2/4) Epoch 2, batch 6000, loss[loss=0.3205, simple_loss=0.2604, pruned_loss=0.1903, over 4671.00 frames. ], tot_loss[loss=0.2431, simple_loss=0.2192, pruned_loss=0.1335, over 1074777.19 frames. ], batch size: 136, lr: 2.91e-02, grad_scale: 8.0 +2022-11-15 15:49:36,277 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 15:49:43,707 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5299, 4.8269, 4.6653, 4.9339, 4.3657, 3.6044, 3.0608, 4.2995], + device='cuda:2'), covar=tensor([0.1588, 0.0300, 0.0366, 0.0157, 0.0220, 0.0807, 0.2154, 0.0154], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0101, 0.0141, 0.0093, 0.0118, 0.0155, 0.0187, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 15:49:47,196 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3995, 3.7877, 3.2860, 3.8555, 3.9858, 3.4402, 3.7436, 3.0841], + device='cuda:2'), covar=tensor([0.0309, 0.0540, 0.0699, 0.0329, 0.0310, 0.0400, 0.0299, 0.0560], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0076, 0.0098, 0.0071, 0.0091, 0.0088, 0.0079, 0.0068], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 15:49:47,426 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8306, 3.2158, 3.1857, 1.2890, 3.1781, 3.6653, 3.1291, 3.6641], + device='cuda:2'), covar=tensor([0.0885, 0.0345, 0.0267, 0.0884, 0.0075, 0.0062, 0.0121, 0.0070], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0116, 0.0086, 0.0134, 0.0081, 0.0073, 0.0072, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 15:49:54,758 INFO [train.py:908] (2/4) Epoch 2, validation: loss=0.1945, simple_loss=0.208, pruned_loss=0.09052, over 1530663.00 frames. +2022-11-15 15:49:54,760 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4573MB +2022-11-15 15:50:09,563 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.65 vs. limit=5.0 +2022-11-15 15:50:11,171 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.31 vs. limit=5.0 +2022-11-15 15:50:19,046 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.409e+02 2.427e+02 3.017e+02 4.086e+02 8.174e+02, threshold=6.035e+02, percent-clipped=9.0 +2022-11-15 15:50:48,550 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.23 vs. limit=2.0 +2022-11-15 15:51:06,261 INFO [train.py:876] (2/4) Epoch 2, batch 6100, loss[loss=0.2252, simple_loss=0.2171, pruned_loss=0.1167, over 5556.00 frames. ], tot_loss[loss=0.2427, simple_loss=0.2193, pruned_loss=0.133, over 1080814.46 frames. ], batch size: 14, lr: 2.90e-02, grad_scale: 8.0 +2022-11-15 15:51:29,932 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.327e+02 2.166e+02 2.663e+02 3.329e+02 7.571e+02, threshold=5.325e+02, percent-clipped=1.0 +2022-11-15 15:51:38,748 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=13419.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:52:12,695 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=13467.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:52:17,511 INFO [train.py:876] (2/4) Epoch 2, batch 6200, loss[loss=0.2722, simple_loss=0.2321, pruned_loss=0.1561, over 5598.00 frames. ], tot_loss[loss=0.2444, simple_loss=0.2205, pruned_loss=0.1342, over 1080401.03 frames. ], batch size: 38, lr: 2.89e-02, grad_scale: 8.0 +2022-11-15 15:52:41,229 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.148e+02 2.479e+02 3.223e+02 3.937e+02 9.344e+02, threshold=6.446e+02, percent-clipped=9.0 +2022-11-15 15:53:13,086 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8459, 2.0661, 1.0625, 1.0363, 0.3476, 1.8311, 1.6557, 0.7396], + device='cuda:2'), covar=tensor([0.0223, 0.0094, 0.0203, 0.0262, 0.1220, 0.0327, 0.0168, 0.0424], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0026, 0.0030, 0.0032, 0.0030, 0.0022, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([4.8880e-05, 3.6005e-05, 4.4496e-05, 4.9704e-05, 5.3241e-05, 3.8483e-05, + 4.0569e-05, 4.6469e-05], device='cuda:2') +2022-11-15 15:53:27,906 INFO [train.py:876] (2/4) Epoch 2, batch 6300, loss[loss=0.2524, simple_loss=0.2174, pruned_loss=0.1437, over 5009.00 frames. ], tot_loss[loss=0.2426, simple_loss=0.2195, pruned_loss=0.1329, over 1085785.48 frames. ], batch size: 109, lr: 2.88e-02, grad_scale: 8.0 +2022-11-15 15:53:36,009 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8634, 1.1519, 0.9733, 0.7207, 1.1688, 1.0226, 0.6203, 1.5341], + device='cuda:2'), covar=tensor([0.0117, 0.0057, 0.0062, 0.0042, 0.0053, 0.0129, 0.0163, 0.0108], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0019, 0.0017, 0.0018, 0.0020, 0.0017], + device='cuda:2'), out_proj_covar=tensor([2.8731e-05, 2.9768e-05, 2.8357e-05, 2.3387e-05, 2.3308e-05, 2.5010e-05, + 3.5009e-05, 2.5192e-05], device='cuda:2') +2022-11-15 15:53:39,289 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.01 vs. limit=2.0 +2022-11-15 15:53:52,436 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.388e+02 2.381e+02 3.023e+02 3.897e+02 7.097e+02, threshold=6.046e+02, percent-clipped=2.0 +2022-11-15 15:53:57,982 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5421, 4.6408, 3.7768, 1.9814, 4.5057, 1.8415, 4.1998, 2.7098], + device='cuda:2'), covar=tensor([0.0722, 0.0138, 0.0234, 0.2312, 0.0138, 0.1698, 0.0144, 0.1439], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0072, 0.0068, 0.0110, 0.0076, 0.0117, 0.0060, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0001, 0.0003], + device='cuda:2') +2022-11-15 15:54:39,249 INFO [train.py:876] (2/4) Epoch 2, batch 6400, loss[loss=0.2472, simple_loss=0.2341, pruned_loss=0.1302, over 5802.00 frames. ], tot_loss[loss=0.2383, simple_loss=0.2166, pruned_loss=0.13, over 1083760.97 frames. ], batch size: 22, lr: 2.87e-02, grad_scale: 16.0 +2022-11-15 15:55:03,885 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.329e+02 2.247e+02 2.645e+02 3.306e+02 5.348e+02, threshold=5.289e+02, percent-clipped=0.0 +2022-11-15 15:55:10,969 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8651, 3.2332, 2.6495, 3.1448, 2.2529, 2.3981, 1.9010, 2.7714], + device='cuda:2'), covar=tensor([0.1301, 0.0122, 0.0566, 0.0168, 0.0519, 0.0696, 0.1584, 0.0160], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0098, 0.0137, 0.0091, 0.0110, 0.0147, 0.0174, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 15:55:24,675 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=13737.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:55:50,601 INFO [train.py:876] (2/4) Epoch 2, batch 6500, loss[loss=0.2161, simple_loss=0.2054, pruned_loss=0.1134, over 5625.00 frames. ], tot_loss[loss=0.2438, simple_loss=0.2203, pruned_loss=0.1337, over 1079843.55 frames. ], batch size: 23, lr: 2.86e-02, grad_scale: 16.0 +2022-11-15 15:55:59,103 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.40 vs. limit=5.0 +2022-11-15 15:56:09,395 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=13798.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 15:56:15,578 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.258e+02 2.395e+02 2.916e+02 4.086e+02 9.118e+02, threshold=5.832e+02, percent-clipped=10.0 +2022-11-15 15:56:24,025 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8116, 1.8307, 1.0348, 0.8750, 0.5918, 1.7178, 1.2551, 0.6377], + device='cuda:2'), covar=tensor([0.0135, 0.0036, 0.0149, 0.0169, 0.0330, 0.0096, 0.0151, 0.0216], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0024, 0.0028, 0.0030, 0.0026, 0.0021, 0.0024, 0.0028], + device='cuda:2'), out_proj_covar=tensor([4.6223e-05, 3.3308e-05, 4.1335e-05, 4.6791e-05, 4.6354e-05, 3.6168e-05, + 3.7433e-05, 4.3947e-05], device='cuda:2') +2022-11-15 15:56:58,251 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6041, 1.6841, 1.7737, 2.6484, 1.3212, 2.0918, 2.7100, 2.5619], + device='cuda:2'), covar=tensor([0.0111, 0.0149, 0.0271, 0.0655, 0.0192, 0.0151, 0.0169, 0.0590], + device='cuda:2'), in_proj_covar=tensor([0.0033, 0.0038, 0.0042, 0.0029, 0.0044, 0.0034, 0.0045, 0.0027], + device='cuda:2'), out_proj_covar=tensor([5.6513e-05, 6.9960e-05, 9.0270e-05, 5.6758e-05, 8.8592e-05, 7.3142e-05, + 8.6257e-05, 5.2512e-05], device='cuda:2') +2022-11-15 15:57:02,020 INFO [train.py:876] (2/4) Epoch 2, batch 6600, loss[loss=0.2378, simple_loss=0.227, pruned_loss=0.1242, over 5778.00 frames. ], tot_loss[loss=0.2398, simple_loss=0.2182, pruned_loss=0.1307, over 1080814.12 frames. ], batch size: 21, lr: 2.85e-02, grad_scale: 16.0 +2022-11-15 15:57:25,733 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.321e+02 2.160e+02 2.814e+02 3.632e+02 6.196e+02, threshold=5.627e+02, percent-clipped=2.0 +2022-11-15 15:58:12,910 INFO [train.py:876] (2/4) Epoch 2, batch 6700, loss[loss=0.2623, simple_loss=0.2166, pruned_loss=0.154, over 4180.00 frames. ], tot_loss[loss=0.2387, simple_loss=0.2172, pruned_loss=0.1301, over 1086316.34 frames. ], batch size: 181, lr: 2.85e-02, grad_scale: 16.0 +2022-11-15 15:58:36,326 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.165e+02 2.069e+02 2.722e+02 3.403e+02 5.968e+02, threshold=5.443e+02, percent-clipped=2.0 +2022-11-15 15:59:22,987 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3151, 1.7622, 1.9428, 1.5660, 1.1955, 2.0518, 1.5477, 1.8802], + device='cuda:2'), covar=tensor([0.0113, 0.0303, 0.0075, 0.0070, 0.0202, 0.0058, 0.0096, 0.0061], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0014, 0.0014, 0.0017, 0.0016, 0.0016, 0.0019, 0.0016], + device='cuda:2'), out_proj_covar=tensor([2.4366e-05, 2.1238e-05, 2.2099e-05, 2.2748e-05, 2.2051e-05, 2.0954e-05, + 2.4784e-05, 2.1767e-05], device='cuda:2') +2022-11-15 15:59:23,700 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14072.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:59:24,579 INFO [train.py:876] (2/4) Epoch 2, batch 6800, loss[loss=0.1828, simple_loss=0.1873, pruned_loss=0.08914, over 5498.00 frames. ], tot_loss[loss=0.2362, simple_loss=0.2157, pruned_loss=0.1284, over 1079161.41 frames. ], batch size: 17, lr: 2.84e-02, grad_scale: 16.0 +2022-11-15 15:59:38,921 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14093.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 15:59:45,928 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14103.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 15:59:48,496 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.444e+02 2.309e+02 2.876e+02 3.823e+02 9.866e+02, threshold=5.752e+02, percent-clipped=3.0 +2022-11-15 15:59:54,471 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2109, 5.1050, 4.4008, 5.3289, 5.3777, 4.5123, 4.6928, 4.0591], + device='cuda:2'), covar=tensor([0.0189, 0.0303, 0.0581, 0.0200, 0.0201, 0.0427, 0.0214, 0.0468], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0079, 0.0110, 0.0072, 0.0098, 0.0093, 0.0084, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 15:59:58,373 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14120.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:00:08,020 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14133.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:00:26,666 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.23 vs. limit=2.0 +2022-11-15 16:00:29,239 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14164.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:00:35,848 INFO [train.py:876] (2/4) Epoch 2, batch 6900, loss[loss=0.2476, simple_loss=0.2129, pruned_loss=0.1412, over 5130.00 frames. ], tot_loss[loss=0.239, simple_loss=0.2175, pruned_loss=0.1302, over 1073648.04 frames. ], batch size: 7, lr: 2.83e-02, grad_scale: 16.0 +2022-11-15 16:00:41,928 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14181.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 16:01:00,099 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.482e+02 2.412e+02 3.170e+02 4.129e+02 8.263e+02, threshold=6.339e+02, percent-clipped=8.0 +2022-11-15 16:01:31,585 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 16:01:47,608 INFO [train.py:876] (2/4) Epoch 2, batch 7000, loss[loss=0.2346, simple_loss=0.1988, pruned_loss=0.1351, over 5693.00 frames. ], tot_loss[loss=0.2409, simple_loss=0.2181, pruned_loss=0.1319, over 1076259.54 frames. ], batch size: 11, lr: 2.82e-02, grad_scale: 16.0 +2022-11-15 16:02:11,794 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.160e+02 2.311e+02 3.068e+02 3.846e+02 6.793e+02, threshold=6.137e+02, percent-clipped=2.0 +2022-11-15 16:02:58,389 INFO [train.py:876] (2/4) Epoch 2, batch 7100, loss[loss=0.2609, simple_loss=0.219, pruned_loss=0.1514, over 4702.00 frames. ], tot_loss[loss=0.2406, simple_loss=0.2182, pruned_loss=0.1315, over 1078141.86 frames. ], batch size: 135, lr: 2.81e-02, grad_scale: 16.0 +2022-11-15 16:03:03,331 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9618, 1.1968, 1.1703, 0.9150, 1.5450, 1.2694, 1.2315, 1.3074], + device='cuda:2'), covar=tensor([0.1371, 0.0429, 0.0415, 0.1036, 0.0746, 0.0380, 0.0615, 0.0363], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0011, 0.0009, 0.0009, 0.0009, 0.0009, 0.0011, 0.0009], + device='cuda:2'), out_proj_covar=tensor([2.2648e-05, 2.3379e-05, 2.3484e-05, 2.5759e-05, 2.1406e-05, 2.0745e-05, + 2.6371e-05, 2.0468e-05], device='cuda:2') +2022-11-15 16:03:05,965 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.16 vs. limit=5.0 +2022-11-15 16:03:08,111 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-11-15 16:03:13,672 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=14393.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:03:20,070 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3043, 2.8380, 2.3445, 1.4337, 2.6990, 0.9751, 2.7657, 1.6185], + device='cuda:2'), covar=tensor([0.0865, 0.0227, 0.0459, 0.2332, 0.0310, 0.2284, 0.0231, 0.1868], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0074, 0.0073, 0.0113, 0.0077, 0.0118, 0.0065, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 16:03:23,296 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.146e+02 2.305e+02 2.820e+02 3.726e+02 7.277e+02, threshold=5.640e+02, percent-clipped=2.0 +2022-11-15 16:03:27,741 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14413.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:03:37,782 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14428.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:03:47,213 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=14441.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:03:49,388 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8183, 1.5226, 2.5638, 1.9673, 2.3517, 1.8379, 2.1829, 2.6340], + device='cuda:2'), covar=tensor([0.0053, 0.0395, 0.0092, 0.0310, 0.0143, 0.0279, 0.0224, 0.0125], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0165, 0.0112, 0.0171, 0.0100, 0.0149, 0.0155, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:04:00,313 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14459.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:04:01,832 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.63 vs. limit=5.0 +2022-11-15 16:04:09,795 INFO [train.py:876] (2/4) Epoch 2, batch 7200, loss[loss=0.2346, simple_loss=0.2048, pruned_loss=0.1322, over 5523.00 frames. ], tot_loss[loss=0.2402, simple_loss=0.2179, pruned_loss=0.1313, over 1080903.71 frames. ], batch size: 13, lr: 2.80e-02, grad_scale: 16.0 +2022-11-15 16:04:10,623 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14474.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:04:11,849 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14476.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 16:04:19,429 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14487.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:04:29,621 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.01 vs. limit=2.0 +2022-11-15 16:04:33,139 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.449e+02 2.188e+02 2.926e+02 3.996e+02 7.445e+02, threshold=5.852e+02, percent-clipped=7.0 +2022-11-15 16:05:50,252 INFO [train.py:876] (2/4) Epoch 3, batch 0, loss[loss=0.2385, simple_loss=0.2177, pruned_loss=0.1296, over 5298.00 frames. ], tot_loss[loss=0.2385, simple_loss=0.2177, pruned_loss=0.1296, over 5298.00 frames. ], batch size: 79, lr: 2.66e-02, grad_scale: 16.0 +2022-11-15 16:05:50,252 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 16:05:59,676 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8597, 1.4844, 1.6498, 2.0607, 1.4303, 1.1957, 1.2957, 2.2273], + device='cuda:2'), covar=tensor([0.0131, 0.0267, 0.0348, 0.0199, 0.0394, 0.0700, 0.0335, 0.0116], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0037, 0.0042, 0.0031, 0.0048, 0.0037, 0.0044, 0.0028], + device='cuda:2'), out_proj_covar=tensor([6.1677e-05, 7.2798e-05, 9.3187e-05, 6.1843e-05, 9.7840e-05, 7.9605e-05, + 8.7051e-05, 5.5923e-05], device='cuda:2') +2022-11-15 16:06:07,523 INFO [train.py:908] (2/4) Epoch 3, validation: loss=0.1917, simple_loss=0.2065, pruned_loss=0.08845, over 1530663.00 frames. +2022-11-15 16:06:07,523 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4573MB +2022-11-15 16:06:09,668 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14548.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:06:17,499 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.57 vs. limit=5.0 +2022-11-15 16:06:20,561 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4143, 4.5795, 4.7594, 4.6928, 4.2956, 3.9851, 5.3777, 4.4205], + device='cuda:2'), covar=tensor([0.0336, 0.0680, 0.0275, 0.0462, 0.0405, 0.0245, 0.0466, 0.0274], + device='cuda:2'), in_proj_covar=tensor([0.0051, 0.0072, 0.0062, 0.0070, 0.0051, 0.0043, 0.0081, 0.0052], + device='cuda:2'), out_proj_covar=tensor([1.1499e-04, 1.5899e-04, 1.3780e-04, 1.5559e-04, 1.1355e-04, 9.6604e-05, + 1.9816e-04, 1.1347e-04], device='cuda:2') +2022-11-15 16:06:31,418 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1303, 4.1452, 3.2843, 1.9676, 4.1499, 1.7607, 3.6941, 2.5565], + device='cuda:2'), covar=tensor([0.0826, 0.0161, 0.0418, 0.2173, 0.0128, 0.1687, 0.0235, 0.1492], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0077, 0.0076, 0.0116, 0.0081, 0.0119, 0.0067, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 16:06:52,130 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.043e+02 2.390e+02 2.831e+02 3.708e+02 1.001e+03, threshold=5.662e+02, percent-clipped=6.0 +2022-11-15 16:07:19,302 INFO [train.py:876] (2/4) Epoch 3, batch 100, loss[loss=0.1594, simple_loss=0.1662, pruned_loss=0.0763, over 5310.00 frames. ], tot_loss[loss=0.2362, simple_loss=0.217, pruned_loss=0.1277, over 429561.03 frames. ], batch size: 9, lr: 2.65e-02, grad_scale: 16.0 +2022-11-15 16:07:34,602 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.19 vs. limit=2.0 +2022-11-15 16:08:03,406 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.336e+02 2.280e+02 2.660e+02 3.506e+02 7.201e+02, threshold=5.320e+02, percent-clipped=1.0 +2022-11-15 16:08:18,646 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=14728.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:08:27,255 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14740.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:08:30,530 INFO [train.py:876] (2/4) Epoch 3, batch 200, loss[loss=0.1896, simple_loss=0.1905, pruned_loss=0.0944, over 5445.00 frames. ], tot_loss[loss=0.2373, simple_loss=0.2168, pruned_loss=0.1289, over 690003.76 frames. ], batch size: 11, lr: 2.64e-02, grad_scale: 16.0 +2022-11-15 16:08:41,082 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=14759.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:08:41,105 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0994, 1.4087, 1.5132, 1.6740, 1.3825, 1.8261, 1.6817, 1.9907], + device='cuda:2'), covar=tensor([0.0045, 0.0140, 0.0094, 0.0046, 0.0071, 0.0032, 0.0069, 0.0040], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0015, 0.0014, 0.0016, 0.0015, 0.0015, 0.0019, 0.0015], + device='cuda:2'), out_proj_covar=tensor([2.1422e-05, 2.1366e-05, 2.1446e-05, 1.9710e-05, 2.1170e-05, 1.9090e-05, + 2.5068e-05, 2.0940e-05], device='cuda:2') +2022-11-15 16:08:48,055 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14769.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:08:50,800 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-11-15 16:08:53,272 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=14776.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:08:53,359 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=14776.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:08:56,279 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0680, 0.8430, 0.9718, 0.8750, 1.5322, 0.5522, 1.4527, 1.6143], + device='cuda:2'), covar=tensor([0.1585, 0.0552, 0.1215, 0.0976, 0.0612, 0.1877, 0.0371, 0.0302], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0010, 0.0009, 0.0009, 0.0008, 0.0009, 0.0009, 0.0008], + device='cuda:2'), out_proj_covar=tensor([2.0772e-05, 2.2440e-05, 2.2772e-05, 2.4328e-05, 2.0303e-05, 2.0464e-05, + 2.4261e-05, 1.9393e-05], device='cuda:2') +2022-11-15 16:09:11,189 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14801.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:09:15,916 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.042e+02 2.295e+02 2.977e+02 3.816e+02 8.125e+02, threshold=5.953e+02, percent-clipped=6.0 +2022-11-15 16:09:16,007 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=14807.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:09:27,546 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=14824.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:09:31,793 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.01 vs. limit=5.0 +2022-11-15 16:09:41,492 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14843.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:09:42,822 INFO [train.py:876] (2/4) Epoch 3, batch 300, loss[loss=0.2188, simple_loss=0.217, pruned_loss=0.1103, over 5708.00 frames. ], tot_loss[loss=0.2323, simple_loss=0.2137, pruned_loss=0.1254, over 850334.04 frames. ], batch size: 15, lr: 2.63e-02, grad_scale: 16.0 +2022-11-15 16:10:08,924 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 16:10:27,143 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.433e+02 2.202e+02 2.979e+02 3.584e+02 7.564e+02, threshold=5.959e+02, percent-clipped=7.0 +2022-11-15 16:10:55,019 INFO [train.py:876] (2/4) Epoch 3, batch 400, loss[loss=0.1558, simple_loss=0.1645, pruned_loss=0.07351, over 5751.00 frames. ], tot_loss[loss=0.2352, simple_loss=0.2158, pruned_loss=0.1273, over 940879.60 frames. ], batch size: 13, lr: 2.62e-02, grad_scale: 16.0 +2022-11-15 16:11:43,138 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.230e+02 2.415e+02 2.867e+02 3.380e+02 8.857e+02, threshold=5.735e+02, percent-clipped=3.0 +2022-11-15 16:11:46,929 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1190, 1.8181, 3.9967, 2.7864, 4.0066, 2.5652, 3.6175, 3.9237], + device='cuda:2'), covar=tensor([0.0036, 0.0499, 0.0068, 0.0472, 0.0049, 0.0336, 0.0171, 0.0096], + device='cuda:2'), in_proj_covar=tensor([0.0093, 0.0166, 0.0115, 0.0174, 0.0101, 0.0151, 0.0160, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:11:55,560 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6115, 1.6194, 1.5537, 1.7136, 1.7599, 1.4789, 1.3276, 1.9136], + device='cuda:2'), covar=tensor([0.0029, 0.0063, 0.0064, 0.0035, 0.0026, 0.0036, 0.0078, 0.0050], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0014, 0.0013, 0.0016, 0.0014, 0.0015, 0.0018, 0.0014], + device='cuda:2'), out_proj_covar=tensor([2.1385e-05, 2.0745e-05, 2.0550e-05, 1.9548e-05, 1.9866e-05, 1.8771e-05, + 2.5187e-05, 1.9557e-05], device='cuda:2') +2022-11-15 16:12:05,174 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.43 vs. limit=2.0 +2022-11-15 16:12:10,978 INFO [train.py:876] (2/4) Epoch 3, batch 500, loss[loss=0.1461, simple_loss=0.1502, pruned_loss=0.07098, over 5341.00 frames. ], tot_loss[loss=0.2357, simple_loss=0.2155, pruned_loss=0.128, over 990884.37 frames. ], batch size: 9, lr: 2.62e-02, grad_scale: 16.0 +2022-11-15 16:12:27,960 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=15069.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:12:41,417 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8728, 1.3006, 1.5739, 1.2230, 0.4293, 1.6857, 0.9802, 0.8339], + device='cuda:2'), covar=tensor([0.0225, 0.0186, 0.0117, 0.0202, 0.0472, 0.0264, 0.0528, 0.0314], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0025, 0.0026, 0.0030, 0.0027, 0.0022, 0.0025, 0.0028], + device='cuda:2'), out_proj_covar=tensor([4.9186e-05, 3.4334e-05, 3.8802e-05, 4.7566e-05, 4.6643e-05, 3.8032e-05, + 4.0184e-05, 4.5221e-05], device='cuda:2') +2022-11-15 16:12:48,136 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=15096.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:12:55,531 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.456e+02 2.220e+02 2.761e+02 3.566e+02 7.983e+02, threshold=5.522e+02, percent-clipped=3.0 +2022-11-15 16:13:02,513 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=15117.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:13:02,880 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-15 16:13:13,821 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.07 vs. limit=2.0 +2022-11-15 16:13:21,495 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=15143.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:13:22,749 INFO [train.py:876] (2/4) Epoch 3, batch 600, loss[loss=0.2338, simple_loss=0.2194, pruned_loss=0.1241, over 5749.00 frames. ], tot_loss[loss=0.235, simple_loss=0.2155, pruned_loss=0.1272, over 1031776.43 frames. ], batch size: 27, lr: 2.61e-02, grad_scale: 16.0 +2022-11-15 16:13:53,414 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2521, 1.9370, 3.9493, 2.4947, 3.7931, 2.5935, 3.6217, 3.8028], + device='cuda:2'), covar=tensor([0.0031, 0.0371, 0.0066, 0.0358, 0.0059, 0.0278, 0.0143, 0.0102], + device='cuda:2'), in_proj_covar=tensor([0.0093, 0.0162, 0.0114, 0.0174, 0.0104, 0.0153, 0.0163, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:13:55,648 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=15191.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:13:58,624 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=15195.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:14:07,258 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.247e+02 2.220e+02 2.668e+02 3.263e+02 7.749e+02, threshold=5.336e+02, percent-clipped=2.0 +2022-11-15 16:14:33,896 INFO [train.py:876] (2/4) Epoch 3, batch 700, loss[loss=0.2622, simple_loss=0.2476, pruned_loss=0.1384, over 5787.00 frames. ], tot_loss[loss=0.2331, simple_loss=0.2149, pruned_loss=0.1256, over 1058861.91 frames. ], batch size: 21, lr: 2.60e-02, grad_scale: 16.0 +2022-11-15 16:14:42,119 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=15256.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:15:14,134 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.74 vs. limit=5.0 +2022-11-15 16:15:18,348 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.235e+02 2.381e+02 2.880e+02 4.091e+02 8.657e+02, threshold=5.760e+02, percent-clipped=7.0 +2022-11-15 16:15:21,014 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 16:15:37,707 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.49 vs. limit=2.0 +2022-11-15 16:15:42,419 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9807, 3.4240, 2.5214, 2.5078, 2.0308, 3.0376, 2.1431, 2.6722], + device='cuda:2'), covar=tensor([0.0242, 0.0057, 0.0153, 0.0162, 0.0232, 0.0063, 0.0166, 0.0055], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0063, 0.0083, 0.0078, 0.0123, 0.0076, 0.0103, 0.0067], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:15:44,960 INFO [train.py:876] (2/4) Epoch 3, batch 800, loss[loss=0.232, simple_loss=0.2175, pruned_loss=0.1232, over 5606.00 frames. ], tot_loss[loss=0.2305, simple_loss=0.2125, pruned_loss=0.1243, over 1066065.02 frames. ], batch size: 18, lr: 2.59e-02, grad_scale: 16.0 +2022-11-15 16:15:54,721 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5019, 2.4644, 2.2454, 1.2183, 2.3681, 3.0093, 2.3901, 3.1219], + device='cuda:2'), covar=tensor([0.0970, 0.0399, 0.0421, 0.0930, 0.0138, 0.0138, 0.0125, 0.0096], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0133, 0.0101, 0.0155, 0.0098, 0.0086, 0.0085, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 16:15:56,722 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0691, 4.0752, 4.0620, 3.5229, 4.1641, 3.9605, 1.4026, 3.9484], + device='cuda:2'), covar=tensor([0.0246, 0.0199, 0.0187, 0.0304, 0.0198, 0.0189, 0.2834, 0.0259], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0059, 0.0059, 0.0047, 0.0069, 0.0049, 0.0113, 0.0073], + device='cuda:2'), out_proj_covar=tensor([1.5652e-04, 1.1612e-04, 1.1831e-04, 9.5484e-05, 1.4008e-04, 1.0218e-04, + 2.0762e-04, 1.4422e-04], device='cuda:2') +2022-11-15 16:16:08,440 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-11-15 16:16:21,721 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=15396.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:16:30,195 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.130e+02 2.163e+02 2.793e+02 3.391e+02 6.505e+02, threshold=5.586e+02, percent-clipped=1.0 +2022-11-15 16:16:56,552 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=15444.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:16:57,168 INFO [train.py:876] (2/4) Epoch 3, batch 900, loss[loss=0.2414, simple_loss=0.2235, pruned_loss=0.1297, over 5740.00 frames. ], tot_loss[loss=0.2296, simple_loss=0.2119, pruned_loss=0.1236, over 1066349.87 frames. ], batch size: 27, lr: 2.59e-02, grad_scale: 16.0 +2022-11-15 16:17:24,046 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.00 vs. limit=5.0 +2022-11-15 16:17:41,563 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.169e+02 2.326e+02 2.765e+02 3.451e+02 5.869e+02, threshold=5.530e+02, percent-clipped=2.0 +2022-11-15 16:18:09,248 INFO [train.py:876] (2/4) Epoch 3, batch 1000, loss[loss=0.2794, simple_loss=0.2431, pruned_loss=0.1579, over 5510.00 frames. ], tot_loss[loss=0.2302, simple_loss=0.2126, pruned_loss=0.1239, over 1075665.17 frames. ], batch size: 53, lr: 2.58e-02, grad_scale: 16.0 +2022-11-15 16:18:13,441 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=15551.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 16:18:34,258 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.19 vs. limit=2.0 +2022-11-15 16:18:53,835 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.311e+02 2.148e+02 2.818e+02 3.595e+02 6.939e+02, threshold=5.636e+02, percent-clipped=2.0 +2022-11-15 16:19:06,662 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-11-15 16:19:19,925 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.15 vs. limit=2.0 +2022-11-15 16:19:20,895 INFO [train.py:876] (2/4) Epoch 3, batch 1100, loss[loss=0.2148, simple_loss=0.2198, pruned_loss=0.1049, over 5520.00 frames. ], tot_loss[loss=0.2292, simple_loss=0.2122, pruned_loss=0.1231, over 1080891.60 frames. ], batch size: 14, lr: 2.57e-02, grad_scale: 32.0 +2022-11-15 16:20:05,750 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.351e+02 2.101e+02 2.719e+02 3.370e+02 6.510e+02, threshold=5.438e+02, percent-clipped=5.0 +2022-11-15 16:20:18,819 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.27 vs. limit=5.0 +2022-11-15 16:20:32,642 INFO [train.py:876] (2/4) Epoch 3, batch 1200, loss[loss=0.3006, simple_loss=0.2578, pruned_loss=0.1717, over 5430.00 frames. ], tot_loss[loss=0.2297, simple_loss=0.2127, pruned_loss=0.1233, over 1081010.57 frames. ], batch size: 58, lr: 2.56e-02, grad_scale: 16.0 +2022-11-15 16:20:35,673 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-11-15 16:21:04,352 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7681, 4.1431, 3.1447, 2.0609, 3.9297, 1.6236, 3.9216, 2.4385], + device='cuda:2'), covar=tensor([0.0768, 0.0107, 0.0383, 0.1485, 0.0125, 0.1428, 0.0097, 0.1246], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0081, 0.0078, 0.0112, 0.0080, 0.0122, 0.0069, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 16:21:08,500 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-11-15 16:21:17,792 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.458e+02 2.202e+02 2.751e+02 3.254e+02 5.977e+02, threshold=5.502e+02, percent-clipped=4.0 +2022-11-15 16:21:37,557 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4458, 1.5582, 1.1199, 1.3122, 1.8154, 1.4223, 1.2284, 1.6836], + device='cuda:2'), covar=tensor([0.0067, 0.0124, 0.0100, 0.0046, 0.0031, 0.0047, 0.0062, 0.0044], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0014, 0.0014, 0.0016, 0.0014, 0.0015, 0.0017, 0.0014], + device='cuda:2'), out_proj_covar=tensor([2.1135e-05, 2.0392e-05, 2.0303e-05, 2.0346e-05, 1.9174e-05, 1.8307e-05, + 2.2373e-05, 1.8786e-05], device='cuda:2') +2022-11-15 16:21:43,807 INFO [train.py:876] (2/4) Epoch 3, batch 1300, loss[loss=0.172, simple_loss=0.185, pruned_loss=0.07949, over 5081.00 frames. ], tot_loss[loss=0.2293, simple_loss=0.2126, pruned_loss=0.123, over 1082800.66 frames. ], batch size: 7, lr: 2.56e-02, grad_scale: 16.0 +2022-11-15 16:21:48,895 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=15851.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:22:11,345 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=15883.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:22:16,639 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.5486, 4.8388, 5.3843, 4.9656, 5.5534, 5.4132, 4.6651, 5.4370], + device='cuda:2'), covar=tensor([0.0232, 0.0243, 0.0328, 0.0204, 0.0209, 0.0060, 0.0158, 0.0172], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0082, 0.0069, 0.0083, 0.0078, 0.0050, 0.0069, 0.0070], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:22:22,432 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=15899.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:22:29,933 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.357e+02 2.010e+02 2.682e+02 3.501e+02 1.988e+03, threshold=5.365e+02, percent-clipped=5.0 +2022-11-15 16:22:54,781 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=15944.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:22:55,285 INFO [train.py:876] (2/4) Epoch 3, batch 1400, loss[loss=0.2179, simple_loss=0.2077, pruned_loss=0.1141, over 5613.00 frames. ], tot_loss[loss=0.2293, simple_loss=0.2122, pruned_loss=0.1232, over 1083172.99 frames. ], batch size: 38, lr: 2.55e-02, grad_scale: 8.0 +2022-11-15 16:23:18,110 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3170, 4.4552, 3.3027, 2.0275, 4.2436, 1.5385, 4.2675, 2.6621], + device='cuda:2'), covar=tensor([0.0804, 0.0113, 0.0404, 0.1843, 0.0137, 0.1862, 0.0164, 0.1425], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0081, 0.0077, 0.0113, 0.0082, 0.0124, 0.0070, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 16:23:23,392 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5906, 4.1866, 3.6230, 4.2635, 4.2404, 3.4801, 3.8581, 3.2676], + device='cuda:2'), covar=tensor([0.0446, 0.0332, 0.0792, 0.0290, 0.0317, 0.0502, 0.0259, 0.0485], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0084, 0.0118, 0.0086, 0.0108, 0.0102, 0.0089, 0.0081], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:23:36,099 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16001.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:23:42,150 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.208e+02 2.137e+02 2.778e+02 3.586e+02 7.557e+02, threshold=5.556e+02, percent-clipped=2.0 +2022-11-15 16:24:07,055 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-11-15 16:24:07,309 INFO [train.py:876] (2/4) Epoch 3, batch 1500, loss[loss=0.1424, simple_loss=0.1562, pruned_loss=0.06427, over 5718.00 frames. ], tot_loss[loss=0.2269, simple_loss=0.2105, pruned_loss=0.1216, over 1081124.26 frames. ], batch size: 13, lr: 2.54e-02, grad_scale: 8.0 +2022-11-15 16:24:19,082 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16062.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:24:36,019 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16085.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:24:52,645 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.193e+02 2.433e+02 2.944e+02 3.861e+02 5.407e+02, threshold=5.888e+02, percent-clipped=0.0 +2022-11-15 16:25:18,690 INFO [train.py:876] (2/4) Epoch 3, batch 1600, loss[loss=0.2215, simple_loss=0.22, pruned_loss=0.1115, over 5733.00 frames. ], tot_loss[loss=0.2279, simple_loss=0.212, pruned_loss=0.1219, over 1086685.11 frames. ], batch size: 20, lr: 2.53e-02, grad_scale: 8.0 +2022-11-15 16:25:19,555 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16146.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:25:21,623 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16149.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:25:27,534 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4071, 1.3413, 1.7366, 1.8614, 0.3604, 1.7060, 1.0540, 1.3547], + device='cuda:2'), covar=tensor([0.0165, 0.0186, 0.0063, 0.0379, 0.0774, 0.3761, 0.0212, 0.0299], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0027, 0.0025, 0.0030, 0.0028, 0.0024, 0.0026, 0.0029], + device='cuda:2'), out_proj_covar=tensor([4.8642e-05, 3.7781e-05, 3.7650e-05, 4.7437e-05, 4.8947e-05, 4.1641e-05, + 4.2268e-05, 4.7417e-05], device='cuda:2') +2022-11-15 16:25:46,500 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8515, 1.9030, 1.3945, 2.3542, 1.4526, 1.6341, 1.5526, 2.2028], + device='cuda:2'), covar=tensor([0.0257, 0.0266, 0.0550, 0.0382, 0.0492, 0.0372, 0.0479, 0.1169], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0039, 0.0044, 0.0034, 0.0048, 0.0040, 0.0047, 0.0034], + device='cuda:2'), out_proj_covar=tensor([7.1751e-05, 7.8796e-05, 1.0522e-04, 7.0815e-05, 1.0334e-04, 9.0479e-05, + 9.5779e-05, 6.8216e-05], device='cuda:2') +2022-11-15 16:26:01,051 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0159, 3.3804, 2.8364, 2.7107, 1.9175, 3.3278, 2.1428, 2.8960], + device='cuda:2'), covar=tensor([0.0189, 0.0049, 0.0072, 0.0170, 0.0209, 0.0036, 0.0148, 0.0032], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0066, 0.0086, 0.0082, 0.0123, 0.0077, 0.0104, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:26:04,923 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.227e+02 2.236e+02 2.729e+02 3.361e+02 8.285e+02, threshold=5.459e+02, percent-clipped=7.0 +2022-11-15 16:26:05,766 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16210.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:26:12,844 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0115, 3.9618, 2.8935, 3.7936, 3.1650, 2.7697, 1.9098, 3.5305], + device='cuda:2'), covar=tensor([0.2126, 0.0225, 0.1086, 0.0338, 0.0506, 0.1095, 0.2805, 0.0238], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0101, 0.0149, 0.0097, 0.0126, 0.0159, 0.0181, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:26:26,178 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16239.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:26:30,135 INFO [train.py:876] (2/4) Epoch 3, batch 1700, loss[loss=0.2593, simple_loss=0.234, pruned_loss=0.1423, over 5563.00 frames. ], tot_loss[loss=0.2271, simple_loss=0.2114, pruned_loss=0.1214, over 1083847.06 frames. ], batch size: 40, lr: 2.53e-02, grad_scale: 8.0 +2022-11-15 16:26:32,135 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-15 16:27:09,115 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3884, 1.6846, 3.2479, 2.3771, 3.3998, 2.0980, 2.9770, 3.5470], + device='cuda:2'), covar=tensor([0.0054, 0.0436, 0.0109, 0.0342, 0.0077, 0.0355, 0.0220, 0.0119], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0165, 0.0120, 0.0175, 0.0109, 0.0156, 0.0167, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:27:15,062 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.295e+02 2.221e+02 2.797e+02 3.532e+02 1.022e+03, threshold=5.594e+02, percent-clipped=2.0 +2022-11-15 16:27:41,524 INFO [train.py:876] (2/4) Epoch 3, batch 1800, loss[loss=0.2341, simple_loss=0.2109, pruned_loss=0.1286, over 5542.00 frames. ], tot_loss[loss=0.2273, simple_loss=0.2112, pruned_loss=0.1217, over 1084502.27 frames. ], batch size: 25, lr: 2.52e-02, grad_scale: 8.0 +2022-11-15 16:27:49,682 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16357.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:27:59,860 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16371.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:28:08,247 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7614, 2.8301, 2.4211, 2.7636, 1.7018, 2.3055, 1.7250, 2.4520], + device='cuda:2'), covar=tensor([0.1317, 0.0164, 0.0541, 0.0207, 0.0656, 0.0706, 0.1362, 0.0209], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0102, 0.0147, 0.0097, 0.0127, 0.0161, 0.0181, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:28:26,562 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.507e+02 2.508e+02 2.970e+02 3.858e+02 6.298e+02, threshold=5.940e+02, percent-clipped=3.0 +2022-11-15 16:28:33,546 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9786, 3.6108, 3.9068, 3.5208, 4.0629, 3.4581, 3.6404, 3.9519], + device='cuda:2'), covar=tensor([0.0342, 0.0251, 0.0328, 0.0309, 0.0283, 0.0279, 0.0248, 0.0265], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0083, 0.0067, 0.0084, 0.0078, 0.0052, 0.0067, 0.0071], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-11-15 16:28:42,863 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16432.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:28:49,196 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16441.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:28:51,930 INFO [train.py:876] (2/4) Epoch 3, batch 1900, loss[loss=0.2334, simple_loss=0.213, pruned_loss=0.1269, over 5634.00 frames. ], tot_loss[loss=0.2291, simple_loss=0.2122, pruned_loss=0.123, over 1089863.19 frames. ], batch size: 32, lr: 2.51e-02, grad_scale: 8.0 +2022-11-15 16:29:00,399 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9082, 4.1920, 3.8365, 4.3122, 3.5281, 2.7662, 4.5968, 3.7307], + device='cuda:2'), covar=tensor([0.0416, 0.0790, 0.0498, 0.0521, 0.0485, 0.0589, 0.0793, 0.0428], + device='cuda:2'), in_proj_covar=tensor([0.0053, 0.0076, 0.0064, 0.0071, 0.0053, 0.0044, 0.0084, 0.0058], + device='cuda:2'), out_proj_covar=tensor([1.1854e-04, 1.7329e-04, 1.4633e-04, 1.5677e-04, 1.2080e-04, 9.9468e-05, + 2.1031e-04, 1.2856e-04], device='cuda:2') +2022-11-15 16:29:13,764 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.21 vs. limit=2.0 +2022-11-15 16:29:34,095 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16505.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:29:37,425 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.237e+02 2.066e+02 2.615e+02 3.411e+02 7.424e+02, threshold=5.231e+02, percent-clipped=4.0 +2022-11-15 16:29:40,365 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9985, 1.1735, 1.1974, 0.7236, 1.3431, 1.7505, 0.6135, 1.0443], + device='cuda:2'), covar=tensor([0.0033, 0.0031, 0.0043, 0.0025, 0.0026, 0.0031, 0.0087, 0.0029], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0020, 0.0020, 0.0021, 0.0017, 0.0022, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.6704e-05, 2.9333e-05, 2.6688e-05, 2.3710e-05, 2.4165e-05, 2.1845e-05, + 3.8036e-05, 2.3118e-05], device='cuda:2') +2022-11-15 16:29:58,064 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=16539.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:30:02,346 INFO [train.py:876] (2/4) Epoch 3, batch 2000, loss[loss=0.2346, simple_loss=0.2214, pruned_loss=0.1239, over 5607.00 frames. ], tot_loss[loss=0.2288, simple_loss=0.2116, pruned_loss=0.123, over 1082408.97 frames. ], batch size: 43, lr: 2.51e-02, grad_scale: 8.0 +2022-11-15 16:30:28,130 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-11-15 16:30:32,355 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=16587.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:30:45,418 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-11-15 16:30:48,526 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.413e+02 2.235e+02 2.805e+02 3.719e+02 8.798e+02, threshold=5.609e+02, percent-clipped=4.0 +2022-11-15 16:31:14,090 INFO [train.py:876] (2/4) Epoch 3, batch 2100, loss[loss=0.2133, simple_loss=0.2173, pruned_loss=0.1047, over 5732.00 frames. ], tot_loss[loss=0.2272, simple_loss=0.2112, pruned_loss=0.1216, over 1084446.87 frames. ], batch size: 17, lr: 2.50e-02, grad_scale: 8.0 +2022-11-15 16:31:22,846 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=16657.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:31:48,452 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.81 vs. limit=2.0 +2022-11-15 16:31:56,395 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=16705.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:31:59,088 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.519e+01 2.019e+02 2.564e+02 3.147e+02 5.259e+02, threshold=5.129e+02, percent-clipped=0.0 +2022-11-15 16:32:12,149 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16727.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:32:22,623 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=16741.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:32:25,234 INFO [train.py:876] (2/4) Epoch 3, batch 2200, loss[loss=0.2064, simple_loss=0.2066, pruned_loss=0.1031, over 5764.00 frames. ], tot_loss[loss=0.2271, simple_loss=0.2115, pruned_loss=0.1213, over 1081999.11 frames. ], batch size: 31, lr: 2.49e-02, grad_scale: 8.0 +2022-11-15 16:32:41,369 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.43 vs. limit=2.0 +2022-11-15 16:32:43,809 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8382, 4.4391, 4.8072, 4.4445, 4.9014, 4.7281, 4.2850, 4.9413], + device='cuda:2'), covar=tensor([0.0382, 0.0210, 0.0340, 0.0209, 0.0388, 0.0111, 0.0180, 0.0184], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0086, 0.0069, 0.0087, 0.0082, 0.0054, 0.0068, 0.0075], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:32:56,716 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=16789.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:33:07,906 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=16805.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:33:10,449 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.364e+02 2.121e+02 2.774e+02 3.414e+02 1.004e+03, threshold=5.548e+02, percent-clipped=7.0 +2022-11-15 16:33:35,561 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.16 vs. limit=5.0 +2022-11-15 16:33:36,644 INFO [train.py:876] (2/4) Epoch 3, batch 2300, loss[loss=0.262, simple_loss=0.2246, pruned_loss=0.1497, over 5569.00 frames. ], tot_loss[loss=0.2284, simple_loss=0.212, pruned_loss=0.1224, over 1085007.26 frames. ], batch size: 43, lr: 2.49e-02, grad_scale: 8.0 +2022-11-15 16:33:42,403 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=16853.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:33:53,225 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16868.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:34:22,209 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16906.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:34:24,020 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.385e+02 2.301e+02 2.850e+02 3.623e+02 1.087e+03, threshold=5.699e+02, percent-clipped=3.0 +2022-11-15 16:34:38,436 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16929.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:34:40,866 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1959, 4.5809, 3.5696, 4.2470, 3.6223, 3.1396, 2.2192, 4.0488], + device='cuda:2'), covar=tensor([0.2141, 0.0120, 0.0803, 0.0296, 0.0469, 0.1138, 0.2448, 0.0181], + device='cuda:2'), in_proj_covar=tensor([0.0187, 0.0113, 0.0163, 0.0110, 0.0138, 0.0176, 0.0195, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:34:50,017 INFO [train.py:876] (2/4) Epoch 3, batch 2400, loss[loss=0.3049, simple_loss=0.2532, pruned_loss=0.1783, over 5407.00 frames. ], tot_loss[loss=0.2243, simple_loss=0.2093, pruned_loss=0.1197, over 1087784.81 frames. ], batch size: 70, lr: 2.48e-02, grad_scale: 8.0 +2022-11-15 16:35:05,717 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16967.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 16:35:23,016 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9095, 3.6984, 2.9261, 3.6926, 2.7543, 2.6254, 1.8561, 3.3341], + device='cuda:2'), covar=tensor([0.1873, 0.0169, 0.0859, 0.0161, 0.0662, 0.1070, 0.2399, 0.0230], + device='cuda:2'), in_proj_covar=tensor([0.0184, 0.0112, 0.0162, 0.0107, 0.0137, 0.0175, 0.0193, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:35:36,096 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.257e+02 2.211e+02 2.830e+02 3.499e+02 8.414e+02, threshold=5.661e+02, percent-clipped=2.0 +2022-11-15 16:35:48,475 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=17027.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:36:01,858 INFO [train.py:876] (2/4) Epoch 3, batch 2500, loss[loss=0.2333, simple_loss=0.2063, pruned_loss=0.1301, over 5454.00 frames. ], tot_loss[loss=0.2279, simple_loss=0.2115, pruned_loss=0.1221, over 1088119.34 frames. ], batch size: 11, lr: 2.47e-02, grad_scale: 8.0 +2022-11-15 16:36:18,832 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8150, 2.7960, 2.4624, 2.5066, 1.6516, 2.4836, 1.7987, 2.3781], + device='cuda:2'), covar=tensor([0.0215, 0.0049, 0.0080, 0.0099, 0.0196, 0.0058, 0.0152, 0.0045], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0072, 0.0091, 0.0089, 0.0129, 0.0086, 0.0110, 0.0073], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:36:22,799 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=17075.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:36:47,307 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.241e+02 2.290e+02 2.908e+02 3.668e+02 6.942e+02, threshold=5.815e+02, percent-clipped=2.0 +2022-11-15 16:36:48,859 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9005, 5.2043, 4.0231, 5.0604, 3.9138, 3.8475, 2.8335, 4.5023], + device='cuda:2'), covar=tensor([0.1201, 0.0074, 0.0518, 0.0133, 0.0279, 0.0505, 0.1571, 0.0121], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0109, 0.0160, 0.0108, 0.0134, 0.0170, 0.0188, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:36:51,582 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2556, 4.4327, 4.4566, 4.6215, 4.2286, 3.9013, 5.0059, 4.0908], + device='cuda:2'), covar=tensor([0.0557, 0.0806, 0.0462, 0.0582, 0.0397, 0.0262, 0.0656, 0.0433], + device='cuda:2'), in_proj_covar=tensor([0.0053, 0.0076, 0.0065, 0.0073, 0.0053, 0.0044, 0.0086, 0.0057], + device='cuda:2'), out_proj_covar=tensor([1.1984e-04, 1.7269e-04, 1.4949e-04, 1.6286e-04, 1.1950e-04, 9.9616e-05, + 2.1763e-04, 1.2695e-04], device='cuda:2') +2022-11-15 16:37:05,981 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5260, 1.7397, 0.9459, 1.2614, 0.9596, 1.0811, 1.1881, 1.6592], + device='cuda:2'), covar=tensor([0.0251, 0.0474, 0.0909, 0.1030, 0.1014, 0.0770, 0.0560, 0.1641], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0041, 0.0045, 0.0032, 0.0050, 0.0040, 0.0047, 0.0034], + device='cuda:2'), out_proj_covar=tensor([7.0756e-05, 8.5237e-05, 1.0682e-04, 7.1133e-05, 1.0898e-04, 9.2107e-05, + 9.8316e-05, 7.2883e-05], device='cuda:2') +2022-11-15 16:37:12,392 INFO [train.py:876] (2/4) Epoch 3, batch 2600, loss[loss=0.2104, simple_loss=0.2049, pruned_loss=0.1079, over 5571.00 frames. ], tot_loss[loss=0.226, simple_loss=0.2101, pruned_loss=0.1209, over 1090285.54 frames. ], batch size: 16, lr: 2.47e-02, grad_scale: 8.0 +2022-11-15 16:37:28,359 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1440, 3.1079, 2.7240, 3.1013, 3.1320, 2.6896, 2.7281, 2.4703], + device='cuda:2'), covar=tensor([0.0185, 0.0344, 0.0687, 0.0282, 0.0287, 0.0415, 0.0405, 0.0587], + device='cuda:2'), in_proj_covar=tensor([0.0079, 0.0093, 0.0131, 0.0087, 0.0117, 0.0106, 0.0097, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:37:36,457 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7124, 0.9254, 1.2544, 0.8621, 0.2921, 1.1020, 0.7639, 0.6472], + device='cuda:2'), covar=tensor([0.0112, 0.0097, 0.0053, 0.0102, 0.0271, 0.0023, 0.0141, 0.0176], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0026, 0.0026, 0.0028, 0.0026, 0.0023, 0.0023, 0.0026], + device='cuda:2'), out_proj_covar=tensor([4.4176e-05, 3.8321e-05, 3.8859e-05, 4.4082e-05, 4.5233e-05, 4.0434e-05, + 3.9236e-05, 4.3734e-05], device='cuda:2') +2022-11-15 16:37:41,711 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5326, 5.0393, 3.9312, 4.8353, 4.8573, 4.6664, 4.6854, 4.3114], + device='cuda:2'), covar=tensor([0.0199, 0.0382, 0.1176, 0.0522, 0.0582, 0.0380, 0.0280, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0092, 0.0130, 0.0087, 0.0117, 0.0104, 0.0096, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:37:43,076 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17189.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:37:51,136 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.01 vs. limit=2.0 +2022-11-15 16:37:57,337 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.374e+02 2.163e+02 2.684e+02 3.226e+02 5.923e+02, threshold=5.368e+02, percent-clipped=1.0 +2022-11-15 16:38:08,598 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17224.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:38:19,558 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9352, 2.0331, 3.4983, 2.6296, 3.7653, 2.5337, 3.3221, 3.8128], + device='cuda:2'), covar=tensor([0.0054, 0.0456, 0.0104, 0.0410, 0.0053, 0.0330, 0.0198, 0.0095], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0167, 0.0122, 0.0180, 0.0111, 0.0160, 0.0173, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:38:22,715 INFO [train.py:876] (2/4) Epoch 3, batch 2700, loss[loss=0.1698, simple_loss=0.1673, pruned_loss=0.08618, over 5708.00 frames. ], tot_loss[loss=0.224, simple_loss=0.209, pruned_loss=0.1195, over 1087625.83 frames. ], batch size: 11, lr: 2.46e-02, grad_scale: 8.0 +2022-11-15 16:38:26,437 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17250.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:38:33,518 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-11-15 16:38:34,856 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17262.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 16:39:07,891 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.169e+02 2.430e+02 3.020e+02 3.987e+02 6.630e+02, threshold=6.040e+02, percent-clipped=4.0 +2022-11-15 16:39:25,740 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7042, 1.0364, 1.5233, 1.2809, 0.1762, 1.3566, 1.1157, 0.6130], + device='cuda:2'), covar=tensor([0.0388, 0.0143, 0.0195, 0.0209, 0.0884, 0.0423, 0.0447, 0.0434], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0027, 0.0027, 0.0029, 0.0027, 0.0024, 0.0023, 0.0027], + device='cuda:2'), out_proj_covar=tensor([4.6834e-05, 3.8958e-05, 4.0804e-05, 4.5874e-05, 4.7227e-05, 4.1997e-05, + 4.0220e-05, 4.5208e-05], device='cuda:2') +2022-11-15 16:39:33,471 INFO [train.py:876] (2/4) Epoch 3, batch 2800, loss[loss=0.2958, simple_loss=0.2612, pruned_loss=0.1652, over 5526.00 frames. ], tot_loss[loss=0.2228, simple_loss=0.2087, pruned_loss=0.1185, over 1086663.93 frames. ], batch size: 49, lr: 2.45e-02, grad_scale: 8.0 +2022-11-15 16:39:45,074 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17362.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:39:49,875 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3454, 3.8722, 4.1552, 3.9037, 4.4498, 3.9813, 3.9305, 4.3293], + device='cuda:2'), covar=tensor([0.0323, 0.0282, 0.0484, 0.0321, 0.0337, 0.0255, 0.0239, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0082, 0.0087, 0.0072, 0.0089, 0.0088, 0.0056, 0.0071, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:40:00,291 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17383.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 16:40:04,403 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0178, 3.4044, 2.9429, 2.8096, 1.8603, 3.2193, 2.0944, 2.6014], + device='cuda:2'), covar=tensor([0.0138, 0.0041, 0.0046, 0.0078, 0.0157, 0.0030, 0.0099, 0.0032], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0070, 0.0090, 0.0089, 0.0130, 0.0086, 0.0107, 0.0075], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:40:18,908 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.389e+02 2.246e+02 2.631e+02 3.087e+02 4.773e+02, threshold=5.262e+02, percent-clipped=0.0 +2022-11-15 16:40:28,885 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17423.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:40:29,546 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5650, 4.0946, 3.4935, 4.0344, 3.9624, 3.4567, 3.4923, 3.3314], + device='cuda:2'), covar=tensor([0.0443, 0.0244, 0.0642, 0.0244, 0.0267, 0.0347, 0.0339, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0091, 0.0132, 0.0087, 0.0119, 0.0103, 0.0098, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:40:44,638 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17444.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:40:45,100 INFO [train.py:876] (2/4) Epoch 3, batch 2900, loss[loss=0.2496, simple_loss=0.2162, pruned_loss=0.1415, over 5705.00 frames. ], tot_loss[loss=0.2207, simple_loss=0.2068, pruned_loss=0.1173, over 1079309.39 frames. ], batch size: 36, lr: 2.45e-02, grad_scale: 8.0 +2022-11-15 16:41:00,270 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4419, 1.0780, 0.9751, 1.4610, 1.7250, 1.0814, 1.2114, 1.6473], + device='cuda:2'), covar=tensor([0.0308, 0.0440, 0.1401, 0.0603, 0.3779, 0.2341, 0.0686, 0.0326], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0011, 0.0009, 0.0009, 0.0009, 0.0010, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([2.5525e-05, 2.8909e-05, 2.6743e-05, 2.8529e-05, 2.5082e-05, 2.6889e-05, + 2.7890e-05, 2.4809e-05], device='cuda:2') +2022-11-15 16:41:09,620 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.95 vs. limit=2.0 +2022-11-15 16:41:27,950 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-11-15 16:41:30,297 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.327e+02 2.189e+02 2.747e+02 3.512e+02 7.345e+02, threshold=5.495e+02, percent-clipped=3.0 +2022-11-15 16:41:40,751 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3386, 1.9438, 1.6880, 1.8878, 1.1267, 1.6732, 1.3176, 1.8893], + device='cuda:2'), covar=tensor([0.0723, 0.0133, 0.0605, 0.0168, 0.0599, 0.0458, 0.0907, 0.0165], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0108, 0.0159, 0.0104, 0.0129, 0.0169, 0.0183, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:41:40,756 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=17524.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 16:41:54,816 INFO [train.py:876] (2/4) Epoch 3, batch 3000, loss[loss=0.1808, simple_loss=0.1943, pruned_loss=0.08368, over 5589.00 frames. ], tot_loss[loss=0.2203, simple_loss=0.2065, pruned_loss=0.117, over 1079508.31 frames. ], batch size: 15, lr: 2.44e-02, grad_scale: 8.0 +2022-11-15 16:41:54,817 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 16:42:10,921 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7322, 1.7798, 1.6066, 1.5947, 1.9618, 1.4948, 1.6375, 1.7959], + device='cuda:2'), covar=tensor([0.0030, 0.0103, 0.0106, 0.0061, 0.0022, 0.0049, 0.0035, 0.0034], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0012, 0.0013, 0.0016, 0.0014, 0.0015, 0.0015, 0.0014], + device='cuda:2'), out_proj_covar=tensor([2.0555e-05, 1.8156e-05, 1.8636e-05, 2.0339e-05, 1.7269e-05, 1.8757e-05, + 2.0154e-05, 1.8794e-05], device='cuda:2') +2022-11-15 16:42:13,676 INFO [train.py:908] (2/4) Epoch 3, validation: loss=0.1847, simple_loss=0.2015, pruned_loss=0.08391, over 1530663.00 frames. +2022-11-15 16:42:13,676 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4573MB +2022-11-15 16:42:13,753 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17545.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:42:17,922 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4588, 3.9441, 3.4635, 3.9983, 3.9809, 3.3671, 3.3031, 2.8365], + device='cuda:2'), covar=tensor([0.0410, 0.0515, 0.0920, 0.0236, 0.0317, 0.0442, 0.0429, 0.0936], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0094, 0.0136, 0.0089, 0.0117, 0.0104, 0.0098, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:42:25,428 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=17562.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 16:42:32,288 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=17572.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 16:42:51,046 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.50 vs. limit=5.0 +2022-11-15 16:42:55,453 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-11-15 16:42:59,053 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.183e+02 2.309e+02 3.014e+02 4.120e+02 1.212e+03, threshold=6.029e+02, percent-clipped=7.0 +2022-11-15 16:43:00,217 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=17610.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:43:24,411 INFO [train.py:876] (2/4) Epoch 3, batch 3100, loss[loss=0.2862, simple_loss=0.2434, pruned_loss=0.1645, over 5567.00 frames. ], tot_loss[loss=0.2237, simple_loss=0.2089, pruned_loss=0.1193, over 1080421.34 frames. ], batch size: 54, lr: 2.43e-02, grad_scale: 8.0 +2022-11-15 16:44:04,515 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17700.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:44:10,832 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.139e+02 2.237e+02 2.895e+02 3.474e+02 7.762e+02, threshold=5.791e+02, percent-clipped=3.0 +2022-11-15 16:44:16,898 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17718.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:44:17,569 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7242, 4.2875, 4.5201, 4.4294, 4.8263, 4.4307, 4.2932, 4.7809], + device='cuda:2'), covar=tensor([0.0446, 0.0234, 0.0420, 0.0187, 0.0362, 0.0216, 0.0160, 0.0195], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0082, 0.0066, 0.0083, 0.0081, 0.0054, 0.0067, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-11-15 16:44:25,301 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17730.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:44:31,326 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17739.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 16:44:35,377 INFO [train.py:876] (2/4) Epoch 3, batch 3200, loss[loss=0.1967, simple_loss=0.207, pruned_loss=0.09322, over 5502.00 frames. ], tot_loss[loss=0.2232, simple_loss=0.2085, pruned_loss=0.1189, over 1078695.06 frames. ], batch size: 17, lr: 2.43e-02, grad_scale: 8.0 +2022-11-15 16:44:47,651 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17761.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:45:08,551 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17791.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:45:12,624 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5557, 4.0504, 3.6090, 4.0081, 4.0107, 3.4163, 3.5318, 3.2994], + device='cuda:2'), covar=tensor([0.0418, 0.0269, 0.0689, 0.0219, 0.0288, 0.0364, 0.0303, 0.0429], + device='cuda:2'), in_proj_covar=tensor([0.0082, 0.0097, 0.0138, 0.0091, 0.0120, 0.0107, 0.0099, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:45:21,429 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.240e+02 2.197e+02 2.889e+02 3.896e+02 6.814e+02, threshold=5.779e+02, percent-clipped=2.0 +2022-11-15 16:45:26,866 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.28 vs. limit=2.0 +2022-11-15 16:45:47,452 INFO [train.py:876] (2/4) Epoch 3, batch 3300, loss[loss=0.1973, simple_loss=0.1853, pruned_loss=0.1046, over 5507.00 frames. ], tot_loss[loss=0.2193, simple_loss=0.2061, pruned_loss=0.1162, over 1083998.75 frames. ], batch size: 17, lr: 2.42e-02, grad_scale: 8.0 +2022-11-15 16:45:47,585 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=17845.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:46:21,795 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=17893.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:46:32,461 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.181e+02 2.187e+02 2.686e+02 3.512e+02 8.499e+02, threshold=5.372e+02, percent-clipped=2.0 +2022-11-15 16:46:34,155 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-11-15 16:46:41,649 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8629, 1.6503, 3.4789, 2.4792, 3.3776, 2.6069, 2.9715, 3.6434], + device='cuda:2'), covar=tensor([0.0055, 0.0589, 0.0098, 0.0518, 0.0135, 0.0305, 0.0280, 0.0138], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0166, 0.0122, 0.0181, 0.0113, 0.0160, 0.0176, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:46:58,448 INFO [train.py:876] (2/4) Epoch 3, batch 3400, loss[loss=0.2541, simple_loss=0.2373, pruned_loss=0.1354, over 5633.00 frames. ], tot_loss[loss=0.2209, simple_loss=0.2074, pruned_loss=0.1172, over 1093598.27 frames. ], batch size: 29, lr: 2.41e-02, grad_scale: 16.0 +2022-11-15 16:47:08,840 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17960.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:47:19,824 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9993, 1.0592, 1.1806, 2.2456, 1.5376, 0.9358, 1.4143, 1.1265], + device='cuda:2'), covar=tensor([0.0029, 0.0021, 0.0021, 0.0011, 0.0022, 0.0117, 0.0019, 0.0017], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0019, 0.0018, 0.0020, 0.0016, 0.0019, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.7079e-05, 2.8439e-05, 2.3220e-05, 1.9357e-05, 2.2233e-05, 1.9949e-05, + 3.0502e-05, 2.4301e-05], device='cuda:2') +2022-11-15 16:47:43,699 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.311e+02 2.080e+02 2.540e+02 3.324e+02 6.629e+02, threshold=5.080e+02, percent-clipped=5.0 +2022-11-15 16:47:50,076 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18018.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:47:52,114 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18021.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:48:04,390 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18039.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:48:09,015 INFO [train.py:876] (2/4) Epoch 3, batch 3500, loss[loss=0.1732, simple_loss=0.1875, pruned_loss=0.07947, over 5799.00 frames. ], tot_loss[loss=0.2158, simple_loss=0.2044, pruned_loss=0.1136, over 1094734.89 frames. ], batch size: 21, lr: 2.41e-02, grad_scale: 16.0 +2022-11-15 16:48:13,630 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3786, 3.8467, 3.3264, 3.8953, 3.9189, 3.3570, 3.5020, 3.2120], + device='cuda:2'), covar=tensor([0.0605, 0.0425, 0.1095, 0.0296, 0.0307, 0.0356, 0.0308, 0.0443], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0100, 0.0145, 0.0093, 0.0124, 0.0110, 0.0101, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 16:48:17,489 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18056.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:48:24,391 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18066.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:48:28,802 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.40 vs. limit=5.0 +2022-11-15 16:48:38,156 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18086.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:48:38,775 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18087.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 16:48:55,270 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.292e+02 2.252e+02 2.672e+02 3.554e+02 6.890e+02, threshold=5.344e+02, percent-clipped=4.0 +2022-11-15 16:49:20,259 INFO [train.py:876] (2/4) Epoch 3, batch 3600, loss[loss=0.3055, simple_loss=0.264, pruned_loss=0.1735, over 5461.00 frames. ], tot_loss[loss=0.2196, simple_loss=0.2064, pruned_loss=0.1164, over 1091068.58 frames. ], batch size: 53, lr: 2.40e-02, grad_scale: 16.0 +2022-11-15 16:49:41,598 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18174.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:49:42,184 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9202, 2.7841, 2.7685, 3.0021, 2.6827, 2.2962, 3.2556, 2.8733], + device='cuda:2'), covar=tensor([0.0469, 0.0880, 0.0603, 0.0712, 0.0641, 0.0500, 0.0932, 0.0495], + device='cuda:2'), in_proj_covar=tensor([0.0053, 0.0078, 0.0063, 0.0074, 0.0056, 0.0046, 0.0087, 0.0056], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:49:42,287 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18175.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:50:06,564 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.401e+02 2.223e+02 2.647e+02 3.378e+02 5.745e+02, threshold=5.295e+02, percent-clipped=2.0 +2022-11-15 16:50:15,375 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0608, 1.5057, 1.1242, 1.4869, 1.1241, 1.3671, 0.6902, 1.2102], + device='cuda:2'), covar=tensor([0.0045, 0.0016, 0.0020, 0.0016, 0.0030, 0.0029, 0.0039, 0.0028], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0019, 0.0018, 0.0020, 0.0016, 0.0019, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.5795e-05, 2.6218e-05, 2.3440e-05, 1.8784e-05, 2.2280e-05, 1.9301e-05, + 2.9786e-05, 2.3251e-05], device='cuda:2') +2022-11-15 16:50:25,754 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18235.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:50:26,443 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18236.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:50:32,383 INFO [train.py:876] (2/4) Epoch 3, batch 3700, loss[loss=0.2599, simple_loss=0.2448, pruned_loss=0.1375, over 5595.00 frames. ], tot_loss[loss=0.2212, simple_loss=0.2075, pruned_loss=0.1175, over 1087744.22 frames. ], batch size: 24, lr: 2.40e-02, grad_scale: 16.0 +2022-11-15 16:51:17,618 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.280e+02 2.074e+02 2.681e+02 3.314e+02 6.700e+02, threshold=5.362e+02, percent-clipped=3.0 +2022-11-15 16:51:22,576 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18316.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:51:26,874 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4471, 2.2098, 1.2770, 2.2283, 1.2468, 2.0137, 2.1197, 1.5475], + device='cuda:2'), covar=tensor([0.0405, 0.0601, 0.1607, 0.0927, 0.1006, 0.0451, 0.0497, 0.2954], + device='cuda:2'), in_proj_covar=tensor([0.0040, 0.0041, 0.0049, 0.0032, 0.0053, 0.0041, 0.0048, 0.0032], + device='cuda:2'), out_proj_covar=tensor([8.2023e-05, 9.1298e-05, 1.1413e-04, 7.5273e-05, 1.1685e-04, 9.7246e-05, + 1.0382e-04, 7.4078e-05], device='cuda:2') +2022-11-15 16:51:43,711 INFO [train.py:876] (2/4) Epoch 3, batch 3800, loss[loss=0.2566, simple_loss=0.2358, pruned_loss=0.1387, over 5594.00 frames. ], tot_loss[loss=0.2166, simple_loss=0.2049, pruned_loss=0.1142, over 1092202.69 frames. ], batch size: 24, lr: 2.39e-02, grad_scale: 16.0 +2022-11-15 16:51:51,398 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18356.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:52:01,287 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-15 16:52:12,767 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18386.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:52:25,089 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18404.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:52:28,472 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.144e+02 2.160e+02 2.623e+02 3.183e+02 6.237e+02, threshold=5.245e+02, percent-clipped=1.0 +2022-11-15 16:52:29,586 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.80 vs. limit=5.0 +2022-11-15 16:52:46,227 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18434.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:52:52,430 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-11-15 16:52:54,048 INFO [train.py:876] (2/4) Epoch 3, batch 3900, loss[loss=0.1301, simple_loss=0.1387, pruned_loss=0.06073, over 4443.00 frames. ], tot_loss[loss=0.2178, simple_loss=0.2051, pruned_loss=0.1153, over 1087293.18 frames. ], batch size: 5, lr: 2.38e-02, grad_scale: 16.0 +2022-11-15 16:53:06,756 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-11-15 16:53:26,674 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18490.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:53:35,154 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.07 vs. limit=2.0 +2022-11-15 16:53:39,712 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18508.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 16:53:40,177 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.408e+02 2.213e+02 2.841e+02 3.897e+02 8.408e+02, threshold=5.683e+02, percent-clipped=7.0 +2022-11-15 16:53:54,780 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18530.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:53:55,476 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18531.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:54:05,587 INFO [train.py:876] (2/4) Epoch 3, batch 4000, loss[loss=0.2962, simple_loss=0.2325, pruned_loss=0.1799, over 3058.00 frames. ], tot_loss[loss=0.2181, simple_loss=0.2049, pruned_loss=0.1156, over 1081102.50 frames. ], batch size: 284, lr: 2.38e-02, grad_scale: 16.0 +2022-11-15 16:54:10,043 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18551.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:54:22,439 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18569.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 16:54:29,260 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0269, 1.7676, 0.8729, 2.0377, 0.9445, 1.3564, 1.1551, 1.6277], + device='cuda:2'), covar=tensor([0.0413, 0.0499, 0.1105, 0.0326, 0.0910, 0.0876, 0.0746, 0.0716], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0043, 0.0050, 0.0033, 0.0051, 0.0041, 0.0050, 0.0031], + device='cuda:2'), out_proj_covar=tensor([8.6122e-05, 9.5154e-05, 1.1853e-04, 7.6202e-05, 1.1476e-04, 9.8584e-05, + 1.0663e-04, 7.2251e-05], device='cuda:2') +2022-11-15 16:54:51,287 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.420e+02 2.156e+02 2.771e+02 3.318e+02 9.148e+02, threshold=5.542e+02, percent-clipped=3.0 +2022-11-15 16:54:56,227 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18616.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:55:00,724 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.54 vs. limit=5.0 +2022-11-15 16:55:16,120 INFO [train.py:876] (2/4) Epoch 3, batch 4100, loss[loss=0.2233, simple_loss=0.2182, pruned_loss=0.1142, over 5587.00 frames. ], tot_loss[loss=0.217, simple_loss=0.2037, pruned_loss=0.1151, over 1084691.25 frames. ], batch size: 22, lr: 2.37e-02, grad_scale: 16.0 +2022-11-15 16:55:19,054 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.61 vs. limit=5.0 +2022-11-15 16:55:29,861 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18664.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:55:37,271 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18674.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:55:41,590 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-15 16:56:01,443 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.072e+02 2.068e+02 2.779e+02 3.549e+02 8.529e+02, threshold=5.559e+02, percent-clipped=5.0 +2022-11-15 16:56:13,894 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1531, 0.9611, 1.9558, 2.2098, 2.3303, 2.0401, 2.2971, 2.3308], + device='cuda:2'), covar=tensor([0.0033, 0.0130, 0.0037, 0.0022, 0.0016, 0.0022, 0.0026, 0.0023], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0014, 0.0012, 0.0015, 0.0014, 0.0014, 0.0016, 0.0015], + device='cuda:2'), out_proj_covar=tensor([2.0302e-05, 1.8855e-05, 1.6680e-05, 1.9136e-05, 1.5620e-05, 1.7713e-05, + 1.9518e-05, 1.9033e-05], device='cuda:2') +2022-11-15 16:56:20,124 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18735.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:56:26,897 INFO [train.py:876] (2/4) Epoch 3, batch 4200, loss[loss=0.1416, simple_loss=0.1481, pruned_loss=0.06761, over 5689.00 frames. ], tot_loss[loss=0.2186, simple_loss=0.2054, pruned_loss=0.116, over 1086422.77 frames. ], batch size: 11, lr: 2.37e-02, grad_scale: 16.0 +2022-11-15 16:56:38,061 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7810, 1.7023, 1.7528, 0.9091, 0.5443, 2.1704, 2.0558, 1.1999], + device='cuda:2'), covar=tensor([0.0189, 0.0194, 0.0274, 0.0349, 0.1948, 0.0413, 0.0328, 0.0488], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0028, 0.0030, 0.0030, 0.0029, 0.0024, 0.0025, 0.0028], + device='cuda:2'), out_proj_covar=tensor([4.5973e-05, 4.2441e-05, 4.4257e-05, 4.9102e-05, 5.1661e-05, 4.4490e-05, + 4.0343e-05, 4.6864e-05], device='cuda:2') +2022-11-15 16:56:53,385 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1114, 3.6941, 3.1178, 3.0607, 2.0070, 3.3593, 2.1866, 3.0674], + device='cuda:2'), covar=tensor([0.0237, 0.0049, 0.0077, 0.0154, 0.0239, 0.0048, 0.0153, 0.0037], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0077, 0.0098, 0.0096, 0.0133, 0.0091, 0.0115, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:57:02,602 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.12 vs. limit=2.0 +2022-11-15 16:57:06,646 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0968, 3.8749, 3.3802, 3.3296, 2.4908, 3.7549, 2.3366, 3.2487], + device='cuda:2'), covar=tensor([0.0258, 0.0072, 0.0096, 0.0123, 0.0248, 0.0050, 0.0171, 0.0029], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0076, 0.0097, 0.0095, 0.0132, 0.0091, 0.0114, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:57:11,802 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.45 vs. limit=5.0 +2022-11-15 16:57:12,954 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 2.085e+02 2.508e+02 3.197e+02 8.828e+02, threshold=5.016e+02, percent-clipped=2.0 +2022-11-15 16:57:28,690 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18830.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:57:29,363 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18831.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:57:38,764 INFO [train.py:876] (2/4) Epoch 3, batch 4300, loss[loss=0.2346, simple_loss=0.2224, pruned_loss=0.1234, over 5709.00 frames. ], tot_loss[loss=0.2186, simple_loss=0.2055, pruned_loss=0.1159, over 1080015.26 frames. ], batch size: 17, lr: 2.36e-02, grad_scale: 16.0 +2022-11-15 16:57:39,541 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18846.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:57:52,361 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18864.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 16:58:02,524 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18878.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:58:03,539 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18879.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:58:17,953 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.63 vs. limit=5.0 +2022-11-15 16:58:24,292 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.372e+02 2.437e+02 2.955e+02 3.758e+02 8.175e+02, threshold=5.909e+02, percent-clipped=8.0 +2022-11-15 16:58:34,910 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4940, 4.7389, 4.7919, 4.9211, 4.2002, 3.8225, 5.4506, 4.4603], + device='cuda:2'), covar=tensor([0.0270, 0.0643, 0.0239, 0.0437, 0.0384, 0.0270, 0.0444, 0.0234], + device='cuda:2'), in_proj_covar=tensor([0.0052, 0.0076, 0.0060, 0.0074, 0.0054, 0.0044, 0.0086, 0.0055], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 16:58:40,191 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18930.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:58:50,711 INFO [train.py:876] (2/4) Epoch 3, batch 4400, loss[loss=0.1661, simple_loss=0.1754, pruned_loss=0.07844, over 5750.00 frames. ], tot_loss[loss=0.2195, simple_loss=0.2061, pruned_loss=0.1164, over 1083907.07 frames. ], batch size: 14, lr: 2.35e-02, grad_scale: 16.0 +2022-11-15 16:59:02,270 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.00 vs. limit=5.0 +2022-11-15 16:59:05,209 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 16:59:12,439 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0 +2022-11-15 16:59:16,670 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3457, 1.6337, 1.5005, 1.6390, 0.5316, 1.8262, 1.6495, 1.5610], + device='cuda:2'), covar=tensor([0.0227, 0.0314, 0.0331, 0.0395, 0.0883, 0.1026, 0.0239, 0.0322], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0028, 0.0028, 0.0030, 0.0026, 0.0023, 0.0023, 0.0028], + device='cuda:2'), out_proj_covar=tensor([4.4187e-05, 4.1356e-05, 4.2746e-05, 4.8287e-05, 4.7614e-05, 4.2901e-05, + 3.7922e-05, 4.5687e-05], device='cuda:2') +2022-11-15 16:59:18,071 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4270, 1.7384, 2.8965, 2.4709, 3.1964, 1.9391, 2.7187, 3.2433], + device='cuda:2'), covar=tensor([0.0044, 0.0430, 0.0126, 0.0337, 0.0116, 0.0335, 0.0249, 0.0134], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0174, 0.0130, 0.0184, 0.0123, 0.0160, 0.0185, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 16:59:23,545 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18991.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:59:36,220 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.259e+02 2.090e+02 2.682e+02 3.252e+02 5.703e+02, threshold=5.365e+02, percent-clipped=0.0 +2022-11-15 16:59:51,556 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19030.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 16:59:57,783 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 17:00:02,100 INFO [train.py:876] (2/4) Epoch 3, batch 4500, loss[loss=0.228, simple_loss=0.2142, pruned_loss=0.1209, over 5592.00 frames. ], tot_loss[loss=0.2188, simple_loss=0.2058, pruned_loss=0.1159, over 1087803.23 frames. ], batch size: 22, lr: 2.35e-02, grad_scale: 16.0 +2022-11-15 17:00:16,750 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.52 vs. limit=5.0 +2022-11-15 17:00:24,091 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-11-15 17:00:48,080 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.336e+02 1.944e+02 2.562e+02 3.151e+02 4.952e+02, threshold=5.125e+02, percent-clipped=0.0 +2022-11-15 17:01:02,220 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=19129.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 17:01:13,364 INFO [train.py:876] (2/4) Epoch 3, batch 4600, loss[loss=0.2406, simple_loss=0.2208, pruned_loss=0.1302, over 5577.00 frames. ], tot_loss[loss=0.2193, simple_loss=0.2063, pruned_loss=0.1162, over 1091586.19 frames. ], batch size: 46, lr: 2.34e-02, grad_scale: 16.0 +2022-11-15 17:01:14,569 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19146.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:01:27,299 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19164.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 17:01:46,031 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=19190.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 17:01:48,573 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19194.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:01:59,681 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.136e+02 2.110e+02 2.501e+02 3.073e+02 9.500e+02, threshold=5.001e+02, percent-clipped=3.0 +2022-11-15 17:02:01,809 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19212.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 17:02:02,574 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0511, 3.9613, 2.8902, 3.8843, 2.8442, 2.6661, 2.0016, 3.4136], + device='cuda:2'), covar=tensor([0.1531, 0.0122, 0.0737, 0.0132, 0.0495, 0.0759, 0.1762, 0.0170], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0109, 0.0152, 0.0102, 0.0133, 0.0166, 0.0179, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 17:02:04,690 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1107, 3.0933, 2.7799, 2.7444, 1.7865, 3.1291, 2.0613, 2.7467], + device='cuda:2'), covar=tensor([0.0132, 0.0036, 0.0047, 0.0091, 0.0141, 0.0031, 0.0102, 0.0027], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0075, 0.0097, 0.0094, 0.0130, 0.0087, 0.0112, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:02:24,580 INFO [train.py:876] (2/4) Epoch 3, batch 4700, loss[loss=0.208, simple_loss=0.2054, pruned_loss=0.1053, over 5567.00 frames. ], tot_loss[loss=0.2196, simple_loss=0.2063, pruned_loss=0.1164, over 1091365.62 frames. ], batch size: 22, lr: 2.34e-02, grad_scale: 16.0 +2022-11-15 17:02:53,842 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19286.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:02:55,978 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7980, 1.2461, 1.1768, 1.1885, 1.2510, 1.5572, 0.6558, 1.3275], + device='cuda:2'), covar=tensor([0.0026, 0.0013, 0.0015, 0.0014, 0.0017, 0.0017, 0.0032, 0.0019], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0019, 0.0019, 0.0019, 0.0020, 0.0016, 0.0020, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.5213e-05, 2.5224e-05, 2.3103e-05, 1.9654e-05, 2.3030e-05, 1.8585e-05, + 3.0017e-05, 2.2325e-05], device='cuda:2') +2022-11-15 17:03:10,435 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.250e+02 2.139e+02 2.654e+02 3.598e+02 8.917e+02, threshold=5.308e+02, percent-clipped=4.0 +2022-11-15 17:03:17,738 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3410, 3.9187, 4.0428, 3.8768, 4.3519, 3.9855, 3.8830, 4.3210], + device='cuda:2'), covar=tensor([0.0341, 0.0307, 0.0444, 0.0326, 0.0295, 0.0295, 0.0293, 0.0254], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0087, 0.0071, 0.0093, 0.0090, 0.0058, 0.0078, 0.0078], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:03:25,174 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19330.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:03:35,750 INFO [train.py:876] (2/4) Epoch 3, batch 4800, loss[loss=0.2542, simple_loss=0.2285, pruned_loss=0.14, over 5684.00 frames. ], tot_loss[loss=0.2187, simple_loss=0.2056, pruned_loss=0.1159, over 1089348.63 frames. ], batch size: 34, lr: 2.33e-02, grad_scale: 8.0 +2022-11-15 17:03:42,649 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8612, 2.9315, 2.8388, 2.6721, 2.9749, 2.8820, 0.9440, 2.9180], + device='cuda:2'), covar=tensor([0.0273, 0.0215, 0.0252, 0.0257, 0.0288, 0.0219, 0.2661, 0.0266], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0062, 0.0067, 0.0057, 0.0080, 0.0062, 0.0121, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:03:56,629 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6391, 3.5924, 3.4991, 3.8238, 3.3696, 3.1633, 4.2062, 3.5989], + device='cuda:2'), covar=tensor([0.0421, 0.0881, 0.0492, 0.0600, 0.0561, 0.0303, 0.0687, 0.0467], + device='cuda:2'), in_proj_covar=tensor([0.0054, 0.0077, 0.0063, 0.0075, 0.0056, 0.0046, 0.0090, 0.0059], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:03:59,327 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19378.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:04:10,749 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=19394.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:04:12,582 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.11 vs. limit=2.0 +2022-11-15 17:04:21,909 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.292e+02 2.238e+02 2.787e+02 3.697e+02 9.222e+02, threshold=5.575e+02, percent-clipped=6.0 +2022-11-15 17:04:47,612 INFO [train.py:876] (2/4) Epoch 3, batch 4900, loss[loss=0.2487, simple_loss=0.2065, pruned_loss=0.1455, over 4717.00 frames. ], tot_loss[loss=0.2181, simple_loss=0.2053, pruned_loss=0.1155, over 1090902.27 frames. ], batch size: 135, lr: 2.32e-02, grad_scale: 8.0 +2022-11-15 17:04:54,583 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=19455.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:05:06,921 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.02 vs. limit=5.0 +2022-11-15 17:05:16,057 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19485.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 17:05:33,092 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.277e+02 2.112e+02 2.712e+02 3.360e+02 6.764e+02, threshold=5.425e+02, percent-clipped=2.0 +2022-11-15 17:05:40,245 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.15 vs. limit=2.0 +2022-11-15 17:05:58,042 INFO [train.py:876] (2/4) Epoch 3, batch 5000, loss[loss=0.2318, simple_loss=0.2111, pruned_loss=0.1263, over 5625.00 frames. ], tot_loss[loss=0.2168, simple_loss=0.2043, pruned_loss=0.1147, over 1089599.08 frames. ], batch size: 50, lr: 2.32e-02, grad_scale: 8.0 +2022-11-15 17:06:14,201 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6431, 1.6415, 1.7747, 1.5454, 1.4295, 1.7773, 1.7863, 1.6214], + device='cuda:2'), covar=tensor([0.0025, 0.0044, 0.0039, 0.0025, 0.0022, 0.0034, 0.0023, 0.0030], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0013, 0.0013, 0.0016, 0.0015, 0.0015, 0.0016, 0.0016], + device='cuda:2'), out_proj_covar=tensor([1.9813e-05, 1.8249e-05, 1.7340e-05, 1.9591e-05, 1.6076e-05, 1.8388e-05, + 1.9704e-05, 2.0520e-05], device='cuda:2') +2022-11-15 17:06:26,430 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19586.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:06:44,099 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.388e+02 2.122e+02 2.600e+02 3.385e+02 5.559e+02, threshold=5.201e+02, percent-clipped=2.0 +2022-11-15 17:06:50,201 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=19618.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:07:01,376 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19634.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:07:09,727 INFO [train.py:876] (2/4) Epoch 3, batch 5100, loss[loss=0.3129, simple_loss=0.2648, pruned_loss=0.1805, over 5454.00 frames. ], tot_loss[loss=0.2188, simple_loss=0.2051, pruned_loss=0.1163, over 1090540.68 frames. ], batch size: 64, lr: 2.31e-02, grad_scale: 8.0 +2022-11-15 17:07:11,301 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4134, 1.9278, 1.1784, 2.2842, 1.2519, 1.5141, 1.2418, 2.0543], + device='cuda:2'), covar=tensor([0.0354, 0.0527, 0.1174, 0.0297, 0.0764, 0.0650, 0.0781, 0.0434], + device='cuda:2'), in_proj_covar=tensor([0.0042, 0.0047, 0.0055, 0.0035, 0.0055, 0.0045, 0.0055, 0.0036], + device='cuda:2'), out_proj_covar=tensor([9.0691e-05, 1.0629e-04, 1.3205e-04, 8.2820e-05, 1.2544e-04, 1.0905e-04, + 1.2120e-04, 8.3160e-05], device='cuda:2') +2022-11-15 17:07:34,068 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=19679.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:07:47,473 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9147, 4.1324, 4.1708, 4.3633, 3.4705, 3.2714, 4.9339, 4.1493], + device='cuda:2'), covar=tensor([0.0531, 0.0798, 0.0467, 0.0616, 0.0799, 0.0348, 0.0624, 0.0319], + device='cuda:2'), in_proj_covar=tensor([0.0056, 0.0078, 0.0066, 0.0076, 0.0058, 0.0048, 0.0091, 0.0059], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:07:56,097 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.321e+02 2.277e+02 2.877e+02 3.641e+02 9.302e+02, threshold=5.755e+02, percent-clipped=5.0 +2022-11-15 17:08:20,624 INFO [train.py:876] (2/4) Epoch 3, batch 5200, loss[loss=0.2441, simple_loss=0.2102, pruned_loss=0.139, over 5393.00 frames. ], tot_loss[loss=0.2164, simple_loss=0.2039, pruned_loss=0.1144, over 1085857.10 frames. ], batch size: 70, lr: 2.31e-02, grad_scale: 8.0 +2022-11-15 17:08:24,893 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19750.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:08:28,064 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 17:08:49,759 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19785.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 17:09:07,141 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.329e+02 2.129e+02 2.534e+02 3.438e+02 1.087e+03, threshold=5.069e+02, percent-clipped=3.0 +2022-11-15 17:09:23,652 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19833.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 17:09:32,159 INFO [train.py:876] (2/4) Epoch 3, batch 5300, loss[loss=0.1798, simple_loss=0.1843, pruned_loss=0.08769, over 5714.00 frames. ], tot_loss[loss=0.2193, simple_loss=0.206, pruned_loss=0.1164, over 1088219.99 frames. ], batch size: 12, lr: 2.30e-02, grad_scale: 8.0 +2022-11-15 17:10:01,494 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 17:10:10,637 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.86 vs. limit=2.0 +2022-11-15 17:10:18,063 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.350e+02 2.053e+02 2.554e+02 3.563e+02 7.883e+02, threshold=5.108e+02, percent-clipped=6.0 +2022-11-15 17:10:38,214 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.84 vs. limit=5.0 +2022-11-15 17:10:43,253 INFO [train.py:876] (2/4) Epoch 3, batch 5400, loss[loss=0.1438, simple_loss=0.1498, pruned_loss=0.06892, over 5348.00 frames. ], tot_loss[loss=0.2186, simple_loss=0.2057, pruned_loss=0.1157, over 1090797.84 frames. ], batch size: 9, lr: 2.30e-02, grad_scale: 8.0 +2022-11-15 17:11:04,383 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19974.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:11:05,414 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.12 vs. limit=2.0 +2022-11-15 17:11:21,655 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.33 vs. limit=2.0 +2022-11-15 17:11:31,833 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1622, 0.9967, 1.4245, 1.5801, 1.1423, 0.8978, 0.9867, 1.6242], + device='cuda:2'), covar=tensor([0.0491, 0.0766, 0.0456, 0.0247, 0.0567, 0.1011, 0.0734, 0.0193], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0047, 0.0053, 0.0037, 0.0056, 0.0047, 0.0056, 0.0037], + device='cuda:2'), out_proj_covar=tensor([8.7366e-05, 1.0589e-04, 1.3248e-04, 8.5492e-05, 1.2900e-04, 1.1332e-04, + 1.2348e-04, 8.6974e-05], device='cuda:2') +2022-11-15 17:11:33,653 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.198e+02 2.054e+02 2.433e+02 3.312e+02 7.375e+02, threshold=4.866e+02, percent-clipped=2.0 +2022-11-15 17:11:36,717 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3081, 2.5052, 2.7973, 3.8884, 4.2443, 3.2232, 2.4737, 4.4030], + device='cuda:2'), covar=tensor([0.0080, 0.2667, 0.1903, 0.1720, 0.0263, 0.2171, 0.1947, 0.0064], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0226, 0.0231, 0.0214, 0.0180, 0.0239, 0.0223, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0004, 0.0003, 0.0003, 0.0004, 0.0004, 0.0002], + device='cuda:2') +2022-11-15 17:11:40,711 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20020.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:11:58,780 INFO [train.py:876] (2/4) Epoch 3, batch 5500, loss[loss=0.2351, simple_loss=0.2195, pruned_loss=0.1253, over 5629.00 frames. ], tot_loss[loss=0.2204, simple_loss=0.2068, pruned_loss=0.1171, over 1079052.30 frames. ], batch size: 38, lr: 2.29e-02, grad_scale: 8.0 +2022-11-15 17:12:02,248 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20050.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:12:20,555 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.39 vs. limit=5.0 +2022-11-15 17:12:24,739 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20081.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:12:30,163 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20089.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:12:36,664 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20098.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:12:39,289 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.74 vs. limit=2.0 +2022-11-15 17:12:45,203 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.380e+02 2.322e+02 3.147e+02 3.867e+02 1.026e+03, threshold=6.293e+02, percent-clipped=11.0 +2022-11-15 17:12:46,846 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20112.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:12:48,230 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9433, 0.5841, 1.1546, 0.9910, 1.3227, 0.9996, 1.3204, 1.4164], + device='cuda:2'), covar=tensor([0.0659, 0.0342, 0.0293, 0.0689, 0.0858, 0.1190, 0.0509, 0.0670], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0011, 0.0008, 0.0009, 0.0010, 0.0009, 0.0009, 0.0008], + device='cuda:2'), out_proj_covar=tensor([2.8649e-05, 3.0647e-05, 2.6268e-05, 2.8500e-05, 2.8934e-05, 2.7480e-05, + 2.8643e-05, 2.6521e-05], device='cuda:2') +2022-11-15 17:13:10,511 INFO [train.py:876] (2/4) Epoch 3, batch 5600, loss[loss=0.2151, simple_loss=0.2021, pruned_loss=0.1141, over 5565.00 frames. ], tot_loss[loss=0.2194, simple_loss=0.2067, pruned_loss=0.116, over 1082889.02 frames. ], batch size: 25, lr: 2.29e-02, grad_scale: 8.0 +2022-11-15 17:13:14,144 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20150.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:13:19,546 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.45 vs. limit=5.0 +2022-11-15 17:13:30,567 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20173.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:13:52,972 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20204.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:13:56,932 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 2.126e+02 2.547e+02 3.418e+02 6.941e+02, threshold=5.093e+02, percent-clipped=1.0 +2022-11-15 17:14:22,613 INFO [train.py:876] (2/4) Epoch 3, batch 5700, loss[loss=0.3686, simple_loss=0.2864, pruned_loss=0.2254, over 3061.00 frames. ], tot_loss[loss=0.2206, simple_loss=0.2073, pruned_loss=0.1169, over 1082524.73 frames. ], batch size: 284, lr: 2.28e-02, grad_scale: 8.0 +2022-11-15 17:14:32,156 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5873, 4.0447, 3.0065, 3.8797, 3.8391, 3.3224, 3.7857, 3.3282], + device='cuda:2'), covar=tensor([0.0473, 0.0388, 0.1513, 0.0486, 0.0600, 0.0578, 0.0381, 0.0645], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0102, 0.0153, 0.0096, 0.0125, 0.0111, 0.0106, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:14:36,312 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20265.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:14:41,520 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4478, 2.3233, 1.4565, 2.7684, 1.9601, 2.2727, 1.6507, 2.2464], + device='cuda:2'), covar=tensor([0.0397, 0.1684, 0.1602, 0.0465, 0.0971, 0.0632, 0.0878, 0.1516], + device='cuda:2'), in_proj_covar=tensor([0.0040, 0.0046, 0.0054, 0.0038, 0.0057, 0.0046, 0.0055, 0.0039], + device='cuda:2'), out_proj_covar=tensor([8.7059e-05, 1.0552e-04, 1.3409e-04, 8.7271e-05, 1.3150e-04, 1.1211e-04, + 1.2312e-04, 9.1540e-05], device='cuda:2') +2022-11-15 17:14:42,868 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20274.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:15:08,577 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.155e+02 2.183e+02 2.755e+02 3.255e+02 9.254e+02, threshold=5.510e+02, percent-clipped=4.0 +2022-11-15 17:15:14,178 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0748, 2.8864, 2.9310, 3.1642, 2.8634, 2.6303, 3.4935, 2.9743], + device='cuda:2'), covar=tensor([0.0516, 0.1146, 0.0564, 0.0750, 0.0658, 0.0497, 0.0856, 0.0573], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0082, 0.0066, 0.0075, 0.0059, 0.0050, 0.0092, 0.0061], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:15:16,851 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20322.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:15:33,292 INFO [train.py:876] (2/4) Epoch 3, batch 5800, loss[loss=0.1937, simple_loss=0.1894, pruned_loss=0.09895, over 5579.00 frames. ], tot_loss[loss=0.2189, simple_loss=0.2062, pruned_loss=0.1158, over 1080958.92 frames. ], batch size: 43, lr: 2.28e-02, grad_scale: 8.0 +2022-11-15 17:15:55,501 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20376.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:16:04,064 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20388.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:16:05,977 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7327, 2.4676, 2.6122, 2.3721, 2.7862, 2.3748, 2.5489, 2.6258], + device='cuda:2'), covar=tensor([0.0421, 0.0392, 0.0476, 0.0462, 0.0375, 0.0273, 0.0346, 0.0490], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0090, 0.0073, 0.0096, 0.0093, 0.0058, 0.0082, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:16:19,793 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.418e+02 2.270e+02 2.884e+02 3.519e+02 5.566e+02, threshold=5.768e+02, percent-clipped=1.0 +2022-11-15 17:16:25,589 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20418.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:16:26,244 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8132, 2.8858, 2.4362, 2.9171, 2.9889, 2.5975, 2.5954, 2.4399], + device='cuda:2'), covar=tensor([0.0262, 0.0418, 0.1052, 0.0396, 0.0328, 0.0483, 0.0435, 0.0498], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0106, 0.0154, 0.0098, 0.0125, 0.0111, 0.0107, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:16:44,669 INFO [train.py:876] (2/4) Epoch 3, batch 5900, loss[loss=0.2632, simple_loss=0.2318, pruned_loss=0.1473, over 5435.00 frames. ], tot_loss[loss=0.2153, simple_loss=0.204, pruned_loss=0.1133, over 1085239.96 frames. ], batch size: 53, lr: 2.27e-02, grad_scale: 8.0 +2022-11-15 17:16:44,758 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20445.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:16:47,615 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20449.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:16:57,853 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0785, 4.2493, 3.4869, 1.9600, 4.2106, 1.6637, 3.8707, 2.1758], + device='cuda:2'), covar=tensor([0.0818, 0.0121, 0.0430, 0.2265, 0.0137, 0.1852, 0.0170, 0.1783], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0087, 0.0089, 0.0119, 0.0092, 0.0129, 0.0076, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 17:17:01,267 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20468.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:17:09,321 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20479.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:17:30,800 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.198e+02 2.074e+02 2.650e+02 3.512e+02 6.887e+02, threshold=5.300e+02, percent-clipped=1.0 +2022-11-15 17:17:42,958 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2419, 4.4887, 4.4539, 4.7108, 3.9561, 3.3182, 5.0970, 4.3768], + device='cuda:2'), covar=tensor([0.0389, 0.0639, 0.0355, 0.0455, 0.0405, 0.0357, 0.0595, 0.0352], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0082, 0.0067, 0.0075, 0.0059, 0.0049, 0.0092, 0.0061], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:17:49,149 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.6047, 4.7893, 5.3646, 4.9846, 5.4817, 5.4843, 4.7640, 5.4066], + device='cuda:2'), covar=tensor([0.0217, 0.0228, 0.0267, 0.0230, 0.0268, 0.0072, 0.0189, 0.0192], + device='cuda:2'), in_proj_covar=tensor([0.0082, 0.0084, 0.0069, 0.0092, 0.0087, 0.0055, 0.0076, 0.0080], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:17:55,881 INFO [train.py:876] (2/4) Epoch 3, batch 6000, loss[loss=0.1338, simple_loss=0.14, pruned_loss=0.06377, over 5158.00 frames. ], tot_loss[loss=0.2172, simple_loss=0.2049, pruned_loss=0.1147, over 1083852.04 frames. ], batch size: 8, lr: 2.27e-02, grad_scale: 8.0 +2022-11-15 17:17:55,881 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 17:18:14,731 INFO [train.py:908] (2/4) Epoch 3, validation: loss=0.1788, simple_loss=0.1971, pruned_loss=0.08032, over 1530663.00 frames. +2022-11-15 17:18:14,732 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4573MB +2022-11-15 17:18:25,274 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20560.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:18:26,441 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1827, 0.6676, 1.1215, 1.2620, 1.5740, 1.1867, 1.1904, 1.2189], + device='cuda:2'), covar=tensor([0.1476, 0.0370, 0.0542, 0.0473, 0.0581, 0.1749, 0.0330, 0.2026], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0012, 0.0009, 0.0010, 0.0010, 0.0009, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([2.9443e-05, 3.2954e-05, 2.8769e-05, 3.1055e-05, 3.0662e-05, 2.9527e-05, + 3.1240e-05, 2.9916e-05], device='cuda:2') +2022-11-15 17:18:42,709 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9470, 4.1623, 3.9941, 3.9371, 4.1653, 3.6425, 1.7800, 4.2542], + device='cuda:2'), covar=tensor([0.0327, 0.0339, 0.0275, 0.0296, 0.0426, 0.0470, 0.2777, 0.0307], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0065, 0.0065, 0.0055, 0.0079, 0.0062, 0.0121, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:18:45,541 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20588.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:18:59,177 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1805, 3.3056, 2.9782, 3.0082, 2.2156, 3.4497, 2.3114, 2.7893], + device='cuda:2'), covar=tensor([0.0198, 0.0056, 0.0089, 0.0131, 0.0193, 0.0036, 0.0133, 0.0035], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0083, 0.0102, 0.0105, 0.0138, 0.0096, 0.0119, 0.0081], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:19:00,872 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.234e+02 1.977e+02 2.420e+02 3.254e+02 5.998e+02, threshold=4.840e+02, percent-clipped=5.0 +2022-11-15 17:19:25,866 INFO [train.py:876] (2/4) Epoch 3, batch 6100, loss[loss=0.2486, simple_loss=0.2338, pruned_loss=0.1317, over 5741.00 frames. ], tot_loss[loss=0.2147, simple_loss=0.2036, pruned_loss=0.1129, over 1080729.38 frames. ], batch size: 15, lr: 2.26e-02, grad_scale: 8.0 +2022-11-15 17:19:28,882 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20649.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:19:47,907 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20676.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:20:05,242 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8671, 2.8072, 2.3614, 2.7502, 2.7985, 2.5143, 2.4802, 2.4144], + device='cuda:2'), covar=tensor([0.0228, 0.0352, 0.0996, 0.0416, 0.0357, 0.0413, 0.0430, 0.0446], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0104, 0.0156, 0.0099, 0.0125, 0.0112, 0.0108, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:20:11,186 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.178e+02 2.120e+02 2.658e+02 3.170e+02 7.084e+02, threshold=5.316e+02, percent-clipped=6.0 +2022-11-15 17:20:11,614 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.59 vs. limit=5.0 +2022-11-15 17:20:20,750 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20724.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:20:23,429 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.15 vs. limit=5.0 +2022-11-15 17:20:30,094 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7871, 4.3145, 3.6899, 4.2583, 4.1710, 3.6222, 3.7560, 3.5109], + device='cuda:2'), covar=tensor([0.0344, 0.0303, 0.0989, 0.0411, 0.0299, 0.0401, 0.0373, 0.0393], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0106, 0.0158, 0.0099, 0.0127, 0.0113, 0.0110, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:20:36,121 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20744.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:20:36,751 INFO [train.py:876] (2/4) Epoch 3, batch 6200, loss[loss=0.2095, simple_loss=0.2036, pruned_loss=0.1077, over 5591.00 frames. ], tot_loss[loss=0.2133, simple_loss=0.203, pruned_loss=0.1118, over 1085019.80 frames. ], batch size: 23, lr: 2.26e-02, grad_scale: 8.0 +2022-11-15 17:20:36,872 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20745.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:20:51,544 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9892, 1.3570, 1.9264, 1.7382, 1.8145, 1.2194, 1.6775, 1.9916], + device='cuda:2'), covar=tensor([0.0038, 0.0161, 0.0034, 0.0055, 0.0041, 0.0201, 0.0091, 0.0037], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0175, 0.0135, 0.0188, 0.0129, 0.0166, 0.0192, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 17:20:52,825 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20768.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:20:57,642 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20774.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:21:11,101 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20793.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:21:22,963 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.536e+02 2.113e+02 2.498e+02 3.399e+02 6.391e+02, threshold=4.996e+02, percent-clipped=4.0 +2022-11-15 17:21:27,275 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20816.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:21:47,814 INFO [train.py:876] (2/4) Epoch 3, batch 6300, loss[loss=0.203, simple_loss=0.1971, pruned_loss=0.1045, over 5704.00 frames. ], tot_loss[loss=0.2152, simple_loss=0.2038, pruned_loss=0.1133, over 1086502.19 frames. ], batch size: 13, lr: 2.25e-02, grad_scale: 8.0 +2022-11-15 17:21:58,669 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20860.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:22:20,396 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-11-15 17:22:21,713 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-15 17:22:32,809 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20908.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:22:34,075 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.371e+02 2.128e+02 2.723e+02 3.616e+02 7.802e+02, threshold=5.445e+02, percent-clipped=12.0 +2022-11-15 17:22:43,122 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6980, 2.4624, 1.0902, 2.1316, 1.6095, 1.6950, 1.2141, 2.5542], + device='cuda:2'), covar=tensor([0.0414, 0.0627, 0.1182, 0.0670, 0.0754, 0.1231, 0.0704, 0.0385], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0044, 0.0052, 0.0038, 0.0056, 0.0043, 0.0052, 0.0038], + device='cuda:2'), out_proj_covar=tensor([9.0358e-05, 1.0329e-04, 1.3239e-04, 8.9315e-05, 1.3257e-04, 1.0741e-04, + 1.2109e-04, 9.2606e-05], device='cuda:2') +2022-11-15 17:22:58,280 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20944.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:22:58,895 INFO [train.py:876] (2/4) Epoch 3, batch 6400, loss[loss=0.2571, simple_loss=0.2273, pruned_loss=0.1434, over 5595.00 frames. ], tot_loss[loss=0.212, simple_loss=0.2019, pruned_loss=0.1111, over 1088019.59 frames. ], batch size: 50, lr: 2.25e-02, grad_scale: 8.0 +2022-11-15 17:23:00,498 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.41 vs. limit=5.0 +2022-11-15 17:23:08,923 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3540, 2.5874, 2.5671, 2.3868, 2.5073, 2.5432, 1.0595, 2.4894], + device='cuda:2'), covar=tensor([0.0272, 0.0164, 0.0179, 0.0186, 0.0225, 0.0180, 0.2083, 0.0259], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0065, 0.0064, 0.0054, 0.0079, 0.0059, 0.0121, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:23:46,176 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.067e+02 1.963e+02 2.483e+02 3.263e+02 5.829e+02, threshold=4.966e+02, percent-clipped=2.0 +2022-11-15 17:23:56,328 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5956, 1.7225, 1.7471, 1.6662, 1.7294, 1.7526, 0.8155, 1.6893], + device='cuda:2'), covar=tensor([0.0503, 0.0329, 0.0328, 0.0240, 0.0355, 0.0208, 0.1973, 0.0468], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0066, 0.0066, 0.0056, 0.0080, 0.0061, 0.0123, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:24:10,688 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21044.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:24:11,260 INFO [train.py:876] (2/4) Epoch 3, batch 6500, loss[loss=0.2549, simple_loss=0.219, pruned_loss=0.1454, over 5176.00 frames. ], tot_loss[loss=0.2158, simple_loss=0.2038, pruned_loss=0.1139, over 1085929.99 frames. ], batch size: 92, lr: 2.24e-02, grad_scale: 8.0 +2022-11-15 17:24:17,486 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21052.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:24:29,656 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3756, 1.5934, 1.1995, 1.3780, 1.6688, 2.1092, 1.2604, 1.1892], + device='cuda:2'), covar=tensor([0.0031, 0.0020, 0.0020, 0.0028, 0.0017, 0.0014, 0.0036, 0.0065], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0015, 0.0017, 0.0015, 0.0016, 0.0014, 0.0017, 0.0015], + device='cuda:2'), out_proj_covar=tensor([2.1084e-05, 1.9481e-05, 1.9239e-05, 1.6187e-05, 1.7587e-05, 1.4033e-05, + 2.5712e-05, 1.8035e-05], device='cuda:2') +2022-11-15 17:24:32,921 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21074.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:24:45,243 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21092.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:24:50,679 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-15 17:24:58,582 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.422e+02 2.262e+02 2.845e+02 3.817e+02 7.458e+02, threshold=5.690e+02, percent-clipped=11.0 +2022-11-15 17:25:00,937 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21113.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:25:04,400 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21118.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:25:06,963 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21122.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:25:22,794 INFO [train.py:876] (2/4) Epoch 3, batch 6600, loss[loss=0.2421, simple_loss=0.2274, pruned_loss=0.1284, over 5676.00 frames. ], tot_loss[loss=0.2151, simple_loss=0.2032, pruned_loss=0.1135, over 1082318.85 frames. ], batch size: 36, lr: 2.23e-02, grad_scale: 8.0 +2022-11-15 17:25:22,874 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.2431, 4.6773, 5.1682, 4.7303, 5.1627, 5.2233, 4.4692, 5.1604], + device='cuda:2'), covar=tensor([0.0300, 0.0204, 0.0295, 0.0210, 0.0387, 0.0065, 0.0221, 0.0243], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0090, 0.0073, 0.0096, 0.0093, 0.0058, 0.0080, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:25:36,828 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4654, 4.0111, 3.4040, 3.1939, 2.2530, 3.7708, 2.2289, 3.4558], + device='cuda:2'), covar=tensor([0.0226, 0.0088, 0.0105, 0.0159, 0.0278, 0.0047, 0.0193, 0.0038], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0083, 0.0099, 0.0103, 0.0135, 0.0094, 0.0117, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:25:47,393 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21179.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:26:08,288 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-11-15 17:26:09,146 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.333e+02 2.074e+02 2.594e+02 3.441e+02 9.119e+02, threshold=5.187e+02, percent-clipped=3.0 +2022-11-15 17:26:14,774 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.31 vs. limit=2.0 +2022-11-15 17:26:33,845 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21244.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:26:34,421 INFO [train.py:876] (2/4) Epoch 3, batch 6700, loss[loss=0.1964, simple_loss=0.1914, pruned_loss=0.1007, over 5589.00 frames. ], tot_loss[loss=0.214, simple_loss=0.2027, pruned_loss=0.1126, over 1085567.02 frames. ], batch size: 25, lr: 2.23e-02, grad_scale: 8.0 +2022-11-15 17:26:48,035 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1865, 3.2579, 2.4758, 1.6166, 3.0717, 1.2134, 3.2145, 1.9078], + device='cuda:2'), covar=tensor([0.0852, 0.0126, 0.0674, 0.1558, 0.0204, 0.1572, 0.0112, 0.1235], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0090, 0.0092, 0.0122, 0.0096, 0.0132, 0.0080, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 17:26:58,844 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2694, 4.6272, 3.4902, 4.3284, 3.6061, 3.1379, 2.3653, 3.8657], + device='cuda:2'), covar=tensor([0.1490, 0.0103, 0.0651, 0.0357, 0.0317, 0.0738, 0.1716, 0.0124], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0115, 0.0161, 0.0114, 0.0145, 0.0177, 0.0187, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 17:27:07,742 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21292.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:27:20,392 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.449e+02 2.110e+02 2.601e+02 3.417e+02 8.582e+02, threshold=5.201e+02, percent-clipped=1.0 +2022-11-15 17:27:21,997 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21312.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:27:22,008 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1659, 1.1400, 1.2506, 0.9751, 1.3277, 1.2512, 1.0842, 1.2157], + device='cuda:2'), covar=tensor([0.0930, 0.0184, 0.0655, 0.1407, 0.0880, 0.0905, 0.0450, 0.0542], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0010, 0.0008, 0.0009, 0.0009, 0.0008, 0.0009, 0.0008], + device='cuda:2'), out_proj_covar=tensor([2.8397e-05, 2.9896e-05, 2.6729e-05, 2.9515e-05, 2.8667e-05, 2.6992e-05, + 2.8909e-05, 2.7112e-05], device='cuda:2') +2022-11-15 17:27:27,485 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1531, 4.9323, 4.1297, 4.7351, 4.7452, 4.0161, 4.1185, 3.8341], + device='cuda:2'), covar=tensor([0.0236, 0.0246, 0.0993, 0.0298, 0.0336, 0.0344, 0.0336, 0.0459], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0107, 0.0167, 0.0103, 0.0133, 0.0114, 0.0111, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:27:33,709 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21328.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:27:45,493 INFO [train.py:876] (2/4) Epoch 3, batch 6800, loss[loss=0.231, simple_loss=0.2243, pruned_loss=0.1188, over 5591.00 frames. ], tot_loss[loss=0.2128, simple_loss=0.2022, pruned_loss=0.1117, over 1083570.96 frames. ], batch size: 23, lr: 2.22e-02, grad_scale: 16.0 +2022-11-15 17:28:05,232 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21373.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:28:17,366 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21389.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:28:30,800 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21408.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:28:32,049 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.395e+02 2.037e+02 2.447e+02 3.250e+02 5.677e+02, threshold=4.895e+02, percent-clipped=2.0 +2022-11-15 17:28:57,302 INFO [train.py:876] (2/4) Epoch 3, batch 6900, loss[loss=0.2589, simple_loss=0.236, pruned_loss=0.1409, over 5569.00 frames. ], tot_loss[loss=0.2144, simple_loss=0.2035, pruned_loss=0.1126, over 1089847.70 frames. ], batch size: 43, lr: 2.22e-02, grad_scale: 16.0 +2022-11-15 17:29:17,886 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21474.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:29:28,909 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9579, 1.6950, 2.1830, 1.3513, 0.7542, 1.8786, 1.2295, 1.5860], + device='cuda:2'), covar=tensor([0.0188, 0.0221, 0.0110, 0.0428, 0.1092, 0.0588, 0.0640, 0.0274], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0029, 0.0028, 0.0034, 0.0032, 0.0026, 0.0030, 0.0031], + device='cuda:2'), out_proj_covar=tensor([4.9590e-05, 4.6071e-05, 4.2890e-05, 5.9496e-05, 5.6750e-05, 4.8557e-05, + 4.7867e-05, 5.0746e-05], device='cuda:2') +2022-11-15 17:29:43,555 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.421e+02 2.216e+02 2.709e+02 3.216e+02 6.064e+02, threshold=5.419e+02, percent-clipped=2.0 +2022-11-15 17:30:08,651 INFO [train.py:876] (2/4) Epoch 3, batch 7000, loss[loss=0.1888, simple_loss=0.1925, pruned_loss=0.09255, over 5668.00 frames. ], tot_loss[loss=0.2119, simple_loss=0.2017, pruned_loss=0.111, over 1088651.24 frames. ], batch size: 19, lr: 2.22e-02, grad_scale: 16.0 +2022-11-15 17:30:43,768 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21595.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:30:55,043 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.332e+02 2.225e+02 2.783e+02 3.485e+02 7.501e+02, threshold=5.566e+02, percent-clipped=1.0 +2022-11-15 17:30:57,267 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3617, 2.3023, 1.8108, 2.2850, 1.7818, 2.1741, 2.4527, 2.5879], + device='cuda:2'), covar=tensor([0.0701, 0.0916, 0.1285, 0.0651, 0.1046, 0.0871, 0.0740, 0.0721], + device='cuda:2'), in_proj_covar=tensor([0.0044, 0.0046, 0.0057, 0.0043, 0.0058, 0.0048, 0.0055, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 17:30:57,273 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21613.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:31:19,550 INFO [train.py:876] (2/4) Epoch 3, batch 7100, loss[loss=0.1637, simple_loss=0.1752, pruned_loss=0.07609, over 5532.00 frames. ], tot_loss[loss=0.2116, simple_loss=0.2016, pruned_loss=0.1108, over 1084605.33 frames. ], batch size: 14, lr: 2.21e-02, grad_scale: 16.0 +2022-11-15 17:31:27,578 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21656.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:31:36,525 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21668.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:31:40,751 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21674.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:31:47,844 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21684.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:32:04,371 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21708.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:32:05,617 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 2.184e+02 2.618e+02 3.432e+02 5.543e+02, threshold=5.236e+02, percent-clipped=0.0 +2022-11-15 17:32:23,493 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-15 17:32:31,161 INFO [train.py:876] (2/4) Epoch 3, batch 7200, loss[loss=0.2241, simple_loss=0.1886, pruned_loss=0.1298, over 4073.00 frames. ], tot_loss[loss=0.2114, simple_loss=0.2012, pruned_loss=0.1108, over 1080316.08 frames. ], batch size: 181, lr: 2.21e-02, grad_scale: 16.0 +2022-11-15 17:32:38,707 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21756.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:32:43,676 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21763.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:32:51,895 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21774.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:33:16,994 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.323e+02 2.105e+02 2.662e+02 3.428e+02 5.862e+02, threshold=5.324e+02, percent-clipped=2.0 +2022-11-15 17:34:13,376 INFO [train.py:876] (2/4) Epoch 4, batch 0, loss[loss=0.2608, simple_loss=0.2533, pruned_loss=0.1342, over 5748.00 frames. ], tot_loss[loss=0.2608, simple_loss=0.2533, pruned_loss=0.1342, over 5748.00 frames. ], batch size: 16, lr: 2.06e-02, grad_scale: 16.0 +2022-11-15 17:34:13,377 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 17:34:20,234 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1502, 3.9887, 4.0502, 4.1440, 4.1841, 4.3572, 4.0391, 4.1768], + device='cuda:2'), covar=tensor([0.0349, 0.0230, 0.0376, 0.0183, 0.0347, 0.0085, 0.0261, 0.0382], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0088, 0.0072, 0.0097, 0.0096, 0.0056, 0.0080, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:34:21,842 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3472, 4.3621, 4.6458, 4.3345, 4.2797, 4.2311, 4.9607, 4.4405], + device='cuda:2'), covar=tensor([0.0484, 0.0798, 0.0344, 0.0537, 0.0366, 0.0289, 0.0588, 0.0287], + device='cuda:2'), in_proj_covar=tensor([0.0055, 0.0077, 0.0063, 0.0073, 0.0057, 0.0048, 0.0091, 0.0059], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:34:27,207 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4407, 0.8576, 1.1368, 1.3907, 1.1737, 1.1405, 0.7569, 1.1693], + device='cuda:2'), covar=tensor([0.0137, 0.0071, 0.0241, 0.0412, 0.0155, 0.0323, 0.0315, 0.0231], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0010, 0.0008, 0.0008, 0.0009, 0.0009, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([2.9260e-05, 3.0943e-05, 2.6788e-05, 2.9744e-05, 2.9754e-05, 2.9359e-05, + 2.9969e-05, 2.8538e-05], device='cuda:2') +2022-11-15 17:34:29,685 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1365, 2.3716, 3.6167, 3.1555, 3.8512, 2.3488, 3.2391, 3.9240], + device='cuda:2'), covar=tensor([0.0136, 0.0632, 0.0195, 0.0689, 0.0103, 0.0551, 0.0549, 0.0213], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0174, 0.0141, 0.0191, 0.0131, 0.0165, 0.0194, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 17:34:30,839 INFO [train.py:908] (2/4) Epoch 4, validation: loss=0.1863, simple_loss=0.204, pruned_loss=0.08431, over 1530663.00 frames. +2022-11-15 17:34:30,841 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4693MB +2022-11-15 17:34:34,343 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21822.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:34:35,817 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21824.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:34:55,165 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21850.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:35:20,456 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8170, 4.4385, 3.7611, 4.4316, 4.2993, 3.5931, 3.7364, 3.3571], + device='cuda:2'), covar=tensor([0.0338, 0.0317, 0.1160, 0.0311, 0.0388, 0.0387, 0.0404, 0.0744], + device='cuda:2'), in_proj_covar=tensor([0.0093, 0.0109, 0.0167, 0.0102, 0.0131, 0.0115, 0.0110, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:35:24,470 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5867, 2.7275, 2.6239, 2.4262, 2.6496, 2.6816, 1.2353, 2.6526], + device='cuda:2'), covar=tensor([0.0282, 0.0215, 0.0211, 0.0236, 0.0325, 0.0232, 0.2432, 0.0286], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0068, 0.0067, 0.0057, 0.0082, 0.0063, 0.0123, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:35:38,005 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.349e+02 2.101e+02 2.586e+02 3.383e+02 7.997e+02, threshold=5.171e+02, percent-clipped=3.0 +2022-11-15 17:35:38,904 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21911.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:35:40,662 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.98 vs. limit=5.0 +2022-11-15 17:35:42,901 INFO [train.py:876] (2/4) Epoch 4, batch 100, loss[loss=0.2236, simple_loss=0.1969, pruned_loss=0.1252, over 4764.00 frames. ], tot_loss[loss=0.2029, simple_loss=0.1972, pruned_loss=0.1043, over 437787.74 frames. ], batch size: 135, lr: 2.05e-02, grad_scale: 16.0 +2022-11-15 17:36:07,078 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21951.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:36:16,001 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-11-15 17:36:19,006 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21968.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:36:19,642 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21969.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:36:20,402 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21970.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:36:29,993 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21984.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:36:37,579 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0195, 0.6155, 1.0342, 1.0723, 1.2854, 1.3277, 0.8860, 1.3032], + device='cuda:2'), covar=tensor([0.0495, 0.0219, 0.0450, 0.0873, 0.0248, 0.0981, 0.0345, 0.0651], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0011, 0.0009, 0.0009, 0.0009, 0.0009, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.0637e-05, 3.2177e-05, 2.8503e-05, 3.1054e-05, 3.1595e-05, 3.0421e-05, + 3.1514e-05, 2.9684e-05], device='cuda:2') +2022-11-15 17:36:42,405 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.20 vs. limit=5.0 +2022-11-15 17:36:49,994 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.351e+02 2.318e+02 2.900e+02 3.607e+02 8.310e+02, threshold=5.801e+02, percent-clipped=7.0 +2022-11-15 17:36:53,600 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22016.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:36:54,234 INFO [train.py:876] (2/4) Epoch 4, batch 200, loss[loss=0.265, simple_loss=0.2339, pruned_loss=0.148, over 5259.00 frames. ], tot_loss[loss=0.2079, simple_loss=0.1996, pruned_loss=0.1081, over 696410.04 frames. ], batch size: 79, lr: 2.05e-02, grad_scale: 8.0 +2022-11-15 17:37:04,765 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22031.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:37:05,301 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22032.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:37:31,344 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 17:38:01,135 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3330, 3.0499, 3.1811, 2.9244, 3.3606, 3.1941, 3.2336, 3.3350], + device='cuda:2'), covar=tensor([0.0385, 0.0303, 0.0471, 0.0371, 0.0389, 0.0166, 0.0243, 0.0379], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0092, 0.0075, 0.0101, 0.0098, 0.0059, 0.0081, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:38:01,664 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.234e+02 1.970e+02 2.360e+02 3.051e+02 4.623e+02, threshold=4.719e+02, percent-clipped=0.0 +2022-11-15 17:38:06,229 INFO [train.py:876] (2/4) Epoch 4, batch 300, loss[loss=0.2066, simple_loss=0.2062, pruned_loss=0.1034, over 5666.00 frames. ], tot_loss[loss=0.2088, simple_loss=0.2001, pruned_loss=0.1087, over 850263.19 frames. ], batch size: 19, lr: 2.05e-02, grad_scale: 8.0 +2022-11-15 17:38:07,662 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22119.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:38:08,938 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6634, 2.4937, 2.4773, 2.2900, 2.7242, 2.5476, 2.7412, 2.6109], + device='cuda:2'), covar=tensor([0.0937, 0.0826, 0.1079, 0.1144, 0.0918, 0.0539, 0.0665, 0.1055], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0091, 0.0075, 0.0100, 0.0097, 0.0059, 0.0081, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:38:15,912 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22130.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:38:48,222 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22176.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:38:58,760 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2062, 3.2982, 3.4045, 1.2012, 2.9064, 3.9560, 3.2687, 3.4522], + device='cuda:2'), covar=tensor([0.1350, 0.0836, 0.0383, 0.1800, 0.0158, 0.0084, 0.0165, 0.0149], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0180, 0.0131, 0.0185, 0.0121, 0.0111, 0.0115, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:38:59,420 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22191.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:39:09,870 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22206.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:39:13,064 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.175e+02 2.158e+02 2.699e+02 3.433e+02 6.693e+02, threshold=5.398e+02, percent-clipped=7.0 +2022-11-15 17:39:15,890 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22215.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:39:17,104 INFO [train.py:876] (2/4) Epoch 4, batch 400, loss[loss=0.1741, simple_loss=0.1904, pruned_loss=0.07889, over 5710.00 frames. ], tot_loss[loss=0.2103, simple_loss=0.2016, pruned_loss=0.1095, over 946958.93 frames. ], batch size: 28, lr: 2.04e-02, grad_scale: 8.0 +2022-11-15 17:39:31,510 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22237.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:39:31,533 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22237.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:39:40,042 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1447, 1.8335, 1.8365, 1.4795, 0.5455, 2.0758, 1.6701, 1.5749], + device='cuda:2'), covar=tensor([0.0328, 0.0227, 0.0210, 0.0477, 0.0955, 0.0423, 0.0367, 0.0299], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0028, 0.0030, 0.0032, 0.0030, 0.0025, 0.0028, 0.0030], + device='cuda:2'), out_proj_covar=tensor([4.9954e-05, 4.4898e-05, 4.5250e-05, 5.7276e-05, 5.3655e-05, 4.6415e-05, + 4.6680e-05, 4.9376e-05], device='cuda:2') +2022-11-15 17:39:41,370 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22251.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:39:54,919 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22269.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:39:59,911 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22276.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:40:15,784 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22298.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:40:16,319 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22299.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:40:25,359 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.316e+02 2.107e+02 2.622e+02 3.333e+02 5.947e+02, threshold=5.245e+02, percent-clipped=1.0 +2022-11-15 17:40:29,489 INFO [train.py:876] (2/4) Epoch 4, batch 500, loss[loss=0.1724, simple_loss=0.1754, pruned_loss=0.08473, over 5722.00 frames. ], tot_loss[loss=0.2089, simple_loss=0.2012, pruned_loss=0.1083, over 1002759.80 frames. ], batch size: 12, lr: 2.04e-02, grad_scale: 8.0 +2022-11-15 17:40:29,531 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22317.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:40:35,643 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22326.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:40:43,208 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0488, 3.8594, 3.2443, 3.2265, 2.1675, 3.8432, 2.0303, 3.2147], + device='cuda:2'), covar=tensor([0.0249, 0.0068, 0.0091, 0.0167, 0.0244, 0.0051, 0.0173, 0.0034], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0087, 0.0106, 0.0106, 0.0141, 0.0101, 0.0120, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 17:40:58,069 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.04 vs. limit=5.0 +2022-11-15 17:41:17,525 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22385.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:41:31,040 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.79 vs. limit=5.0 +2022-11-15 17:41:36,526 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.346e+02 2.045e+02 2.785e+02 3.827e+02 6.845e+02, threshold=5.570e+02, percent-clipped=5.0 +2022-11-15 17:41:40,690 INFO [train.py:876] (2/4) Epoch 4, batch 600, loss[loss=0.2068, simple_loss=0.1913, pruned_loss=0.1112, over 5626.00 frames. ], tot_loss[loss=0.207, simple_loss=0.1998, pruned_loss=0.1071, over 1038106.31 frames. ], batch size: 18, lr: 2.03e-02, grad_scale: 8.0 +2022-11-15 17:41:42,567 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22419.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:42:01,408 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22446.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:42:16,336 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22467.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:42:17,168 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8600, 2.2820, 2.7642, 3.7518, 3.8961, 2.9813, 2.5793, 4.0364], + device='cuda:2'), covar=tensor([0.0137, 0.2784, 0.2263, 0.2091, 0.0428, 0.2224, 0.1844, 0.0113], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0229, 0.0230, 0.0238, 0.0190, 0.0232, 0.0204, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 17:42:26,597 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.39 vs. limit=2.0 +2022-11-15 17:42:29,715 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22486.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:42:43,782 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22506.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:42:47,538 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.340e+02 1.953e+02 2.259e+02 3.253e+02 4.972e+02, threshold=4.518e+02, percent-clipped=0.0 +2022-11-15 17:42:51,953 INFO [train.py:876] (2/4) Epoch 4, batch 700, loss[loss=0.2794, simple_loss=0.2228, pruned_loss=0.168, over 3072.00 frames. ], tot_loss[loss=0.2063, simple_loss=0.1991, pruned_loss=0.1068, over 1056165.97 frames. ], batch size: 284, lr: 2.03e-02, grad_scale: 8.0 +2022-11-15 17:43:02,564 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22532.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:43:18,015 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22554.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:43:30,242 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22571.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:43:34,097 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9153, 2.0018, 1.3794, 2.5434, 1.4957, 1.6646, 1.5754, 2.4163], + device='cuda:2'), covar=tensor([0.0844, 0.0593, 0.2221, 0.0299, 0.1093, 0.0630, 0.0960, 0.0516], + device='cuda:2'), in_proj_covar=tensor([0.0045, 0.0052, 0.0064, 0.0044, 0.0061, 0.0048, 0.0059, 0.0042], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 17:43:43,528 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.67 vs. limit=5.0 +2022-11-15 17:43:45,946 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22593.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:43:58,685 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.071e+02 2.662e+02 3.271e+02 7.195e+02, threshold=5.325e+02, percent-clipped=7.0 +2022-11-15 17:44:02,730 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9207, 0.9358, 0.8381, 0.9659, 1.2115, 1.0004, 0.6653, 1.1347], + device='cuda:2'), covar=tensor([0.0348, 0.0401, 0.0602, 0.0885, 0.0419, 0.0275, 0.0844, 0.0627], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0011, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0008], + device='cuda:2'), out_proj_covar=tensor([2.9941e-05, 3.3075e-05, 2.9215e-05, 3.1331e-05, 2.9835e-05, 2.9933e-05, + 3.0675e-05, 2.7853e-05], device='cuda:2') +2022-11-15 17:44:03,252 INFO [train.py:876] (2/4) Epoch 4, batch 800, loss[loss=0.2839, simple_loss=0.2511, pruned_loss=0.1584, over 5474.00 frames. ], tot_loss[loss=0.2044, simple_loss=0.1985, pruned_loss=0.1052, over 1073364.51 frames. ], batch size: 58, lr: 2.02e-02, grad_scale: 8.0 +2022-11-15 17:44:09,817 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22626.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:44:19,312 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22639.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:44:43,330 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22674.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:44:49,273 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7933, 1.9388, 2.5494, 3.7064, 3.8960, 2.7914, 1.9694, 3.7600], + device='cuda:2'), covar=tensor([0.0262, 0.3732, 0.3002, 0.1526, 0.0462, 0.3040, 0.2596, 0.0148], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0224, 0.0228, 0.0246, 0.0192, 0.0233, 0.0204, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 17:44:57,095 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1598, 1.8505, 2.2013, 3.1247, 3.2429, 2.4520, 1.8435, 3.1134], + device='cuda:2'), covar=tensor([0.0338, 0.3117, 0.2433, 0.2400, 0.0567, 0.2362, 0.2189, 0.0194], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0224, 0.0227, 0.0246, 0.0193, 0.0231, 0.0204, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 17:45:01,826 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22700.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 17:45:09,603 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.215e+02 2.138e+02 2.671e+02 3.427e+02 5.214e+02, threshold=5.342e+02, percent-clipped=0.0 +2022-11-15 17:45:13,838 INFO [train.py:876] (2/4) Epoch 4, batch 900, loss[loss=0.2008, simple_loss=0.1916, pruned_loss=0.105, over 5264.00 frames. ], tot_loss[loss=0.2054, simple_loss=0.1986, pruned_loss=0.1062, over 1079794.33 frames. ], batch size: 79, lr: 2.02e-02, grad_scale: 8.0 +2022-11-15 17:45:26,183 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.36 vs. limit=5.0 +2022-11-15 17:45:31,210 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22741.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:45:49,207 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.21 vs. limit=5.0 +2022-11-15 17:46:03,083 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22786.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:46:08,949 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22794.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:46:20,973 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.117e+02 2.200e+02 2.748e+02 3.509e+02 2.017e+03, threshold=5.496e+02, percent-clipped=6.0 +2022-11-15 17:46:25,453 INFO [train.py:876] (2/4) Epoch 4, batch 1000, loss[loss=0.2608, simple_loss=0.2333, pruned_loss=0.1442, over 5745.00 frames. ], tot_loss[loss=0.2064, simple_loss=0.1989, pruned_loss=0.107, over 1077700.81 frames. ], batch size: 27, lr: 2.02e-02, grad_scale: 8.0 +2022-11-15 17:46:27,055 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2583, 3.5639, 3.1900, 3.0212, 2.1503, 3.5066, 2.0582, 3.1205], + device='cuda:2'), covar=tensor([0.0227, 0.0062, 0.0081, 0.0143, 0.0212, 0.0051, 0.0175, 0.0041], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0086, 0.0105, 0.0110, 0.0143, 0.0102, 0.0121, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:46:31,072 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22825.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:46:35,904 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22832.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:46:37,246 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22834.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:46:52,934 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22855.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:47:04,126 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22871.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:47:10,244 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22880.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:47:14,471 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22886.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:47:19,586 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22893.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:47:32,216 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.242e+02 1.971e+02 2.339e+02 2.906e+02 5.624e+02, threshold=4.678e+02, percent-clipped=1.0 +2022-11-15 17:47:36,398 INFO [train.py:876] (2/4) Epoch 4, batch 1100, loss[loss=0.1811, simple_loss=0.1869, pruned_loss=0.08768, over 5714.00 frames. ], tot_loss[loss=0.2059, simple_loss=0.1985, pruned_loss=0.1067, over 1080618.13 frames. ], batch size: 28, lr: 2.01e-02, grad_scale: 8.0 +2022-11-15 17:47:37,872 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22919.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:47:53,252 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22941.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:48:31,533 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22995.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 17:48:43,816 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.434e+02 2.126e+02 2.633e+02 3.216e+02 6.006e+02, threshold=5.267e+02, percent-clipped=5.0 +2022-11-15 17:48:47,954 INFO [train.py:876] (2/4) Epoch 4, batch 1200, loss[loss=0.1861, simple_loss=0.1869, pruned_loss=0.09267, over 5717.00 frames. ], tot_loss[loss=0.2027, simple_loss=0.1964, pruned_loss=0.1045, over 1083410.67 frames. ], batch size: 28, lr: 2.01e-02, grad_scale: 8.0 +2022-11-15 17:49:04,970 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23041.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:49:06,636 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.85 vs. limit=2.0 +2022-11-15 17:49:16,532 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2132, 2.6323, 1.6271, 3.2432, 2.0780, 2.4432, 2.2444, 2.7546], + device='cuda:2'), covar=tensor([0.0305, 0.0464, 0.1546, 0.0223, 0.0873, 0.0386, 0.0588, 0.3038], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0050, 0.0059, 0.0042, 0.0058, 0.0046, 0.0056, 0.0040], + device='cuda:2'), out_proj_covar=tensor([9.7199e-05, 1.2262e-04, 1.5215e-04, 1.0538e-04, 1.4106e-04, 1.1947e-04, + 1.3443e-04, 1.0446e-04], device='cuda:2') +2022-11-15 17:49:32,895 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1047, 4.3186, 3.2147, 4.0652, 3.2367, 2.6209, 2.2679, 3.6778], + device='cuda:2'), covar=tensor([0.1734, 0.0122, 0.0742, 0.0224, 0.0498, 0.1028, 0.1807, 0.0265], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0114, 0.0161, 0.0112, 0.0153, 0.0180, 0.0188, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 17:49:38,855 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23089.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:49:47,158 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5023, 5.1411, 3.7404, 2.2399, 4.8614, 2.1560, 4.6525, 2.8813], + device='cuda:2'), covar=tensor([0.0919, 0.0062, 0.0293, 0.1836, 0.0104, 0.1589, 0.0107, 0.1371], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0088, 0.0093, 0.0116, 0.0094, 0.0128, 0.0081, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0004, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 17:49:52,411 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=23108.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:49:54,240 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.201e+02 2.049e+02 2.605e+02 3.403e+02 6.305e+02, threshold=5.209e+02, percent-clipped=2.0 +2022-11-15 17:49:58,726 INFO [train.py:876] (2/4) Epoch 4, batch 1300, loss[loss=0.2874, simple_loss=0.2425, pruned_loss=0.1661, over 5445.00 frames. ], tot_loss[loss=0.2038, simple_loss=0.1973, pruned_loss=0.1052, over 1084116.75 frames. ], batch size: 64, lr: 2.00e-02, grad_scale: 8.0 +2022-11-15 17:50:08,510 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6523, 4.2549, 4.4809, 4.2057, 4.6807, 4.4915, 4.1771, 4.6910], + device='cuda:2'), covar=tensor([0.0350, 0.0234, 0.0408, 0.0253, 0.0331, 0.0136, 0.0241, 0.0225], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0091, 0.0076, 0.0098, 0.0095, 0.0057, 0.0083, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:50:16,519 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0901, 4.3238, 4.5926, 4.4334, 3.6981, 3.4591, 4.9233, 4.1660], + device='cuda:2'), covar=tensor([0.0480, 0.0862, 0.0351, 0.0680, 0.0626, 0.0327, 0.0843, 0.0413], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0080, 0.0067, 0.0078, 0.0061, 0.0049, 0.0097, 0.0064], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:50:21,967 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23150.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:50:36,148 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=23169.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:50:44,515 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23181.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:51:05,597 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.170e+01 2.060e+02 2.584e+02 3.363e+02 5.860e+02, threshold=5.168e+02, percent-clipped=2.0 +2022-11-15 17:51:09,743 INFO [train.py:876] (2/4) Epoch 4, batch 1400, loss[loss=0.2185, simple_loss=0.2082, pruned_loss=0.1144, over 5750.00 frames. ], tot_loss[loss=0.2049, simple_loss=0.198, pruned_loss=0.1059, over 1088997.99 frames. ], batch size: 27, lr: 2.00e-02, grad_scale: 8.0 +2022-11-15 17:51:09,910 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=23217.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:51:19,220 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=23229.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:51:53,381 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=23278.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:52:02,690 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=23290.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:52:06,041 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23295.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:52:17,014 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.385e+02 2.039e+02 2.495e+02 2.958e+02 6.410e+02, threshold=4.991e+02, percent-clipped=2.0 +2022-11-15 17:52:21,019 INFO [train.py:876] (2/4) Epoch 4, batch 1500, loss[loss=0.1869, simple_loss=0.1914, pruned_loss=0.09118, over 5735.00 frames. ], tot_loss[loss=0.2028, simple_loss=0.1964, pruned_loss=0.1045, over 1085569.65 frames. ], batch size: 14, lr: 1.99e-02, grad_scale: 8.0 +2022-11-15 17:52:39,664 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23343.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:53:26,479 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7227, 4.4044, 3.4934, 3.3191, 2.6480, 4.4336, 2.4947, 3.7801], + device='cuda:2'), covar=tensor([0.0260, 0.0080, 0.0120, 0.0275, 0.0327, 0.0051, 0.0231, 0.0047], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0091, 0.0110, 0.0114, 0.0145, 0.0106, 0.0125, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:53:27,575 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.369e+02 2.167e+02 2.612e+02 3.180e+02 6.082e+02, threshold=5.224e+02, percent-clipped=1.0 +2022-11-15 17:53:32,212 INFO [train.py:876] (2/4) Epoch 4, batch 1600, loss[loss=0.1507, simple_loss=0.1611, pruned_loss=0.07009, over 5106.00 frames. ], tot_loss[loss=0.2025, simple_loss=0.196, pruned_loss=0.1045, over 1080487.00 frames. ], batch size: 7, lr: 1.99e-02, grad_scale: 8.0 +2022-11-15 17:53:44,309 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.41 vs. limit=5.0 +2022-11-15 17:53:56,053 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23450.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:54:05,587 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23464.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:54:17,876 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23481.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:54:22,740 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=23488.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:54:29,479 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23498.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:54:39,498 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.059e+02 1.935e+02 2.419e+02 3.182e+02 7.599e+02, threshold=4.837e+02, percent-clipped=7.0 +2022-11-15 17:54:41,774 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1772, 2.0876, 3.6292, 2.9778, 4.0582, 2.3058, 3.4172, 4.0417], + device='cuda:2'), covar=tensor([0.0111, 0.0857, 0.0189, 0.0530, 0.0082, 0.0615, 0.0371, 0.0162], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0174, 0.0141, 0.0189, 0.0133, 0.0167, 0.0195, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 17:54:43,668 INFO [train.py:876] (2/4) Epoch 4, batch 1700, loss[loss=0.1324, simple_loss=0.1449, pruned_loss=0.05995, over 5460.00 frames. ], tot_loss[loss=0.2004, simple_loss=0.1953, pruned_loss=0.1028, over 1088665.83 frames. ], batch size: 10, lr: 1.99e-02, grad_scale: 8.0 +2022-11-15 17:54:51,898 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23529.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:55:06,120 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=23549.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 17:55:20,488 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-11-15 17:55:23,673 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23573.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:55:25,835 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8241, 1.2686, 1.2809, 0.6327, 1.5115, 1.7179, 0.9328, 1.2159], + device='cuda:2'), covar=tensor([0.0021, 0.0016, 0.0012, 0.0016, 0.0010, 0.0011, 0.0020, 0.0014], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0019, 0.0018, 0.0018, 0.0018, 0.0017, 0.0019, 0.0017], + device='cuda:2'), out_proj_covar=tensor([2.3738e-05, 2.3328e-05, 1.8977e-05, 1.8183e-05, 1.9147e-05, 1.6085e-05, + 2.7221e-05, 1.7395e-05], device='cuda:2') +2022-11-15 17:55:26,725 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.96 vs. limit=5.0 +2022-11-15 17:55:32,248 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23585.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:55:51,050 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.312e+02 2.088e+02 2.577e+02 3.329e+02 6.388e+02, threshold=5.153e+02, percent-clipped=6.0 +2022-11-15 17:55:55,440 INFO [train.py:876] (2/4) Epoch 4, batch 1800, loss[loss=0.2111, simple_loss=0.2082, pruned_loss=0.107, over 5629.00 frames. ], tot_loss[loss=0.2007, simple_loss=0.1951, pruned_loss=0.1031, over 1084284.34 frames. ], batch size: 23, lr: 1.98e-02, grad_scale: 8.0 +2022-11-15 17:55:58,303 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4094, 1.5766, 1.6399, 2.1808, 1.5788, 1.2016, 1.3565, 1.5333], + device='cuda:2'), covar=tensor([0.0016, 0.0026, 0.0024, 0.0015, 0.0017, 0.0050, 0.0022, 0.0021], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0015, 0.0014, 0.0016, 0.0016, 0.0014], + device='cuda:2'), out_proj_covar=tensor([1.6212e-05, 1.6942e-05, 1.5366e-05, 1.7571e-05, 1.4221e-05, 1.8256e-05, + 1.9139e-05, 1.8436e-05], device='cuda:2') +2022-11-15 17:56:02,004 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-15 17:56:10,338 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3612, 2.1108, 2.8728, 3.4214, 3.6800, 2.6999, 2.3202, 3.9376], + device='cuda:2'), covar=tensor([0.0282, 0.3234, 0.2419, 0.2023, 0.0599, 0.2996, 0.2640, 0.0206], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0222, 0.0232, 0.0261, 0.0199, 0.0233, 0.0208, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 17:57:01,352 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.273e+02 2.086e+02 2.661e+02 3.167e+02 7.420e+02, threshold=5.321e+02, percent-clipped=1.0 +2022-11-15 17:57:05,418 INFO [train.py:876] (2/4) Epoch 4, batch 1900, loss[loss=0.2027, simple_loss=0.1964, pruned_loss=0.1044, over 5770.00 frames. ], tot_loss[loss=0.2054, simple_loss=0.1984, pruned_loss=0.1062, over 1085973.48 frames. ], batch size: 16, lr: 1.98e-02, grad_scale: 8.0 +2022-11-15 17:57:39,167 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23764.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:58:09,479 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5187, 2.3348, 1.6119, 2.4460, 1.7340, 2.1763, 2.1798, 2.1339], + device='cuda:2'), covar=tensor([0.0611, 0.0926, 0.2172, 0.0835, 0.1904, 0.0893, 0.1309, 0.6609], + device='cuda:2'), in_proj_covar=tensor([0.0043, 0.0054, 0.0063, 0.0045, 0.0064, 0.0049, 0.0060, 0.0044], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 17:58:12,697 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.186e+02 2.073e+02 2.797e+02 3.748e+02 9.524e+02, threshold=5.593e+02, percent-clipped=9.0 +2022-11-15 17:58:13,469 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23812.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:58:16,812 INFO [train.py:876] (2/4) Epoch 4, batch 2000, loss[loss=0.1429, simple_loss=0.1453, pruned_loss=0.07024, over 5150.00 frames. ], tot_loss[loss=0.2031, simple_loss=0.1966, pruned_loss=0.1048, over 1087315.46 frames. ], batch size: 8, lr: 1.97e-02, grad_scale: 8.0 +2022-11-15 17:58:25,488 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2571, 3.3911, 3.2083, 1.1371, 3.2648, 3.5234, 3.4326, 3.7932], + device='cuda:2'), covar=tensor([0.1371, 0.0747, 0.0403, 0.1950, 0.0136, 0.0163, 0.0220, 0.0154], + device='cuda:2'), in_proj_covar=tensor([0.0189, 0.0189, 0.0135, 0.0196, 0.0130, 0.0127, 0.0116, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:58:35,088 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-11-15 17:58:36,708 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23844.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 17:58:51,384 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7022, 3.7022, 3.5021, 3.8303, 3.0989, 2.9164, 4.1932, 3.4454], + device='cuda:2'), covar=tensor([0.0412, 0.0643, 0.0457, 0.0580, 0.0675, 0.0408, 0.0562, 0.0470], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0080, 0.0068, 0.0078, 0.0062, 0.0050, 0.0099, 0.0066], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:58:56,894 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23873.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:58:59,063 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.23 vs. limit=2.0 +2022-11-15 17:59:05,882 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23885.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:59:13,176 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5913, 4.7444, 3.4431, 4.5439, 3.5779, 3.3302, 2.5277, 4.1834], + device='cuda:2'), covar=tensor([0.1301, 0.0104, 0.0595, 0.0173, 0.0333, 0.0660, 0.1481, 0.0114], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0119, 0.0168, 0.0115, 0.0156, 0.0180, 0.0191, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 17:59:13,203 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1123, 2.1886, 1.7018, 2.2241, 1.5308, 2.0572, 2.2113, 2.5885], + device='cuda:2'), covar=tensor([0.0469, 0.1202, 0.2052, 0.1834, 0.1618, 0.0892, 0.1409, 0.0992], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0052, 0.0062, 0.0045, 0.0061, 0.0048, 0.0057, 0.0043], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 17:59:21,086 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.6576, 4.9804, 5.4689, 5.0406, 5.7004, 5.6563, 4.7920, 5.6176], + device='cuda:2'), covar=tensor([0.0251, 0.0174, 0.0311, 0.0219, 0.0239, 0.0045, 0.0186, 0.0130], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0097, 0.0077, 0.0103, 0.0101, 0.0060, 0.0084, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:59:23,775 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.127e+02 2.154e+02 2.632e+02 3.272e+02 6.334e+02, threshold=5.263e+02, percent-clipped=2.0 +2022-11-15 17:59:27,896 INFO [train.py:876] (2/4) Epoch 4, batch 2100, loss[loss=0.2298, simple_loss=0.2202, pruned_loss=0.1197, over 5601.00 frames. ], tot_loss[loss=0.2027, simple_loss=0.1968, pruned_loss=0.1043, over 1091507.64 frames. ], batch size: 24, lr: 1.97e-02, grad_scale: 8.0 +2022-11-15 17:59:31,170 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23921.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:59:39,170 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23933.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 17:59:40,606 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9242, 4.5596, 3.8500, 4.5230, 4.5178, 3.6666, 4.0514, 3.8090], + device='cuda:2'), covar=tensor([0.0518, 0.0374, 0.1328, 0.0321, 0.0391, 0.0437, 0.0648, 0.0846], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0115, 0.0179, 0.0111, 0.0144, 0.0125, 0.0123, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 17:59:53,918 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4031, 2.3282, 2.1678, 2.5101, 2.3571, 2.2059, 2.6820, 2.4506], + device='cuda:2'), covar=tensor([0.0531, 0.0880, 0.0691, 0.0723, 0.0547, 0.0392, 0.0880, 0.0569], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0081, 0.0068, 0.0078, 0.0062, 0.0050, 0.0099, 0.0066], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:00:09,323 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-11-15 18:00:34,706 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.500e+02 2.023e+02 2.446e+02 2.916e+02 5.131e+02, threshold=4.891e+02, percent-clipped=0.0 +2022-11-15 18:00:38,751 INFO [train.py:876] (2/4) Epoch 4, batch 2200, loss[loss=0.2146, simple_loss=0.1829, pruned_loss=0.1231, over 4162.00 frames. ], tot_loss[loss=0.2026, simple_loss=0.1969, pruned_loss=0.1042, over 1087867.88 frames. ], batch size: 181, lr: 1.97e-02, grad_scale: 16.0 +2022-11-15 18:00:53,040 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24037.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:00:53,744 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3493, 4.7721, 3.5380, 2.1284, 4.4439, 2.0230, 4.3743, 2.7124], + device='cuda:2'), covar=tensor([0.0808, 0.0094, 0.0384, 0.1798, 0.0123, 0.1586, 0.0101, 0.1374], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0090, 0.0095, 0.0118, 0.0095, 0.0129, 0.0082, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0002, 0.0004], + device='cuda:2') +2022-11-15 18:01:04,004 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9371, 4.5740, 3.9825, 4.5923, 4.5029, 3.8416, 4.1120, 3.6973], + device='cuda:2'), covar=tensor([0.0413, 0.0284, 0.0916, 0.0278, 0.0329, 0.0310, 0.0275, 0.0480], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0119, 0.0188, 0.0116, 0.0149, 0.0130, 0.0127, 0.0114], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:01:28,979 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3015, 4.2063, 2.7520, 4.0574, 3.1878, 2.9184, 2.1773, 3.5634], + device='cuda:2'), covar=tensor([0.1490, 0.0118, 0.0969, 0.0195, 0.0419, 0.0862, 0.1813, 0.0191], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0116, 0.0165, 0.0113, 0.0153, 0.0177, 0.0188, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 18:01:36,329 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24098.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:01:45,260 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.100e+02 2.713e+02 3.440e+02 7.345e+02, threshold=5.426e+02, percent-clipped=3.0 +2022-11-15 18:01:50,143 INFO [train.py:876] (2/4) Epoch 4, batch 2300, loss[loss=0.2167, simple_loss=0.2104, pruned_loss=0.1115, over 5574.00 frames. ], tot_loss[loss=0.2016, simple_loss=0.1963, pruned_loss=0.1035, over 1091340.72 frames. ], batch size: 25, lr: 1.96e-02, grad_scale: 16.0 +2022-11-15 18:02:00,060 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-15 18:02:02,421 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2187, 4.0558, 2.8861, 3.9608, 3.0294, 2.8902, 2.1077, 3.4236], + device='cuda:2'), covar=tensor([0.1392, 0.0124, 0.0837, 0.0165, 0.0513, 0.0789, 0.1808, 0.0169], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0117, 0.0165, 0.0114, 0.0154, 0.0178, 0.0190, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 18:02:08,830 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=24144.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 18:02:20,221 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6813, 2.4682, 2.5784, 2.3933, 2.7208, 2.4299, 2.6118, 2.6684], + device='cuda:2'), covar=tensor([0.0580, 0.0406, 0.0579, 0.0499, 0.0545, 0.0287, 0.0332, 0.0619], + device='cuda:2'), in_proj_covar=tensor([0.0093, 0.0098, 0.0077, 0.0103, 0.0104, 0.0061, 0.0085, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:02:42,465 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=24192.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:02:43,261 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.7514, 5.1310, 5.3608, 5.2549, 5.7853, 5.7726, 5.0840, 5.6678], + device='cuda:2'), covar=tensor([0.0321, 0.0205, 0.0491, 0.0220, 0.0273, 0.0065, 0.0156, 0.0187], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0097, 0.0076, 0.0103, 0.0103, 0.0061, 0.0084, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:02:56,251 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.235e+02 1.961e+02 2.462e+02 3.190e+02 6.497e+02, threshold=4.924e+02, percent-clipped=3.0 +2022-11-15 18:02:59,802 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3788, 3.0173, 3.2326, 2.9823, 3.4086, 3.1586, 3.1944, 3.3222], + device='cuda:2'), covar=tensor([0.0389, 0.0319, 0.0427, 0.0373, 0.0374, 0.0162, 0.0258, 0.0439], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0097, 0.0076, 0.0102, 0.0102, 0.0061, 0.0084, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:03:00,411 INFO [train.py:876] (2/4) Epoch 4, batch 2400, loss[loss=0.1978, simple_loss=0.1826, pruned_loss=0.1065, over 5006.00 frames. ], tot_loss[loss=0.199, simple_loss=0.1939, pruned_loss=0.102, over 1082498.44 frames. ], batch size: 109, lr: 1.96e-02, grad_scale: 16.0 +2022-11-15 18:03:07,386 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.41 vs. limit=5.0 +2022-11-15 18:03:57,870 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7983, 1.5436, 1.6666, 0.8729, 1.1770, 1.2648, 0.9650, 1.5424], + device='cuda:2'), covar=tensor([0.0023, 0.0013, 0.0011, 0.0021, 0.0023, 0.0014, 0.0023, 0.0013], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0020, 0.0019, 0.0019, 0.0021, 0.0018, 0.0020, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.4838e-05, 2.3870e-05, 1.9263e-05, 1.9650e-05, 2.1305e-05, 1.7108e-05, + 2.8649e-05, 1.8832e-05], device='cuda:2') +2022-11-15 18:04:02,081 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 18:04:07,099 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.243e+02 2.091e+02 2.435e+02 3.216e+02 5.165e+02, threshold=4.869e+02, percent-clipped=2.0 +2022-11-15 18:04:11,700 INFO [train.py:876] (2/4) Epoch 4, batch 2500, loss[loss=0.2046, simple_loss=0.209, pruned_loss=0.1001, over 5790.00 frames. ], tot_loss[loss=0.1973, simple_loss=0.1931, pruned_loss=0.1007, over 1093343.37 frames. ], batch size: 20, lr: 1.96e-02, grad_scale: 16.0 +2022-11-15 18:04:34,133 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9108, 0.5949, 0.7791, 1.1168, 1.3337, 1.3859, 1.0524, 1.2020], + device='cuda:2'), covar=tensor([0.0542, 0.0313, 0.0722, 0.0766, 0.0299, 0.0273, 0.0409, 0.0500], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0011, 0.0009, 0.0008, 0.0009, 0.0008, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([2.9629e-05, 3.4813e-05, 3.1269e-05, 3.2038e-05, 3.2431e-05, 2.9950e-05, + 3.0537e-05, 3.1405e-05], device='cuda:2') +2022-11-15 18:05:05,216 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=24393.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:05:18,631 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.341e+02 2.064e+02 2.482e+02 3.355e+02 5.467e+02, threshold=4.964e+02, percent-clipped=1.0 +2022-11-15 18:05:22,806 INFO [train.py:876] (2/4) Epoch 4, batch 2600, loss[loss=0.1401, simple_loss=0.1585, pruned_loss=0.06088, over 5717.00 frames. ], tot_loss[loss=0.1974, simple_loss=0.1929, pruned_loss=0.101, over 1094068.19 frames. ], batch size: 12, lr: 1.95e-02, grad_scale: 16.0 +2022-11-15 18:06:06,939 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.49 vs. limit=5.0 +2022-11-15 18:06:29,496 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.062e+02 1.874e+02 2.327e+02 3.027e+02 6.734e+02, threshold=4.654e+02, percent-clipped=5.0 +2022-11-15 18:06:33,928 INFO [train.py:876] (2/4) Epoch 4, batch 2700, loss[loss=0.2298, simple_loss=0.202, pruned_loss=0.1288, over 4217.00 frames. ], tot_loss[loss=0.197, simple_loss=0.1924, pruned_loss=0.1008, over 1086800.93 frames. ], batch size: 181, lr: 1.95e-02, grad_scale: 16.0 +2022-11-15 18:06:46,133 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-11-15 18:06:55,309 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24547.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:07:11,181 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24569.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:07:11,529 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-15 18:07:38,828 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24608.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:07:40,661 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.287e+02 2.178e+02 2.525e+02 3.048e+02 6.598e+02, threshold=5.050e+02, percent-clipped=4.0 +2022-11-15 18:07:45,022 INFO [train.py:876] (2/4) Epoch 4, batch 2800, loss[loss=0.1518, simple_loss=0.162, pruned_loss=0.07081, over 5676.00 frames. ], tot_loss[loss=0.1972, simple_loss=0.193, pruned_loss=0.1007, over 1089472.46 frames. ], batch size: 11, lr: 1.94e-02, grad_scale: 16.0 +2022-11-15 18:07:54,356 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24630.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:08:38,939 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=24693.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:08:51,552 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 1.858e+02 2.475e+02 3.106e+02 5.971e+02, threshold=4.951e+02, percent-clipped=4.0 +2022-11-15 18:08:55,661 INFO [train.py:876] (2/4) Epoch 4, batch 2900, loss[loss=0.1376, simple_loss=0.1562, pruned_loss=0.05948, over 5705.00 frames. ], tot_loss[loss=0.1973, simple_loss=0.1932, pruned_loss=0.1007, over 1091143.30 frames. ], batch size: 12, lr: 1.94e-02, grad_scale: 16.0 +2022-11-15 18:08:56,774 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.30 vs. limit=5.0 +2022-11-15 18:09:13,208 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=24741.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:09:14,931 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.09 vs. limit=2.0 +2022-11-15 18:09:16,468 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.51 vs. limit=2.0 +2022-11-15 18:09:30,516 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24765.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:10:00,370 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-11-15 18:10:03,413 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.408e+02 2.004e+02 2.362e+02 2.844e+02 4.612e+02, threshold=4.725e+02, percent-clipped=0.0 +2022-11-15 18:10:04,270 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4359, 1.1708, 1.1995, 1.7119, 0.9056, 1.1719, 1.2510, 1.6966], + device='cuda:2'), covar=tensor([0.0888, 0.1797, 0.1229, 0.0605, 0.1710, 0.1976, 0.1422, 0.0736], + device='cuda:2'), in_proj_covar=tensor([0.0042, 0.0049, 0.0058, 0.0040, 0.0055, 0.0048, 0.0055, 0.0040], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 18:10:07,566 INFO [train.py:876] (2/4) Epoch 4, batch 3000, loss[loss=0.183, simple_loss=0.1841, pruned_loss=0.09094, over 5569.00 frames. ], tot_loss[loss=0.1982, simple_loss=0.1931, pruned_loss=0.1016, over 1077383.46 frames. ], batch size: 21, lr: 1.94e-02, grad_scale: 16.0 +2022-11-15 18:10:07,567 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 18:10:18,363 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7877, 3.5232, 3.4258, 3.1535, 3.7107, 3.4474, 1.3489, 3.8097], + device='cuda:2'), covar=tensor([0.0395, 0.0478, 0.0340, 0.0404, 0.0422, 0.0341, 0.3558, 0.0304], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0068, 0.0070, 0.0060, 0.0084, 0.0064, 0.0122, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:10:26,938 INFO [train.py:908] (2/4) Epoch 4, validation: loss=0.1712, simple_loss=0.1916, pruned_loss=0.07544, over 1530663.00 frames. +2022-11-15 18:10:26,939 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 18:10:33,619 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24826.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:10:48,981 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24848.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:10:52,319 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9262, 4.1586, 4.1729, 4.3181, 3.5937, 3.2003, 4.7766, 3.7781], + device='cuda:2'), covar=tensor([0.0537, 0.0784, 0.0396, 0.0724, 0.0555, 0.0330, 0.0718, 0.0464], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0078, 0.0065, 0.0077, 0.0061, 0.0048, 0.0096, 0.0063], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 18:11:04,007 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.06 vs. limit=2.0 +2022-11-15 18:11:28,021 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=24903.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:11:32,531 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24909.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:11:34,401 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.259e+02 1.907e+02 2.538e+02 3.082e+02 5.544e+02, threshold=5.077e+02, percent-clipped=4.0 +2022-11-15 18:11:37,841 INFO [train.py:876] (2/4) Epoch 4, batch 3100, loss[loss=0.1819, simple_loss=0.1834, pruned_loss=0.09021, over 5680.00 frames. ], tot_loss[loss=0.1975, simple_loss=0.1927, pruned_loss=0.1012, over 1075472.30 frames. ], batch size: 34, lr: 1.93e-02, grad_scale: 8.0 +2022-11-15 18:11:43,320 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=24925.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:12:33,559 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24995.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:12:47,108 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.01 vs. limit=2.0 +2022-11-15 18:12:48,706 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.164e+02 2.138e+02 2.743e+02 3.438e+02 6.298e+02, threshold=5.486e+02, percent-clipped=1.0 +2022-11-15 18:12:51,282 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2543, 1.9183, 1.3554, 2.5048, 1.4061, 1.4404, 1.6146, 2.4689], + device='cuda:2'), covar=tensor([0.0435, 0.0986, 0.1888, 0.0763, 0.1536, 0.0956, 0.1200, 0.0505], + device='cuda:2'), in_proj_covar=tensor([0.0043, 0.0050, 0.0063, 0.0041, 0.0056, 0.0049, 0.0056, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 18:12:52,449 INFO [train.py:876] (2/4) Epoch 4, batch 3200, loss[loss=0.2036, simple_loss=0.1964, pruned_loss=0.1054, over 5686.00 frames. ], tot_loss[loss=0.1989, simple_loss=0.1941, pruned_loss=0.1019, over 1078626.47 frames. ], batch size: 28, lr: 1.93e-02, grad_scale: 8.0 +2022-11-15 18:12:52,815 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 18:12:58,742 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1014, 4.3325, 3.9501, 3.7009, 2.7047, 4.8724, 2.7649, 4.3642], + device='cuda:2'), covar=tensor([0.0267, 0.0144, 0.0130, 0.0167, 0.0323, 0.0041, 0.0209, 0.0031], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0097, 0.0113, 0.0119, 0.0149, 0.0111, 0.0129, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:13:20,279 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=25056.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 18:13:25,012 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1813, 4.3977, 3.9559, 3.9236, 4.2995, 3.9344, 1.7495, 4.2975], + device='cuda:2'), covar=tensor([0.0324, 0.0235, 0.0350, 0.0302, 0.0360, 0.0387, 0.2992, 0.0251], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0069, 0.0072, 0.0060, 0.0087, 0.0067, 0.0128, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 18:13:29,702 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.2504, 5.7068, 5.8989, 6.0597, 5.3479, 4.4306, 6.4414, 5.3584], + device='cuda:2'), covar=tensor([0.0301, 0.0281, 0.0235, 0.0330, 0.0215, 0.0228, 0.0432, 0.0424], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0078, 0.0066, 0.0078, 0.0060, 0.0050, 0.0097, 0.0064], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:13:46,976 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-11-15 18:14:01,367 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.332e+02 2.126e+02 2.620e+02 3.307e+02 8.462e+02, threshold=5.241e+02, percent-clipped=1.0 +2022-11-15 18:14:05,061 INFO [train.py:876] (2/4) Epoch 4, batch 3300, loss[loss=0.1802, simple_loss=0.1813, pruned_loss=0.08952, over 5457.00 frames. ], tot_loss[loss=0.2004, simple_loss=0.1953, pruned_loss=0.1027, over 1080017.82 frames. ], batch size: 11, lr: 1.93e-02, grad_scale: 8.0 +2022-11-15 18:14:08,002 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=25121.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:14:18,897 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.40 vs. limit=5.0 +2022-11-15 18:14:33,078 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 18:15:06,856 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25203.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:15:08,286 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=25204.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:15:13,949 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.413e+02 1.803e+02 2.408e+02 3.153e+02 7.479e+02, threshold=4.816e+02, percent-clipped=4.0 +2022-11-15 18:15:17,452 INFO [train.py:876] (2/4) Epoch 4, batch 3400, loss[loss=0.2198, simple_loss=0.1987, pruned_loss=0.1204, over 4715.00 frames. ], tot_loss[loss=0.201, simple_loss=0.1951, pruned_loss=0.1035, over 1083775.37 frames. ], batch size: 135, lr: 1.92e-02, grad_scale: 8.0 +2022-11-15 18:15:23,189 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25225.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:15:27,979 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2794, 4.4748, 3.2192, 4.2341, 3.3165, 3.2106, 2.4012, 3.7568], + device='cuda:2'), covar=tensor([0.1446, 0.0114, 0.0666, 0.0163, 0.0514, 0.0664, 0.1689, 0.0167], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0116, 0.0159, 0.0114, 0.0150, 0.0171, 0.0187, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 18:15:32,315 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-11-15 18:15:33,354 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.09 vs. limit=5.0 +2022-11-15 18:15:41,065 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25251.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:15:56,864 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25273.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:15:59,287 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.53 vs. limit=5.0 +2022-11-15 18:16:03,219 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.6562, 0.4167, 0.6033, 0.6373, 0.7166, 0.7765, 0.6838, 0.7689], + device='cuda:2'), covar=tensor([0.0220, 0.0293, 0.0210, 0.0271, 0.0182, 0.0110, 0.0218, 0.0265], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0011, 0.0010, 0.0009, 0.0009, 0.0008, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.0428e-05, 3.7731e-05, 3.3867e-05, 3.3082e-05, 3.3573e-05, 3.1164e-05, + 3.3385e-05, 3.1789e-05], device='cuda:2') +2022-11-15 18:16:23,801 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.200e+02 1.900e+02 2.223e+02 2.867e+02 4.399e+02, threshold=4.447e+02, percent-clipped=0.0 +2022-11-15 18:16:27,921 INFO [train.py:876] (2/4) Epoch 4, batch 3500, loss[loss=0.206, simple_loss=0.1818, pruned_loss=0.1151, over 5652.00 frames. ], tot_loss[loss=0.2003, simple_loss=0.1947, pruned_loss=0.1029, over 1085539.80 frames. ], batch size: 29, lr: 1.92e-02, grad_scale: 8.0 +2022-11-15 18:16:51,509 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=25351.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 18:17:01,460 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5875, 4.6745, 4.7670, 5.0207, 4.2681, 3.6460, 5.5242, 4.6061], + device='cuda:2'), covar=tensor([0.0436, 0.0723, 0.0306, 0.0640, 0.0535, 0.0338, 0.0479, 0.0384], + device='cuda:2'), in_proj_covar=tensor([0.0062, 0.0080, 0.0068, 0.0080, 0.0061, 0.0053, 0.0098, 0.0067], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:17:04,201 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0040, 3.6849, 2.4886, 3.4825, 2.6404, 2.5757, 1.8454, 3.0631], + device='cuda:2'), covar=tensor([0.1355, 0.0133, 0.0871, 0.0186, 0.0580, 0.0839, 0.1828, 0.0185], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0120, 0.0166, 0.0117, 0.0154, 0.0177, 0.0192, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 18:17:19,942 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.13 vs. limit=2.0 +2022-11-15 18:17:34,602 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.212e+02 2.249e+02 2.867e+02 3.402e+02 1.165e+03, threshold=5.733e+02, percent-clipped=11.0 +2022-11-15 18:17:38,072 INFO [train.py:876] (2/4) Epoch 4, batch 3600, loss[loss=0.2367, simple_loss=0.221, pruned_loss=0.1262, over 5317.00 frames. ], tot_loss[loss=0.1984, simple_loss=0.1933, pruned_loss=0.1017, over 1090000.59 frames. ], batch size: 79, lr: 1.91e-02, grad_scale: 8.0 +2022-11-15 18:17:41,232 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25421.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:17:46,136 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=25428.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:18:15,273 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25469.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:18:30,211 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=25489.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 18:18:31,436 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5268, 3.5212, 3.3556, 3.0995, 3.6045, 3.3884, 1.2986, 3.6061], + device='cuda:2'), covar=tensor([0.0286, 0.0282, 0.0327, 0.0460, 0.0292, 0.0323, 0.3021, 0.0285], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0067, 0.0070, 0.0060, 0.0083, 0.0066, 0.0123, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:18:40,592 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25504.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:18:45,858 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.206e+02 2.020e+02 2.444e+02 3.029e+02 5.693e+02, threshold=4.888e+02, percent-clipped=0.0 +2022-11-15 18:18:49,297 INFO [train.py:876] (2/4) Epoch 4, batch 3700, loss[loss=0.2671, simple_loss=0.2433, pruned_loss=0.1455, over 5430.00 frames. ], tot_loss[loss=0.1985, simple_loss=0.1937, pruned_loss=0.1016, over 1090390.85 frames. ], batch size: 58, lr: 1.91e-02, grad_scale: 8.0 +2022-11-15 18:19:14,619 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25552.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:19:57,015 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.296e+02 2.082e+02 2.548e+02 3.444e+02 5.526e+02, threshold=5.097e+02, percent-clipped=3.0 +2022-11-15 18:20:00,476 INFO [train.py:876] (2/4) Epoch 4, batch 3800, loss[loss=0.1919, simple_loss=0.1893, pruned_loss=0.09727, over 4969.00 frames. ], tot_loss[loss=0.1974, simple_loss=0.1933, pruned_loss=0.1008, over 1087283.88 frames. ], batch size: 5, lr: 1.91e-02, grad_scale: 8.0 +2022-11-15 18:20:24,494 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25651.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:20:31,261 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.74 vs. limit=5.0 +2022-11-15 18:20:58,561 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25699.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:21:01,767 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3955, 1.2270, 2.1159, 1.5560, 1.1874, 0.8066, 2.1481, 1.2662], + device='cuda:2'), covar=tensor([0.0018, 0.0109, 0.0025, 0.0020, 0.0078, 0.0074, 0.0018, 0.0025], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0014, 0.0014, 0.0014, 0.0015, 0.0014], + device='cuda:2'), out_proj_covar=tensor([1.4399e-05, 1.6148e-05, 1.4010e-05, 1.6150e-05, 1.4056e-05, 1.6017e-05, + 1.7387e-05, 1.7660e-05], device='cuda:2') +2022-11-15 18:21:08,419 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.968e+01 1.976e+02 2.317e+02 2.935e+02 6.102e+02, threshold=4.634e+02, percent-clipped=2.0 +2022-11-15 18:21:11,942 INFO [train.py:876] (2/4) Epoch 4, batch 3900, loss[loss=0.2263, simple_loss=0.2101, pruned_loss=0.1212, over 5441.00 frames. ], tot_loss[loss=0.1969, simple_loss=0.193, pruned_loss=0.1003, over 1088762.39 frames. ], batch size: 53, lr: 1.90e-02, grad_scale: 8.0 +2022-11-15 18:21:13,842 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.05 vs. limit=2.0 +2022-11-15 18:21:59,891 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=25784.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 18:22:20,040 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.927e+01 1.909e+02 2.418e+02 3.208e+02 6.074e+02, threshold=4.836e+02, percent-clipped=3.0 +2022-11-15 18:22:24,179 INFO [train.py:876] (2/4) Epoch 4, batch 4000, loss[loss=0.237, simple_loss=0.2149, pruned_loss=0.1296, over 5461.00 frames. ], tot_loss[loss=0.197, simple_loss=0.1933, pruned_loss=0.1004, over 1085612.76 frames. ], batch size: 58, lr: 1.90e-02, grad_scale: 8.0 +2022-11-15 18:22:52,065 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=25858.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 18:22:53,078 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6213, 4.0720, 4.3182, 4.1294, 4.6195, 4.2194, 4.0873, 4.5350], + device='cuda:2'), covar=tensor([0.0259, 0.0229, 0.0432, 0.0250, 0.0302, 0.0210, 0.0195, 0.0244], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0094, 0.0079, 0.0101, 0.0100, 0.0059, 0.0086, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:23:16,374 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.14 vs. limit=2.0 +2022-11-15 18:23:20,974 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7579, 1.9744, 2.1018, 2.2974, 0.7820, 2.5403, 2.0750, 1.8649], + device='cuda:2'), covar=tensor([0.0363, 0.0243, 0.0244, 0.0448, 0.1678, 0.0385, 0.0763, 0.0537], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0031, 0.0033, 0.0035, 0.0031, 0.0027, 0.0028, 0.0034], + device='cuda:2'), out_proj_covar=tensor([5.8798e-05, 4.9045e-05, 5.0785e-05, 6.6557e-05, 5.5610e-05, 5.0895e-05, + 4.9981e-05, 5.7858e-05], device='cuda:2') +2022-11-15 18:23:30,895 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.220e+02 1.955e+02 2.286e+02 3.069e+02 5.405e+02, threshold=4.573e+02, percent-clipped=2.0 +2022-11-15 18:23:35,102 INFO [train.py:876] (2/4) Epoch 4, batch 4100, loss[loss=0.09573, simple_loss=0.1127, pruned_loss=0.03937, over 4028.00 frames. ], tot_loss[loss=0.195, simple_loss=0.1915, pruned_loss=0.09922, over 1085157.96 frames. ], batch size: 4, lr: 1.90e-02, grad_scale: 8.0 +2022-11-15 18:23:37,052 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=25919.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 18:23:37,686 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8898, 4.4316, 3.8311, 4.3921, 4.4586, 3.7999, 3.9037, 3.6660], + device='cuda:2'), covar=tensor([0.0307, 0.0287, 0.1104, 0.0375, 0.0344, 0.0327, 0.0401, 0.0435], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0116, 0.0186, 0.0118, 0.0148, 0.0131, 0.0123, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:23:53,366 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1835, 1.5551, 1.7543, 1.3167, 0.5170, 1.7569, 1.4943, 1.2815], + device='cuda:2'), covar=tensor([0.0273, 0.0208, 0.0241, 0.0446, 0.0650, 0.0197, 0.0648, 0.0507], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0032, 0.0033, 0.0034, 0.0030, 0.0027, 0.0028, 0.0033], + device='cuda:2'), out_proj_covar=tensor([5.7859e-05, 4.9521e-05, 5.0027e-05, 6.5185e-05, 5.4537e-05, 4.9822e-05, + 4.9269e-05, 5.6691e-05], device='cuda:2') +2022-11-15 18:24:42,054 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.193e+02 2.057e+02 2.578e+02 3.231e+02 5.725e+02, threshold=5.157e+02, percent-clipped=4.0 +2022-11-15 18:24:45,827 INFO [train.py:876] (2/4) Epoch 4, batch 4200, loss[loss=0.1718, simple_loss=0.1928, pruned_loss=0.07544, over 5580.00 frames. ], tot_loss[loss=0.1988, simple_loss=0.1941, pruned_loss=0.1018, over 1079127.86 frames. ], batch size: 22, lr: 1.89e-02, grad_scale: 8.0 +2022-11-15 18:25:24,907 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8794, 0.8509, 0.8084, 0.6774, 1.1026, 1.0129, 1.0672, 1.1252], + device='cuda:2'), covar=tensor([0.0359, 0.0351, 0.0522, 0.0718, 0.0452, 0.0143, 0.0449, 0.0644], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0012, 0.0010, 0.0010, 0.0010, 0.0009, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.1381e-05, 4.0365e-05, 3.6137e-05, 3.7275e-05, 3.6269e-05, 3.2836e-05, + 3.5453e-05, 3.5109e-05], device='cuda:2') +2022-11-15 18:25:34,267 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26084.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 18:25:46,683 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26102.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:25:53,376 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.334e+02 1.847e+02 2.412e+02 3.140e+02 6.094e+02, threshold=4.824e+02, percent-clipped=3.0 +2022-11-15 18:25:56,762 INFO [train.py:876] (2/4) Epoch 4, batch 4300, loss[loss=0.2077, simple_loss=0.2068, pruned_loss=0.1043, over 5681.00 frames. ], tot_loss[loss=0.1981, simple_loss=0.1936, pruned_loss=0.1013, over 1082041.47 frames. ], batch size: 36, lr: 1.89e-02, grad_scale: 8.0 +2022-11-15 18:26:08,216 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26132.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:26:29,694 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26163.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:26:33,031 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26168.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:26:38,853 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8039, 2.0572, 2.1600, 2.1582, 1.3205, 2.0317, 1.3267, 1.0432], + device='cuda:2'), covar=tensor([0.0122, 0.0031, 0.0064, 0.0051, 0.0145, 0.0059, 0.0126, 0.0078], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0098, 0.0114, 0.0118, 0.0147, 0.0111, 0.0130, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:26:46,873 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2879, 4.8780, 3.7218, 2.1392, 4.7088, 1.6350, 4.5490, 2.7270], + device='cuda:2'), covar=tensor([0.0860, 0.0077, 0.0347, 0.1782, 0.0087, 0.1937, 0.0100, 0.1388], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0096, 0.0102, 0.0121, 0.0099, 0.0135, 0.0087, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 18:26:48,957 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1007, 4.6549, 4.0143, 4.6228, 4.5754, 3.7084, 4.0752, 3.8528], + device='cuda:2'), covar=tensor([0.0343, 0.0354, 0.1255, 0.0275, 0.0293, 0.0444, 0.0353, 0.0425], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0120, 0.0197, 0.0120, 0.0153, 0.0132, 0.0128, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:27:03,605 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9763, 2.1487, 3.4329, 2.8307, 4.1792, 2.2693, 3.3890, 3.8651], + device='cuda:2'), covar=tensor([0.0182, 0.0822, 0.0368, 0.0814, 0.0142, 0.0807, 0.0541, 0.0406], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0180, 0.0158, 0.0192, 0.0146, 0.0172, 0.0213, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 18:27:04,695 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.262e+02 2.078e+02 2.562e+02 3.309e+02 6.433e+02, threshold=5.124e+02, percent-clipped=7.0 +2022-11-15 18:27:06,223 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26214.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 18:27:08,148 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-11-15 18:27:08,254 INFO [train.py:876] (2/4) Epoch 4, batch 4400, loss[loss=0.2591, simple_loss=0.2217, pruned_loss=0.1482, over 5625.00 frames. ], tot_loss[loss=0.195, simple_loss=0.1914, pruned_loss=0.09929, over 1082893.55 frames. ], batch size: 32, lr: 1.89e-02, grad_scale: 8.0 +2022-11-15 18:27:16,430 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26229.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:27:32,220 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26250.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:27:35,077 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3523, 3.7753, 3.2323, 3.3580, 2.2750, 3.8481, 2.1667, 2.8764], + device='cuda:2'), covar=tensor([0.0274, 0.0077, 0.0107, 0.0156, 0.0273, 0.0059, 0.0226, 0.0112], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0096, 0.0110, 0.0116, 0.0144, 0.0109, 0.0127, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:27:47,068 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.4940, 4.7080, 5.2355, 4.8278, 5.4983, 5.2775, 4.4966, 5.4206], + device='cuda:2'), covar=tensor([0.0237, 0.0225, 0.0329, 0.0229, 0.0251, 0.0079, 0.0245, 0.0216], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0098, 0.0078, 0.0104, 0.0102, 0.0062, 0.0088, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:28:07,281 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0916, 4.7577, 4.1336, 4.7204, 4.6737, 3.8074, 4.3276, 4.1039], + device='cuda:2'), covar=tensor([0.0311, 0.0280, 0.1145, 0.0292, 0.0272, 0.0472, 0.0374, 0.0487], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0121, 0.0195, 0.0120, 0.0152, 0.0130, 0.0127, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:28:15,304 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26311.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:28:15,740 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.392e+02 2.026e+02 2.337e+02 2.838e+02 7.809e+02, threshold=4.674e+02, percent-clipped=2.0 +2022-11-15 18:28:19,244 INFO [train.py:876] (2/4) Epoch 4, batch 4500, loss[loss=0.2021, simple_loss=0.1919, pruned_loss=0.1061, over 5735.00 frames. ], tot_loss[loss=0.195, simple_loss=0.1914, pruned_loss=0.09932, over 1081951.89 frames. ], batch size: 11, lr: 1.88e-02, grad_scale: 8.0 +2022-11-15 18:28:32,707 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4265, 3.5903, 3.2608, 3.3049, 2.0859, 3.4845, 2.2005, 3.1712], + device='cuda:2'), covar=tensor([0.0242, 0.0077, 0.0093, 0.0162, 0.0273, 0.0074, 0.0208, 0.0052], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0098, 0.0112, 0.0120, 0.0147, 0.0111, 0.0131, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:29:27,468 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.298e+02 1.949e+02 2.521e+02 3.180e+02 6.694e+02, threshold=5.042e+02, percent-clipped=2.0 +2022-11-15 18:29:29,007 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26414.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:29:31,201 INFO [train.py:876] (2/4) Epoch 4, batch 4600, loss[loss=0.1576, simple_loss=0.1542, pruned_loss=0.08045, over 4994.00 frames. ], tot_loss[loss=0.1932, simple_loss=0.1906, pruned_loss=0.0979, over 1088311.00 frames. ], batch size: 7, lr: 1.88e-02, grad_scale: 8.0 +2022-11-15 18:29:33,604 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.30 vs. limit=5.0 +2022-11-15 18:29:41,597 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1154, 1.2021, 1.1486, 0.9562, 1.3145, 0.9735, 1.1990, 1.0180], + device='cuda:2'), covar=tensor([0.0337, 0.0212, 0.0196, 0.0858, 0.0328, 0.0125, 0.0430, 0.0225], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0011, 0.0009, 0.0010, 0.0009, 0.0008, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.0766e-05, 3.8230e-05, 3.4309e-05, 3.7161e-05, 3.3552e-05, 3.0745e-05, + 3.3865e-05, 3.2715e-05], device='cuda:2') +2022-11-15 18:29:42,285 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6648, 1.8906, 2.7576, 3.1729, 3.6741, 2.6077, 1.8942, 3.9028], + device='cuda:2'), covar=tensor([0.0202, 0.4799, 0.2599, 0.3405, 0.0780, 0.3197, 0.3079, 0.0178], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0213, 0.0221, 0.0286, 0.0201, 0.0224, 0.0198, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 18:29:53,876 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9774, 1.3395, 2.0039, 1.7950, 1.9306, 1.2841, 1.6068, 2.0037], + device='cuda:2'), covar=tensor([0.0061, 0.0239, 0.0050, 0.0079, 0.0070, 0.0316, 0.0143, 0.0066], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0182, 0.0161, 0.0193, 0.0147, 0.0172, 0.0212, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 18:29:59,368 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26458.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:30:02,532 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0778, 2.2883, 3.5435, 3.7089, 4.3434, 3.2828, 2.5064, 4.1845], + device='cuda:2'), covar=tensor([0.0158, 0.4362, 0.2101, 0.4377, 0.0429, 0.3070, 0.2422, 0.0214], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0213, 0.0221, 0.0286, 0.0201, 0.0226, 0.0198, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 18:30:06,455 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 18:30:12,144 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26475.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:30:14,103 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8529, 4.3205, 3.6913, 4.3110, 4.3061, 3.5959, 3.8545, 3.4432], + device='cuda:2'), covar=tensor([0.0427, 0.0336, 0.1240, 0.0353, 0.0421, 0.0345, 0.0360, 0.0673], + device='cuda:2'), in_proj_covar=tensor([0.0108, 0.0126, 0.0201, 0.0127, 0.0157, 0.0133, 0.0131, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:30:31,833 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1495, 4.7862, 3.5016, 4.4845, 4.5773, 4.3396, 4.7872, 3.9809], + device='cuda:2'), covar=tensor([0.0378, 0.0508, 0.2460, 0.0783, 0.0670, 0.0464, 0.0331, 0.0889], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0127, 0.0200, 0.0127, 0.0157, 0.0134, 0.0131, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:30:37,113 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.423e+02 1.888e+02 2.493e+02 3.125e+02 5.400e+02, threshold=4.987e+02, percent-clipped=2.0 +2022-11-15 18:30:38,689 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26514.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 18:30:40,594 INFO [train.py:876] (2/4) Epoch 4, batch 4700, loss[loss=0.2042, simple_loss=0.2015, pruned_loss=0.1035, over 5575.00 frames. ], tot_loss[loss=0.1926, simple_loss=0.1903, pruned_loss=0.09741, over 1085020.57 frames. ], batch size: 43, lr: 1.88e-02, grad_scale: 8.0 +2022-11-15 18:30:46,192 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26524.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:31:12,679 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26562.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:31:43,662 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26606.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:31:47,746 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.209e+02 2.139e+02 2.674e+02 3.379e+02 6.556e+02, threshold=5.348e+02, percent-clipped=3.0 +2022-11-15 18:31:51,244 INFO [train.py:876] (2/4) Epoch 4, batch 4800, loss[loss=0.1813, simple_loss=0.1783, pruned_loss=0.09217, over 5553.00 frames. ], tot_loss[loss=0.1948, simple_loss=0.1913, pruned_loss=0.09918, over 1078454.51 frames. ], batch size: 13, lr: 1.87e-02, grad_scale: 8.0 +2022-11-15 18:32:01,978 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0267, 2.4882, 3.1649, 3.9092, 4.2551, 3.2532, 2.5607, 4.0388], + device='cuda:2'), covar=tensor([0.0204, 0.3082, 0.2086, 0.2006, 0.0510, 0.2292, 0.2314, 0.0209], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0210, 0.0218, 0.0282, 0.0202, 0.0223, 0.0199, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 18:32:18,152 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.14 vs. limit=2.0 +2022-11-15 18:32:24,047 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5639, 1.5920, 2.4010, 1.3189, 1.3238, 2.1273, 1.2628, 1.6411], + device='cuda:2'), covar=tensor([0.0317, 0.0356, 0.0250, 0.0611, 0.1344, 0.0683, 0.0817, 0.0516], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0034, 0.0034, 0.0037, 0.0033, 0.0030, 0.0030, 0.0036], + device='cuda:2'), out_proj_covar=tensor([6.2524e-05, 5.3142e-05, 5.3168e-05, 7.0824e-05, 6.0433e-05, 5.4769e-05, + 5.2608e-05, 6.1565e-05], device='cuda:2') +2022-11-15 18:32:26,961 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 18:32:58,142 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.132e+02 1.842e+02 2.215e+02 2.988e+02 4.976e+02, threshold=4.429e+02, percent-clipped=0.0 +2022-11-15 18:33:01,598 INFO [train.py:876] (2/4) Epoch 4, batch 4900, loss[loss=0.1804, simple_loss=0.1886, pruned_loss=0.08607, over 5558.00 frames. ], tot_loss[loss=0.1936, simple_loss=0.1907, pruned_loss=0.09822, over 1080295.66 frames. ], batch size: 16, lr: 1.87e-02, grad_scale: 8.0 +2022-11-15 18:33:12,769 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0343, 2.7607, 2.7165, 2.5547, 1.6765, 2.8007, 1.8192, 1.8883], + device='cuda:2'), covar=tensor([0.0163, 0.0061, 0.0068, 0.0110, 0.0216, 0.0063, 0.0166, 0.0078], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0099, 0.0112, 0.0121, 0.0148, 0.0111, 0.0132, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:33:21,469 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4160, 5.1553, 4.5571, 5.2044, 5.2855, 4.3298, 4.6055, 4.2113], + device='cuda:2'), covar=tensor([0.0202, 0.0343, 0.0927, 0.0333, 0.0276, 0.0335, 0.0408, 0.0422], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0124, 0.0197, 0.0125, 0.0153, 0.0131, 0.0131, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:33:21,771 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.96 vs. limit=5.0 +2022-11-15 18:33:21,891 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-11-15 18:33:31,043 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26758.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:33:39,433 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26770.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:34:05,206 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26806.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:34:09,163 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.286e+02 1.967e+02 2.433e+02 3.040e+02 5.263e+02, threshold=4.865e+02, percent-clipped=3.0 +2022-11-15 18:34:12,162 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3759, 1.7576, 2.1439, 1.2473, 1.0530, 1.9626, 1.2031, 1.6258], + device='cuda:2'), covar=tensor([0.0324, 0.0522, 0.0299, 0.0656, 0.3427, 0.1714, 0.1245, 0.0697], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0035, 0.0035, 0.0039, 0.0035, 0.0032, 0.0031, 0.0038], + device='cuda:2'), out_proj_covar=tensor([6.3026e-05, 5.5199e-05, 5.5469e-05, 7.4542e-05, 6.4178e-05, 5.8195e-05, + 5.5083e-05, 6.5983e-05], device='cuda:2') +2022-11-15 18:34:12,714 INFO [train.py:876] (2/4) Epoch 4, batch 5000, loss[loss=0.1589, simple_loss=0.1722, pruned_loss=0.0728, over 5426.00 frames. ], tot_loss[loss=0.1951, simple_loss=0.1916, pruned_loss=0.09929, over 1078144.61 frames. ], batch size: 11, lr: 1.87e-02, grad_scale: 8.0 +2022-11-15 18:34:17,625 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26824.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:34:51,697 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26872.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:35:14,881 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26904.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:35:16,173 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26906.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:35:20,044 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.243e+02 1.925e+02 2.492e+02 3.201e+02 4.823e+02, threshold=4.984e+02, percent-clipped=0.0 +2022-11-15 18:35:21,838 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3642, 4.2598, 4.3215, 4.4479, 3.7586, 3.5921, 4.9328, 4.2249], + device='cuda:2'), covar=tensor([0.0381, 0.0762, 0.0443, 0.0694, 0.0564, 0.0350, 0.0615, 0.0467], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0079, 0.0066, 0.0080, 0.0062, 0.0053, 0.0101, 0.0066], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 18:35:23,732 INFO [train.py:876] (2/4) Epoch 4, batch 5100, loss[loss=0.1364, simple_loss=0.156, pruned_loss=0.05844, over 5755.00 frames. ], tot_loss[loss=0.1948, simple_loss=0.1916, pruned_loss=0.09906, over 1082380.29 frames. ], batch size: 13, lr: 1.86e-02, grad_scale: 16.0 +2022-11-15 18:35:46,954 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26950.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:35:50,396 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26954.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:35:58,378 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26965.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 18:36:31,264 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=27011.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:36:31,701 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.350e+02 2.232e+02 2.660e+02 3.188e+02 6.115e+02, threshold=5.320e+02, percent-clipped=2.0 +2022-11-15 18:36:35,111 INFO [train.py:876] (2/4) Epoch 4, batch 5200, loss[loss=0.1226, simple_loss=0.1426, pruned_loss=0.05128, over 4488.00 frames. ], tot_loss[loss=0.1923, simple_loss=0.1899, pruned_loss=0.09738, over 1087290.81 frames. ], batch size: 5, lr: 1.86e-02, grad_scale: 16.0 +2022-11-15 18:36:35,505 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-11-15 18:36:36,835 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.35 vs. limit=5.0 +2022-11-15 18:36:41,881 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5927, 1.9285, 2.0251, 1.1984, 0.7438, 2.1933, 1.4153, 1.2431], + device='cuda:2'), covar=tensor([0.0384, 0.0522, 0.0363, 0.1339, 0.2430, 0.0686, 0.1508, 0.0653], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0036, 0.0037, 0.0041, 0.0037, 0.0032, 0.0032, 0.0039], + device='cuda:2'), out_proj_covar=tensor([6.5439e-05, 5.7614e-05, 5.8641e-05, 7.7438e-05, 6.7308e-05, 5.8636e-05, + 5.5918e-05, 6.7803e-05], device='cuda:2') +2022-11-15 18:37:12,638 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=27070.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:37:21,657 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8133, 1.8318, 1.5537, 2.2492, 1.4437, 1.5664, 1.7344, 2.2129], + device='cuda:2'), covar=tensor([0.0606, 0.1220, 0.1506, 0.0625, 0.1552, 0.1068, 0.1173, 0.1082], + device='cuda:2'), in_proj_covar=tensor([0.0047, 0.0055, 0.0067, 0.0043, 0.0062, 0.0051, 0.0061, 0.0045], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 18:37:42,558 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.152e+02 1.845e+02 2.257e+02 3.098e+02 5.332e+02, threshold=4.514e+02, percent-clipped=1.0 +2022-11-15 18:37:45,681 INFO [train.py:876] (2/4) Epoch 4, batch 5300, loss[loss=0.1883, simple_loss=0.1842, pruned_loss=0.09614, over 5347.00 frames. ], tot_loss[loss=0.1936, simple_loss=0.191, pruned_loss=0.0981, over 1087065.02 frames. ], batch size: 9, lr: 1.86e-02, grad_scale: 8.0 +2022-11-15 18:37:46,367 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=27118.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:38:03,946 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0451, 1.4364, 1.7789, 1.3659, 1.0547, 1.3854, 1.4928, 1.0430], + device='cuda:2'), covar=tensor([0.0014, 0.0021, 0.0027, 0.0012, 0.0030, 0.0018, 0.0015, 0.0026], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0013, 0.0016, 0.0016, 0.0015, 0.0016, 0.0016], + device='cuda:2'), out_proj_covar=tensor([1.6163e-05, 1.7195e-05, 1.4999e-05, 1.7013e-05, 1.6466e-05, 1.6743e-05, + 1.7865e-05, 2.0931e-05], device='cuda:2') +2022-11-15 18:38:07,342 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=27148.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:38:50,773 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=27209.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:38:52,074 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3151, 1.1443, 1.5702, 1.6519, 0.8256, 1.2968, 1.7070, 1.4168], + device='cuda:2'), covar=tensor([0.0015, 0.0057, 0.0049, 0.0018, 0.0051, 0.0062, 0.0017, 0.0026], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0014, 0.0013, 0.0015, 0.0015, 0.0015, 0.0016, 0.0016], + device='cuda:2'), out_proj_covar=tensor([1.5923e-05, 1.6707e-05, 1.4417e-05, 1.6509e-05, 1.5898e-05, 1.6480e-05, + 1.7264e-05, 2.0622e-05], device='cuda:2') +2022-11-15 18:38:53,684 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.431e+02 2.106e+02 2.706e+02 3.414e+02 5.155e+02, threshold=5.411e+02, percent-clipped=7.0 +2022-11-15 18:38:56,361 INFO [train.py:876] (2/4) Epoch 4, batch 5400, loss[loss=0.2012, simple_loss=0.201, pruned_loss=0.1008, over 5080.00 frames. ], tot_loss[loss=0.1941, simple_loss=0.1917, pruned_loss=0.09822, over 1090175.58 frames. ], batch size: 91, lr: 1.85e-02, grad_scale: 8.0 +2022-11-15 18:39:26,839 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=27260.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:39:59,499 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=27306.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:40:04,135 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.144e+02 2.138e+02 2.582e+02 3.197e+02 8.943e+02, threshold=5.164e+02, percent-clipped=1.0 +2022-11-15 18:40:07,252 INFO [train.py:876] (2/4) Epoch 4, batch 5500, loss[loss=0.2251, simple_loss=0.2143, pruned_loss=0.118, over 5775.00 frames. ], tot_loss[loss=0.195, simple_loss=0.1921, pruned_loss=0.09897, over 1088655.01 frames. ], batch size: 26, lr: 1.85e-02, grad_scale: 8.0 +2022-11-15 18:40:43,920 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.36 vs. limit=5.0 +2022-11-15 18:40:49,612 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-11-15 18:41:14,018 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4411, 1.6793, 1.8240, 0.7715, 1.0646, 1.6397, 1.2494, 1.3482], + device='cuda:2'), covar=tensor([0.0510, 0.0539, 0.0318, 0.1199, 0.1564, 0.3416, 0.0835, 0.0679], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0037, 0.0037, 0.0042, 0.0035, 0.0032, 0.0031, 0.0040], + device='cuda:2'), out_proj_covar=tensor([6.6264e-05, 5.9079e-05, 5.9361e-05, 7.9944e-05, 6.4748e-05, 6.0053e-05, + 5.5117e-05, 6.8749e-05], device='cuda:2') +2022-11-15 18:41:15,203 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.296e+02 2.174e+02 2.548e+02 3.213e+02 6.484e+02, threshold=5.095e+02, percent-clipped=3.0 +2022-11-15 18:41:17,889 INFO [train.py:876] (2/4) Epoch 4, batch 5600, loss[loss=0.1925, simple_loss=0.1918, pruned_loss=0.09663, over 5752.00 frames. ], tot_loss[loss=0.1957, simple_loss=0.1929, pruned_loss=0.09922, over 1089304.35 frames. ], batch size: 31, lr: 1.85e-02, grad_scale: 8.0 +2022-11-15 18:41:34,817 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 18:42:06,966 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6538, 1.7101, 1.7552, 1.1175, 1.2426, 1.9127, 1.5305, 1.6262], + device='cuda:2'), covar=tensor([0.0549, 0.0863, 0.0636, 0.1456, 0.1797, 1.2156, 0.0731, 0.0662], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0036, 0.0036, 0.0042, 0.0035, 0.0033, 0.0030, 0.0040], + device='cuda:2'), out_proj_covar=tensor([6.5945e-05, 5.8473e-05, 5.9001e-05, 8.0395e-05, 6.5492e-05, 6.0075e-05, + 5.4620e-05, 6.8636e-05], device='cuda:2') +2022-11-15 18:42:17,328 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8875, 4.3672, 3.3922, 2.1085, 3.9853, 1.5917, 3.7483, 2.2725], + device='cuda:2'), covar=tensor([0.0897, 0.0099, 0.0496, 0.1612, 0.0161, 0.1709, 0.0161, 0.1433], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0095, 0.0099, 0.0117, 0.0096, 0.0129, 0.0084, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 18:42:19,416 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=27504.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:42:25,764 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.283e+02 2.178e+02 2.574e+02 3.004e+02 8.808e+02, threshold=5.148e+02, percent-clipped=2.0 +2022-11-15 18:42:28,792 INFO [train.py:876] (2/4) Epoch 4, batch 5700, loss[loss=0.2533, simple_loss=0.2416, pruned_loss=0.1325, over 5633.00 frames. ], tot_loss[loss=0.1943, simple_loss=0.1921, pruned_loss=0.09826, over 1087311.50 frames. ], batch size: 38, lr: 1.84e-02, grad_scale: 8.0 +2022-11-15 18:42:59,121 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=27560.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:43:32,368 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=27606.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:43:33,705 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=27608.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 18:43:37,000 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.377e+02 1.840e+02 2.310e+02 2.841e+02 5.066e+02, threshold=4.620e+02, percent-clipped=0.0 +2022-11-15 18:43:39,481 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1531, 4.5274, 4.9016, 4.6952, 5.1752, 4.9685, 4.2393, 5.1404], + device='cuda:2'), covar=tensor([0.0324, 0.0245, 0.0429, 0.0257, 0.0280, 0.0098, 0.0249, 0.0222], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0097, 0.0078, 0.0104, 0.0102, 0.0059, 0.0086, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:43:40,074 INFO [train.py:876] (2/4) Epoch 4, batch 5800, loss[loss=0.1801, simple_loss=0.1823, pruned_loss=0.08894, over 5713.00 frames. ], tot_loss[loss=0.1954, simple_loss=0.1923, pruned_loss=0.09923, over 1086904.02 frames. ], batch size: 17, lr: 1.84e-02, grad_scale: 8.0 +2022-11-15 18:43:43,114 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9402, 4.3789, 3.6733, 3.8076, 2.6771, 4.5335, 2.4280, 4.0372], + device='cuda:2'), covar=tensor([0.0306, 0.0112, 0.0138, 0.0176, 0.0364, 0.0059, 0.0289, 0.0048], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0100, 0.0118, 0.0127, 0.0151, 0.0116, 0.0135, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 18:44:06,336 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=27654.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:44:36,193 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0733, 0.8098, 0.7421, 1.0592, 1.3387, 1.8485, 1.2500, 1.3459], + device='cuda:2'), covar=tensor([0.2057, 0.0362, 0.0792, 0.0837, 0.1134, 0.0387, 0.0931, 0.0704], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0010, 0.0009, 0.0009, 0.0008, 0.0007, 0.0009, 0.0008], + device='cuda:2'), out_proj_covar=tensor([3.0602e-05, 3.7760e-05, 3.3196e-05, 3.6519e-05, 3.2437e-05, 2.9436e-05, + 3.4155e-05, 3.1466e-05], device='cuda:2') +2022-11-15 18:44:47,986 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.281e+02 1.945e+02 2.573e+02 2.996e+02 4.462e+02, threshold=5.147e+02, percent-clipped=0.0 +2022-11-15 18:44:48,883 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=27714.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 18:44:51,045 INFO [train.py:876] (2/4) Epoch 4, batch 5900, loss[loss=0.1714, simple_loss=0.1825, pruned_loss=0.08012, over 5508.00 frames. ], tot_loss[loss=0.1944, simple_loss=0.1912, pruned_loss=0.09878, over 1080887.16 frames. ], batch size: 13, lr: 1.84e-02, grad_scale: 8.0 +2022-11-15 18:45:17,837 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8469, 0.9143, 1.1045, 0.6504, 1.1094, 1.1468, 0.6073, 1.1229], + device='cuda:2'), covar=tensor([0.0020, 0.0010, 0.0012, 0.0011, 0.0018, 0.0015, 0.0029, 0.0020], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0022, 0.0023, 0.0024, 0.0023, 0.0021, 0.0023, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.5980e-05, 2.5914e-05, 2.1134e-05, 2.3440e-05, 2.1939e-05, 1.7106e-05, + 2.9340e-05, 2.1707e-05], device='cuda:2') +2022-11-15 18:45:28,208 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6536, 4.0680, 3.1597, 1.8249, 3.8712, 1.3670, 3.4668, 2.0302], + device='cuda:2'), covar=tensor([0.1167, 0.0122, 0.0747, 0.1932, 0.0158, 0.1942, 0.0279, 0.1693], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0096, 0.0101, 0.0119, 0.0097, 0.0129, 0.0086, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 18:45:32,359 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=27775.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:45:53,380 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=27804.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:45:59,274 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.246e+02 2.057e+02 2.608e+02 3.378e+02 5.938e+02, threshold=5.215e+02, percent-clipped=4.0 +2022-11-15 18:46:02,454 INFO [train.py:876] (2/4) Epoch 4, batch 6000, loss[loss=0.2248, simple_loss=0.2008, pruned_loss=0.1244, over 5482.00 frames. ], tot_loss[loss=0.1922, simple_loss=0.1903, pruned_loss=0.09703, over 1083719.69 frames. ], batch size: 58, lr: 1.83e-02, grad_scale: 8.0 +2022-11-15 18:46:02,454 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 18:46:07,931 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3592, 4.2454, 3.4837, 3.3128, 2.3262, 3.6671, 2.1913, 3.4101], + device='cuda:2'), covar=tensor([0.0355, 0.0042, 0.0114, 0.0189, 0.0343, 0.0084, 0.0279, 0.0071], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0099, 0.0116, 0.0124, 0.0147, 0.0114, 0.0133, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:46:08,377 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3107, 1.4595, 1.9622, 1.2738, 1.4790, 1.5971, 1.5722, 1.0513], + device='cuda:2'), covar=tensor([0.0013, 0.0050, 0.0015, 0.0018, 0.0029, 0.0031, 0.0015, 0.0025], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0013, 0.0015, 0.0015, 0.0014, 0.0016, 0.0016], + device='cuda:2'), out_proj_covar=tensor([1.5781e-05, 1.6997e-05, 1.4681e-05, 1.6463e-05, 1.5330e-05, 1.5971e-05, + 1.7218e-05, 2.0474e-05], device='cuda:2') +2022-11-15 18:46:20,381 INFO [train.py:908] (2/4) Epoch 4, validation: loss=0.1691, simple_loss=0.1898, pruned_loss=0.07419, over 1530663.00 frames. +2022-11-15 18:46:20,382 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 18:46:45,373 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=27852.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:47:28,450 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 1.961e+02 2.307e+02 2.838e+02 4.570e+02, threshold=4.614e+02, percent-clipped=0.0 +2022-11-15 18:47:30,804 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-11-15 18:47:31,204 INFO [train.py:876] (2/4) Epoch 4, batch 6100, loss[loss=0.1755, simple_loss=0.1814, pruned_loss=0.08482, over 5610.00 frames. ], tot_loss[loss=0.1896, simple_loss=0.1885, pruned_loss=0.0953, over 1089450.63 frames. ], batch size: 24, lr: 1.83e-02, grad_scale: 8.0 +2022-11-15 18:47:33,730 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 18:47:39,914 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7871, 1.3550, 2.2624, 2.2009, 1.3211, 1.3417, 1.5591, 1.6492], + device='cuda:2'), covar=tensor([0.0012, 0.0048, 0.0022, 0.0011, 0.0023, 0.0080, 0.0022, 0.0018], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0013, 0.0016, 0.0015, 0.0015, 0.0017, 0.0016], + device='cuda:2'), out_proj_covar=tensor([1.6059e-05, 1.7566e-05, 1.5214e-05, 1.6536e-05, 1.5830e-05, 1.6589e-05, + 1.8128e-05, 2.0593e-05], device='cuda:2') +2022-11-15 18:48:05,080 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-11-15 18:48:39,808 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.715e+01 1.962e+02 2.542e+02 3.499e+02 5.895e+02, threshold=5.084e+02, percent-clipped=10.0 +2022-11-15 18:48:42,556 INFO [train.py:876] (2/4) Epoch 4, batch 6200, loss[loss=0.1086, simple_loss=0.1331, pruned_loss=0.04211, over 5233.00 frames. ], tot_loss[loss=0.1928, simple_loss=0.1906, pruned_loss=0.09747, over 1089630.86 frames. ], batch size: 8, lr: 1.83e-02, grad_scale: 8.0 +2022-11-15 18:49:19,320 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3695, 2.8301, 2.9025, 1.4691, 2.6405, 3.2910, 3.2250, 3.6492], + device='cuda:2'), covar=tensor([0.1168, 0.1186, 0.0504, 0.1870, 0.0193, 0.0276, 0.0173, 0.0251], + device='cuda:2'), in_proj_covar=tensor([0.0183, 0.0175, 0.0125, 0.0184, 0.0133, 0.0129, 0.0123, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:49:19,868 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28070.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 18:49:50,342 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.357e+02 2.008e+02 2.464e+02 2.967e+02 7.434e+02, threshold=4.928e+02, percent-clipped=2.0 +2022-11-15 18:49:53,144 INFO [train.py:876] (2/4) Epoch 4, batch 6300, loss[loss=0.158, simple_loss=0.182, pruned_loss=0.06698, over 5527.00 frames. ], tot_loss[loss=0.192, simple_loss=0.1898, pruned_loss=0.09709, over 1091396.55 frames. ], batch size: 17, lr: 1.82e-02, grad_scale: 8.0 +2022-11-15 18:50:22,707 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0242, 1.8467, 2.0579, 2.9178, 3.0829, 2.1439, 1.7846, 3.3033], + device='cuda:2'), covar=tensor([0.0403, 0.3102, 0.2710, 0.2485, 0.0805, 0.2864, 0.2320, 0.0294], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0210, 0.0215, 0.0283, 0.0199, 0.0218, 0.0195, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 18:50:33,865 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-11-15 18:50:49,425 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6322, 2.1534, 1.8259, 2.2602, 1.8154, 1.7589, 1.6698, 2.3001], + device='cuda:2'), covar=tensor([0.0876, 0.1252, 0.2290, 0.1509, 0.1436, 0.1183, 0.1367, 0.1067], + device='cuda:2'), in_proj_covar=tensor([0.0049, 0.0054, 0.0071, 0.0048, 0.0064, 0.0052, 0.0065, 0.0047], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 18:51:00,494 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.320e+02 2.107e+02 2.497e+02 3.273e+02 5.114e+02, threshold=4.994e+02, percent-clipped=2.0 +2022-11-15 18:51:04,024 INFO [train.py:876] (2/4) Epoch 4, batch 6400, loss[loss=0.1667, simple_loss=0.1703, pruned_loss=0.08157, over 5590.00 frames. ], tot_loss[loss=0.1925, simple_loss=0.1902, pruned_loss=0.0974, over 1087117.67 frames. ], batch size: 18, lr: 1.82e-02, grad_scale: 8.0 +2022-11-15 18:51:12,356 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28229.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:51:30,766 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28255.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:51:53,828 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28287.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:51:55,862 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28290.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:51:58,314 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.98 vs. limit=5.0 +2022-11-15 18:52:11,778 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.188e+02 2.095e+02 2.644e+02 3.190e+02 6.758e+02, threshold=5.289e+02, percent-clipped=3.0 +2022-11-15 18:52:14,074 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28316.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 18:52:14,555 INFO [train.py:876] (2/4) Epoch 4, batch 6500, loss[loss=0.1979, simple_loss=0.1968, pruned_loss=0.09954, over 5737.00 frames. ], tot_loss[loss=0.1949, simple_loss=0.1915, pruned_loss=0.09918, over 1078489.46 frames. ], batch size: 20, lr: 1.82e-02, grad_scale: 8.0 +2022-11-15 18:52:36,610 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28348.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:52:52,109 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=28370.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 18:53:06,011 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-11-15 18:53:15,757 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.18 vs. limit=2.0 +2022-11-15 18:53:22,965 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.292e+02 1.962e+02 2.374e+02 2.918e+02 5.741e+02, threshold=4.749e+02, percent-clipped=1.0 +2022-11-15 18:53:26,221 INFO [train.py:876] (2/4) Epoch 4, batch 6600, loss[loss=0.2141, simple_loss=0.2019, pruned_loss=0.1132, over 5781.00 frames. ], tot_loss[loss=0.1925, simple_loss=0.1903, pruned_loss=0.09741, over 1073976.22 frames. ], batch size: 21, lr: 1.81e-02, grad_scale: 8.0 +2022-11-15 18:53:26,945 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=28418.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 18:53:33,164 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.4087, 0.8653, 0.9889, 0.6252, 0.9164, 0.8203, 0.7455, 1.0462], + device='cuda:2'), covar=tensor([0.0017, 0.0009, 0.0014, 0.0013, 0.0012, 0.0013, 0.0020, 0.0012], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0022, 0.0024, 0.0024, 0.0024, 0.0022, 0.0023, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.5633e-05, 2.6108e-05, 2.1868e-05, 2.3679e-05, 2.2579e-05, 1.8257e-05, + 2.9919e-05, 2.1205e-05], device='cuda:2') +2022-11-15 18:54:06,943 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-11-15 18:54:07,196 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8045, 3.7852, 3.6052, 3.7362, 3.7191, 3.3325, 1.3713, 3.6175], + device='cuda:2'), covar=tensor([0.0448, 0.0470, 0.0446, 0.0572, 0.0511, 0.0630, 0.3803, 0.0560], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0070, 0.0071, 0.0058, 0.0085, 0.0068, 0.0124, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 18:54:14,619 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4649, 3.9828, 3.0165, 1.7481, 3.8350, 1.5281, 3.7812, 2.1351], + device='cuda:2'), covar=tensor([0.1360, 0.0201, 0.0711, 0.2435, 0.0229, 0.2270, 0.0187, 0.1913], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0099, 0.0101, 0.0122, 0.0101, 0.0129, 0.0086, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 18:54:14,672 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28486.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:54:19,423 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28492.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:54:33,674 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.274e+02 2.033e+02 2.580e+02 2.965e+02 5.128e+02, threshold=5.159e+02, percent-clipped=4.0 +2022-11-15 18:54:36,429 INFO [train.py:876] (2/4) Epoch 4, batch 6700, loss[loss=0.132, simple_loss=0.157, pruned_loss=0.05349, over 5207.00 frames. ], tot_loss[loss=0.1934, simple_loss=0.1906, pruned_loss=0.09814, over 1071450.66 frames. ], batch size: 8, lr: 1.81e-02, grad_scale: 8.0 +2022-11-15 18:54:39,630 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 18:54:57,422 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28547.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:54:58,705 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.71 vs. limit=5.0 +2022-11-15 18:55:02,294 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28553.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:55:17,415 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2100, 1.2490, 1.4454, 0.6431, 1.3564, 1.1555, 0.6150, 1.6845], + device='cuda:2'), covar=tensor([0.0016, 0.0011, 0.0016, 0.0023, 0.0019, 0.0015, 0.0037, 0.0012], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0021, 0.0023, 0.0023, 0.0023, 0.0021, 0.0022, 0.0021], + device='cuda:2'), out_proj_covar=tensor([2.4740e-05, 2.4542e-05, 2.0963e-05, 2.2664e-05, 2.1467e-05, 1.7347e-05, + 2.8256e-05, 2.0340e-05], device='cuda:2') +2022-11-15 18:55:24,587 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28585.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:55:43,573 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28611.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 18:55:44,796 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.410e+02 2.021e+02 2.337e+02 3.022e+02 5.149e+02, threshold=4.674e+02, percent-clipped=0.0 +2022-11-15 18:55:47,617 INFO [train.py:876] (2/4) Epoch 4, batch 6800, loss[loss=0.1589, simple_loss=0.1776, pruned_loss=0.0701, over 5561.00 frames. ], tot_loss[loss=0.1917, simple_loss=0.19, pruned_loss=0.09669, over 1081020.12 frames. ], batch size: 16, lr: 1.81e-02, grad_scale: 8.0 +2022-11-15 18:56:06,106 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28643.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:56:53,335 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.36 vs. limit=5.0 +2022-11-15 18:56:56,416 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.127e+02 2.083e+02 2.579e+02 3.189e+02 6.516e+02, threshold=5.157e+02, percent-clipped=5.0 +2022-11-15 18:56:59,139 INFO [train.py:876] (2/4) Epoch 4, batch 6900, loss[loss=0.2176, simple_loss=0.213, pruned_loss=0.1111, over 5691.00 frames. ], tot_loss[loss=0.192, simple_loss=0.1908, pruned_loss=0.09657, over 1084966.76 frames. ], batch size: 34, lr: 1.80e-02, grad_scale: 8.0 +2022-11-15 18:57:13,914 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5197, 4.0358, 3.1273, 1.7570, 3.8039, 1.4568, 3.9216, 2.1521], + device='cuda:2'), covar=tensor([0.1261, 0.0122, 0.0486, 0.2275, 0.0169, 0.1976, 0.0131, 0.1780], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0098, 0.0102, 0.0121, 0.0100, 0.0128, 0.0086, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 18:57:20,806 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7413, 4.4358, 3.3272, 1.8257, 4.1714, 1.4651, 4.2343, 2.3172], + device='cuda:2'), covar=tensor([0.1074, 0.0084, 0.0499, 0.2012, 0.0126, 0.1851, 0.0123, 0.1554], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0099, 0.0103, 0.0122, 0.0101, 0.0129, 0.0086, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 18:57:48,727 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6819, 5.4241, 4.3467, 5.1267, 4.0172, 3.5967, 3.0843, 4.8967], + device='cuda:2'), covar=tensor([0.1310, 0.0101, 0.0373, 0.0213, 0.0291, 0.0621, 0.1269, 0.0076], + device='cuda:2'), in_proj_covar=tensor([0.0183, 0.0126, 0.0171, 0.0130, 0.0158, 0.0181, 0.0190, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 18:58:07,455 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.482e+01 2.092e+02 2.563e+02 3.027e+02 6.161e+02, threshold=5.126e+02, percent-clipped=1.0 +2022-11-15 18:58:10,635 INFO [train.py:876] (2/4) Epoch 4, batch 7000, loss[loss=0.128, simple_loss=0.1334, pruned_loss=0.0613, over 5113.00 frames. ], tot_loss[loss=0.1907, simple_loss=0.1897, pruned_loss=0.09581, over 1087823.71 frames. ], batch size: 7, lr: 1.80e-02, grad_scale: 8.0 +2022-11-15 18:58:28,004 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28842.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:58:32,051 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28848.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:58:41,450 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28861.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:58:58,292 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=28885.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:59:16,473 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=28911.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 18:59:17,662 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.222e+02 2.242e+02 2.603e+02 3.137e+02 5.359e+02, threshold=5.206e+02, percent-clipped=2.0 +2022-11-15 18:59:21,223 INFO [train.py:876] (2/4) Epoch 4, batch 7100, loss[loss=0.2419, simple_loss=0.2065, pruned_loss=0.1387, over 3060.00 frames. ], tot_loss[loss=0.191, simple_loss=0.1898, pruned_loss=0.09609, over 1083412.27 frames. ], batch size: 284, lr: 1.80e-02, grad_scale: 8.0 +2022-11-15 18:59:22,090 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.6087, 1.0408, 1.4021, 0.3040, 1.2253, 1.2674, 0.5719, 0.9820], + device='cuda:2'), covar=tensor([0.0028, 0.0018, 0.0026, 0.0042, 0.0021, 0.0021, 0.0042, 0.0031], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0023, 0.0024, 0.0024, 0.0024, 0.0022, 0.0023, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.6803e-05, 2.6204e-05, 2.2222e-05, 2.3600e-05, 2.2248e-05, 1.8235e-05, + 2.8980e-05, 2.1004e-05], device='cuda:2') +2022-11-15 18:59:24,861 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28922.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:59:30,264 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 18:59:32,542 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=28933.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:59:39,666 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=28943.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 18:59:50,847 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=28959.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 19:00:04,185 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.95 vs. limit=2.0 +2022-11-15 19:00:13,729 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=28991.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:00:21,856 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29002.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:00:29,544 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.083e+02 2.015e+02 2.566e+02 3.105e+02 6.744e+02, threshold=5.133e+02, percent-clipped=2.0 +2022-11-15 19:00:32,329 INFO [train.py:876] (2/4) Epoch 4, batch 7200, loss[loss=0.2228, simple_loss=0.2044, pruned_loss=0.1205, over 5449.00 frames. ], tot_loss[loss=0.1928, simple_loss=0.1906, pruned_loss=0.09748, over 1078790.61 frames. ], batch size: 64, lr: 1.80e-02, grad_scale: 8.0 +2022-11-15 19:01:03,728 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29063.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:02:11,320 INFO [train.py:876] (2/4) Epoch 5, batch 0, loss[loss=0.2788, simple_loss=0.2437, pruned_loss=0.1569, over 5396.00 frames. ], tot_loss[loss=0.2788, simple_loss=0.2437, pruned_loss=0.1569, over 5396.00 frames. ], batch size: 70, lr: 1.67e-02, grad_scale: 16.0 +2022-11-15 19:02:11,321 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 19:02:28,899 INFO [train.py:908] (2/4) Epoch 5, validation: loss=0.1679, simple_loss=0.1892, pruned_loss=0.07329, over 1530663.00 frames. +2022-11-15 19:02:28,899 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 19:02:46,021 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.011e+02 1.898e+02 2.233e+02 2.969e+02 5.666e+02, threshold=4.467e+02, percent-clipped=2.0 +2022-11-15 19:02:48,884 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7055, 1.1162, 1.4003, 0.8116, 0.4995, 1.7802, 1.4933, 0.7599], + device='cuda:2'), covar=tensor([0.0921, 0.0582, 0.0791, 0.1151, 0.2095, 0.0335, 0.0492, 0.1050], + device='cuda:2'), in_proj_covar=tensor([0.0040, 0.0039, 0.0040, 0.0046, 0.0040, 0.0036, 0.0033, 0.0040], + device='cuda:2'), out_proj_covar=tensor([7.1577e-05, 6.5847e-05, 6.7446e-05, 8.9114e-05, 7.2453e-05, 6.7925e-05, + 5.9171e-05, 7.0212e-05], device='cuda:2') +2022-11-15 19:02:52,201 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 19:03:06,462 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29142.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:03:10,780 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29148.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:03:40,113 INFO [train.py:876] (2/4) Epoch 5, batch 100, loss[loss=0.162, simple_loss=0.1578, pruned_loss=0.08304, over 5110.00 frames. ], tot_loss[loss=0.1953, simple_loss=0.1926, pruned_loss=0.09902, over 433313.44 frames. ], batch size: 8, lr: 1.67e-02, grad_scale: 16.0 +2022-11-15 19:03:41,274 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29190.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:03:45,648 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29196.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:03:50,576 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7261, 0.9565, 1.3297, 0.5035, 0.9359, 0.8987, 1.0976, 1.3783], + device='cuda:2'), covar=tensor([0.0032, 0.0030, 0.0026, 0.0027, 0.0026, 0.0021, 0.0033, 0.0034], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0024, 0.0024, 0.0025, 0.0025, 0.0023, 0.0024, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.7826e-05, 2.7212e-05, 2.2446e-05, 2.3895e-05, 2.3109e-05, 1.9427e-05, + 3.0091e-05, 2.1447e-05], device='cuda:2') +2022-11-15 19:03:51,611 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-11-15 19:03:57,639 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.04 vs. limit=5.0 +2022-11-15 19:03:58,341 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.10 vs. limit=5.0 +2022-11-15 19:03:58,637 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.157e+02 1.864e+02 2.257e+02 2.875e+02 4.325e+02, threshold=4.514e+02, percent-clipped=0.0 +2022-11-15 19:04:01,603 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=29217.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:04:01,912 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.80 vs. limit=5.0 +2022-11-15 19:04:04,559 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-15 19:04:16,080 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29236.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:04:36,331 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7109, 0.4189, 0.4985, 0.4721, 0.6993, 0.6469, 0.4509, 0.5876], + device='cuda:2'), covar=tensor([0.0218, 0.0359, 0.0223, 0.0262, 0.0194, 0.0176, 0.0418, 0.0232], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0011, 0.0010, 0.0010, 0.0009, 0.0008, 0.0011, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.5954e-05, 4.2130e-05, 3.9091e-05, 4.1057e-05, 3.5786e-05, 3.2311e-05, + 4.0197e-05, 3.6092e-05], device='cuda:2') +2022-11-15 19:04:44,979 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.10 vs. limit=5.0 +2022-11-15 19:04:54,403 INFO [train.py:876] (2/4) Epoch 5, batch 200, loss[loss=0.1838, simple_loss=0.1956, pruned_loss=0.08599, over 5786.00 frames. ], tot_loss[loss=0.1891, simple_loss=0.1878, pruned_loss=0.09518, over 688409.28 frames. ], batch size: 21, lr: 1.66e-02, grad_scale: 16.0 +2022-11-15 19:05:00,116 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29297.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:05:02,073 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8522, 4.3883, 3.7105, 4.3612, 4.2980, 3.6199, 3.7842, 3.4538], + device='cuda:2'), covar=tensor([0.0435, 0.0329, 0.1285, 0.0306, 0.0412, 0.0437, 0.0522, 0.0629], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0127, 0.0206, 0.0133, 0.0158, 0.0132, 0.0135, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:05:05,318 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0900, 3.9632, 4.1235, 4.1395, 4.0439, 3.7425, 1.5854, 3.9297], + device='cuda:2'), covar=tensor([0.0396, 0.0662, 0.0383, 0.0332, 0.0570, 0.0609, 0.4050, 0.0627], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0069, 0.0069, 0.0059, 0.0085, 0.0070, 0.0126, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:05:11,281 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 1.852e+02 2.311e+02 2.896e+02 4.298e+02, threshold=4.622e+02, percent-clipped=0.0 +2022-11-15 19:05:43,284 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=29358.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:05:52,833 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8340, 2.0728, 3.0992, 3.7559, 4.1777, 3.1391, 2.3623, 4.1684], + device='cuda:2'), covar=tensor([0.0226, 0.3281, 0.2049, 0.3505, 0.0486, 0.2653, 0.2371, 0.0256], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0217, 0.0216, 0.0298, 0.0206, 0.0225, 0.0200, 0.0167], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 19:06:00,979 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.00 vs. limit=5.0 +2022-11-15 19:06:04,993 INFO [train.py:876] (2/4) Epoch 5, batch 300, loss[loss=0.1715, simple_loss=0.1803, pruned_loss=0.08139, over 5735.00 frames. ], tot_loss[loss=0.19, simple_loss=0.1885, pruned_loss=0.09579, over 845596.86 frames. ], batch size: 15, lr: 1.66e-02, grad_scale: 8.0 +2022-11-15 19:06:07,944 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4493, 2.9995, 3.4681, 1.2171, 2.9881, 3.5715, 3.2371, 3.7439], + device='cuda:2'), covar=tensor([0.1492, 0.1380, 0.0408, 0.2404, 0.0273, 0.0242, 0.0298, 0.0277], + device='cuda:2'), in_proj_covar=tensor([0.0192, 0.0196, 0.0136, 0.0198, 0.0145, 0.0140, 0.0132, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 19:06:22,461 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.196e+02 1.934e+02 2.477e+02 3.130e+02 8.486e+02, threshold=4.954e+02, percent-clipped=7.0 +2022-11-15 19:06:26,175 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-15 19:06:26,510 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5932, 3.6377, 3.3916, 3.1764, 2.1729, 3.6900, 2.1951, 3.3049], + device='cuda:2'), covar=tensor([0.0287, 0.0193, 0.0116, 0.0226, 0.0334, 0.0077, 0.0273, 0.0064], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0109, 0.0124, 0.0135, 0.0154, 0.0122, 0.0142, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:06:48,834 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2563, 1.6889, 1.7964, 1.7704, 2.1894, 1.6667, 1.3208, 2.3114], + device='cuda:2'), covar=tensor([0.0445, 0.1155, 0.1029, 0.0491, 0.0452, 0.1268, 0.1430, 0.0388], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0214, 0.0215, 0.0299, 0.0204, 0.0226, 0.0199, 0.0164], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 19:06:53,901 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-11-15 19:07:04,645 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8505, 3.8857, 3.8889, 1.5700, 3.4519, 3.6548, 3.5091, 4.2452], + device='cuda:2'), covar=tensor([0.1174, 0.0762, 0.0327, 0.2132, 0.0180, 0.0280, 0.0304, 0.0206], + device='cuda:2'), in_proj_covar=tensor([0.0191, 0.0193, 0.0137, 0.0196, 0.0144, 0.0140, 0.0130, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 19:07:15,095 INFO [train.py:876] (2/4) Epoch 5, batch 400, loss[loss=0.1834, simple_loss=0.1904, pruned_loss=0.08824, over 5729.00 frames. ], tot_loss[loss=0.1864, simple_loss=0.1866, pruned_loss=0.09307, over 943521.75 frames. ], batch size: 31, lr: 1.66e-02, grad_scale: 8.0 +2022-11-15 19:07:18,760 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-15 19:07:25,280 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4816, 4.4272, 2.9021, 4.1973, 3.3216, 2.8004, 2.4523, 3.7259], + device='cuda:2'), covar=tensor([0.1460, 0.0139, 0.0846, 0.0225, 0.0459, 0.1011, 0.1710, 0.0180], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0122, 0.0166, 0.0124, 0.0157, 0.0180, 0.0188, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:07:32,201 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.241e+02 1.992e+02 2.391e+02 2.865e+02 5.865e+02, threshold=4.782e+02, percent-clipped=1.0 +2022-11-15 19:07:34,761 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29517.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:07:34,772 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4629, 1.3686, 1.8609, 1.1440, 0.4953, 1.9106, 1.7879, 1.2475], + device='cuda:2'), covar=tensor([0.0505, 0.0762, 0.0364, 0.1151, 0.1083, 0.1372, 0.0800, 0.0691], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0039, 0.0039, 0.0045, 0.0039, 0.0036, 0.0033, 0.0040], + device='cuda:2'), out_proj_covar=tensor([7.2768e-05, 6.6149e-05, 6.5957e-05, 8.8527e-05, 7.1427e-05, 6.8695e-05, + 6.0762e-05, 7.1055e-05], device='cuda:2') +2022-11-15 19:07:58,378 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29551.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:08:06,307 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 19:08:07,906 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29565.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:08:25,425 INFO [train.py:876] (2/4) Epoch 5, batch 500, loss[loss=0.2323, simple_loss=0.2296, pruned_loss=0.1175, over 5707.00 frames. ], tot_loss[loss=0.186, simple_loss=0.1866, pruned_loss=0.09271, over 1002566.00 frames. ], batch size: 28, lr: 1.66e-02, grad_scale: 8.0 +2022-11-15 19:08:27,521 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=29592.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:08:41,570 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29612.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:08:42,743 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.299e+01 1.690e+02 2.176e+02 2.799e+02 4.262e+02, threshold=4.352e+02, percent-clipped=0.0 +2022-11-15 19:09:14,443 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29658.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:09:17,201 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29662.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:09:30,869 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.92 vs. limit=5.0 +2022-11-15 19:09:36,292 INFO [train.py:876] (2/4) Epoch 5, batch 600, loss[loss=0.2182, simple_loss=0.206, pruned_loss=0.1152, over 5461.00 frames. ], tot_loss[loss=0.1885, simple_loss=0.1884, pruned_loss=0.09428, over 1033897.56 frames. ], batch size: 58, lr: 1.65e-02, grad_scale: 8.0 +2022-11-15 19:09:48,477 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29706.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:09:51,267 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6039, 4.2715, 3.3772, 3.3550, 2.3248, 4.0105, 2.0613, 3.5670], + device='cuda:2'), covar=tensor([0.0327, 0.0088, 0.0162, 0.0223, 0.0361, 0.0101, 0.0337, 0.0064], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0107, 0.0123, 0.0135, 0.0152, 0.0122, 0.0141, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:09:53,666 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.408e+01 2.026e+02 2.597e+02 3.156e+02 6.029e+02, threshold=5.193e+02, percent-clipped=5.0 +2022-11-15 19:09:56,248 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-11-15 19:09:59,880 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29723.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:10:27,475 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3920, 3.7592, 2.8766, 1.8919, 3.5061, 1.2480, 3.4519, 1.8702], + device='cuda:2'), covar=tensor([0.1270, 0.0157, 0.0833, 0.1994, 0.0233, 0.2260, 0.0233, 0.1964], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0098, 0.0106, 0.0121, 0.0098, 0.0130, 0.0087, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 19:10:46,594 INFO [train.py:876] (2/4) Epoch 5, batch 700, loss[loss=0.1714, simple_loss=0.1744, pruned_loss=0.08426, over 5443.00 frames. ], tot_loss[loss=0.1875, simple_loss=0.188, pruned_loss=0.09344, over 1058334.35 frames. ], batch size: 10, lr: 1.65e-02, grad_scale: 8.0 +2022-11-15 19:10:56,332 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29802.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:11:04,299 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.261e+02 2.007e+02 2.393e+02 2.831e+02 4.977e+02, threshold=4.787e+02, percent-clipped=0.0 +2022-11-15 19:11:39,246 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29863.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:11:43,991 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.72 vs. limit=5.0 +2022-11-15 19:11:57,077 INFO [train.py:876] (2/4) Epoch 5, batch 800, loss[loss=0.2127, simple_loss=0.2082, pruned_loss=0.1086, over 5653.00 frames. ], tot_loss[loss=0.1856, simple_loss=0.1867, pruned_loss=0.09228, over 1072555.96 frames. ], batch size: 29, lr: 1.65e-02, grad_scale: 8.0 +2022-11-15 19:11:59,151 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29892.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:12:10,346 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=29907.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:12:15,044 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.261e+02 1.917e+02 2.325e+02 2.760e+02 6.321e+02, threshold=4.650e+02, percent-clipped=1.0 +2022-11-15 19:12:17,303 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6364, 4.1011, 3.2211, 1.9909, 3.9912, 1.4332, 3.9686, 2.3093], + device='cuda:2'), covar=tensor([0.1189, 0.0144, 0.0632, 0.1941, 0.0128, 0.2089, 0.0138, 0.1653], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0098, 0.0107, 0.0122, 0.0098, 0.0132, 0.0088, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 19:12:30,325 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 19:12:33,174 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29940.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:12:40,133 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4207, 2.3230, 1.8429, 2.7162, 1.7196, 2.2233, 2.2326, 2.5437], + device='cuda:2'), covar=tensor([0.0716, 0.1815, 0.3111, 0.0932, 0.2370, 0.1342, 0.1495, 0.3893], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0058, 0.0072, 0.0047, 0.0062, 0.0053, 0.0066, 0.0045], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-11-15 19:13:07,969 INFO [train.py:876] (2/4) Epoch 5, batch 900, loss[loss=0.2234, simple_loss=0.1921, pruned_loss=0.1274, over 4130.00 frames. ], tot_loss[loss=0.185, simple_loss=0.1864, pruned_loss=0.09177, over 1077368.04 frames. ], batch size: 181, lr: 1.65e-02, grad_scale: 8.0 +2022-11-15 19:13:20,640 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3817, 4.3062, 3.3655, 3.2246, 2.2220, 4.1390, 2.0936, 3.6045], + device='cuda:2'), covar=tensor([0.0395, 0.0114, 0.0151, 0.0331, 0.0427, 0.0079, 0.0370, 0.0073], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0109, 0.0125, 0.0138, 0.0154, 0.0121, 0.0141, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:13:29,502 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.538e+01 1.888e+02 2.305e+02 2.835e+02 5.650e+02, threshold=4.611e+02, percent-clipped=1.0 +2022-11-15 19:13:32,287 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30018.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 19:14:22,090 INFO [train.py:876] (2/4) Epoch 5, batch 1000, loss[loss=0.1508, simple_loss=0.1697, pruned_loss=0.06592, over 5727.00 frames. ], tot_loss[loss=0.1838, simple_loss=0.1854, pruned_loss=0.09108, over 1077694.67 frames. ], batch size: 15, lr: 1.64e-02, grad_scale: 8.0 +2022-11-15 19:14:27,985 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1896, 2.7582, 2.9505, 1.4760, 2.4700, 3.0875, 2.8062, 3.4547], + device='cuda:2'), covar=tensor([0.1419, 0.1389, 0.0804, 0.2300, 0.0318, 0.0625, 0.0476, 0.0294], + device='cuda:2'), in_proj_covar=tensor([0.0191, 0.0189, 0.0135, 0.0193, 0.0143, 0.0139, 0.0129, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 19:14:39,366 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.933e+01 1.915e+02 2.282e+02 2.914e+02 7.246e+02, threshold=4.564e+02, percent-clipped=3.0 +2022-11-15 19:14:47,779 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30125.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:14:49,742 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7844, 1.1827, 1.0882, 0.5333, 1.1468, 1.1093, 0.8056, 1.4483], + device='cuda:2'), covar=tensor([0.0030, 0.0026, 0.0038, 0.0030, 0.0030, 0.0019, 0.0033, 0.0027], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0025, 0.0026, 0.0026, 0.0026, 0.0024, 0.0025, 0.0023], + device='cuda:2'), out_proj_covar=tensor([2.7601e-05, 2.9460e-05, 2.4010e-05, 2.4406e-05, 2.3833e-05, 1.9974e-05, + 3.1115e-05, 2.2100e-05], device='cuda:2') +2022-11-15 19:14:57,305 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9898, 1.8515, 2.7428, 2.5463, 2.5997, 1.8248, 2.5485, 2.9001], + device='cuda:2'), covar=tensor([0.0209, 0.0647, 0.0244, 0.0526, 0.0244, 0.0569, 0.0388, 0.0235], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0182, 0.0166, 0.0197, 0.0161, 0.0176, 0.0211, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 19:15:10,219 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30158.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:15:30,408 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30186.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:15:32,247 INFO [train.py:876] (2/4) Epoch 5, batch 1100, loss[loss=0.1249, simple_loss=0.1457, pruned_loss=0.05204, over 5453.00 frames. ], tot_loss[loss=0.1858, simple_loss=0.187, pruned_loss=0.09231, over 1077743.45 frames. ], batch size: 11, lr: 1.64e-02, grad_scale: 8.0 +2022-11-15 19:15:45,334 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30207.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:15:46,746 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30209.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:15:49,985 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.297e+02 1.933e+02 2.265e+02 2.887e+02 4.331e+02, threshold=4.530e+02, percent-clipped=0.0 +2022-11-15 19:16:02,471 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-11-15 19:16:19,261 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30255.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:16:30,094 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30270.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:16:36,108 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0960, 3.7280, 3.9521, 3.7553, 4.1560, 3.8341, 3.7890, 4.1668], + device='cuda:2'), covar=tensor([0.0471, 0.0349, 0.0434, 0.0378, 0.0474, 0.0342, 0.0305, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0108, 0.0082, 0.0115, 0.0114, 0.0067, 0.0092, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:16:43,121 INFO [train.py:876] (2/4) Epoch 5, batch 1200, loss[loss=0.2271, simple_loss=0.2061, pruned_loss=0.1241, over 4731.00 frames. ], tot_loss[loss=0.1856, simple_loss=0.1862, pruned_loss=0.09249, over 1077388.53 frames. ], batch size: 135, lr: 1.64e-02, grad_scale: 8.0 +2022-11-15 19:16:55,732 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-15 19:17:00,685 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.148e+02 1.969e+02 2.334e+02 2.925e+02 5.027e+02, threshold=4.667e+02, percent-clipped=3.0 +2022-11-15 19:17:03,519 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30318.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 19:17:33,178 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30360.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:17:33,391 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0 +2022-11-15 19:17:37,207 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30366.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:17:52,994 INFO [train.py:876] (2/4) Epoch 5, batch 1300, loss[loss=0.178, simple_loss=0.1827, pruned_loss=0.08662, over 5733.00 frames. ], tot_loss[loss=0.1866, simple_loss=0.1872, pruned_loss=0.09294, over 1083202.16 frames. ], batch size: 17, lr: 1.63e-02, grad_scale: 8.0 +2022-11-15 19:18:10,523 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.384e+02 1.960e+02 2.546e+02 3.212e+02 9.234e+02, threshold=5.093e+02, percent-clipped=6.0 +2022-11-15 19:18:15,967 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30421.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:18:42,364 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30458.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:18:58,170 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30481.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:19:04,005 INFO [train.py:876] (2/4) Epoch 5, batch 1400, loss[loss=0.1907, simple_loss=0.1949, pruned_loss=0.09325, over 5573.00 frames. ], tot_loss[loss=0.1832, simple_loss=0.1851, pruned_loss=0.09071, over 1084146.82 frames. ], batch size: 43, lr: 1.63e-02, grad_scale: 8.0 +2022-11-15 19:19:15,744 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30506.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:19:21,870 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.110e+02 1.868e+02 2.246e+02 2.844e+02 4.562e+02, threshold=4.491e+02, percent-clipped=0.0 +2022-11-15 19:19:21,977 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3307, 4.3835, 4.3664, 4.6906, 3.6150, 3.2751, 5.1821, 4.2688], + device='cuda:2'), covar=tensor([0.0519, 0.0734, 0.0438, 0.0845, 0.0617, 0.0485, 0.0594, 0.0565], + device='cuda:2'), in_proj_covar=tensor([0.0064, 0.0085, 0.0071, 0.0086, 0.0066, 0.0057, 0.0108, 0.0071], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:19:57,521 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30565.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:20:14,012 INFO [train.py:876] (2/4) Epoch 5, batch 1500, loss[loss=0.2162, simple_loss=0.194, pruned_loss=0.1192, over 5178.00 frames. ], tot_loss[loss=0.1845, simple_loss=0.186, pruned_loss=0.09144, over 1082669.23 frames. ], batch size: 91, lr: 1.63e-02, grad_scale: 8.0 +2022-11-15 19:20:17,804 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-11-15 19:20:31,689 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.176e+02 1.945e+02 2.500e+02 2.887e+02 6.503e+02, threshold=4.999e+02, percent-clipped=3.0 +2022-11-15 19:20:51,854 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 19:20:55,997 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-11-15 19:21:01,537 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30655.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:21:25,657 INFO [train.py:876] (2/4) Epoch 5, batch 1600, loss[loss=0.1603, simple_loss=0.1743, pruned_loss=0.07314, over 5524.00 frames. ], tot_loss[loss=0.1843, simple_loss=0.1856, pruned_loss=0.0915, over 1083318.56 frames. ], batch size: 13, lr: 1.63e-02, grad_scale: 8.0 +2022-11-15 19:21:29,155 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4772, 3.6243, 3.3321, 3.7809, 3.0000, 2.8676, 4.1254, 3.2790], + device='cuda:2'), covar=tensor([0.0510, 0.0908, 0.0706, 0.0992, 0.0985, 0.0579, 0.0897, 0.0731], + device='cuda:2'), in_proj_covar=tensor([0.0064, 0.0084, 0.0071, 0.0087, 0.0067, 0.0057, 0.0109, 0.0070], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:21:40,477 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6156, 2.5634, 2.1695, 2.5426, 2.5784, 2.3494, 2.3203, 2.3390], + device='cuda:2'), covar=tensor([0.0401, 0.0625, 0.1781, 0.0664, 0.0694, 0.0565, 0.0717, 0.0600], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0137, 0.0215, 0.0135, 0.0168, 0.0140, 0.0140, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:21:43,137 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.104e+02 1.815e+02 2.150e+02 2.703e+02 4.613e+02, threshold=4.301e+02, percent-clipped=0.0 +2022-11-15 19:21:44,616 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30716.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:21:44,725 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30716.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 19:22:31,313 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30781.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:22:36,915 INFO [train.py:876] (2/4) Epoch 5, batch 1700, loss[loss=0.1667, simple_loss=0.1653, pruned_loss=0.08401, over 5677.00 frames. ], tot_loss[loss=0.1844, simple_loss=0.1853, pruned_loss=0.09173, over 1080091.24 frames. ], batch size: 11, lr: 1.62e-02, grad_scale: 8.0 +2022-11-15 19:22:54,317 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.089e+02 1.936e+02 2.464e+02 3.039e+02 4.896e+02, threshold=4.928e+02, percent-clipped=3.0 +2022-11-15 19:23:04,935 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30829.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:23:28,555 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5317, 3.5395, 3.5243, 3.6975, 3.1461, 2.7381, 4.0299, 3.4474], + device='cuda:2'), covar=tensor([0.0541, 0.0727, 0.0406, 0.0740, 0.0697, 0.0517, 0.0708, 0.0627], + device='cuda:2'), in_proj_covar=tensor([0.0063, 0.0082, 0.0070, 0.0086, 0.0066, 0.0056, 0.0106, 0.0068], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:23:29,931 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30865.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:23:45,849 INFO [train.py:876] (2/4) Epoch 5, batch 1800, loss[loss=0.1689, simple_loss=0.1775, pruned_loss=0.08016, over 5765.00 frames. ], tot_loss[loss=0.1835, simple_loss=0.185, pruned_loss=0.09104, over 1084457.94 frames. ], batch size: 31, lr: 1.62e-02, grad_scale: 8.0 +2022-11-15 19:23:55,414 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1003, 4.8639, 4.2200, 3.6857, 3.0320, 4.8874, 2.9759, 4.5107], + device='cuda:2'), covar=tensor([0.0261, 0.0083, 0.0110, 0.0279, 0.0313, 0.0078, 0.0215, 0.0037], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0111, 0.0128, 0.0137, 0.0154, 0.0126, 0.0142, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:23:56,004 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30903.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:24:02,787 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30913.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:24:03,363 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.082e+02 1.810e+02 2.217e+02 2.817e+02 4.516e+02, threshold=4.435e+02, percent-clipped=0.0 +2022-11-15 19:24:10,703 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30925.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:24:16,523 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30934.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:24:23,417 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3326, 3.9491, 4.0747, 4.0431, 4.3788, 4.2118, 3.9611, 4.3136], + device='cuda:2'), covar=tensor([0.0571, 0.0487, 0.0636, 0.0540, 0.0522, 0.0278, 0.0416, 0.0553], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0103, 0.0080, 0.0111, 0.0109, 0.0065, 0.0087, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:24:33,970 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7242, 2.0381, 2.8057, 3.5095, 3.3868, 2.7013, 2.2052, 3.7758], + device='cuda:2'), covar=tensor([0.0216, 0.2999, 0.1906, 0.2008, 0.0866, 0.2752, 0.2200, 0.0178], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0212, 0.0214, 0.0302, 0.0213, 0.0230, 0.0210, 0.0164], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 19:24:37,867 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30964.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:24:52,275 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30986.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:24:54,011 INFO [train.py:876] (2/4) Epoch 5, batch 1900, loss[loss=0.1434, simple_loss=0.1455, pruned_loss=0.07064, over 5334.00 frames. ], tot_loss[loss=0.1821, simple_loss=0.184, pruned_loss=0.09008, over 1081848.09 frames. ], batch size: 9, lr: 1.62e-02, grad_scale: 8.0 +2022-11-15 19:24:58,426 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30995.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:25:10,260 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31011.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 19:25:10,323 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31011.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:25:12,010 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.128e+02 1.895e+02 2.325e+02 2.903e+02 5.344e+02, threshold=4.651e+02, percent-clipped=5.0 +2022-11-15 19:25:13,454 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31016.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:25:43,752 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9042, 1.4710, 1.4189, 1.2951, 2.2422, 1.5474, 1.6470, 1.3551], + device='cuda:2'), covar=tensor([0.0898, 0.0536, 0.0895, 0.1536, 0.0247, 0.0448, 0.0617, 0.0540], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0011, 0.0009, 0.0010, 0.0009, 0.0008, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.5598e-05, 4.3097e-05, 3.7115e-05, 4.1117e-05, 3.6704e-05, 3.3368e-05, + 4.0056e-05, 3.7988e-05], device='cuda:2') +2022-11-15 19:25:46,274 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31064.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:25:51,717 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31072.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 19:25:59,347 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-11-15 19:26:02,817 INFO [train.py:876] (2/4) Epoch 5, batch 2000, loss[loss=0.1747, simple_loss=0.1817, pruned_loss=0.08388, over 5623.00 frames. ], tot_loss[loss=0.183, simple_loss=0.1846, pruned_loss=0.09071, over 1082966.36 frames. ], batch size: 38, lr: 1.62e-02, grad_scale: 8.0 +2022-11-15 19:26:19,615 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1832, 0.7842, 1.0772, 1.0088, 1.2831, 1.2084, 1.0115, 1.1345], + device='cuda:2'), covar=tensor([0.0710, 0.0469, 0.0482, 0.1131, 0.0734, 0.0573, 0.1127, 0.0415], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0012, 0.0009, 0.0010, 0.0009, 0.0008, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.6689e-05, 4.4271e-05, 3.7820e-05, 4.2388e-05, 3.7720e-05, 3.4221e-05, + 4.0627e-05, 3.8587e-05], device='cuda:2') +2022-11-15 19:26:20,069 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.071e+02 1.737e+02 2.220e+02 2.843e+02 5.773e+02, threshold=4.440e+02, percent-clipped=3.0 +2022-11-15 19:26:52,092 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-11-15 19:26:59,500 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.16 vs. limit=2.0 +2022-11-15 19:27:06,084 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.25 vs. limit=5.0 +2022-11-15 19:27:10,091 INFO [train.py:876] (2/4) Epoch 5, batch 2100, loss[loss=0.1159, simple_loss=0.1385, pruned_loss=0.04668, over 4917.00 frames. ], tot_loss[loss=0.1818, simple_loss=0.1839, pruned_loss=0.08983, over 1086615.13 frames. ], batch size: 5, lr: 1.61e-02, grad_scale: 8.0 +2022-11-15 19:27:11,469 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8931, 4.1284, 4.0822, 4.3342, 3.7355, 3.3162, 4.7148, 3.8973], + device='cuda:2'), covar=tensor([0.0412, 0.0582, 0.0310, 0.0586, 0.0413, 0.0399, 0.0667, 0.0440], + device='cuda:2'), in_proj_covar=tensor([0.0065, 0.0085, 0.0073, 0.0087, 0.0068, 0.0057, 0.0112, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:27:26,926 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.254e+02 1.972e+02 2.588e+02 3.306e+02 8.013e+02, threshold=5.176e+02, percent-clipped=4.0 +2022-11-15 19:27:43,303 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-11-15 19:27:47,453 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.07 vs. limit=2.0 +2022-11-15 19:27:58,202 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31259.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:27:59,761 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.37 vs. limit=5.0 +2022-11-15 19:28:13,297 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31281.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:28:18,981 INFO [train.py:876] (2/4) Epoch 5, batch 2200, loss[loss=0.1918, simple_loss=0.1928, pruned_loss=0.09535, over 5619.00 frames. ], tot_loss[loss=0.1823, simple_loss=0.1848, pruned_loss=0.08993, over 1088944.26 frames. ], batch size: 23, lr: 1.61e-02, grad_scale: 8.0 +2022-11-15 19:28:19,741 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31290.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:28:33,498 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31311.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:28:35,403 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.134e+02 2.016e+02 2.479e+02 3.242e+02 5.312e+02, threshold=4.958e+02, percent-clipped=2.0 +2022-11-15 19:28:36,135 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0422, 3.6227, 3.8341, 3.5415, 4.0528, 3.6550, 3.7314, 4.0452], + device='cuda:2'), covar=tensor([0.0402, 0.0302, 0.0426, 0.0413, 0.0387, 0.0393, 0.0314, 0.0325], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0108, 0.0084, 0.0114, 0.0114, 0.0067, 0.0091, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:29:06,522 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31359.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:29:11,637 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31367.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 19:29:26,903 INFO [train.py:876] (2/4) Epoch 5, batch 2300, loss[loss=0.2004, simple_loss=0.1958, pruned_loss=0.1024, over 5695.00 frames. ], tot_loss[loss=0.1796, simple_loss=0.183, pruned_loss=0.08813, over 1090554.72 frames. ], batch size: 19, lr: 1.61e-02, grad_scale: 16.0 +2022-11-15 19:29:41,104 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31409.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:29:44,269 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.150e+02 1.900e+02 2.299e+02 2.871e+02 4.466e+02, threshold=4.598e+02, percent-clipped=0.0 +2022-11-15 19:30:02,972 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5742, 1.4888, 1.8119, 0.8978, 0.8616, 2.1510, 1.4122, 1.3077], + device='cuda:2'), covar=tensor([0.0409, 0.1098, 0.0669, 0.1925, 0.1869, 0.0748, 0.0939, 0.0801], + device='cuda:2'), in_proj_covar=tensor([0.0043, 0.0040, 0.0043, 0.0049, 0.0041, 0.0034, 0.0037, 0.0043], + device='cuda:2'), out_proj_covar=tensor([7.8345e-05, 7.1657e-05, 7.5527e-05, 9.4975e-05, 7.7524e-05, 6.9817e-05, + 6.8615e-05, 7.6750e-05], device='cuda:2') +2022-11-15 19:30:22,592 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31470.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:30:35,183 INFO [train.py:876] (2/4) Epoch 5, batch 2400, loss[loss=0.2083, simple_loss=0.1899, pruned_loss=0.1133, over 4665.00 frames. ], tot_loss[loss=0.1795, simple_loss=0.1826, pruned_loss=0.08815, over 1084838.18 frames. ], batch size: 135, lr: 1.61e-02, grad_scale: 16.0 +2022-11-15 19:30:37,450 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31491.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:30:53,273 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.223e+02 1.788e+02 2.240e+02 2.771e+02 4.453e+02, threshold=4.479e+02, percent-clipped=0.0 +2022-11-15 19:31:03,132 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2725, 2.3331, 1.9673, 2.3285, 2.3884, 2.2019, 2.2886, 2.1487], + device='cuda:2'), covar=tensor([0.0423, 0.0596, 0.1541, 0.0545, 0.0444, 0.0474, 0.0572, 0.0534], + device='cuda:2'), in_proj_covar=tensor([0.0111, 0.0131, 0.0210, 0.0132, 0.0158, 0.0134, 0.0139, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:31:19,169 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31552.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:31:24,388 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31559.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:31:35,746 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 19:31:38,737 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31581.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:31:42,324 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-11-15 19:31:43,847 INFO [train.py:876] (2/4) Epoch 5, batch 2500, loss[loss=0.1781, simple_loss=0.178, pruned_loss=0.08909, over 5454.00 frames. ], tot_loss[loss=0.1814, simple_loss=0.1842, pruned_loss=0.08932, over 1088719.32 frames. ], batch size: 11, lr: 1.60e-02, grad_scale: 16.0 +2022-11-15 19:31:44,623 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31590.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:31:56,592 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31607.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:32:01,411 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.094e+02 1.811e+02 2.202e+02 2.747e+02 5.680e+02, threshold=4.404e+02, percent-clipped=5.0 +2022-11-15 19:32:07,188 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3416, 3.8375, 3.4344, 3.2546, 2.1550, 3.6405, 2.0233, 3.1312], + device='cuda:2'), covar=tensor([0.0359, 0.0131, 0.0135, 0.0274, 0.0386, 0.0094, 0.0325, 0.0070], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0112, 0.0127, 0.0137, 0.0152, 0.0124, 0.0142, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:32:11,674 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31629.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:32:17,655 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31638.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:32:37,450 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31667.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:32:47,556 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-11-15 19:32:52,310 INFO [train.py:876] (2/4) Epoch 5, batch 2600, loss[loss=0.2315, simple_loss=0.2278, pruned_loss=0.1176, over 5631.00 frames. ], tot_loss[loss=0.1838, simple_loss=0.1855, pruned_loss=0.09107, over 1089029.87 frames. ], batch size: 32, lr: 1.60e-02, grad_scale: 16.0 +2022-11-15 19:33:02,098 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31704.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:33:08,480 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.148e+02 1.912e+02 2.361e+02 2.985e+02 5.385e+02, threshold=4.723e+02, percent-clipped=4.0 +2022-11-15 19:33:09,210 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31715.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:33:23,867 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-11-15 19:33:27,769 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7280, 2.1528, 1.5945, 1.2777, 1.3968, 2.1993, 1.9939, 2.5490], + device='cuda:2'), covar=tensor([0.1319, 0.0838, 0.1168, 0.1819, 0.0670, 0.0384, 0.0244, 0.0517], + device='cuda:2'), in_proj_covar=tensor([0.0189, 0.0187, 0.0140, 0.0195, 0.0146, 0.0146, 0.0132, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 19:33:43,345 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31765.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:33:43,443 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31765.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:33:44,284 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.44 vs. limit=5.0 +2022-11-15 19:34:00,076 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31788.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:34:00,585 INFO [train.py:876] (2/4) Epoch 5, batch 2700, loss[loss=0.1589, simple_loss=0.1674, pruned_loss=0.07517, over 5689.00 frames. ], tot_loss[loss=0.1835, simple_loss=0.185, pruned_loss=0.09105, over 1083939.12 frames. ], batch size: 28, lr: 1.60e-02, grad_scale: 16.0 +2022-11-15 19:34:16,058 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1003, 0.9168, 1.1728, 0.9383, 1.0533, 1.2607, 0.9991, 0.8901], + device='cuda:2'), covar=tensor([0.0571, 0.0857, 0.0481, 0.1009, 0.0638, 0.0810, 0.0686, 0.1060], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0012, 0.0010, 0.0010, 0.0009, 0.0009, 0.0011, 0.0010], + device='cuda:2'), out_proj_covar=tensor([3.7591e-05, 4.7460e-05, 3.9444e-05, 4.3683e-05, 3.8977e-05, 3.7013e-05, + 4.2551e-05, 4.0004e-05], device='cuda:2') +2022-11-15 19:34:16,096 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8214, 2.2253, 2.7729, 3.6444, 3.8687, 2.8948, 2.2497, 3.9722], + device='cuda:2'), covar=tensor([0.0238, 0.2963, 0.2806, 0.2665, 0.0756, 0.2883, 0.2482, 0.0243], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0218, 0.0215, 0.0310, 0.0216, 0.0225, 0.0211, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 19:34:17,214 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.174e+02 1.996e+02 2.440e+02 3.153e+02 7.916e+02, threshold=4.880e+02, percent-clipped=8.0 +2022-11-15 19:34:23,142 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31823.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:34:33,144 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0710, 2.6003, 3.1861, 3.8499, 4.1994, 3.3288, 2.6869, 4.1477], + device='cuda:2'), covar=tensor([0.0304, 0.3580, 0.1802, 0.2947, 0.0570, 0.2538, 0.2150, 0.0179], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0213, 0.0210, 0.0304, 0.0211, 0.0220, 0.0205, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 19:34:40,080 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31847.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:34:41,494 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31849.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:35:00,781 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31878.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:35:02,734 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-15 19:35:05,084 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31884.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:35:08,459 INFO [train.py:876] (2/4) Epoch 5, batch 2800, loss[loss=0.2443, simple_loss=0.2172, pruned_loss=0.1357, over 5374.00 frames. ], tot_loss[loss=0.1808, simple_loss=0.1832, pruned_loss=0.08919, over 1086837.28 frames. ], batch size: 70, lr: 1.60e-02, grad_scale: 16.0 +2022-11-15 19:35:11,063 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4464, 4.5006, 4.5565, 4.8580, 4.2597, 3.4297, 5.2732, 4.4324], + device='cuda:2'), covar=tensor([0.0295, 0.0673, 0.0296, 0.0594, 0.0386, 0.0487, 0.0464, 0.0370], + device='cuda:2'), in_proj_covar=tensor([0.0063, 0.0082, 0.0067, 0.0084, 0.0066, 0.0054, 0.0106, 0.0068], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:35:15,486 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9486, 0.7426, 0.9041, 0.7446, 0.9181, 1.2424, 0.7642, 0.7769], + device='cuda:2'), covar=tensor([0.1155, 0.0596, 0.1249, 0.1773, 0.0880, 0.1045, 0.1041, 0.0695], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0012, 0.0009, 0.0010, 0.0009, 0.0009, 0.0011, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.6222e-05, 4.5562e-05, 3.8433e-05, 4.2251e-05, 3.7825e-05, 3.5777e-05, + 4.1237e-05, 3.8751e-05], device='cuda:2') +2022-11-15 19:35:24,530 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6482, 4.6416, 3.2315, 4.4405, 3.5522, 3.2075, 2.0843, 3.8820], + device='cuda:2'), covar=tensor([0.1183, 0.0121, 0.0690, 0.0158, 0.0368, 0.0746, 0.1848, 0.0193], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0122, 0.0163, 0.0125, 0.0161, 0.0177, 0.0186, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:35:25,009 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.103e+02 1.889e+02 2.395e+02 3.111e+02 5.606e+02, threshold=4.789e+02, percent-clipped=5.0 +2022-11-15 19:35:25,916 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31915.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:35:32,665 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.01 vs. limit=2.0 +2022-11-15 19:35:42,285 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31939.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:35:50,331 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31950.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:36:07,076 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31976.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:36:15,574 INFO [train.py:876] (2/4) Epoch 5, batch 2900, loss[loss=0.1714, simple_loss=0.1885, pruned_loss=0.07711, over 5513.00 frames. ], tot_loss[loss=0.1804, simple_loss=0.1829, pruned_loss=0.08891, over 1088618.47 frames. ], batch size: 13, lr: 1.59e-02, grad_scale: 16.0 +2022-11-15 19:36:31,853 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=32011.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:36:33,617 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.337e+02 1.876e+02 2.342e+02 2.951e+02 6.735e+02, threshold=4.684e+02, percent-clipped=4.0 +2022-11-15 19:36:53,745 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-11-15 19:37:04,959 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32060.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:37:08,620 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32065.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:37:24,200 INFO [train.py:876] (2/4) Epoch 5, batch 3000, loss[loss=0.1544, simple_loss=0.1725, pruned_loss=0.0681, over 5596.00 frames. ], tot_loss[loss=0.1802, simple_loss=0.1831, pruned_loss=0.08863, over 1093433.83 frames. ], batch size: 43, lr: 1.59e-02, grad_scale: 16.0 +2022-11-15 19:37:24,200 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 19:37:29,999 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2370, 3.1412, 3.1081, 3.2711, 3.2260, 3.1170, 1.3882, 3.3495], + device='cuda:2'), covar=tensor([0.0324, 0.0324, 0.0302, 0.0198, 0.0324, 0.0300, 0.2608, 0.0254], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0072, 0.0072, 0.0063, 0.0088, 0.0074, 0.0129, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:37:38,697 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9984, 0.8656, 1.0421, 0.9854, 1.1397, 1.4925, 0.9030, 0.4687], + device='cuda:2'), covar=tensor([0.0281, 0.0107, 0.0426, 0.0246, 0.0155, 0.0103, 0.0382, 0.0403], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0012, 0.0009, 0.0010, 0.0009, 0.0008, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.5990e-05, 4.5954e-05, 3.7282e-05, 4.1189e-05, 3.7728e-05, 3.4499e-05, + 4.0361e-05, 3.8347e-05], device='cuda:2') +2022-11-15 19:37:41,542 INFO [train.py:908] (2/4) Epoch 5, validation: loss=0.1632, simple_loss=0.186, pruned_loss=0.07021, over 1530663.00 frames. +2022-11-15 19:37:41,543 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 19:37:58,635 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32113.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:37:59,207 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.218e+02 2.101e+02 2.644e+02 3.414e+02 5.284e+02, threshold=5.288e+02, percent-clipped=5.0 +2022-11-15 19:38:06,136 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 19:38:19,080 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32144.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:38:21,092 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32147.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:38:29,205 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-15 19:38:43,314 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32179.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:38:49,860 INFO [train.py:876] (2/4) Epoch 5, batch 3100, loss[loss=0.2178, simple_loss=0.2086, pruned_loss=0.1135, over 5587.00 frames. ], tot_loss[loss=0.1824, simple_loss=0.1846, pruned_loss=0.09005, over 1091191.45 frames. ], batch size: 23, lr: 1.59e-02, grad_scale: 16.0 +2022-11-15 19:38:53,729 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32195.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 19:39:07,484 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.018e+02 2.024e+02 2.459e+02 3.147e+02 6.395e+02, threshold=4.918e+02, percent-clipped=1.0 +2022-11-15 19:39:21,165 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32234.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:39:46,917 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32271.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:39:58,837 INFO [train.py:876] (2/4) Epoch 5, batch 3200, loss[loss=0.1789, simple_loss=0.1889, pruned_loss=0.0844, over 5610.00 frames. ], tot_loss[loss=0.183, simple_loss=0.1853, pruned_loss=0.0903, over 1083795.50 frames. ], batch size: 23, lr: 1.59e-02, grad_scale: 16.0 +2022-11-15 19:40:10,369 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32306.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:40:16,205 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.227e+02 1.788e+02 2.177e+02 2.789e+02 5.595e+02, threshold=4.355e+02, percent-clipped=3.0 +2022-11-15 19:40:17,034 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2014, 4.5336, 3.5368, 2.0728, 4.1903, 1.8223, 4.6891, 2.3709], + device='cuda:2'), covar=tensor([0.1094, 0.0162, 0.0446, 0.2350, 0.0252, 0.2107, 0.0099, 0.2042], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0100, 0.0107, 0.0121, 0.0099, 0.0131, 0.0089, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 19:40:30,669 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-11-15 19:40:47,301 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32360.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:40:52,919 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0480, 3.8807, 2.5424, 3.6493, 2.7997, 2.7999, 1.9011, 3.2532], + device='cuda:2'), covar=tensor([0.1310, 0.0152, 0.0977, 0.0221, 0.0684, 0.0765, 0.1916, 0.0250], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0124, 0.0170, 0.0131, 0.0165, 0.0179, 0.0189, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:41:07,144 INFO [train.py:876] (2/4) Epoch 5, batch 3300, loss[loss=0.1272, simple_loss=0.1597, pruned_loss=0.04739, over 5709.00 frames. ], tot_loss[loss=0.1797, simple_loss=0.1836, pruned_loss=0.08793, over 1088687.43 frames. ], batch size: 12, lr: 1.58e-02, grad_scale: 16.0 +2022-11-15 19:41:19,787 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32408.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:41:24,093 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 1.817e+02 2.244e+02 2.768e+02 6.144e+02, threshold=4.488e+02, percent-clipped=4.0 +2022-11-15 19:41:44,493 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32444.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:41:48,218 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.62 vs. limit=2.0 +2022-11-15 19:41:53,942 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-11-15 19:42:01,508 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-15 19:42:08,057 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32479.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:42:15,152 INFO [train.py:876] (2/4) Epoch 5, batch 3400, loss[loss=0.1201, simple_loss=0.1278, pruned_loss=0.05621, over 5503.00 frames. ], tot_loss[loss=0.1797, simple_loss=0.183, pruned_loss=0.08815, over 1082534.82 frames. ], batch size: 10, lr: 1.58e-02, grad_scale: 16.0 +2022-11-15 19:42:17,571 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32492.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:42:21,691 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0219, 3.5882, 3.0753, 3.5161, 3.5823, 3.2143, 3.0172, 3.1649], + device='cuda:2'), covar=tensor([0.1221, 0.0426, 0.1501, 0.0436, 0.0413, 0.0384, 0.0605, 0.0465], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0136, 0.0221, 0.0138, 0.0167, 0.0140, 0.0148, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:42:32,056 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.133e+02 1.882e+02 2.362e+02 2.947e+02 5.374e+02, threshold=4.725e+02, percent-clipped=3.0 +2022-11-15 19:42:40,957 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32527.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:42:46,047 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32534.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:42:59,838 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2124, 1.3949, 1.1409, 0.7291, 1.4755, 1.2098, 0.8298, 1.2436], + device='cuda:2'), covar=tensor([0.0023, 0.0013, 0.0022, 0.0022, 0.0018, 0.0014, 0.0034, 0.0028], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0024, 0.0026, 0.0025, 0.0026, 0.0023, 0.0025, 0.0024], + device='cuda:2'), out_proj_covar=tensor([2.7095e-05, 2.7317e-05, 2.3934e-05, 2.3625e-05, 2.4277e-05, 1.9089e-05, + 2.9112e-05, 2.2440e-05], device='cuda:2') +2022-11-15 19:43:10,884 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32571.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:43:18,440 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32582.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:43:23,554 INFO [train.py:876] (2/4) Epoch 5, batch 3500, loss[loss=0.1403, simple_loss=0.1566, pruned_loss=0.062, over 5255.00 frames. ], tot_loss[loss=0.1779, simple_loss=0.1822, pruned_loss=0.08683, over 1081862.52 frames. ], batch size: 9, lr: 1.58e-02, grad_scale: 16.0 +2022-11-15 19:43:26,596 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1817, 4.6091, 4.9694, 4.5680, 5.2069, 5.0480, 4.5347, 5.1129], + device='cuda:2'), covar=tensor([0.0268, 0.0246, 0.0383, 0.0280, 0.0262, 0.0108, 0.0194, 0.0175], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0109, 0.0086, 0.0118, 0.0118, 0.0070, 0.0093, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:43:35,716 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32606.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:43:38,068 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.58 vs. limit=5.0 +2022-11-15 19:43:40,883 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.289e+02 1.935e+02 2.270e+02 2.895e+02 5.164e+02, threshold=4.540e+02, percent-clipped=2.0 +2022-11-15 19:43:43,814 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.08 vs. limit=2.0 +2022-11-15 19:43:44,158 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32619.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:44:08,842 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32654.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:44:18,561 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.38 vs. limit=5.0 +2022-11-15 19:44:21,570 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4863, 1.0281, 1.3968, 0.9577, 0.9643, 1.3732, 0.9608, 1.0336], + device='cuda:2'), covar=tensor([0.1081, 0.0794, 0.0908, 0.1694, 0.1097, 0.0439, 0.0634, 0.0926], + device='cuda:2'), in_proj_covar=tensor([0.0008, 0.0012, 0.0009, 0.0010, 0.0009, 0.0008, 0.0010, 0.0009], + device='cuda:2'), out_proj_covar=tensor([3.5116e-05, 4.6045e-05, 3.7280e-05, 4.1069e-05, 3.7668e-05, 3.5093e-05, + 4.0024e-05, 3.8183e-05], device='cuda:2') +2022-11-15 19:44:32,338 INFO [train.py:876] (2/4) Epoch 5, batch 3600, loss[loss=0.1116, simple_loss=0.1354, pruned_loss=0.04392, over 5688.00 frames. ], tot_loss[loss=0.1769, simple_loss=0.1811, pruned_loss=0.08632, over 1082899.96 frames. ], batch size: 11, lr: 1.58e-02, grad_scale: 16.0 +2022-11-15 19:44:49,900 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.117e+02 1.827e+02 2.423e+02 3.084e+02 7.397e+02, threshold=4.846e+02, percent-clipped=5.0 +2022-11-15 19:45:15,327 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-11-15 19:45:40,993 INFO [train.py:876] (2/4) Epoch 5, batch 3700, loss[loss=0.208, simple_loss=0.1881, pruned_loss=0.1139, over 4745.00 frames. ], tot_loss[loss=0.1792, simple_loss=0.1827, pruned_loss=0.08789, over 1080664.34 frames. ], batch size: 135, lr: 1.58e-02, grad_scale: 16.0 +2022-11-15 19:45:57,989 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.334e+02 2.092e+02 2.533e+02 3.307e+02 5.477e+02, threshold=5.066e+02, percent-clipped=1.0 +2022-11-15 19:45:59,874 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1081, 1.2103, 1.4292, 1.0337, 0.5835, 2.1456, 1.3693, 0.9507], + device='cuda:2'), covar=tensor([0.0737, 0.0699, 0.0580, 0.1672, 0.2100, 0.0565, 0.0709, 0.0909], + device='cuda:2'), in_proj_covar=tensor([0.0047, 0.0040, 0.0044, 0.0051, 0.0044, 0.0038, 0.0039, 0.0041], + device='cuda:2'), out_proj_covar=tensor([8.7298e-05, 7.5023e-05, 7.8618e-05, 9.9331e-05, 8.3275e-05, 7.6803e-05, + 7.3007e-05, 7.6564e-05], device='cuda:2') +2022-11-15 19:46:49,252 INFO [train.py:876] (2/4) Epoch 5, batch 3800, loss[loss=0.222, simple_loss=0.2082, pruned_loss=0.1179, over 5481.00 frames. ], tot_loss[loss=0.1784, simple_loss=0.1821, pruned_loss=0.08738, over 1084746.86 frames. ], batch size: 53, lr: 1.57e-02, grad_scale: 16.0 +2022-11-15 19:47:05,911 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.156e+02 1.821e+02 2.317e+02 3.245e+02 5.660e+02, threshold=4.635e+02, percent-clipped=3.0 +2022-11-15 19:47:22,354 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8664, 4.8465, 4.8279, 5.1617, 4.4704, 3.7977, 5.6109, 4.8018], + device='cuda:2'), covar=tensor([0.0292, 0.0585, 0.0340, 0.0557, 0.0468, 0.0362, 0.0499, 0.0287], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0085, 0.0071, 0.0088, 0.0068, 0.0057, 0.0109, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:47:42,310 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=32966.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:47:47,195 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.40 vs. limit=5.0 +2022-11-15 19:47:57,894 INFO [train.py:876] (2/4) Epoch 5, batch 3900, loss[loss=0.1835, simple_loss=0.1702, pruned_loss=0.09843, over 4171.00 frames. ], tot_loss[loss=0.1782, simple_loss=0.1817, pruned_loss=0.08737, over 1080614.55 frames. ], batch size: 181, lr: 1.57e-02, grad_scale: 16.0 +2022-11-15 19:48:15,131 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.080e+02 1.831e+02 2.324e+02 2.912e+02 5.041e+02, threshold=4.648e+02, percent-clipped=1.0 +2022-11-15 19:48:24,297 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33027.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:48:28,936 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-11-15 19:48:40,248 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4909, 4.0568, 4.3141, 3.9602, 4.5135, 4.2040, 3.9781, 4.4973], + device='cuda:2'), covar=tensor([0.0326, 0.0292, 0.0427, 0.0321, 0.0363, 0.0286, 0.0284, 0.0283], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0110, 0.0083, 0.0116, 0.0117, 0.0070, 0.0095, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:48:43,835 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-11-15 19:48:51,435 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.85 vs. limit=2.0 +2022-11-15 19:48:52,400 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33069.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:49:06,329 INFO [train.py:876] (2/4) Epoch 5, batch 4000, loss[loss=0.1884, simple_loss=0.2025, pruned_loss=0.0872, over 5610.00 frames. ], tot_loss[loss=0.1816, simple_loss=0.1836, pruned_loss=0.0898, over 1082337.21 frames. ], batch size: 24, lr: 1.57e-02, grad_scale: 16.0 +2022-11-15 19:49:23,826 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.282e+02 1.884e+02 2.400e+02 2.913e+02 6.279e+02, threshold=4.801e+02, percent-clipped=5.0 +2022-11-15 19:49:34,182 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33130.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:49:48,115 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-11-15 19:49:51,328 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33155.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:50:03,265 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3622, 1.7938, 2.9426, 2.5312, 3.2155, 1.9628, 2.8314, 3.4182], + device='cuda:2'), covar=tensor([0.0350, 0.1186, 0.0450, 0.1035, 0.0315, 0.0983, 0.0730, 0.0446], + device='cuda:2'), in_proj_covar=tensor([0.0188, 0.0197, 0.0181, 0.0212, 0.0183, 0.0192, 0.0228, 0.0202], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 19:50:13,958 INFO [train.py:876] (2/4) Epoch 5, batch 4100, loss[loss=0.1206, simple_loss=0.1396, pruned_loss=0.05077, over 5483.00 frames. ], tot_loss[loss=0.1798, simple_loss=0.1826, pruned_loss=0.08851, over 1079876.94 frames. ], batch size: 10, lr: 1.57e-02, grad_scale: 8.0 +2022-11-15 19:50:16,830 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3018, 2.5790, 1.8395, 3.0790, 2.0352, 2.2179, 2.3671, 2.8082], + device='cuda:2'), covar=tensor([0.0705, 0.1120, 0.2614, 0.0826, 0.1608, 0.0724, 0.1295, 0.1777], + device='cuda:2'), in_proj_covar=tensor([0.0052, 0.0059, 0.0074, 0.0049, 0.0063, 0.0055, 0.0068, 0.0048], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:50:32,026 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.183e+02 1.879e+02 2.350e+02 3.001e+02 5.532e+02, threshold=4.700e+02, percent-clipped=2.0 +2022-11-15 19:50:32,859 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33216.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:50:36,920 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3701, 4.3144, 4.5195, 4.6446, 3.9935, 3.7501, 5.1323, 4.4638], + device='cuda:2'), covar=tensor([0.0492, 0.0896, 0.0300, 0.0697, 0.0506, 0.0363, 0.0593, 0.0467], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0085, 0.0070, 0.0087, 0.0068, 0.0057, 0.0110, 0.0073], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:50:48,063 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3795, 1.0987, 1.8471, 1.1563, 0.7852, 2.4340, 1.5283, 1.3160], + device='cuda:2'), covar=tensor([0.0644, 0.0796, 0.0594, 0.1786, 0.2662, 0.0555, 0.0962, 0.0981], + device='cuda:2'), in_proj_covar=tensor([0.0049, 0.0043, 0.0047, 0.0055, 0.0045, 0.0036, 0.0043, 0.0042], + device='cuda:2'), out_proj_covar=tensor([9.1475e-05, 8.0003e-05, 8.3768e-05, 1.0743e-04, 8.6825e-05, 7.6151e-05, + 7.9901e-05, 7.9516e-05], device='cuda:2') +2022-11-15 19:50:56,041 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3645, 3.4986, 3.2604, 3.1016, 2.0947, 3.6128, 2.0300, 3.2025], + device='cuda:2'), covar=tensor([0.0325, 0.0197, 0.0158, 0.0203, 0.0375, 0.0079, 0.0312, 0.0081], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0119, 0.0135, 0.0145, 0.0161, 0.0128, 0.0151, 0.0114], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:51:10,920 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9053, 2.3458, 2.5566, 2.3684, 1.4920, 2.4497, 1.6073, 1.5583], + device='cuda:2'), covar=tensor([0.0155, 0.0061, 0.0072, 0.0103, 0.0205, 0.0070, 0.0176, 0.0100], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0116, 0.0132, 0.0142, 0.0159, 0.0126, 0.0148, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:51:22,798 INFO [train.py:876] (2/4) Epoch 5, batch 4200, loss[loss=0.163, simple_loss=0.1783, pruned_loss=0.07384, over 5586.00 frames. ], tot_loss[loss=0.1817, simple_loss=0.1839, pruned_loss=0.08973, over 1077359.24 frames. ], batch size: 16, lr: 1.56e-02, grad_scale: 8.0 +2022-11-15 19:51:40,261 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-11-15 19:51:40,448 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.157e+02 1.845e+02 2.173e+02 2.647e+02 4.072e+02, threshold=4.345e+02, percent-clipped=0.0 +2022-11-15 19:51:45,060 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=33322.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:52:06,795 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1719, 2.5454, 1.8931, 2.8972, 1.8523, 2.5448, 2.6656, 3.2559], + device='cuda:2'), covar=tensor([0.0988, 0.1192, 0.3422, 0.1868, 0.2335, 0.1368, 0.1667, 0.1095], + device='cuda:2'), in_proj_covar=tensor([0.0055, 0.0063, 0.0079, 0.0051, 0.0066, 0.0057, 0.0071, 0.0049], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:52:30,460 INFO [train.py:876] (2/4) Epoch 5, batch 4300, loss[loss=0.1444, simple_loss=0.1612, pruned_loss=0.06378, over 5551.00 frames. ], tot_loss[loss=0.1775, simple_loss=0.1818, pruned_loss=0.08656, over 1083497.14 frames. ], batch size: 14, lr: 1.56e-02, grad_scale: 8.0 +2022-11-15 19:52:44,091 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-15 19:52:49,001 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.372e+01 1.969e+02 2.435e+02 3.163e+02 9.091e+02, threshold=4.870e+02, percent-clipped=6.0 +2022-11-15 19:52:49,887 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9196, 2.7731, 3.0976, 1.1786, 3.2612, 3.5859, 3.1937, 3.6149], + device='cuda:2'), covar=tensor([0.2283, 0.1627, 0.0890, 0.3155, 0.0239, 0.0459, 0.0255, 0.0424], + device='cuda:2'), in_proj_covar=tensor([0.0189, 0.0186, 0.0142, 0.0195, 0.0142, 0.0148, 0.0134, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 19:52:55,803 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=33425.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:53:16,442 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-11-15 19:53:16,616 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0 +2022-11-15 19:53:28,335 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33472.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:53:39,862 INFO [train.py:876] (2/4) Epoch 5, batch 4400, loss[loss=0.1273, simple_loss=0.1418, pruned_loss=0.05641, over 4956.00 frames. ], tot_loss[loss=0.1775, simple_loss=0.1814, pruned_loss=0.08679, over 1080686.16 frames. ], batch size: 5, lr: 1.56e-02, grad_scale: 8.0 +2022-11-15 19:53:45,701 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1784, 2.4442, 3.1956, 3.5681, 4.1592, 3.4391, 3.0400, 4.2277], + device='cuda:2'), covar=tensor([0.0232, 0.3326, 0.2045, 0.5724, 0.0651, 0.2662, 0.1758, 0.0222], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0212, 0.0213, 0.0318, 0.0213, 0.0223, 0.0202, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 19:53:55,874 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=33511.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:53:58,512 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.090e+02 1.879e+02 2.441e+02 2.964e+02 5.680e+02, threshold=4.882e+02, percent-clipped=2.0 +2022-11-15 19:53:59,079 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9016, 0.9160, 1.2721, 1.0363, 0.9575, 0.9064, 0.8679, 1.4214], + device='cuda:2'), covar=tensor([0.0028, 0.0031, 0.0018, 0.0020, 0.0025, 0.0017, 0.0030, 0.0024], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0024, 0.0024, 0.0024, 0.0024, 0.0021, 0.0024, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.5244e-05, 2.6786e-05, 2.2717e-05, 2.1826e-05, 2.2002e-05, 1.7553e-05, + 2.7471e-05, 2.0511e-05], device='cuda:2') +2022-11-15 19:54:04,083 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-11-15 19:54:05,329 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 19:54:12,039 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33533.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:54:50,506 INFO [train.py:876] (2/4) Epoch 5, batch 4500, loss[loss=0.1708, simple_loss=0.1847, pruned_loss=0.07847, over 5744.00 frames. ], tot_loss[loss=0.1791, simple_loss=0.1823, pruned_loss=0.08797, over 1085915.31 frames. ], batch size: 16, lr: 1.56e-02, grad_scale: 8.0 +2022-11-15 19:55:08,210 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.222e+02 1.892e+02 2.378e+02 2.959e+02 6.563e+02, threshold=4.756e+02, percent-clipped=3.0 +2022-11-15 19:55:13,233 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=33622.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:55:17,119 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6935, 3.4752, 3.6699, 3.6857, 2.9787, 2.7850, 4.0615, 3.4967], + device='cuda:2'), covar=tensor([0.0353, 0.0813, 0.0380, 0.0715, 0.0606, 0.0484, 0.0626, 0.0523], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0087, 0.0072, 0.0088, 0.0068, 0.0059, 0.0111, 0.0073], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:55:45,891 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=33670.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:55:50,862 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33677.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:55:58,832 INFO [train.py:876] (2/4) Epoch 5, batch 4600, loss[loss=0.1776, simple_loss=0.1771, pruned_loss=0.08907, over 5096.00 frames. ], tot_loss[loss=0.1801, simple_loss=0.1831, pruned_loss=0.08853, over 1086466.38 frames. ], batch size: 91, lr: 1.55e-02, grad_scale: 8.0 +2022-11-15 19:56:14,494 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5030, 3.8191, 3.3753, 3.2944, 2.3596, 3.8678, 2.2218, 3.1439], + device='cuda:2'), covar=tensor([0.0343, 0.0266, 0.0128, 0.0233, 0.0351, 0.0080, 0.0311, 0.0081], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0120, 0.0132, 0.0146, 0.0159, 0.0128, 0.0151, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:56:16,176 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 1.825e+02 2.230e+02 2.892e+02 8.047e+02, threshold=4.459e+02, percent-clipped=4.0 +2022-11-15 19:56:23,208 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=33725.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:56:25,188 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7266, 1.9363, 3.1296, 2.5681, 3.5475, 2.0667, 3.1314, 3.6077], + device='cuda:2'), covar=tensor([0.0323, 0.1466, 0.0666, 0.1380, 0.0346, 0.1315, 0.0887, 0.0607], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0193, 0.0177, 0.0208, 0.0177, 0.0186, 0.0224, 0.0200], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 19:56:32,052 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33738.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 19:56:35,182 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 19:56:39,303 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2155, 1.1527, 1.3722, 0.8633, 1.0602, 0.8591, 1.1967, 1.6197], + device='cuda:2'), covar=tensor([0.0029, 0.0032, 0.0023, 0.0028, 0.0030, 0.0023, 0.0031, 0.0039], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0025, 0.0026, 0.0025, 0.0025, 0.0023, 0.0027, 0.0023], + device='cuda:2'), out_proj_covar=tensor([2.6164e-05, 2.8246e-05, 2.4074e-05, 2.2630e-05, 2.3425e-05, 1.8955e-05, + 3.0339e-05, 2.1670e-05], device='cuda:2') +2022-11-15 19:56:55,511 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=33773.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:56:57,853 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-11-15 19:57:00,848 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 19:57:06,415 INFO [train.py:876] (2/4) Epoch 5, batch 4700, loss[loss=0.18, simple_loss=0.1855, pruned_loss=0.08724, over 5116.00 frames. ], tot_loss[loss=0.1768, simple_loss=0.1806, pruned_loss=0.08647, over 1079913.30 frames. ], batch size: 91, lr: 1.55e-02, grad_scale: 8.0 +2022-11-15 19:57:22,555 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=33811.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:57:25,039 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.135e+02 1.755e+02 2.231e+02 2.801e+02 4.827e+02, threshold=4.463e+02, percent-clipped=3.0 +2022-11-15 19:57:33,505 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=33828.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:57:39,160 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5791, 2.7353, 2.7726, 2.5453, 2.7129, 2.7257, 1.2061, 2.8057], + device='cuda:2'), covar=tensor([0.0349, 0.0236, 0.0183, 0.0205, 0.0293, 0.0260, 0.2442, 0.0299], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0070, 0.0071, 0.0062, 0.0090, 0.0075, 0.0127, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 19:57:55,052 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=33859.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:57:56,155 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9274, 1.0217, 1.2728, 1.2055, 1.1663, 1.3439, 0.9444, 0.7991], + device='cuda:2'), covar=tensor([0.0022, 0.0033, 0.0021, 0.0026, 0.0023, 0.0028, 0.0022, 0.0039], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0016, 0.0016, 0.0017, 0.0017, 0.0018, 0.0018, 0.0017], + device='cuda:2'), out_proj_covar=tensor([1.7483e-05, 1.7199e-05, 1.6822e-05, 1.7545e-05, 1.7159e-05, 1.7929e-05, + 1.9170e-05, 1.9699e-05], device='cuda:2') +2022-11-15 19:58:15,456 INFO [train.py:876] (2/4) Epoch 5, batch 4800, loss[loss=0.1884, simple_loss=0.1949, pruned_loss=0.09098, over 5563.00 frames. ], tot_loss[loss=0.177, simple_loss=0.1808, pruned_loss=0.08663, over 1086112.61 frames. ], batch size: 16, lr: 1.55e-02, grad_scale: 8.0 +2022-11-15 19:58:28,775 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6946, 4.4953, 4.5557, 3.3854, 4.5387, 4.6230, 1.8893, 4.9632], + device='cuda:2'), covar=tensor([0.0251, 0.0362, 0.0265, 0.0592, 0.0341, 0.0413, 0.3007, 0.0249], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0069, 0.0070, 0.0061, 0.0087, 0.0074, 0.0124, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:58:33,232 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.053e+02 1.864e+02 2.250e+02 2.859e+02 4.870e+02, threshold=4.500e+02, percent-clipped=2.0 +2022-11-15 19:58:49,521 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33939.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:59:08,139 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7177, 1.9375, 1.8599, 1.7354, 1.8896, 1.9223, 0.9106, 1.9576], + device='cuda:2'), covar=tensor([0.0369, 0.0208, 0.0243, 0.0238, 0.0295, 0.0223, 0.1864, 0.0267], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0069, 0.0070, 0.0061, 0.0087, 0.0073, 0.0124, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 19:59:23,238 INFO [train.py:876] (2/4) Epoch 5, batch 4900, loss[loss=0.1639, simple_loss=0.1785, pruned_loss=0.07464, over 5558.00 frames. ], tot_loss[loss=0.1753, simple_loss=0.1802, pruned_loss=0.08522, over 1085991.51 frames. ], batch size: 30, lr: 1.55e-02, grad_scale: 8.0 +2022-11-15 19:59:31,339 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34000.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 19:59:34,076 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.11 vs. limit=5.0 +2022-11-15 19:59:34,224 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-15 19:59:41,366 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.343e+02 1.963e+02 2.433e+02 3.223e+02 8.796e+02, threshold=4.867e+02, percent-clipped=10.0 +2022-11-15 19:59:53,929 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34033.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 20:00:01,249 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34044.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:00:32,074 INFO [train.py:876] (2/4) Epoch 5, batch 5000, loss[loss=0.1532, simple_loss=0.1749, pruned_loss=0.06579, over 5569.00 frames. ], tot_loss[loss=0.1733, simple_loss=0.1784, pruned_loss=0.08407, over 1086925.52 frames. ], batch size: 25, lr: 1.55e-02, grad_scale: 8.0 +2022-11-15 20:00:35,466 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2183, 1.9495, 2.1615, 1.0902, 1.1198, 2.7102, 1.8604, 1.1785], + device='cuda:2'), covar=tensor([0.0569, 0.0468, 0.0280, 0.1835, 0.2227, 0.1223, 0.0821, 0.0724], + device='cuda:2'), in_proj_covar=tensor([0.0045, 0.0039, 0.0040, 0.0051, 0.0042, 0.0035, 0.0037, 0.0039], + device='cuda:2'), out_proj_covar=tensor([8.4834e-05, 7.4822e-05, 7.4583e-05, 1.0026e-04, 8.1925e-05, 7.3080e-05, + 7.2440e-05, 7.5670e-05], device='cuda:2') +2022-11-15 20:00:42,671 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34105.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:00:49,472 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.164e+02 1.808e+02 2.328e+02 2.773e+02 5.652e+02, threshold=4.656e+02, percent-clipped=1.0 +2022-11-15 20:00:52,565 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0646, 4.5239, 4.9026, 4.5315, 5.1251, 4.8962, 4.4269, 5.0318], + device='cuda:2'), covar=tensor([0.0315, 0.0216, 0.0336, 0.0251, 0.0302, 0.0104, 0.0232, 0.0201], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0109, 0.0083, 0.0113, 0.0114, 0.0067, 0.0094, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:00:58,880 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34128.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:01:02,261 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7009, 1.8095, 1.8691, 2.3963, 2.6612, 1.9278, 1.5590, 2.8475], + device='cuda:2'), covar=tensor([0.0577, 0.2398, 0.1984, 0.1409, 0.0755, 0.2611, 0.2351, 0.0434], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0210, 0.0212, 0.0318, 0.0211, 0.0223, 0.0203, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 20:01:10,324 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34145.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:01:31,152 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34176.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:01:40,501 INFO [train.py:876] (2/4) Epoch 5, batch 5100, loss[loss=0.1614, simple_loss=0.1793, pruned_loss=0.07172, over 5701.00 frames. ], tot_loss[loss=0.1751, simple_loss=0.1798, pruned_loss=0.08516, over 1094057.23 frames. ], batch size: 28, lr: 1.54e-02, grad_scale: 8.0 +2022-11-15 20:01:43,355 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.3739, 5.0883, 5.3987, 4.9308, 5.3900, 5.4486, 4.7020, 5.5719], + device='cuda:2'), covar=tensor([0.0387, 0.0217, 0.0285, 0.0253, 0.0426, 0.0087, 0.0222, 0.0180], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0110, 0.0085, 0.0115, 0.0114, 0.0069, 0.0096, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:01:50,719 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6797, 2.0189, 2.2772, 3.4410, 3.2996, 2.4210, 2.0099, 3.5669], + device='cuda:2'), covar=tensor([0.0304, 0.2997, 0.2408, 0.2156, 0.1086, 0.3194, 0.2082, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0214, 0.0214, 0.0318, 0.0215, 0.0227, 0.0204, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 20:01:52,694 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34206.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:01:58,449 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.276e+02 1.924e+02 2.234e+02 2.951e+02 5.133e+02, threshold=4.468e+02, percent-clipped=1.0 +2022-11-15 20:01:58,757 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-11-15 20:02:18,229 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-11-15 20:02:49,028 INFO [train.py:876] (2/4) Epoch 5, batch 5200, loss[loss=0.1841, simple_loss=0.1894, pruned_loss=0.08935, over 5693.00 frames. ], tot_loss[loss=0.1747, simple_loss=0.1798, pruned_loss=0.08481, over 1090895.63 frames. ], batch size: 34, lr: 1.54e-02, grad_scale: 8.0 +2022-11-15 20:02:53,492 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34295.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:03:07,060 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.117e+02 1.891e+02 2.372e+02 3.198e+02 5.762e+02, threshold=4.744e+02, percent-clipped=5.0 +2022-11-15 20:03:19,806 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34333.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:03:47,943 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-11-15 20:03:52,327 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34381.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:03:57,990 INFO [train.py:876] (2/4) Epoch 5, batch 5300, loss[loss=0.2345, simple_loss=0.2194, pruned_loss=0.1248, over 5378.00 frames. ], tot_loss[loss=0.1758, simple_loss=0.1808, pruned_loss=0.08545, over 1085890.94 frames. ], batch size: 70, lr: 1.54e-02, grad_scale: 8.0 +2022-11-15 20:04:01,412 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7772, 2.1183, 2.6501, 3.5825, 3.8241, 2.7599, 2.4068, 3.6997], + device='cuda:2'), covar=tensor([0.0245, 0.2791, 0.1849, 0.2656, 0.0600, 0.2290, 0.1770, 0.0185], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0208, 0.0207, 0.0311, 0.0209, 0.0221, 0.0197, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 20:04:05,523 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34400.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:04:08,552 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6773, 1.8394, 2.2804, 1.3593, 1.6152, 2.6604, 1.9676, 0.8400], + device='cuda:2'), covar=tensor([0.0518, 0.0731, 0.0613, 0.2012, 0.2273, 0.2193, 0.0673, 0.1068], + device='cuda:2'), in_proj_covar=tensor([0.0047, 0.0042, 0.0044, 0.0053, 0.0045, 0.0039, 0.0039, 0.0042], + device='cuda:2'), out_proj_covar=tensor([8.8736e-05, 8.0034e-05, 8.1445e-05, 1.0487e-04, 8.6420e-05, 7.9690e-05, + 7.6538e-05, 8.0982e-05], device='cuda:2') +2022-11-15 20:04:15,583 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.165e+02 1.782e+02 2.113e+02 2.792e+02 4.181e+02, threshold=4.226e+02, percent-clipped=0.0 +2022-11-15 20:04:34,239 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3222, 3.3197, 3.1152, 2.8966, 2.0371, 3.4161, 2.0107, 2.8914], + device='cuda:2'), covar=tensor([0.0294, 0.0134, 0.0125, 0.0206, 0.0319, 0.0081, 0.0303, 0.0083], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0120, 0.0134, 0.0148, 0.0160, 0.0130, 0.0151, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:05:06,441 INFO [train.py:876] (2/4) Epoch 5, batch 5400, loss[loss=0.1679, simple_loss=0.1771, pruned_loss=0.07938, over 5517.00 frames. ], tot_loss[loss=0.1767, simple_loss=0.1814, pruned_loss=0.08598, over 1086963.49 frames. ], batch size: 17, lr: 1.54e-02, grad_scale: 8.0 +2022-11-15 20:05:14,778 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34501.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:05:21,943 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.34 vs. limit=2.0 +2022-11-15 20:05:22,423 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0353, 4.5012, 3.9265, 4.6359, 4.5258, 3.6452, 3.9912, 3.7152], + device='cuda:2'), covar=tensor([0.0329, 0.0528, 0.1764, 0.0333, 0.0451, 0.0546, 0.0567, 0.0634], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0141, 0.0225, 0.0137, 0.0170, 0.0145, 0.0149, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:05:24,229 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.200e+02 1.783e+02 2.368e+02 3.183e+02 6.760e+02, threshold=4.736e+02, percent-clipped=8.0 +2022-11-15 20:06:01,286 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.43 vs. limit=2.0 +2022-11-15 20:06:11,728 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-11-15 20:06:14,516 INFO [train.py:876] (2/4) Epoch 5, batch 5500, loss[loss=0.2577, simple_loss=0.234, pruned_loss=0.1407, over 5578.00 frames. ], tot_loss[loss=0.174, simple_loss=0.1796, pruned_loss=0.08419, over 1082553.31 frames. ], batch size: 46, lr: 1.53e-02, grad_scale: 8.0 +2022-11-15 20:06:17,328 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34593.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:06:18,640 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34595.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:06:32,592 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.119e+02 1.877e+02 2.418e+02 2.886e+02 5.617e+02, threshold=4.837e+02, percent-clipped=2.0 +2022-11-15 20:06:37,684 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34622.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:06:40,319 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34626.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:06:51,544 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34643.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:06:59,163 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34654.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:07:04,485 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2960, 4.8927, 3.4009, 4.6307, 3.6907, 3.1311, 2.5001, 4.2240], + device='cuda:2'), covar=tensor([0.1511, 0.0109, 0.0780, 0.0165, 0.0364, 0.0820, 0.1623, 0.0150], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0127, 0.0166, 0.0128, 0.0163, 0.0177, 0.0179, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:07:19,519 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34683.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:07:22,498 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34687.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:07:23,628 INFO [train.py:876] (2/4) Epoch 5, batch 5600, loss[loss=0.3097, simple_loss=0.2513, pruned_loss=0.184, over 3156.00 frames. ], tot_loss[loss=0.175, simple_loss=0.1806, pruned_loss=0.08471, over 1084116.97 frames. ], batch size: 284, lr: 1.53e-02, grad_scale: 8.0 +2022-11-15 20:07:31,131 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34700.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:07:37,940 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34710.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:07:41,446 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.115e+02 1.835e+02 2.169e+02 2.808e+02 5.282e+02, threshold=4.338e+02, percent-clipped=2.0 +2022-11-15 20:07:42,953 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4183, 2.2952, 2.2154, 3.3835, 3.4263, 2.5260, 2.0704, 3.7310], + device='cuda:2'), covar=tensor([0.0327, 0.2193, 0.2374, 0.1950, 0.0744, 0.2391, 0.2051, 0.0264], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0209, 0.0211, 0.0313, 0.0211, 0.0221, 0.0200, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 20:08:03,954 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34748.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:08:13,387 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8848, 1.5115, 1.6210, 1.0980, 0.9634, 2.4319, 1.8744, 0.7208], + device='cuda:2'), covar=tensor([0.0498, 0.0796, 0.0528, 0.2124, 0.3105, 0.0599, 0.0770, 0.1032], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0043, 0.0046, 0.0053, 0.0044, 0.0039, 0.0040, 0.0042], + device='cuda:2'), out_proj_covar=tensor([9.0517e-05, 8.1176e-05, 8.5074e-05, 1.0644e-04, 8.6955e-05, 8.0154e-05, + 7.8748e-05, 8.1344e-05], device='cuda:2') +2022-11-15 20:08:19,718 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34771.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 20:08:32,090 INFO [train.py:876] (2/4) Epoch 5, batch 5700, loss[loss=0.2142, simple_loss=0.2229, pruned_loss=0.1027, over 5549.00 frames. ], tot_loss[loss=0.1755, simple_loss=0.1811, pruned_loss=0.08494, over 1085292.89 frames. ], batch size: 21, lr: 1.53e-02, grad_scale: 8.0 +2022-11-15 20:08:40,364 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34801.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:08:49,699 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.053e+02 1.820e+02 2.216e+02 2.818e+02 4.619e+02, threshold=4.433e+02, percent-clipped=3.0 +2022-11-15 20:09:13,159 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34849.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:09:40,489 INFO [train.py:876] (2/4) Epoch 5, batch 5800, loss[loss=0.1825, simple_loss=0.1925, pruned_loss=0.0863, over 5771.00 frames. ], tot_loss[loss=0.1763, simple_loss=0.1816, pruned_loss=0.08545, over 1086557.10 frames. ], batch size: 20, lr: 1.53e-02, grad_scale: 8.0 +2022-11-15 20:09:58,513 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.085e+02 1.871e+02 2.247e+02 2.954e+02 6.973e+02, threshold=4.493e+02, percent-clipped=4.0 +2022-11-15 20:10:01,579 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.09 vs. limit=2.0 +2022-11-15 20:10:04,173 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.57 vs. limit=5.0 +2022-11-15 20:10:06,371 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5263, 5.1284, 3.0349, 4.6821, 3.7373, 3.0750, 2.5643, 4.3636], + device='cuda:2'), covar=tensor([0.1900, 0.0230, 0.1324, 0.0296, 0.0495, 0.1189, 0.2062, 0.0190], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0133, 0.0174, 0.0133, 0.0172, 0.0184, 0.0187, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 20:10:06,768 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.14 vs. limit=2.0 +2022-11-15 20:10:08,692 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7996, 0.9581, 1.3759, 0.8472, 1.2851, 1.5094, 1.0009, 1.8774], + device='cuda:2'), covar=tensor([0.0028, 0.0030, 0.0019, 0.0020, 0.0019, 0.0017, 0.0029, 0.0028], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0025, 0.0027, 0.0026, 0.0024, 0.0024, 0.0026, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.5799e-05, 2.7949e-05, 2.4832e-05, 2.3298e-05, 2.2206e-05, 1.9872e-05, + 2.9036e-05, 2.0219e-05], device='cuda:2') +2022-11-15 20:10:21,823 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34949.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:10:26,267 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7060, 1.6344, 1.5759, 1.6983, 1.8406, 1.6674, 1.9383, 1.8004], + device='cuda:2'), covar=tensor([0.0846, 0.1106, 0.0890, 0.1134, 0.0745, 0.0608, 0.1122, 0.0833], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0088, 0.0074, 0.0091, 0.0070, 0.0060, 0.0117, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:10:36,573 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5482, 2.8187, 2.0717, 2.8289, 1.8383, 2.4704, 2.2292, 3.1370], + device='cuda:2'), covar=tensor([0.0548, 0.1008, 0.2049, 0.1869, 0.1414, 0.1519, 0.1471, 0.1185], + device='cuda:2'), in_proj_covar=tensor([0.0055, 0.0063, 0.0079, 0.0052, 0.0064, 0.0058, 0.0071, 0.0051], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:10:40,994 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34978.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 20:10:44,254 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34982.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:10:48,559 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-11-15 20:10:48,846 INFO [train.py:876] (2/4) Epoch 5, batch 5900, loss[loss=0.2199, simple_loss=0.2124, pruned_loss=0.1137, over 5798.00 frames. ], tot_loss[loss=0.1757, simple_loss=0.1803, pruned_loss=0.08555, over 1081905.96 frames. ], batch size: 21, lr: 1.53e-02, grad_scale: 8.0 +2022-11-15 20:11:09,453 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.095e+02 2.067e+02 2.510e+02 3.048e+02 6.634e+02, threshold=5.021e+02, percent-clipped=2.0 +2022-11-15 20:11:44,831 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35066.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:11:53,736 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4117, 3.3094, 3.2161, 3.4960, 3.4736, 2.8737, 3.8700, 3.2280], + device='cuda:2'), covar=tensor([0.0550, 0.0802, 0.0497, 0.0832, 0.0480, 0.0460, 0.0688, 0.0562], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0088, 0.0074, 0.0092, 0.0070, 0.0060, 0.0117, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:12:00,909 INFO [train.py:876] (2/4) Epoch 5, batch 6000, loss[loss=0.2252, simple_loss=0.2064, pruned_loss=0.122, over 5569.00 frames. ], tot_loss[loss=0.1763, simple_loss=0.1804, pruned_loss=0.08613, over 1083473.11 frames. ], batch size: 54, lr: 1.52e-02, grad_scale: 8.0 +2022-11-15 20:12:00,909 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 20:12:11,427 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9165, 1.8228, 1.6472, 1.0695, 0.7038, 2.3787, 1.7363, 1.3670], + device='cuda:2'), covar=tensor([0.0715, 0.0646, 0.0818, 0.2235, 0.3363, 0.0956, 0.1164, 0.1171], + device='cuda:2'), in_proj_covar=tensor([0.0047, 0.0042, 0.0043, 0.0052, 0.0045, 0.0037, 0.0038, 0.0042], + device='cuda:2'), out_proj_covar=tensor([9.0082e-05, 8.0102e-05, 8.2148e-05, 1.0365e-04, 8.7394e-05, 7.7846e-05, + 7.6009e-05, 8.0931e-05], device='cuda:2') +2022-11-15 20:12:18,599 INFO [train.py:908] (2/4) Epoch 5, validation: loss=0.1648, simple_loss=0.1864, pruned_loss=0.07158, over 1530663.00 frames. +2022-11-15 20:12:18,600 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 20:12:20,812 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35092.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:12:34,416 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0488, 2.8771, 2.7505, 2.7722, 1.8902, 2.7968, 1.8265, 2.2986], + device='cuda:2'), covar=tensor([0.0279, 0.0091, 0.0110, 0.0160, 0.0291, 0.0096, 0.0286, 0.0105], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0123, 0.0134, 0.0146, 0.0161, 0.0132, 0.0149, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:12:36,106 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.732e+01 1.810e+02 2.246e+02 2.914e+02 5.187e+02, threshold=4.493e+02, percent-clipped=1.0 +2022-11-15 20:12:56,842 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-11-15 20:13:02,251 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35153.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:13:15,995 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.45 vs. limit=5.0 +2022-11-15 20:13:26,560 INFO [train.py:876] (2/4) Epoch 5, batch 6100, loss[loss=0.2258, simple_loss=0.2157, pruned_loss=0.1179, over 5577.00 frames. ], tot_loss[loss=0.175, simple_loss=0.1799, pruned_loss=0.08501, over 1084843.04 frames. ], batch size: 40, lr: 1.52e-02, grad_scale: 16.0 +2022-11-15 20:13:32,610 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5585, 2.7561, 2.1228, 2.9642, 2.0120, 2.3501, 2.2630, 3.3308], + device='cuda:2'), covar=tensor([0.0818, 0.1186, 0.2525, 0.0801, 0.2291, 0.1537, 0.1795, 0.1053], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0064, 0.0080, 0.0052, 0.0065, 0.0059, 0.0072, 0.0052], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:13:44,499 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.122e+02 1.904e+02 2.292e+02 2.845e+02 6.036e+02, threshold=4.585e+02, percent-clipped=4.0 +2022-11-15 20:14:00,482 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4524, 1.7645, 1.9884, 2.3812, 2.4865, 1.9871, 1.5889, 2.7153], + device='cuda:2'), covar=tensor([0.0790, 0.2355, 0.1618, 0.1792, 0.0788, 0.2216, 0.2074, 0.0494], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0210, 0.0207, 0.0311, 0.0216, 0.0220, 0.0197, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 20:14:07,827 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35249.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:14:28,470 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35278.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:14:31,235 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35282.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:14:36,326 INFO [train.py:876] (2/4) Epoch 5, batch 6200, loss[loss=0.1822, simple_loss=0.183, pruned_loss=0.09068, over 5673.00 frames. ], tot_loss[loss=0.1747, simple_loss=0.1793, pruned_loss=0.08509, over 1082232.83 frames. ], batch size: 36, lr: 1.52e-02, grad_scale: 16.0 +2022-11-15 20:14:41,930 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35297.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:14:43,067 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.03 vs. limit=5.0 +2022-11-15 20:14:55,053 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.170e+02 1.804e+02 2.260e+02 2.687e+02 5.215e+02, threshold=4.521e+02, percent-clipped=1.0 +2022-11-15 20:15:02,892 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35326.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:15:05,749 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35330.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:15:16,306 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35345.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:15:30,944 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35366.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:15:36,813 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35374.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:15:39,835 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5274, 2.5916, 2.1956, 3.0357, 1.8200, 2.1064, 2.1358, 2.8610], + device='cuda:2'), covar=tensor([0.0466, 0.1999, 0.2393, 0.0885, 0.1694, 0.1362, 0.1477, 0.1362], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0066, 0.0081, 0.0053, 0.0067, 0.0060, 0.0074, 0.0053], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:15:42,662 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2530, 4.5302, 2.8876, 4.3510, 3.5891, 3.0524, 2.4124, 3.8993], + device='cuda:2'), covar=tensor([0.1585, 0.0154, 0.1059, 0.0261, 0.0414, 0.0945, 0.1912, 0.0181], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0134, 0.0173, 0.0132, 0.0170, 0.0182, 0.0186, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 20:15:47,101 INFO [train.py:876] (2/4) Epoch 5, batch 6300, loss[loss=0.2579, simple_loss=0.2344, pruned_loss=0.1407, over 5358.00 frames. ], tot_loss[loss=0.1771, simple_loss=0.1809, pruned_loss=0.0866, over 1073110.85 frames. ], batch size: 70, lr: 1.52e-02, grad_scale: 16.0 +2022-11-15 20:15:59,511 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35406.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 20:16:04,764 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35414.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:16:05,294 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 1.779e+02 2.287e+02 3.039e+02 6.063e+02, threshold=4.574e+02, percent-clipped=3.0 +2022-11-15 20:16:14,555 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.88 vs. limit=5.0 +2022-11-15 20:16:20,122 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35435.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:16:28,623 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35448.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:16:57,615 INFO [train.py:876] (2/4) Epoch 5, batch 6400, loss[loss=0.1283, simple_loss=0.1416, pruned_loss=0.05747, over 5396.00 frames. ], tot_loss[loss=0.1787, simple_loss=0.1821, pruned_loss=0.08763, over 1074617.72 frames. ], batch size: 11, lr: 1.52e-02, grad_scale: 16.0 +2022-11-15 20:17:09,882 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4152, 5.0372, 4.5431, 5.0504, 4.9755, 4.1929, 4.4918, 4.3491], + device='cuda:2'), covar=tensor([0.0240, 0.0360, 0.1162, 0.0256, 0.0336, 0.0333, 0.0300, 0.0522], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0139, 0.0220, 0.0138, 0.0173, 0.0142, 0.0150, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:17:14,787 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.260e+02 1.875e+02 2.323e+02 3.297e+02 5.699e+02, threshold=4.646e+02, percent-clipped=4.0 +2022-11-15 20:17:25,843 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35530.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:18:05,890 INFO [train.py:876] (2/4) Epoch 5, batch 6500, loss[loss=0.1543, simple_loss=0.1731, pruned_loss=0.06775, over 5546.00 frames. ], tot_loss[loss=0.1791, simple_loss=0.183, pruned_loss=0.08758, over 1077366.18 frames. ], batch size: 13, lr: 1.51e-02, grad_scale: 16.0 +2022-11-15 20:18:07,320 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35591.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:18:23,696 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.209e+02 1.920e+02 2.418e+02 3.178e+02 5.825e+02, threshold=4.835e+02, percent-clipped=5.0 +2022-11-15 20:18:53,976 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.75 vs. limit=5.0 +2022-11-15 20:19:14,318 INFO [train.py:876] (2/4) Epoch 5, batch 6600, loss[loss=0.2083, simple_loss=0.1995, pruned_loss=0.1085, over 5633.00 frames. ], tot_loss[loss=0.1785, simple_loss=0.1823, pruned_loss=0.0873, over 1078004.65 frames. ], batch size: 32, lr: 1.51e-02, grad_scale: 16.0 +2022-11-15 20:19:22,596 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35701.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 20:19:31,773 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.292e+02 1.753e+02 2.153e+02 2.756e+02 5.336e+02, threshold=4.306e+02, percent-clipped=2.0 +2022-11-15 20:19:42,133 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35730.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:19:54,543 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35748.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:20:09,971 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35771.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:20:13,187 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0329, 4.3863, 2.7851, 4.1547, 3.4209, 2.8365, 2.0768, 3.6870], + device='cuda:2'), covar=tensor([0.1819, 0.0142, 0.1180, 0.0214, 0.0483, 0.1150, 0.2186, 0.0231], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0134, 0.0172, 0.0130, 0.0172, 0.0184, 0.0187, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 20:20:21,561 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9946, 2.9082, 3.1736, 1.1239, 3.0980, 3.4615, 3.1381, 3.7265], + device='cuda:2'), covar=tensor([0.1783, 0.1286, 0.0525, 0.2776, 0.0309, 0.0321, 0.0294, 0.0498], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0184, 0.0140, 0.0193, 0.0150, 0.0149, 0.0135, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 20:20:22,009 INFO [train.py:876] (2/4) Epoch 5, batch 6700, loss[loss=0.1701, simple_loss=0.1876, pruned_loss=0.07627, over 5755.00 frames. ], tot_loss[loss=0.1762, simple_loss=0.1805, pruned_loss=0.0859, over 1076009.13 frames. ], batch size: 16, lr: 1.51e-02, grad_scale: 16.0 +2022-11-15 20:20:26,992 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35796.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:20:38,041 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35811.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:20:40,468 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.143e+02 1.856e+02 2.372e+02 2.960e+02 5.756e+02, threshold=4.743e+02, percent-clipped=4.0 +2022-11-15 20:20:52,211 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35832.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:21:19,391 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35872.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:21:20,196 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-15 20:21:27,493 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0 +2022-11-15 20:21:28,458 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35886.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:21:30,764 INFO [train.py:876] (2/4) Epoch 5, batch 6800, loss[loss=0.1443, simple_loss=0.1589, pruned_loss=0.06491, over 5471.00 frames. ], tot_loss[loss=0.1755, simple_loss=0.1801, pruned_loss=0.08552, over 1077336.56 frames. ], batch size: 12, lr: 1.51e-02, grad_scale: 16.0 +2022-11-15 20:21:48,383 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.265e+02 1.967e+02 2.535e+02 3.123e+02 6.625e+02, threshold=5.070e+02, percent-clipped=2.0 +2022-11-15 20:22:01,207 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-11-15 20:22:09,120 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5999, 3.7464, 3.7643, 3.4649, 3.7093, 3.4835, 1.4096, 3.7840], + device='cuda:2'), covar=tensor([0.0523, 0.0421, 0.0286, 0.0376, 0.0456, 0.0471, 0.3584, 0.0482], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0073, 0.0075, 0.0065, 0.0090, 0.0078, 0.0132, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:22:38,607 INFO [train.py:876] (2/4) Epoch 5, batch 6900, loss[loss=0.156, simple_loss=0.1783, pruned_loss=0.06687, over 5690.00 frames. ], tot_loss[loss=0.1743, simple_loss=0.1797, pruned_loss=0.08448, over 1080374.08 frames. ], batch size: 28, lr: 1.51e-02, grad_scale: 16.0 +2022-11-15 20:22:46,845 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36001.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:22:56,602 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.081e+02 1.741e+02 2.226e+02 2.702e+02 5.830e+02, threshold=4.452e+02, percent-clipped=1.0 +2022-11-15 20:23:02,473 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.5067, 0.9834, 1.5136, 0.6473, 1.0340, 0.9194, 0.5862, 1.2392], + device='cuda:2'), covar=tensor([0.0027, 0.0017, 0.0031, 0.0024, 0.0016, 0.0026, 0.0052, 0.0019], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0027, 0.0030, 0.0028, 0.0026, 0.0026, 0.0028, 0.0024], + device='cuda:2'), out_proj_covar=tensor([2.8868e-05, 2.8455e-05, 2.7283e-05, 2.5649e-05, 2.3936e-05, 2.2482e-05, + 3.0955e-05, 2.1013e-05], device='cuda:2') +2022-11-15 20:23:07,222 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36030.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:23:10,005 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2097, 3.5123, 3.1386, 3.1459, 2.1715, 3.4372, 2.1306, 2.9090], + device='cuda:2'), covar=tensor([0.0338, 0.0106, 0.0125, 0.0197, 0.0383, 0.0101, 0.0356, 0.0099], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0128, 0.0138, 0.0151, 0.0165, 0.0137, 0.0154, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:23:13,895 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2203, 2.5264, 1.7895, 2.7867, 1.6792, 2.1889, 2.2187, 2.9113], + device='cuda:2'), covar=tensor([0.0725, 0.2016, 0.3048, 0.0851, 0.3008, 0.2914, 0.1964, 0.0948], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0066, 0.0083, 0.0054, 0.0068, 0.0060, 0.0075, 0.0054], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:23:19,732 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36049.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 20:23:39,888 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36078.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:23:42,957 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3346, 1.5196, 1.6041, 0.9647, 0.7284, 2.4877, 1.7144, 1.2446], + device='cuda:2'), covar=tensor([0.0507, 0.0485, 0.0486, 0.1522, 0.1645, 0.0564, 0.0933, 0.0733], + device='cuda:2'), in_proj_covar=tensor([0.0046, 0.0040, 0.0042, 0.0051, 0.0044, 0.0036, 0.0038, 0.0040], + device='cuda:2'), out_proj_covar=tensor([8.9259e-05, 7.8613e-05, 8.1841e-05, 1.0170e-04, 8.7739e-05, 7.6183e-05, + 7.7498e-05, 7.9353e-05], device='cuda:2') +2022-11-15 20:23:47,622 INFO [train.py:876] (2/4) Epoch 5, batch 7000, loss[loss=0.198, simple_loss=0.1938, pruned_loss=0.1011, over 5718.00 frames. ], tot_loss[loss=0.1769, simple_loss=0.181, pruned_loss=0.08639, over 1084220.85 frames. ], batch size: 19, lr: 1.50e-02, grad_scale: 16.0 +2022-11-15 20:24:01,876 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8889, 0.6606, 1.6518, 0.5448, 1.1394, 0.8451, 0.7398, 1.3970], + device='cuda:2'), covar=tensor([0.0037, 0.0046, 0.0021, 0.0052, 0.0022, 0.0040, 0.0044, 0.0022], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0027, 0.0029, 0.0028, 0.0026, 0.0026, 0.0028, 0.0024], + device='cuda:2'), out_proj_covar=tensor([2.8728e-05, 2.8552e-05, 2.7018e-05, 2.5752e-05, 2.3453e-05, 2.2361e-05, + 3.0555e-05, 2.1059e-05], device='cuda:2') +2022-11-15 20:24:05,016 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.119e+02 1.861e+02 2.290e+02 2.878e+02 5.762e+02, threshold=4.579e+02, percent-clipped=5.0 +2022-11-15 20:24:13,301 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36127.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:24:40,735 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36167.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:24:51,024 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36181.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:24:54,724 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36186.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:24:56,579 INFO [train.py:876] (2/4) Epoch 5, batch 7100, loss[loss=0.2085, simple_loss=0.1935, pruned_loss=0.1117, over 5211.00 frames. ], tot_loss[loss=0.1757, simple_loss=0.1804, pruned_loss=0.08553, over 1084171.73 frames. ], batch size: 91, lr: 1.50e-02, grad_scale: 16.0 +2022-11-15 20:25:14,390 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.219e+01 1.811e+02 2.272e+02 2.779e+02 4.389e+02, threshold=4.544e+02, percent-clipped=0.0 +2022-11-15 20:25:16,565 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9726, 0.9538, 1.3591, 0.4878, 1.1407, 0.8752, 0.8544, 1.3091], + device='cuda:2'), covar=tensor([0.0021, 0.0028, 0.0021, 0.0032, 0.0018, 0.0028, 0.0028, 0.0020], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0027, 0.0030, 0.0029, 0.0027, 0.0027, 0.0028, 0.0024], + device='cuda:2'), out_proj_covar=tensor([2.9457e-05, 2.9232e-05, 2.7603e-05, 2.6273e-05, 2.4335e-05, 2.2932e-05, + 3.0916e-05, 2.1564e-05], device='cuda:2') +2022-11-15 20:25:27,659 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36234.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:25:33,405 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36242.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:25:51,160 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.23 vs. limit=2.0 +2022-11-15 20:26:05,726 INFO [train.py:876] (2/4) Epoch 5, batch 7200, loss[loss=0.1748, simple_loss=0.1773, pruned_loss=0.08618, over 5759.00 frames. ], tot_loss[loss=0.176, simple_loss=0.1806, pruned_loss=0.08567, over 1084119.10 frames. ], batch size: 13, lr: 1.50e-02, grad_scale: 16.0 +2022-11-15 20:26:10,466 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4537, 4.1692, 4.1249, 3.5827, 4.4289, 4.2594, 1.6748, 4.7305], + device='cuda:2'), covar=tensor([0.0278, 0.0390, 0.0262, 0.0526, 0.0409, 0.0389, 0.2945, 0.0340], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0073, 0.0074, 0.0064, 0.0090, 0.0078, 0.0130, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:26:10,662 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-15 20:26:22,596 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.732e+01 1.770e+02 2.199e+02 2.603e+02 4.829e+02, threshold=4.399e+02, percent-clipped=1.0 +2022-11-15 20:27:38,814 INFO [train.py:876] (2/4) Epoch 6, batch 0, loss[loss=0.09295, simple_loss=0.1113, pruned_loss=0.03728, over 5202.00 frames. ], tot_loss[loss=0.09295, simple_loss=0.1113, pruned_loss=0.03728, over 5202.00 frames. ], batch size: 8, lr: 1.40e-02, grad_scale: 16.0 +2022-11-15 20:27:38,814 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 20:27:53,250 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0002, 1.2780, 1.4876, 1.1243, 1.4287, 0.7439, 0.7384, 1.2834], + device='cuda:2'), covar=tensor([0.0030, 0.0016, 0.0027, 0.0023, 0.0026, 0.0045, 0.0044, 0.0030], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0026, 0.0028, 0.0027, 0.0026, 0.0026, 0.0027, 0.0023], + device='cuda:2'), out_proj_covar=tensor([2.7984e-05, 2.7969e-05, 2.6033e-05, 2.4886e-05, 2.3148e-05, 2.2181e-05, + 3.0083e-05, 2.0988e-05], device='cuda:2') +2022-11-15 20:27:55,407 INFO [train.py:908] (2/4) Epoch 6, validation: loss=0.1637, simple_loss=0.1861, pruned_loss=0.07065, over 1530663.00 frames. +2022-11-15 20:27:55,407 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 20:27:58,979 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4699, 1.9248, 3.2144, 2.6919, 3.0811, 2.2604, 2.7357, 3.3182], + device='cuda:2'), covar=tensor([0.0360, 0.1109, 0.0430, 0.0985, 0.0370, 0.0907, 0.0811, 0.0455], + device='cuda:2'), in_proj_covar=tensor([0.0196, 0.0191, 0.0186, 0.0211, 0.0180, 0.0191, 0.0221, 0.0205], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 20:28:23,016 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.2573, 4.6919, 5.0529, 4.6602, 5.3092, 5.1881, 4.5748, 5.2852], + device='cuda:2'), covar=tensor([0.0338, 0.0215, 0.0427, 0.0238, 0.0303, 0.0081, 0.0179, 0.0174], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0111, 0.0085, 0.0111, 0.0119, 0.0069, 0.0096, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:28:31,820 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.207e+02 1.895e+02 2.226e+02 2.646e+02 4.624e+02, threshold=4.452e+02, percent-clipped=2.0 +2022-11-15 20:28:40,318 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36427.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:28:54,193 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0012, 1.1333, 1.2032, 0.6667, 1.0033, 1.1761, 0.8112, 1.3471], + device='cuda:2'), covar=tensor([0.0034, 0.0027, 0.0023, 0.0028, 0.0025, 0.0036, 0.0041, 0.0051], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0026, 0.0028, 0.0027, 0.0025, 0.0025, 0.0027, 0.0023], + device='cuda:2'), out_proj_covar=tensor([2.7423e-05, 2.7786e-05, 2.5429e-05, 2.4078e-05, 2.2531e-05, 2.1688e-05, + 2.9421e-05, 2.0311e-05], device='cuda:2') +2022-11-15 20:28:56,897 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-15 20:29:03,068 INFO [train.py:876] (2/4) Epoch 6, batch 100, loss[loss=0.1535, simple_loss=0.1677, pruned_loss=0.06971, over 5735.00 frames. ], tot_loss[loss=0.1694, simple_loss=0.1765, pruned_loss=0.08112, over 433528.92 frames. ], batch size: 13, lr: 1.40e-02, grad_scale: 16.0 +2022-11-15 20:29:07,142 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36467.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:29:12,766 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36475.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:29:36,912 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2775, 3.8672, 4.1167, 3.7780, 4.3102, 4.0188, 3.9367, 4.3219], + device='cuda:2'), covar=tensor([0.0346, 0.0278, 0.0408, 0.0311, 0.0349, 0.0287, 0.0257, 0.0287], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0115, 0.0088, 0.0115, 0.0123, 0.0071, 0.0099, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:29:40,205 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.130e+02 1.873e+02 2.283e+02 2.904e+02 6.033e+02, threshold=4.566e+02, percent-clipped=4.0 +2022-11-15 20:29:40,288 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36515.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:29:55,306 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36537.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:30:03,657 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4935, 2.5777, 2.0734, 2.9746, 1.8315, 2.7107, 2.7128, 3.1599], + device='cuda:2'), covar=tensor([0.0754, 0.1665, 0.2775, 0.1229, 0.2091, 0.0861, 0.1547, 0.2872], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0068, 0.0085, 0.0056, 0.0068, 0.0062, 0.0076, 0.0054], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:30:11,765 INFO [train.py:876] (2/4) Epoch 6, batch 200, loss[loss=0.1668, simple_loss=0.1764, pruned_loss=0.0786, over 5320.00 frames. ], tot_loss[loss=0.1681, simple_loss=0.1753, pruned_loss=0.08049, over 690813.47 frames. ], batch size: 79, lr: 1.39e-02, grad_scale: 16.0 +2022-11-15 20:30:21,236 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0399, 1.9486, 2.0782, 3.1186, 2.9471, 2.1579, 1.8431, 3.2887], + device='cuda:2'), covar=tensor([0.0714, 0.2808, 0.2423, 0.1932, 0.1104, 0.3166, 0.2062, 0.0597], + device='cuda:2'), in_proj_covar=tensor([0.0183, 0.0211, 0.0205, 0.0319, 0.0216, 0.0220, 0.0194, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0006, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 20:30:35,236 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-15 20:30:42,303 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36605.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:30:44,368 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36608.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:30:46,966 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36612.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:30:48,739 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.138e+02 1.665e+02 2.149e+02 2.847e+02 6.157e+02, threshold=4.299e+02, percent-clipped=2.0 +2022-11-15 20:30:55,114 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 20:31:07,666 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7971, 2.4844, 2.1024, 1.1707, 2.0601, 2.7323, 2.2118, 2.6642], + device='cuda:2'), covar=tensor([0.1585, 0.1261, 0.1357, 0.2549, 0.0605, 0.0379, 0.0313, 0.0694], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0182, 0.0143, 0.0192, 0.0152, 0.0150, 0.0132, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 20:31:08,941 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.08 vs. limit=5.0 +2022-11-15 20:31:16,085 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2914, 1.2297, 1.5559, 1.1277, 1.3295, 1.1941, 1.2387, 0.7384], + device='cuda:2'), covar=tensor([0.0017, 0.0033, 0.0016, 0.0024, 0.0025, 0.0033, 0.0018, 0.0043], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0016, 0.0016, 0.0019, 0.0017, 0.0016, 0.0018, 0.0019], + device='cuda:2'), out_proj_covar=tensor([1.7001e-05, 1.6855e-05, 1.6310e-05, 1.8575e-05, 1.6550e-05, 1.7007e-05, + 1.8442e-05, 2.1039e-05], device='cuda:2') +2022-11-15 20:31:20,154 INFO [train.py:876] (2/4) Epoch 6, batch 300, loss[loss=0.2069, simple_loss=0.204, pruned_loss=0.1049, over 5626.00 frames. ], tot_loss[loss=0.1708, simple_loss=0.1772, pruned_loss=0.08219, over 845725.65 frames. ], batch size: 38, lr: 1.39e-02, grad_scale: 16.0 +2022-11-15 20:31:23,595 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36666.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:31:25,561 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36669.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:31:28,127 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36673.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:31:31,285 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1598, 1.1924, 1.4622, 0.3468, 1.4767, 1.1074, 0.8150, 1.2121], + device='cuda:2'), covar=tensor([0.0041, 0.0043, 0.0109, 0.0035, 0.0025, 0.0045, 0.0032, 0.0037], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0027, 0.0028, 0.0027, 0.0025, 0.0026, 0.0027, 0.0023], + device='cuda:2'), out_proj_covar=tensor([2.7608e-05, 2.8675e-05, 2.5413e-05, 2.4653e-05, 2.2363e-05, 2.1900e-05, + 3.0371e-05, 2.0652e-05], device='cuda:2') +2022-11-15 20:31:36,361 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9213, 3.8508, 3.7695, 3.6075, 3.8585, 3.5039, 1.6624, 3.9261], + device='cuda:2'), covar=tensor([0.0344, 0.0402, 0.0338, 0.0287, 0.0334, 0.0500, 0.3332, 0.0421], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0074, 0.0075, 0.0064, 0.0091, 0.0080, 0.0130, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:31:46,955 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36700.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 20:31:56,887 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.229e+02 1.849e+02 2.219e+02 2.840e+02 6.377e+02, threshold=4.439e+02, percent-clipped=4.0 +2022-11-15 20:32:11,269 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-11-15 20:32:27,630 INFO [train.py:876] (2/4) Epoch 6, batch 400, loss[loss=0.1785, simple_loss=0.1863, pruned_loss=0.08535, over 5553.00 frames. ], tot_loss[loss=0.1724, simple_loss=0.1783, pruned_loss=0.08326, over 940928.12 frames. ], batch size: 25, lr: 1.39e-02, grad_scale: 16.0 +2022-11-15 20:32:27,805 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36761.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 20:32:48,883 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9671, 1.7162, 1.5941, 1.0441, 0.7871, 2.5562, 1.8236, 1.6421], + device='cuda:2'), covar=tensor([0.0735, 0.0588, 0.0645, 0.1835, 0.2542, 0.0704, 0.2049, 0.0609], + device='cuda:2'), in_proj_covar=tensor([0.0049, 0.0042, 0.0043, 0.0051, 0.0045, 0.0038, 0.0040, 0.0041], + device='cuda:2'), out_proj_covar=tensor([9.3797e-05, 8.2275e-05, 8.3763e-05, 1.0443e-04, 9.0672e-05, 8.0232e-05, + 8.0427e-05, 8.2551e-05], device='cuda:2') +2022-11-15 20:32:58,151 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36805.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 20:33:04,836 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.163e+02 1.826e+02 2.118e+02 2.813e+02 4.458e+02, threshold=4.236e+02, percent-clipped=1.0 +2022-11-15 20:33:20,216 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36837.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:33:33,669 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 20:33:35,937 INFO [train.py:876] (2/4) Epoch 6, batch 500, loss[loss=0.155, simple_loss=0.1678, pruned_loss=0.07105, over 5500.00 frames. ], tot_loss[loss=0.171, simple_loss=0.1778, pruned_loss=0.08207, over 1001793.74 frames. ], batch size: 17, lr: 1.39e-02, grad_scale: 16.0 +2022-11-15 20:33:40,076 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36866.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:33:53,154 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36885.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:34:03,145 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8162, 1.8568, 1.5127, 1.8273, 1.8977, 1.7371, 1.7285, 1.7578], + device='cuda:2'), covar=tensor([0.0433, 0.0686, 0.1634, 0.0648, 0.0647, 0.0553, 0.0930, 0.0599], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0145, 0.0231, 0.0142, 0.0178, 0.0149, 0.0155, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:34:10,482 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36911.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:34:13,305 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.178e+02 1.807e+02 2.308e+02 2.926e+02 6.442e+02, threshold=4.616e+02, percent-clipped=7.0 +2022-11-15 20:34:25,498 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7179, 1.5664, 1.5465, 0.9186, 0.8590, 2.5077, 1.8453, 1.7203], + device='cuda:2'), covar=tensor([0.0628, 0.0945, 0.0771, 0.2126, 0.3794, 0.1786, 0.1105, 0.0811], + device='cuda:2'), in_proj_covar=tensor([0.0050, 0.0043, 0.0045, 0.0054, 0.0048, 0.0039, 0.0042, 0.0043], + device='cuda:2'), out_proj_covar=tensor([9.6986e-05, 8.4061e-05, 8.8023e-05, 1.0875e-04, 9.5075e-05, 8.2677e-05, + 8.4576e-05, 8.5521e-05], device='cuda:2') +2022-11-15 20:34:43,138 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36958.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:34:44,966 INFO [train.py:876] (2/4) Epoch 6, batch 600, loss[loss=0.205, simple_loss=0.1998, pruned_loss=0.1051, over 5678.00 frames. ], tot_loss[loss=0.1718, simple_loss=0.1781, pruned_loss=0.08279, over 1025876.68 frames. ], batch size: 36, lr: 1.39e-02, grad_scale: 16.0 +2022-11-15 20:34:45,045 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36961.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:34:47,030 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36964.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:34:49,621 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36968.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:34:52,737 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36972.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:35:06,721 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 20:35:13,039 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.85 vs. limit=5.0 +2022-11-15 20:35:15,263 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=37004.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:35:22,241 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.158e+02 1.873e+02 2.337e+02 2.752e+02 4.766e+02, threshold=4.674e+02, percent-clipped=2.0 +2022-11-15 20:35:25,076 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=37019.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:35:36,146 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3271, 0.8229, 1.9926, 0.8672, 1.4383, 1.3559, 0.7968, 1.5529], + device='cuda:2'), covar=tensor([0.0025, 0.0032, 0.0015, 0.0020, 0.0022, 0.0023, 0.0025, 0.0021], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0026, 0.0027, 0.0027, 0.0025, 0.0026, 0.0028, 0.0023], + device='cuda:2'), out_proj_covar=tensor([2.7292e-05, 2.7497e-05, 2.4620e-05, 2.4226e-05, 2.2840e-05, 2.2019e-05, + 3.0765e-05, 2.0743e-05], device='cuda:2') +2022-11-15 20:35:41,079 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9351, 4.3280, 3.7444, 4.2682, 4.2576, 3.5511, 3.7561, 3.6303], + device='cuda:2'), covar=tensor([0.0310, 0.0399, 0.1502, 0.0397, 0.0405, 0.0435, 0.0553, 0.0561], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0146, 0.0233, 0.0143, 0.0180, 0.0151, 0.0157, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:35:51,218 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37056.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 20:35:54,386 INFO [train.py:876] (2/4) Epoch 6, batch 700, loss[loss=0.1294, simple_loss=0.1461, pruned_loss=0.05632, over 5621.00 frames. ], tot_loss[loss=0.1708, simple_loss=0.1775, pruned_loss=0.08208, over 1054623.03 frames. ], batch size: 32, lr: 1.38e-02, grad_scale: 16.0 +2022-11-15 20:35:57,288 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=37065.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:36:31,398 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.989e+01 2.001e+02 2.497e+02 2.973e+02 6.578e+02, threshold=4.994e+02, percent-clipped=4.0 +2022-11-15 20:37:02,436 INFO [train.py:876] (2/4) Epoch 6, batch 800, loss[loss=0.182, simple_loss=0.1884, pruned_loss=0.08783, over 5590.00 frames. ], tot_loss[loss=0.1701, simple_loss=0.1771, pruned_loss=0.08152, over 1067726.04 frames. ], batch size: 22, lr: 1.38e-02, grad_scale: 16.0 +2022-11-15 20:37:02,891 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37161.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:37:17,130 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8172, 5.6231, 4.9383, 5.5804, 5.5641, 4.7831, 5.1367, 4.8973], + device='cuda:2'), covar=tensor([0.0208, 0.0283, 0.1036, 0.0305, 0.0414, 0.0249, 0.0191, 0.0237], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0146, 0.0232, 0.0142, 0.0178, 0.0150, 0.0157, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:37:31,310 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2517, 4.2468, 2.6444, 4.1308, 3.2803, 2.8561, 2.3666, 3.7134], + device='cuda:2'), covar=tensor([0.1471, 0.0209, 0.1073, 0.0174, 0.0586, 0.0926, 0.1759, 0.0294], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0130, 0.0167, 0.0130, 0.0166, 0.0179, 0.0180, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 20:37:40,547 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.103e+02 1.784e+02 2.226e+02 2.677e+02 4.647e+02, threshold=4.452e+02, percent-clipped=0.0 +2022-11-15 20:37:51,466 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.40 vs. limit=5.0 +2022-11-15 20:38:11,380 INFO [train.py:876] (2/4) Epoch 6, batch 900, loss[loss=0.1659, simple_loss=0.1812, pruned_loss=0.07533, over 5748.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.1769, pruned_loss=0.08107, over 1075511.70 frames. ], batch size: 15, lr: 1.38e-02, grad_scale: 16.0 +2022-11-15 20:38:11,499 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37261.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:13,414 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37264.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:15,269 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37267.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:15,953 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37268.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:15,987 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=37268.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:43,255 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37309.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:45,294 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37312.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:46,651 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37314.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:48,597 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 1.992e+02 2.315e+02 2.795e+02 5.537e+02, threshold=4.630e+02, percent-clipped=3.0 +2022-11-15 20:38:48,678 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37316.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:38:57,785 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=37329.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:39:15,682 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37356.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:39:18,339 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37360.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:39:18,956 INFO [train.py:876] (2/4) Epoch 6, batch 1000, loss[loss=0.16, simple_loss=0.1679, pruned_loss=0.07602, over 5318.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.1762, pruned_loss=0.08102, over 1077565.91 frames. ], batch size: 9, lr: 1.38e-02, grad_scale: 16.0 +2022-11-15 20:39:21,059 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=37364.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:39:48,187 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37404.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 20:39:52,684 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.63 vs. limit=5.0 +2022-11-15 20:39:54,075 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-11-15 20:39:56,109 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.744e+01 1.732e+02 2.123e+02 2.683e+02 6.509e+02, threshold=4.246e+02, percent-clipped=3.0 +2022-11-15 20:39:59,649 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.60 vs. limit=5.0 +2022-11-15 20:40:02,456 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=37425.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:40:07,968 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2800, 0.8511, 0.9453, 1.0047, 0.9737, 0.8016, 0.9552, 1.0583], + device='cuda:2'), covar=tensor([0.0706, 0.0486, 0.0409, 0.0452, 0.0763, 0.2018, 0.0625, 0.0594], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0013, 0.0010, 0.0011, 0.0011, 0.0010, 0.0012, 0.0010], + device='cuda:2'), out_proj_covar=tensor([4.0683e-05, 5.5065e-05, 4.3056e-05, 4.9981e-05, 4.6380e-05, 4.2616e-05, + 4.8981e-05, 4.4480e-05], device='cuda:2') +2022-11-15 20:40:26,885 INFO [train.py:876] (2/4) Epoch 6, batch 1100, loss[loss=0.2098, simple_loss=0.1942, pruned_loss=0.1127, over 5588.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.1763, pruned_loss=0.08095, over 1083573.37 frames. ], batch size: 50, lr: 1.38e-02, grad_scale: 16.0 +2022-11-15 20:40:26,996 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37461.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 20:40:59,804 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37509.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 20:41:04,181 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.065e+02 1.745e+02 2.111e+02 2.538e+02 7.660e+02, threshold=4.223e+02, percent-clipped=2.0 +2022-11-15 20:41:35,292 INFO [train.py:876] (2/4) Epoch 6, batch 1200, loss[loss=0.2067, simple_loss=0.2094, pruned_loss=0.102, over 5593.00 frames. ], tot_loss[loss=0.1701, simple_loss=0.177, pruned_loss=0.08161, over 1083180.45 frames. ], batch size: 24, lr: 1.38e-02, grad_scale: 16.0 +2022-11-15 20:41:39,294 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37567.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:42:02,447 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.27 vs. limit=2.0 +2022-11-15 20:42:04,209 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8700, 4.9500, 3.1458, 4.6381, 3.9257, 3.4366, 2.7258, 4.3976], + device='cuda:2'), covar=tensor([0.1210, 0.0145, 0.1051, 0.0269, 0.0378, 0.0882, 0.1528, 0.0210], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0131, 0.0169, 0.0132, 0.0165, 0.0178, 0.0179, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 20:42:11,421 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37614.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:42:12,016 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37615.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:42:12,628 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.202e+02 1.872e+02 2.304e+02 2.879e+02 7.161e+02, threshold=4.608e+02, percent-clipped=4.0 +2022-11-15 20:42:18,027 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37624.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:42:42,641 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37660.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:42:43,130 INFO [train.py:876] (2/4) Epoch 6, batch 1300, loss[loss=0.2237, simple_loss=0.2087, pruned_loss=0.1194, over 5531.00 frames. ], tot_loss[loss=0.1687, simple_loss=0.1758, pruned_loss=0.08083, over 1079666.36 frames. ], batch size: 40, lr: 1.37e-02, grad_scale: 16.0 +2022-11-15 20:42:43,844 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37662.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:43:02,107 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.4324, 1.0428, 1.2181, 0.7780, 0.7174, 1.0189, 0.5164, 1.0869], + device='cuda:2'), covar=tensor([0.0033, 0.0017, 0.0023, 0.0022, 0.0022, 0.0018, 0.0041, 0.0023], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0029, 0.0030, 0.0029, 0.0028, 0.0026, 0.0030, 0.0025], + device='cuda:2'), out_proj_covar=tensor([2.9921e-05, 3.0052e-05, 2.6981e-05, 2.6391e-05, 2.5089e-05, 2.2071e-05, + 3.2207e-05, 2.1916e-05], device='cuda:2') +2022-11-15 20:43:15,367 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37708.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:43:20,070 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1381, 1.1487, 1.6515, 0.8920, 1.2388, 1.4549, 0.8126, 1.4760], + device='cuda:2'), covar=tensor([0.0031, 0.0032, 0.0018, 0.0027, 0.0024, 0.0015, 0.0031, 0.0024], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0028, 0.0029, 0.0029, 0.0028, 0.0026, 0.0029, 0.0024], + device='cuda:2'), out_proj_covar=tensor([2.9519e-05, 2.9815e-05, 2.6676e-05, 2.6227e-05, 2.4931e-05, 2.1816e-05, + 3.2111e-05, 2.1647e-05], device='cuda:2') +2022-11-15 20:43:20,506 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.128e+02 1.632e+02 1.886e+02 2.277e+02 3.768e+02, threshold=3.772e+02, percent-clipped=0.0 +2022-11-15 20:43:23,149 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37720.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:43:24,003 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-11-15 20:43:41,528 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4678, 5.1898, 4.4994, 5.0675, 5.1650, 4.4685, 4.7015, 4.4464], + device='cuda:2'), covar=tensor([0.0177, 0.0255, 0.1088, 0.0336, 0.0291, 0.0282, 0.0235, 0.0416], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0144, 0.0228, 0.0140, 0.0175, 0.0148, 0.0152, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:43:51,594 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-11-15 20:43:51,757 INFO [train.py:876] (2/4) Epoch 6, batch 1400, loss[loss=0.1988, simple_loss=0.2097, pruned_loss=0.09395, over 5698.00 frames. ], tot_loss[loss=0.17, simple_loss=0.1766, pruned_loss=0.08166, over 1082433.66 frames. ], batch size: 28, lr: 1.37e-02, grad_scale: 16.0 +2022-11-15 20:43:59,618 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-15 20:44:24,124 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7962, 1.6347, 1.5061, 0.8404, 0.5212, 2.2570, 1.6462, 1.4833], + device='cuda:2'), covar=tensor([0.0558, 0.0734, 0.0456, 0.2136, 0.2208, 0.0383, 0.1626, 0.0755], + device='cuda:2'), in_proj_covar=tensor([0.0050, 0.0043, 0.0046, 0.0054, 0.0046, 0.0038, 0.0043, 0.0044], + device='cuda:2'), out_proj_covar=tensor([9.7608e-05, 8.6149e-05, 9.0725e-05, 1.0995e-04, 9.4864e-05, 8.1805e-05, + 8.6426e-05, 8.7785e-05], device='cuda:2') +2022-11-15 20:44:31,663 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.336e+02 1.794e+02 2.213e+02 2.826e+02 4.726e+02, threshold=4.425e+02, percent-clipped=5.0 +2022-11-15 20:44:36,182 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4839, 3.9334, 3.4801, 3.3501, 2.0448, 3.6856, 2.1479, 3.0069], + device='cuda:2'), covar=tensor([0.0358, 0.0120, 0.0141, 0.0253, 0.0426, 0.0112, 0.0375, 0.0114], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0128, 0.0143, 0.0158, 0.0164, 0.0140, 0.0156, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:44:52,769 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2352, 4.0958, 3.4079, 3.2809, 2.1698, 3.8292, 1.9751, 3.2584], + device='cuda:2'), covar=tensor([0.0510, 0.0143, 0.0169, 0.0351, 0.0515, 0.0118, 0.0462, 0.0130], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0128, 0.0143, 0.0158, 0.0165, 0.0140, 0.0156, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:45:01,664 INFO [train.py:876] (2/4) Epoch 6, batch 1500, loss[loss=0.1367, simple_loss=0.1642, pruned_loss=0.05459, over 5734.00 frames. ], tot_loss[loss=0.167, simple_loss=0.1754, pruned_loss=0.07935, over 1090402.47 frames. ], batch size: 13, lr: 1.37e-02, grad_scale: 16.0 +2022-11-15 20:45:01,811 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4779, 2.6718, 2.0004, 2.8764, 1.8551, 2.5954, 2.7444, 3.3932], + device='cuda:2'), covar=tensor([0.0799, 0.1126, 0.3177, 0.0741, 0.1844, 0.1261, 0.1375, 0.0683], + device='cuda:2'), in_proj_covar=tensor([0.0063, 0.0069, 0.0083, 0.0055, 0.0070, 0.0062, 0.0078, 0.0056], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:45:28,497 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-11-15 20:45:32,587 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 20:45:34,088 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.73 vs. limit=5.0 +2022-11-15 20:45:38,927 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.056e+02 1.859e+02 2.319e+02 2.620e+02 5.467e+02, threshold=4.638e+02, percent-clipped=1.0 +2022-11-15 20:45:41,223 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0146, 2.5656, 2.7336, 2.5731, 1.6446, 2.6752, 1.7038, 1.8360], + device='cuda:2'), covar=tensor([0.0241, 0.0080, 0.0100, 0.0147, 0.0293, 0.0102, 0.0267, 0.0128], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0127, 0.0141, 0.0156, 0.0162, 0.0138, 0.0153, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:45:45,053 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37924.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:45:56,430 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9314, 1.2773, 1.8362, 1.3108, 1.8250, 1.2650, 1.5868, 1.3393], + device='cuda:2'), covar=tensor([0.0018, 0.0098, 0.0039, 0.0026, 0.0023, 0.0078, 0.0020, 0.0039], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0017, 0.0017, 0.0020, 0.0018, 0.0017, 0.0018, 0.0020], + device='cuda:2'), out_proj_covar=tensor([1.7811e-05, 1.7082e-05, 1.7343e-05, 2.0323e-05, 1.7922e-05, 1.7991e-05, + 1.8621e-05, 2.2663e-05], device='cuda:2') +2022-11-15 20:46:10,115 INFO [train.py:876] (2/4) Epoch 6, batch 1600, loss[loss=0.1705, simple_loss=0.1737, pruned_loss=0.08366, over 5649.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.1734, pruned_loss=0.0778, over 1092662.10 frames. ], batch size: 38, lr: 1.37e-02, grad_scale: 16.0 +2022-11-15 20:46:17,398 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37972.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:46:22,798 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=37979.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:46:47,655 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.872e+01 1.813e+02 2.310e+02 2.979e+02 5.455e+02, threshold=4.619e+02, percent-clipped=4.0 +2022-11-15 20:46:50,392 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=38020.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:46:51,680 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3983, 3.1198, 3.2572, 3.0232, 3.4574, 3.3484, 3.1884, 3.4074], + device='cuda:2'), covar=tensor([0.0363, 0.0336, 0.0441, 0.0385, 0.0346, 0.0162, 0.0317, 0.0405], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0115, 0.0086, 0.0118, 0.0125, 0.0074, 0.0099, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:47:03,885 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=38040.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:47:18,028 INFO [train.py:876] (2/4) Epoch 6, batch 1700, loss[loss=0.1649, simple_loss=0.1793, pruned_loss=0.07526, over 5596.00 frames. ], tot_loss[loss=0.1653, simple_loss=0.1744, pruned_loss=0.07813, over 1091628.12 frames. ], batch size: 23, lr: 1.37e-02, grad_scale: 16.0 +2022-11-15 20:47:22,635 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=38068.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:47:55,399 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.122e+02 1.764e+02 2.104e+02 2.685e+02 5.145e+02, threshold=4.208e+02, percent-clipped=1.0 +2022-11-15 20:48:22,203 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9710, 4.3681, 4.8081, 4.4266, 5.0250, 4.8417, 4.3184, 4.9339], + device='cuda:2'), covar=tensor([0.0280, 0.0248, 0.0335, 0.0261, 0.0245, 0.0101, 0.0237, 0.0256], + device='cuda:2'), in_proj_covar=tensor([0.0108, 0.0117, 0.0087, 0.0119, 0.0124, 0.0074, 0.0099, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:48:25,391 INFO [train.py:876] (2/4) Epoch 6, batch 1800, loss[loss=0.178, simple_loss=0.1856, pruned_loss=0.08521, over 5602.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.1748, pruned_loss=0.0788, over 1086629.56 frames. ], batch size: 43, lr: 1.36e-02, grad_scale: 16.0 +2022-11-15 20:49:00,084 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5208, 3.8369, 3.6301, 3.2935, 1.9767, 3.7911, 2.0329, 3.1753], + device='cuda:2'), covar=tensor([0.0360, 0.0138, 0.0132, 0.0299, 0.0453, 0.0119, 0.0424, 0.0108], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0128, 0.0144, 0.0160, 0.0163, 0.0140, 0.0156, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:49:03,152 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.164e+02 1.837e+02 2.239e+02 2.713e+02 4.527e+02, threshold=4.477e+02, percent-clipped=1.0 +2022-11-15 20:49:33,779 INFO [train.py:876] (2/4) Epoch 6, batch 1900, loss[loss=0.21, simple_loss=0.1878, pruned_loss=0.1161, over 4970.00 frames. ], tot_loss[loss=0.1681, simple_loss=0.1756, pruned_loss=0.08033, over 1080362.54 frames. ], batch size: 109, lr: 1.36e-02, grad_scale: 16.0 +2022-11-15 20:49:44,077 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-11-15 20:49:52,652 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0876, 3.3398, 2.6102, 1.6236, 3.2360, 1.2224, 3.1819, 1.5897], + device='cuda:2'), covar=tensor([0.1310, 0.0151, 0.0711, 0.1968, 0.0191, 0.2052, 0.0210, 0.1842], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0100, 0.0110, 0.0122, 0.0102, 0.0129, 0.0092, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 20:50:10,567 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.124e+02 1.862e+02 2.216e+02 2.632e+02 5.559e+02, threshold=4.433e+02, percent-clipped=2.0 +2022-11-15 20:50:14,376 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2657, 2.9308, 2.9078, 1.5299, 2.6035, 3.3258, 3.0379, 3.3990], + device='cuda:2'), covar=tensor([0.2103, 0.1279, 0.0628, 0.2447, 0.0411, 0.0385, 0.0343, 0.0559], + device='cuda:2'), in_proj_covar=tensor([0.0184, 0.0183, 0.0144, 0.0188, 0.0151, 0.0154, 0.0134, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 20:50:24,250 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=38335.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:50:36,971 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3501, 3.9054, 4.1882, 3.8622, 4.3441, 4.0693, 3.9246, 4.2946], + device='cuda:2'), covar=tensor([0.0385, 0.0302, 0.0380, 0.0319, 0.0429, 0.0250, 0.0268, 0.0295], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0115, 0.0087, 0.0118, 0.0124, 0.0075, 0.0099, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:50:40,960 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=38360.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:50:41,463 INFO [train.py:876] (2/4) Epoch 6, batch 2000, loss[loss=0.1879, simple_loss=0.2015, pruned_loss=0.08715, over 5607.00 frames. ], tot_loss[loss=0.1675, simple_loss=0.1756, pruned_loss=0.07973, over 1084018.00 frames. ], batch size: 23, lr: 1.36e-02, grad_scale: 16.0 +2022-11-15 20:51:05,239 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5275, 3.6613, 3.1955, 3.2514, 3.5491, 3.4920, 1.3504, 3.7171], + device='cuda:2'), covar=tensor([0.0323, 0.0217, 0.0354, 0.0329, 0.0395, 0.0375, 0.3363, 0.0340], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0077, 0.0078, 0.0068, 0.0095, 0.0081, 0.0131, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:51:19,233 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.233e+02 2.027e+02 2.499e+02 3.028e+02 6.402e+02, threshold=4.998e+02, percent-clipped=6.0 +2022-11-15 20:51:22,842 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=38421.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:51:50,153 INFO [train.py:876] (2/4) Epoch 6, batch 2100, loss[loss=0.1695, simple_loss=0.1777, pruned_loss=0.08063, over 5458.00 frames. ], tot_loss[loss=0.1665, simple_loss=0.1749, pruned_loss=0.07905, over 1079748.21 frames. ], batch size: 53, lr: 1.36e-02, grad_scale: 16.0 +2022-11-15 20:52:01,888 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8230, 2.8411, 2.4673, 2.7757, 2.8447, 2.5170, 2.3472, 2.5206], + device='cuda:2'), covar=tensor([0.0303, 0.0581, 0.1647, 0.0637, 0.0575, 0.0530, 0.0818, 0.0658], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0145, 0.0231, 0.0146, 0.0179, 0.0149, 0.0155, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:52:27,551 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.554e+01 1.866e+02 2.249e+02 2.710e+02 6.180e+02, threshold=4.497e+02, percent-clipped=1.0 +2022-11-15 20:52:48,434 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.8173, 5.5047, 5.5500, 5.3266, 5.9255, 5.9164, 4.9617, 5.8579], + device='cuda:2'), covar=tensor([0.0325, 0.0177, 0.0407, 0.0275, 0.0296, 0.0059, 0.0148, 0.0164], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0114, 0.0086, 0.0119, 0.0123, 0.0075, 0.0099, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:52:58,531 INFO [train.py:876] (2/4) Epoch 6, batch 2200, loss[loss=0.1154, simple_loss=0.1409, pruned_loss=0.04496, over 5332.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.1732, pruned_loss=0.07769, over 1082149.35 frames. ], batch size: 9, lr: 1.36e-02, grad_scale: 16.0 +2022-11-15 20:53:00,076 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2855, 3.0803, 3.0703, 2.7822, 1.9733, 3.0685, 1.9317, 2.8261], + device='cuda:2'), covar=tensor([0.0279, 0.0147, 0.0108, 0.0199, 0.0321, 0.0110, 0.0311, 0.0096], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0131, 0.0144, 0.0162, 0.0166, 0.0141, 0.0158, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:53:09,850 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9969, 4.6102, 4.9656, 4.5810, 5.1336, 5.0128, 4.4882, 5.1372], + device='cuda:2'), covar=tensor([0.0417, 0.0235, 0.0370, 0.0281, 0.0372, 0.0128, 0.0231, 0.0210], + device='cuda:2'), in_proj_covar=tensor([0.0108, 0.0115, 0.0086, 0.0119, 0.0124, 0.0075, 0.0099, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:53:20,855 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0867, 1.8181, 1.8321, 0.9116, 0.7713, 2.3194, 1.8086, 1.6993], + device='cuda:2'), covar=tensor([0.0577, 0.0788, 0.0530, 0.1961, 0.2461, 0.2715, 0.2194, 0.0885], + device='cuda:2'), in_proj_covar=tensor([0.0053, 0.0045, 0.0047, 0.0059, 0.0048, 0.0039, 0.0045, 0.0047], + device='cuda:2'), out_proj_covar=tensor([1.0505e-04, 9.1426e-05, 9.4639e-05, 1.1871e-04, 1.0037e-04, 8.6512e-05, + 9.2417e-05, 9.3621e-05], device='cuda:2') +2022-11-15 20:53:35,850 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7635, 4.7829, 5.0172, 5.1103, 4.6603, 3.9624, 5.7367, 4.8873], + device='cuda:2'), covar=tensor([0.0544, 0.0882, 0.0359, 0.0849, 0.0345, 0.0270, 0.0512, 0.0414], + device='cuda:2'), in_proj_covar=tensor([0.0070, 0.0093, 0.0077, 0.0098, 0.0072, 0.0061, 0.0119, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:53:36,406 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.012e+02 1.801e+02 2.106e+02 2.533e+02 3.933e+02, threshold=4.211e+02, percent-clipped=0.0 +2022-11-15 20:53:49,023 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=38635.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:53:54,531 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7808, 3.0318, 2.8212, 2.9292, 2.8286, 2.7945, 1.2253, 2.8674], + device='cuda:2'), covar=tensor([0.0687, 0.0443, 0.0636, 0.0369, 0.0628, 0.0589, 0.4295, 0.0636], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0075, 0.0077, 0.0067, 0.0093, 0.0081, 0.0130, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 20:54:06,826 INFO [train.py:876] (2/4) Epoch 6, batch 2300, loss[loss=0.1886, simple_loss=0.1809, pruned_loss=0.09814, over 4995.00 frames. ], tot_loss[loss=0.1672, simple_loss=0.1745, pruned_loss=0.07999, over 1082227.29 frames. ], batch size: 109, lr: 1.36e-02, grad_scale: 16.0 +2022-11-15 20:54:21,937 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=38683.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:54:31,460 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5948, 2.5117, 2.1122, 2.6837, 2.1013, 2.4294, 2.3534, 3.0383], + device='cuda:2'), covar=tensor([0.0709, 0.1407, 0.3415, 0.1192, 0.1690, 0.0622, 0.1619, 0.1591], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0075, 0.0090, 0.0059, 0.0072, 0.0066, 0.0080, 0.0059], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 20:54:44,926 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.033e+02 1.846e+02 2.203e+02 3.021e+02 7.472e+02, threshold=4.405e+02, percent-clipped=8.0 +2022-11-15 20:54:45,035 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=38716.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:55:15,143 INFO [train.py:876] (2/4) Epoch 6, batch 2400, loss[loss=0.1954, simple_loss=0.1964, pruned_loss=0.09723, over 5434.00 frames. ], tot_loss[loss=0.1665, simple_loss=0.1746, pruned_loss=0.07919, over 1084068.06 frames. ], batch size: 58, lr: 1.35e-02, grad_scale: 16.0 +2022-11-15 20:55:40,262 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=11.82 vs. limit=5.0 +2022-11-15 20:55:52,458 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.056e+02 1.810e+02 2.219e+02 2.775e+02 4.582e+02, threshold=4.438e+02, percent-clipped=1.0 +2022-11-15 20:55:57,938 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=38823.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:56:10,163 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3895, 0.9583, 1.1991, 0.8253, 1.1939, 1.0765, 0.6308, 1.1800], + device='cuda:2'), covar=tensor([0.0474, 0.0331, 0.0353, 0.0941, 0.0272, 0.0173, 0.1061, 0.0619], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0013, 0.0009, 0.0011, 0.0010, 0.0009, 0.0012, 0.0009], + device='cuda:2'), out_proj_covar=tensor([4.1015e-05, 5.3898e-05, 4.2737e-05, 4.9304e-05, 4.6157e-05, 4.1366e-05, + 4.9931e-05, 4.3599e-05], device='cuda:2') +2022-11-15 20:56:23,608 INFO [train.py:876] (2/4) Epoch 6, batch 2500, loss[loss=0.2308, simple_loss=0.1972, pruned_loss=0.1322, over 3087.00 frames. ], tot_loss[loss=0.1659, simple_loss=0.1743, pruned_loss=0.07881, over 1082627.93 frames. ], batch size: 284, lr: 1.35e-02, grad_scale: 16.0 +2022-11-15 20:56:29,581 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.61 vs. limit=5.0 +2022-11-15 20:56:39,549 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=38884.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:56:41,095 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3014, 4.9718, 3.7059, 2.0758, 4.6654, 2.0864, 4.5968, 2.6853], + device='cuda:2'), covar=tensor([0.1145, 0.0092, 0.0367, 0.2257, 0.0120, 0.1859, 0.0161, 0.1439], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0103, 0.0110, 0.0121, 0.0103, 0.0132, 0.0094, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 20:56:44,454 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2154, 3.3262, 2.4839, 1.6935, 3.2658, 1.1620, 3.1261, 1.7500], + device='cuda:2'), covar=tensor([0.1089, 0.0166, 0.0979, 0.1759, 0.0180, 0.2093, 0.0215, 0.1402], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0103, 0.0110, 0.0121, 0.0103, 0.0132, 0.0094, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 20:57:01,242 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.111e+02 1.730e+02 2.141e+02 2.666e+02 7.999e+02, threshold=4.283e+02, percent-clipped=2.0 +2022-11-15 20:57:31,862 INFO [train.py:876] (2/4) Epoch 6, batch 2600, loss[loss=0.1584, simple_loss=0.1837, pruned_loss=0.06652, over 5701.00 frames. ], tot_loss[loss=0.1647, simple_loss=0.1736, pruned_loss=0.07786, over 1080903.06 frames. ], batch size: 19, lr: 1.35e-02, grad_scale: 16.0 +2022-11-15 20:57:33,395 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7973, 0.8237, 0.7724, 0.4582, 1.1149, 0.9359, 0.4890, 0.9425], + device='cuda:2'), covar=tensor([0.0020, 0.0009, 0.0014, 0.0017, 0.0013, 0.0016, 0.0031, 0.0019], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0029, 0.0030, 0.0030, 0.0028, 0.0028, 0.0029, 0.0026], + device='cuda:2'), out_proj_covar=tensor([2.9075e-05, 2.9842e-05, 2.7556e-05, 2.7825e-05, 2.4914e-05, 2.3560e-05, + 3.1133e-05, 2.3315e-05], device='cuda:2') +2022-11-15 20:58:01,102 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-11-15 20:58:09,808 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.231e+02 1.750e+02 2.073e+02 2.691e+02 5.649e+02, threshold=4.146e+02, percent-clipped=5.0 +2022-11-15 20:58:09,972 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=39016.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:58:11,587 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-15 20:58:15,494 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-11-15 20:58:40,665 INFO [train.py:876] (2/4) Epoch 6, batch 2700, loss[loss=0.08676, simple_loss=0.111, pruned_loss=0.03124, over 5464.00 frames. ], tot_loss[loss=0.1638, simple_loss=0.1733, pruned_loss=0.07711, over 1088103.02 frames. ], batch size: 10, lr: 1.35e-02, grad_scale: 16.0 +2022-11-15 20:58:42,655 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=39064.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 20:58:52,221 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.15 vs. limit=5.0 +2022-11-15 20:59:13,515 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-11-15 20:59:18,427 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.170e+02 1.741e+02 2.185e+02 2.571e+02 4.809e+02, threshold=4.370e+02, percent-clipped=1.0 +2022-11-15 20:59:36,652 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.79 vs. limit=5.0 +2022-11-15 20:59:49,229 INFO [train.py:876] (2/4) Epoch 6, batch 2800, loss[loss=0.1238, simple_loss=0.1509, pruned_loss=0.04839, over 5522.00 frames. ], tot_loss[loss=0.1639, simple_loss=0.173, pruned_loss=0.07742, over 1088352.77 frames. ], batch size: 13, lr: 1.35e-02, grad_scale: 16.0 +2022-11-15 21:00:01,217 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39179.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:00:05,540 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39185.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:00:14,357 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.72 vs. limit=5.0 +2022-11-15 21:00:27,552 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.069e+02 1.875e+02 2.211e+02 2.740e+02 7.049e+02, threshold=4.422e+02, percent-clipped=6.0 +2022-11-15 21:00:47,927 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39246.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:00:54,103 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3702, 3.5824, 3.5456, 1.6831, 3.3039, 3.9006, 3.7034, 4.2499], + device='cuda:2'), covar=tensor([0.2026, 0.1231, 0.0498, 0.2642, 0.0410, 0.0336, 0.0361, 0.0379], + device='cuda:2'), in_proj_covar=tensor([0.0188, 0.0193, 0.0147, 0.0198, 0.0158, 0.0160, 0.0139, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:00:58,299 INFO [train.py:876] (2/4) Epoch 6, batch 2900, loss[loss=0.1742, simple_loss=0.1867, pruned_loss=0.08083, over 5749.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.1732, pruned_loss=0.07746, over 1084491.01 frames. ], batch size: 15, lr: 1.35e-02, grad_scale: 16.0 +2022-11-15 21:01:03,800 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8938, 2.9703, 3.0256, 2.7169, 2.9217, 2.8618, 1.1469, 3.0336], + device='cuda:2'), covar=tensor([0.0345, 0.0308, 0.0229, 0.0293, 0.0385, 0.0350, 0.3124, 0.0345], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0077, 0.0077, 0.0069, 0.0092, 0.0081, 0.0130, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:01:13,863 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39284.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:01:32,397 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.73 vs. limit=5.0 +2022-11-15 21:01:36,728 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.430e+02 2.016e+02 2.489e+02 3.062e+02 5.776e+02, threshold=4.978e+02, percent-clipped=1.0 +2022-11-15 21:01:55,775 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39345.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:02:06,703 INFO [train.py:876] (2/4) Epoch 6, batch 3000, loss[loss=0.1805, simple_loss=0.1765, pruned_loss=0.09228, over 4972.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.1727, pruned_loss=0.07776, over 1088820.43 frames. ], batch size: 109, lr: 1.34e-02, grad_scale: 16.0 +2022-11-15 21:02:06,703 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 21:02:17,417 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2105, 1.6684, 1.4349, 0.7643, 1.6211, 1.4334, 1.0149, 0.8674], + device='cuda:2'), covar=tensor([0.0016, 0.0028, 0.0032, 0.0046, 0.0019, 0.0035, 0.0030, 0.0042], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0016, 0.0018, 0.0020, 0.0019, 0.0017, 0.0019, 0.0020], + device='cuda:2'), out_proj_covar=tensor([1.7297e-05, 1.6636e-05, 1.7447e-05, 2.0099e-05, 1.8298e-05, 1.7862e-05, + 1.9033e-05, 2.1589e-05], device='cuda:2') +2022-11-15 21:02:19,588 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5913, 4.7200, 4.6397, 3.1474, 3.9644, 4.7611, 4.4136, 5.3110], + device='cuda:2'), covar=tensor([0.1100, 0.0567, 0.0179, 0.1558, 0.0221, 0.0177, 0.0404, 0.0174], + device='cuda:2'), in_proj_covar=tensor([0.0183, 0.0192, 0.0143, 0.0197, 0.0156, 0.0159, 0.0136, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:02:24,359 INFO [train.py:908] (2/4) Epoch 6, validation: loss=0.1626, simple_loss=0.1844, pruned_loss=0.07046, over 1530663.00 frames. +2022-11-15 21:02:24,359 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 21:02:46,175 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-11-15 21:02:52,175 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8932, 1.6316, 1.4995, 1.2829, 0.9252, 2.5292, 1.9655, 1.2650], + device='cuda:2'), covar=tensor([0.0734, 0.1136, 0.0902, 0.2395, 0.3543, 0.0839, 0.1414, 0.0915], + device='cuda:2'), in_proj_covar=tensor([0.0056, 0.0047, 0.0050, 0.0061, 0.0050, 0.0042, 0.0049, 0.0050], + device='cuda:2'), out_proj_covar=tensor([1.1172e-04, 9.6625e-05, 1.0138e-04, 1.2493e-04, 1.0502e-04, 9.3528e-05, + 1.0029e-04, 1.0199e-04], device='cuda:2') +2022-11-15 21:03:01,502 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.163e+02 1.785e+02 2.182e+02 2.915e+02 6.384e+02, threshold=4.364e+02, percent-clipped=3.0 +2022-11-15 21:03:28,774 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6807, 1.9136, 1.3842, 1.2626, 1.1549, 2.1796, 2.0727, 1.9503], + device='cuda:2'), covar=tensor([0.0981, 0.0681, 0.1546, 0.1684, 0.0807, 0.0349, 0.0295, 0.0751], + device='cuda:2'), in_proj_covar=tensor([0.0183, 0.0189, 0.0145, 0.0197, 0.0156, 0.0158, 0.0134, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:03:30,643 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.3598, 4.8902, 5.1405, 4.7629, 5.3597, 5.2409, 4.6260, 5.3541], + device='cuda:2'), covar=tensor([0.0325, 0.0254, 0.0415, 0.0322, 0.0312, 0.0117, 0.0260, 0.0249], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0115, 0.0086, 0.0116, 0.0121, 0.0075, 0.0100, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:03:31,204 INFO [train.py:876] (2/4) Epoch 6, batch 3100, loss[loss=0.1075, simple_loss=0.1306, pruned_loss=0.04214, over 5708.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.1738, pruned_loss=0.07775, over 1092529.39 frames. ], batch size: 15, lr: 1.34e-02, grad_scale: 16.0 +2022-11-15 21:03:33,047 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39463.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:03:43,776 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=39479.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:03:46,145 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7008, 1.8923, 2.0235, 2.7711, 2.8040, 2.1819, 1.6223, 3.0457], + device='cuda:2'), covar=tensor([0.0674, 0.2782, 0.2000, 0.1592, 0.0944, 0.2454, 0.2275, 0.0419], + device='cuda:2'), in_proj_covar=tensor([0.0192, 0.0214, 0.0207, 0.0331, 0.0218, 0.0218, 0.0202, 0.0191], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:04:09,321 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.059e+02 1.769e+02 2.283e+02 2.689e+02 7.122e+02, threshold=4.567e+02, percent-clipped=2.0 +2022-11-15 21:04:12,337 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8114, 1.3013, 0.8575, 0.9643, 1.0467, 1.0640, 0.8372, 1.1923], + device='cuda:2'), covar=tensor([0.0035, 0.0021, 0.0031, 0.0039, 0.0026, 0.0027, 0.0067, 0.0028], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0028, 0.0031, 0.0031, 0.0029, 0.0028, 0.0031, 0.0026], + device='cuda:2'), out_proj_covar=tensor([2.9912e-05, 2.8986e-05, 2.8776e-05, 2.8383e-05, 2.5707e-05, 2.3826e-05, + 3.2468e-05, 2.2822e-05], device='cuda:2') +2022-11-15 21:04:14,717 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39524.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:04:16,933 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=39527.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:04:18,700 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-11-15 21:04:24,161 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.8240, 5.3332, 5.3379, 5.1631, 5.8172, 5.6331, 4.9370, 5.5013], + device='cuda:2'), covar=tensor([0.0778, 0.0370, 0.0877, 0.0588, 0.0687, 0.0191, 0.0435, 0.0705], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0117, 0.0087, 0.0118, 0.0125, 0.0076, 0.0101, 0.0114], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:04:26,502 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39541.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:04:40,019 INFO [train.py:876] (2/4) Epoch 6, batch 3200, loss[loss=0.1892, simple_loss=0.1873, pruned_loss=0.09552, over 5588.00 frames. ], tot_loss[loss=0.1657, simple_loss=0.1744, pruned_loss=0.07851, over 1088338.33 frames. ], batch size: 24, lr: 1.34e-02, grad_scale: 16.0 +2022-11-15 21:05:16,831 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 21:05:18,393 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.194e+02 1.784e+02 2.150e+02 2.757e+02 5.278e+02, threshold=4.299e+02, percent-clipped=1.0 +2022-11-15 21:05:31,819 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3807, 0.7779, 1.0547, 0.8886, 1.0476, 1.2796, 0.8412, 0.9354], + device='cuda:2'), covar=tensor([0.0734, 0.0825, 0.0831, 0.1629, 0.1828, 0.0551, 0.1873, 0.0843], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0013, 0.0010, 0.0012, 0.0011, 0.0009, 0.0013, 0.0010], + device='cuda:2'), out_proj_covar=tensor([4.3568e-05, 5.6498e-05, 4.5021e-05, 5.1807e-05, 4.8294e-05, 4.3072e-05, + 5.3042e-05, 4.6284e-05], device='cuda:2') +2022-11-15 21:05:33,687 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39639.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 21:05:34,271 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39640.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:05:45,783 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1684, 4.5381, 2.8690, 4.2108, 3.3328, 2.8871, 2.2964, 3.8250], + device='cuda:2'), covar=tensor([0.1537, 0.0098, 0.1009, 0.0209, 0.0464, 0.0918, 0.1828, 0.0202], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0129, 0.0169, 0.0132, 0.0163, 0.0178, 0.0177, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:05:48,169 INFO [train.py:876] (2/4) Epoch 6, batch 3300, loss[loss=0.1446, simple_loss=0.167, pruned_loss=0.06112, over 5602.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.1745, pruned_loss=0.07842, over 1083919.20 frames. ], batch size: 24, lr: 1.34e-02, grad_scale: 16.0 +2022-11-15 21:06:15,366 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39700.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 21:06:16,326 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.37 vs. limit=5.0 +2022-11-15 21:06:27,140 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.198e+02 1.777e+02 2.323e+02 2.808e+02 5.808e+02, threshold=4.646e+02, percent-clipped=1.0 +2022-11-15 21:06:57,085 INFO [train.py:876] (2/4) Epoch 6, batch 3400, loss[loss=0.1475, simple_loss=0.1733, pruned_loss=0.06086, over 5733.00 frames. ], tot_loss[loss=0.1652, simple_loss=0.1736, pruned_loss=0.0784, over 1080222.22 frames. ], batch size: 17, lr: 1.34e-02, grad_scale: 16.0 +2022-11-15 21:07:01,537 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39767.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:07:23,715 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-11-15 21:07:27,939 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.04 vs. limit=5.0 +2022-11-15 21:07:34,881 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39815.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:07:36,024 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.067e+02 1.758e+02 2.190e+02 2.600e+02 4.839e+02, threshold=4.379e+02, percent-clipped=2.0 +2022-11-15 21:07:37,421 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39819.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:07:43,453 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39828.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:07:52,336 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=39841.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:08:06,133 INFO [train.py:876] (2/4) Epoch 6, batch 3500, loss[loss=0.08968, simple_loss=0.1178, pruned_loss=0.03075, over 4684.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.1729, pruned_loss=0.07806, over 1076766.99 frames. ], batch size: 5, lr: 1.34e-02, grad_scale: 16.0 +2022-11-15 21:08:11,353 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.83 vs. limit=5.0 +2022-11-15 21:08:14,382 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6592, 3.8048, 3.0616, 1.7532, 3.6577, 1.3762, 3.5728, 1.9176], + device='cuda:2'), covar=tensor([0.1112, 0.0149, 0.0703, 0.2093, 0.0212, 0.2230, 0.0190, 0.1856], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0102, 0.0111, 0.0120, 0.0105, 0.0132, 0.0097, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 21:08:16,410 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39876.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:08:24,880 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=39889.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:08:43,943 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.235e+02 1.932e+02 2.210e+02 2.836e+02 6.294e+02, threshold=4.421e+02, percent-clipped=3.0 +2022-11-15 21:08:59,720 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=39940.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:09:14,042 INFO [train.py:876] (2/4) Epoch 6, batch 3600, loss[loss=0.1285, simple_loss=0.1531, pruned_loss=0.05188, over 5537.00 frames. ], tot_loss[loss=0.1637, simple_loss=0.1726, pruned_loss=0.0774, over 1082027.54 frames. ], batch size: 13, lr: 1.33e-02, grad_scale: 16.0 +2022-11-15 21:09:32,387 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=39988.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:09:36,964 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39995.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 21:09:39,053 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7272, 2.1457, 1.6830, 1.3607, 1.3215, 2.3275, 1.8650, 2.1970], + device='cuda:2'), covar=tensor([0.1320, 0.0997, 0.1266, 0.1932, 0.0835, 0.0384, 0.0432, 0.0745], + device='cuda:2'), in_proj_covar=tensor([0.0190, 0.0192, 0.0148, 0.0200, 0.0159, 0.0162, 0.0139, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:09:55,016 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.137e+02 1.820e+02 2.295e+02 3.006e+02 4.723e+02, threshold=4.590e+02, percent-clipped=2.0 +2022-11-15 21:10:09,517 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4152, 4.9410, 4.2336, 4.8998, 4.8476, 4.0985, 4.4244, 4.1981], + device='cuda:2'), covar=tensor([0.0198, 0.0287, 0.1373, 0.0306, 0.0336, 0.0305, 0.0323, 0.0393], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0150, 0.0236, 0.0147, 0.0178, 0.0152, 0.0161, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:10:25,138 INFO [train.py:876] (2/4) Epoch 6, batch 3700, loss[loss=0.1388, simple_loss=0.159, pruned_loss=0.05933, over 5460.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.1736, pruned_loss=0.07781, over 1080140.93 frames. ], batch size: 11, lr: 1.33e-02, grad_scale: 16.0 +2022-11-15 21:10:32,698 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9881, 4.4127, 3.9702, 4.4714, 4.4631, 3.7591, 3.8338, 3.7621], + device='cuda:2'), covar=tensor([0.0386, 0.0417, 0.1089, 0.0283, 0.0318, 0.0364, 0.0517, 0.0477], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0151, 0.0237, 0.0147, 0.0178, 0.0153, 0.0162, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:11:03,523 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.059e+02 1.767e+02 2.199e+02 2.713e+02 5.676e+02, threshold=4.399e+02, percent-clipped=2.0 +2022-11-15 21:11:04,989 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40119.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:11:07,566 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40123.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:11:33,230 INFO [train.py:876] (2/4) Epoch 6, batch 3800, loss[loss=0.134, simple_loss=0.1536, pruned_loss=0.05718, over 5779.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.1738, pruned_loss=0.07739, over 1080162.13 frames. ], batch size: 20, lr: 1.33e-02, grad_scale: 16.0 +2022-11-15 21:11:37,524 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40167.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:11:40,245 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40171.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:11:46,705 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5696, 1.0261, 1.1709, 0.9304, 1.6052, 1.9069, 1.0802, 1.3147], + device='cuda:2'), covar=tensor([0.2441, 0.0865, 0.0726, 0.2472, 0.2900, 0.0332, 0.0709, 0.0901], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0013, 0.0010, 0.0012, 0.0010, 0.0010, 0.0012, 0.0010], + device='cuda:2'), out_proj_covar=tensor([4.4074e-05, 5.7201e-05, 4.5262e-05, 5.2028e-05, 4.7456e-05, 4.4192e-05, + 5.3043e-05, 4.6523e-05], device='cuda:2') +2022-11-15 21:11:49,591 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.00 vs. limit=5.0 +2022-11-15 21:11:57,156 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40195.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:12:05,988 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.3981, 5.1383, 5.1296, 5.0566, 5.0574, 4.4496, 5.9329, 5.0751], + device='cuda:2'), covar=tensor([0.0367, 0.1048, 0.0272, 0.1322, 0.0319, 0.0255, 0.0616, 0.0462], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0092, 0.0075, 0.0096, 0.0073, 0.0062, 0.0122, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:12:11,859 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.095e+02 1.630e+02 2.050e+02 2.494e+02 4.190e+02, threshold=4.099e+02, percent-clipped=0.0 +2022-11-15 21:12:13,032 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40218.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:12:28,245 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.29 vs. limit=5.0 +2022-11-15 21:12:39,064 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40256.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:12:42,146 INFO [train.py:876] (2/4) Epoch 6, batch 3900, loss[loss=0.1722, simple_loss=0.1835, pruned_loss=0.08045, over 5742.00 frames. ], tot_loss[loss=0.1639, simple_loss=0.1736, pruned_loss=0.07713, over 1079873.24 frames. ], batch size: 31, lr: 1.33e-02, grad_scale: 16.0 +2022-11-15 21:12:49,552 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40272.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:12:54,506 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40279.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:13:05,280 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40295.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 21:13:11,664 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8581, 1.3859, 1.3590, 1.9446, 2.0280, 1.9694, 2.6097, 1.7104], + device='cuda:2'), covar=tensor([0.0046, 0.0058, 0.0086, 0.0025, 0.0032, 0.0120, 0.0028, 0.0036], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0016, 0.0018, 0.0019, 0.0018, 0.0017, 0.0019, 0.0019], + device='cuda:2'), out_proj_covar=tensor([1.6845e-05, 1.6297e-05, 1.7196e-05, 1.9158e-05, 1.7736e-05, 1.7219e-05, + 1.9348e-05, 2.0938e-05], device='cuda:2') +2022-11-15 21:13:20,316 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.159e+02 1.970e+02 2.299e+02 2.764e+02 4.240e+02, threshold=4.598e+02, percent-clipped=2.0 +2022-11-15 21:13:28,949 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40330.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:13:31,217 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40333.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:13:37,591 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40343.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 21:13:49,828 INFO [train.py:876] (2/4) Epoch 6, batch 4000, loss[loss=0.1944, simple_loss=0.1896, pruned_loss=0.09965, over 5131.00 frames. ], tot_loss[loss=0.163, simple_loss=0.1729, pruned_loss=0.07656, over 1086743.23 frames. ], batch size: 91, lr: 1.33e-02, grad_scale: 16.0 +2022-11-15 21:14:10,412 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40391.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:14:28,143 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.247e+02 1.803e+02 2.226e+02 2.834e+02 5.770e+02, threshold=4.452e+02, percent-clipped=4.0 +2022-11-15 21:14:32,584 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40423.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:14:58,067 INFO [train.py:876] (2/4) Epoch 6, batch 4100, loss[loss=0.1554, simple_loss=0.1718, pruned_loss=0.06954, over 5750.00 frames. ], tot_loss[loss=0.1612, simple_loss=0.1716, pruned_loss=0.07542, over 1092355.75 frames. ], batch size: 13, lr: 1.33e-02, grad_scale: 16.0 +2022-11-15 21:14:58,584 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-11-15 21:15:04,545 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40471.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:15:04,620 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40471.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:15:19,107 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40492.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:15:36,008 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.131e+02 1.881e+02 2.239e+02 2.711e+02 4.699e+02, threshold=4.478e+02, percent-clipped=1.0 +2022-11-15 21:15:37,411 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40519.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:15:42,128 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7467, 2.7173, 2.2798, 2.4552, 1.7019, 2.3128, 1.7087, 2.4727], + device='cuda:2'), covar=tensor([0.1068, 0.0200, 0.0611, 0.0333, 0.1147, 0.0679, 0.1358, 0.0300], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0132, 0.0171, 0.0135, 0.0167, 0.0181, 0.0181, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:15:59,205 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40551.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:16:00,604 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40553.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:16:05,972 INFO [train.py:876] (2/4) Epoch 6, batch 4200, loss[loss=0.1161, simple_loss=0.1437, pruned_loss=0.04429, over 5697.00 frames. ], tot_loss[loss=0.1637, simple_loss=0.1731, pruned_loss=0.07716, over 1087764.45 frames. ], batch size: 19, lr: 1.32e-02, grad_scale: 16.0 +2022-11-15 21:16:14,950 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40574.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:16:39,089 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1121, 4.2753, 2.7036, 3.9823, 3.2557, 2.7917, 2.2223, 3.5166], + device='cuda:2'), covar=tensor([0.2250, 0.0213, 0.1484, 0.0391, 0.0689, 0.1345, 0.2549, 0.0347], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0131, 0.0171, 0.0135, 0.0165, 0.0180, 0.0180, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:16:44,496 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.208e+02 1.657e+02 2.106e+02 2.806e+02 4.425e+02, threshold=4.213e+02, percent-clipped=0.0 +2022-11-15 21:16:50,336 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6693, 1.0751, 1.1102, 0.8474, 1.1776, 1.1212, 0.7541, 1.1341], + device='cuda:2'), covar=tensor([0.0312, 0.0365, 0.0409, 0.1143, 0.0903, 0.1002, 0.0581, 0.0383], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0014, 0.0010, 0.0012, 0.0011, 0.0010, 0.0013, 0.0010], + device='cuda:2'), out_proj_covar=tensor([4.5281e-05, 5.9527e-05, 4.6181e-05, 5.2869e-05, 4.9538e-05, 4.4984e-05, + 5.4272e-05, 4.6868e-05], device='cuda:2') +2022-11-15 21:16:52,162 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40628.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:17:08,273 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40651.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:17:14,812 INFO [train.py:876] (2/4) Epoch 6, batch 4300, loss[loss=0.1271, simple_loss=0.1518, pruned_loss=0.05117, over 5493.00 frames. ], tot_loss[loss=0.1603, simple_loss=0.171, pruned_loss=0.07484, over 1092843.40 frames. ], batch size: 12, lr: 1.32e-02, grad_scale: 16.0 +2022-11-15 21:17:26,392 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-11-15 21:17:32,056 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40686.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:17:43,957 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-11-15 21:17:49,671 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40712.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:17:52,508 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.33 vs. limit=5.0 +2022-11-15 21:17:52,814 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.535e+01 1.873e+02 2.249e+02 2.828e+02 5.659e+02, threshold=4.498e+02, percent-clipped=5.0 +2022-11-15 21:18:18,243 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8004, 2.7791, 2.1431, 2.4609, 1.5449, 2.2233, 1.7747, 2.4129], + device='cuda:2'), covar=tensor([0.0985, 0.0192, 0.0707, 0.0326, 0.1208, 0.0713, 0.1290, 0.0261], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0129, 0.0168, 0.0134, 0.0163, 0.0177, 0.0177, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:18:23,395 INFO [train.py:876] (2/4) Epoch 6, batch 4400, loss[loss=0.1469, simple_loss=0.164, pruned_loss=0.06489, over 5731.00 frames. ], tot_loss[loss=0.16, simple_loss=0.1707, pruned_loss=0.07466, over 1091243.15 frames. ], batch size: 12, lr: 1.32e-02, grad_scale: 16.0 +2022-11-15 21:18:56,073 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9103, 3.4528, 2.7642, 3.3796, 3.3619, 3.1145, 3.2903, 3.1532], + device='cuda:2'), covar=tensor([0.1334, 0.0678, 0.2246, 0.0750, 0.0755, 0.0605, 0.0637, 0.0672], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0155, 0.0245, 0.0151, 0.0185, 0.0158, 0.0165, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:19:01,940 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.034e+02 1.711e+02 2.029e+02 2.549e+02 3.838e+02, threshold=4.057e+02, percent-clipped=0.0 +2022-11-15 21:19:11,001 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4869, 2.0204, 2.4111, 3.2688, 3.2967, 2.5833, 2.1332, 3.5534], + device='cuda:2'), covar=tensor([0.0569, 0.4027, 0.2773, 0.2681, 0.1154, 0.2895, 0.2615, 0.0332], + device='cuda:2'), in_proj_covar=tensor([0.0195, 0.0215, 0.0206, 0.0324, 0.0224, 0.0219, 0.0200, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:19:23,030 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40848.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:19:25,002 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40851.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:19:32,355 INFO [train.py:876] (2/4) Epoch 6, batch 4500, loss[loss=0.106, simple_loss=0.1368, pruned_loss=0.0376, over 5729.00 frames. ], tot_loss[loss=0.161, simple_loss=0.1717, pruned_loss=0.07512, over 1090122.66 frames. ], batch size: 11, lr: 1.32e-02, grad_scale: 16.0 +2022-11-15 21:19:35,165 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40865.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:19:41,106 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40874.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:19:58,083 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40899.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:19:58,869 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40900.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:20:06,425 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3478, 0.8594, 1.0698, 0.8541, 1.1561, 1.1966, 0.8927, 1.0332], + device='cuda:2'), covar=tensor([0.0663, 0.0517, 0.0379, 0.0938, 0.0612, 0.0510, 0.0603, 0.0372], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0013, 0.0010, 0.0011, 0.0010, 0.0009, 0.0012, 0.0009], + device='cuda:2'), out_proj_covar=tensor([4.4401e-05, 5.7279e-05, 4.5235e-05, 5.0903e-05, 4.7427e-05, 4.3508e-05, + 5.2799e-05, 4.4495e-05], device='cuda:2') +2022-11-15 21:20:10,193 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.352e+02 1.848e+02 2.167e+02 2.574e+02 4.684e+02, threshold=4.333e+02, percent-clipped=2.0 +2022-11-15 21:20:11,052 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4474, 1.2785, 1.5229, 1.4072, 1.2315, 1.3136, 1.4283, 1.2769], + device='cuda:2'), covar=tensor([0.3057, 0.0541, 0.0803, 0.1743, 0.2959, 0.3436, 0.2447, 0.2615], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0013, 0.0010, 0.0011, 0.0010, 0.0009, 0.0012, 0.0009], + device='cuda:2'), out_proj_covar=tensor([4.4466e-05, 5.7360e-05, 4.5148e-05, 5.0898e-05, 4.7369e-05, 4.3505e-05, + 5.2791e-05, 4.4454e-05], device='cuda:2') +2022-11-15 21:20:13,545 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40922.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:20:16,368 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40926.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:20:17,592 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40928.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:20:39,756 INFO [train.py:876] (2/4) Epoch 6, batch 4600, loss[loss=0.1645, simple_loss=0.1738, pruned_loss=0.07758, over 5726.00 frames. ], tot_loss[loss=0.1631, simple_loss=0.1733, pruned_loss=0.07645, over 1090578.75 frames. ], batch size: 31, lr: 1.32e-02, grad_scale: 16.0 +2022-11-15 21:20:39,940 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40961.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 21:20:50,074 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40976.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:20:54,422 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-11-15 21:20:56,729 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40986.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:21:04,232 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40997.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:21:11,465 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41007.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:21:17,999 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.184e+02 1.825e+02 2.193e+02 2.554e+02 5.272e+02, threshold=4.385e+02, percent-clipped=4.0 +2022-11-15 21:21:29,552 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41034.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:21:36,929 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41045.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:21:45,923 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41058.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:21:48,097 INFO [train.py:876] (2/4) Epoch 6, batch 4700, loss[loss=0.1542, simple_loss=0.1725, pruned_loss=0.06793, over 5715.00 frames. ], tot_loss[loss=0.1636, simple_loss=0.1727, pruned_loss=0.07726, over 1091632.82 frames. ], batch size: 28, lr: 1.32e-02, grad_scale: 16.0 +2022-11-15 21:22:16,159 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41102.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:22:18,824 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41106.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:22:26,239 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.964e+01 1.808e+02 2.339e+02 2.942e+02 4.729e+02, threshold=4.678e+02, percent-clipped=2.0 +2022-11-15 21:22:47,070 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41148.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:22:56,347 INFO [train.py:876] (2/4) Epoch 6, batch 4800, loss[loss=0.2507, simple_loss=0.2271, pruned_loss=0.1372, over 5475.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.1746, pruned_loss=0.07889, over 1089811.51 frames. ], batch size: 64, lr: 1.31e-02, grad_scale: 16.0 +2022-11-15 21:22:57,811 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41163.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:23:19,866 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41196.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:23:35,274 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.017e+02 1.896e+02 2.366e+02 3.104e+02 6.971e+02, threshold=4.733e+02, percent-clipped=2.0 +2022-11-15 21:23:37,417 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41221.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:23:59,984 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.76 vs. limit=5.0 +2022-11-15 21:24:00,965 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41256.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 21:24:04,484 INFO [train.py:876] (2/4) Epoch 6, batch 4900, loss[loss=0.1553, simple_loss=0.1675, pruned_loss=0.07157, over 5766.00 frames. ], tot_loss[loss=0.1629, simple_loss=0.1722, pruned_loss=0.07676, over 1087483.64 frames. ], batch size: 16, lr: 1.31e-02, grad_scale: 16.0 +2022-11-15 21:24:35,592 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41307.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:24:43,024 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.121e+02 1.647e+02 1.947e+02 2.444e+02 4.412e+02, threshold=3.894e+02, percent-clipped=0.0 +2022-11-15 21:25:07,255 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41353.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:25:08,583 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41355.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:25:12,451 INFO [train.py:876] (2/4) Epoch 6, batch 5000, loss[loss=0.1094, simple_loss=0.1331, pruned_loss=0.04282, over 5055.00 frames. ], tot_loss[loss=0.159, simple_loss=0.17, pruned_loss=0.07397, over 1091441.94 frames. ], batch size: 7, lr: 1.31e-02, grad_scale: 16.0 +2022-11-15 21:25:37,788 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0269, 2.2862, 3.4548, 2.8510, 3.8140, 2.5103, 3.4664, 3.9365], + device='cuda:2'), covar=tensor([0.0552, 0.1531, 0.0687, 0.1757, 0.0352, 0.1334, 0.0971, 0.0648], + device='cuda:2'), in_proj_covar=tensor([0.0209, 0.0190, 0.0192, 0.0208, 0.0192, 0.0187, 0.0227, 0.0212], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:25:39,909 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41401.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:25:51,845 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.095e+02 1.916e+02 2.252e+02 2.845e+02 5.364e+02, threshold=4.504e+02, percent-clipped=7.0 +2022-11-15 21:26:11,341 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-11-15 21:26:18,842 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41458.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:26:20,691 INFO [train.py:876] (2/4) Epoch 6, batch 5100, loss[loss=0.1938, simple_loss=0.174, pruned_loss=0.1068, over 4079.00 frames. ], tot_loss[loss=0.1596, simple_loss=0.1702, pruned_loss=0.07446, over 1087170.45 frames. ], batch size: 181, lr: 1.31e-02, grad_scale: 16.0 +2022-11-15 21:26:59,646 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.110e+02 1.689e+02 2.100e+02 2.600e+02 5.796e+02, threshold=4.200e+02, percent-clipped=2.0 +2022-11-15 21:27:02,079 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41521.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:27:10,719 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1234, 2.3298, 3.5406, 3.1636, 4.2257, 2.6858, 3.6002, 4.2406], + device='cuda:2'), covar=tensor([0.0440, 0.1718, 0.0676, 0.1429, 0.0278, 0.1401, 0.1066, 0.0678], + device='cuda:2'), in_proj_covar=tensor([0.0211, 0.0191, 0.0193, 0.0209, 0.0195, 0.0188, 0.0228, 0.0215], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:27:25,840 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41556.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 21:27:28,991 INFO [train.py:876] (2/4) Epoch 6, batch 5200, loss[loss=0.1505, simple_loss=0.1748, pruned_loss=0.06311, over 5753.00 frames. ], tot_loss[loss=0.1603, simple_loss=0.1712, pruned_loss=0.07471, over 1092610.39 frames. ], batch size: 14, lr: 1.31e-02, grad_scale: 16.0 +2022-11-15 21:27:30,415 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1480, 0.8603, 1.0554, 0.9606, 0.8441, 0.8066, 0.9040, 0.8224], + device='cuda:2'), covar=tensor([0.1321, 0.0903, 0.1753, 0.1238, 0.1699, 0.1446, 0.0945, 0.1344], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0014, 0.0010, 0.0012, 0.0011, 0.0010, 0.0013, 0.0010], + device='cuda:2'), out_proj_covar=tensor([4.7130e-05, 6.0899e-05, 4.7920e-05, 5.4564e-05, 5.0844e-05, 4.6389e-05, + 5.5791e-05, 4.7236e-05], device='cuda:2') +2022-11-15 21:27:34,217 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41569.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:27:52,307 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.2542, 4.6917, 5.0326, 4.5948, 5.3474, 5.1489, 4.5546, 5.1952], + device='cuda:2'), covar=tensor([0.0262, 0.0217, 0.0363, 0.0321, 0.0216, 0.0095, 0.0205, 0.0208], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0118, 0.0089, 0.0120, 0.0128, 0.0075, 0.0101, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:27:58,403 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41604.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:28:08,060 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.086e+02 1.794e+02 2.245e+02 2.810e+02 6.868e+02, threshold=4.491e+02, percent-clipped=3.0 +2022-11-15 21:28:14,636 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-11-15 21:28:19,171 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.72 vs. limit=5.0 +2022-11-15 21:28:22,825 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8885, 1.0810, 1.0743, 0.7829, 0.9943, 1.3264, 0.8697, 1.1456], + device='cuda:2'), covar=tensor([0.0044, 0.0028, 0.0032, 0.0033, 0.0029, 0.0022, 0.0053, 0.0047], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0033, 0.0034, 0.0035, 0.0032, 0.0030, 0.0035, 0.0028], + device='cuda:2'), out_proj_covar=tensor([3.3059e-05, 3.2494e-05, 3.1146e-05, 3.2136e-05, 2.8959e-05, 2.5959e-05, + 3.4963e-05, 2.5465e-05], device='cuda:2') +2022-11-15 21:28:32,308 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41653.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:28:33,037 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41654.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:28:37,528 INFO [train.py:876] (2/4) Epoch 6, batch 5300, loss[loss=0.1383, simple_loss=0.1561, pruned_loss=0.06023, over 5725.00 frames. ], tot_loss[loss=0.1591, simple_loss=0.1703, pruned_loss=0.07395, over 1089976.92 frames. ], batch size: 15, lr: 1.31e-02, grad_scale: 16.0 +2022-11-15 21:29:05,169 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41701.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:29:05,246 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41701.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:29:15,041 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41715.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 21:29:16,829 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.330e+02 1.882e+02 2.155e+02 2.589e+02 5.849e+02, threshold=4.310e+02, percent-clipped=2.0 +2022-11-15 21:29:33,620 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1622, 1.8637, 2.2996, 1.1064, 1.1602, 2.5192, 2.1536, 1.5790], + device='cuda:2'), covar=tensor([0.0662, 0.0834, 0.0452, 0.2379, 0.2241, 0.2282, 0.0601, 0.0936], + device='cuda:2'), in_proj_covar=tensor([0.0056, 0.0045, 0.0049, 0.0060, 0.0051, 0.0040, 0.0045, 0.0050], + device='cuda:2'), out_proj_covar=tensor([1.1464e-04, 9.4610e-05, 1.0111e-04, 1.2549e-04, 1.0894e-04, 9.0689e-05, + 9.7032e-05, 1.0297e-04], device='cuda:2') +2022-11-15 21:29:38,524 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41749.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:29:44,841 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41758.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:29:46,717 INFO [train.py:876] (2/4) Epoch 6, batch 5400, loss[loss=0.2339, simple_loss=0.2033, pruned_loss=0.1323, over 5239.00 frames. ], tot_loss[loss=0.1591, simple_loss=0.1706, pruned_loss=0.07381, over 1089284.31 frames. ], batch size: 79, lr: 1.31e-02, grad_scale: 16.0 +2022-11-15 21:30:09,359 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-11-15 21:30:12,331 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9650, 2.2752, 3.5285, 3.0760, 3.9463, 2.5787, 3.5060, 3.9790], + device='cuda:2'), covar=tensor([0.0483, 0.1452, 0.0523, 0.1395, 0.0331, 0.1174, 0.0915, 0.0805], + device='cuda:2'), in_proj_covar=tensor([0.0211, 0.0193, 0.0193, 0.0209, 0.0195, 0.0186, 0.0229, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:30:17,683 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41806.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:30:26,061 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.040e+02 1.844e+02 2.196e+02 2.605e+02 6.100e+02, threshold=4.391e+02, percent-clipped=2.0 +2022-11-15 21:30:33,096 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4058, 4.9729, 3.0569, 4.6032, 3.7327, 3.3383, 2.7813, 4.2690], + device='cuda:2'), covar=tensor([0.1401, 0.0135, 0.1123, 0.0348, 0.0471, 0.0886, 0.1589, 0.0207], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0132, 0.0169, 0.0137, 0.0168, 0.0179, 0.0178, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:30:39,899 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-15 21:30:55,105 INFO [train.py:876] (2/4) Epoch 6, batch 5500, loss[loss=0.2013, simple_loss=0.2018, pruned_loss=0.1004, over 5158.00 frames. ], tot_loss[loss=0.1605, simple_loss=0.1712, pruned_loss=0.07492, over 1085567.41 frames. ], batch size: 91, lr: 1.30e-02, grad_scale: 16.0 +2022-11-15 21:31:25,968 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41906.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:31:33,901 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 1.786e+02 2.219e+02 2.876e+02 6.539e+02, threshold=4.438e+02, percent-clipped=3.0 +2022-11-15 21:31:34,137 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0788, 2.2011, 3.2978, 3.1282, 4.0133, 2.2286, 3.3343, 4.0128], + device='cuda:2'), covar=tensor([0.0483, 0.1748, 0.0837, 0.1546, 0.0286, 0.1575, 0.1105, 0.0596], + device='cuda:2'), in_proj_covar=tensor([0.0207, 0.0188, 0.0188, 0.0207, 0.0191, 0.0184, 0.0224, 0.0208], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:31:54,708 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41948.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 21:32:03,410 INFO [train.py:876] (2/4) Epoch 6, batch 5600, loss[loss=0.1728, simple_loss=0.1846, pruned_loss=0.08044, over 5663.00 frames. ], tot_loss[loss=0.1611, simple_loss=0.1714, pruned_loss=0.07539, over 1084365.47 frames. ], batch size: 32, lr: 1.30e-02, grad_scale: 16.0 +2022-11-15 21:32:07,841 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41967.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:32:36,391 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42009.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 21:32:36,948 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42010.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 21:32:42,426 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.145e+02 1.893e+02 2.253e+02 2.739e+02 4.723e+02, threshold=4.505e+02, percent-clipped=2.0 +2022-11-15 21:32:51,269 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0822, 4.7776, 4.9392, 5.0853, 4.3843, 4.1467, 5.4954, 4.6426], + device='cuda:2'), covar=tensor([0.0303, 0.0843, 0.0239, 0.0814, 0.0457, 0.0277, 0.0685, 0.0437], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0089, 0.0075, 0.0095, 0.0073, 0.0061, 0.0118, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:32:59,578 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0822, 1.1159, 1.2644, 1.6237, 1.9680, 1.6836, 1.7501, 1.4305], + device='cuda:2'), covar=tensor([0.0020, 0.0076, 0.0076, 0.0018, 0.0018, 0.0048, 0.0017, 0.0023], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0017, 0.0018, 0.0020, 0.0018, 0.0017, 0.0019, 0.0020], + device='cuda:2'), out_proj_covar=tensor([1.7478e-05, 1.6706e-05, 1.7563e-05, 2.0068e-05, 1.7422e-05, 1.7776e-05, + 1.8720e-05, 2.1115e-05], device='cuda:2') +2022-11-15 21:33:11,608 INFO [train.py:876] (2/4) Epoch 6, batch 5700, loss[loss=0.2915, simple_loss=0.2435, pruned_loss=0.1698, over 3013.00 frames. ], tot_loss[loss=0.1625, simple_loss=0.172, pruned_loss=0.0765, over 1077292.19 frames. ], batch size: 284, lr: 1.30e-02, grad_scale: 16.0 +2022-11-15 21:33:27,460 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2141, 5.6951, 4.1440, 2.7339, 5.3744, 2.9060, 5.1162, 3.2959], + device='cuda:2'), covar=tensor([0.0693, 0.0084, 0.0486, 0.1742, 0.0144, 0.1207, 0.0088, 0.1193], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0101, 0.0114, 0.0118, 0.0107, 0.0127, 0.0096, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 21:33:45,450 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8085, 1.2430, 1.1253, 1.0396, 0.9582, 1.1535, 0.8105, 1.2428], + device='cuda:2'), covar=tensor([0.0029, 0.0019, 0.0023, 0.0022, 0.0024, 0.0017, 0.0043, 0.0031], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0031, 0.0033, 0.0033, 0.0032, 0.0029, 0.0034, 0.0027], + device='cuda:2'), out_proj_covar=tensor([3.2640e-05, 3.0688e-05, 3.0099e-05, 3.0593e-05, 2.8373e-05, 2.4910e-05, + 3.4111e-05, 2.4817e-05], device='cuda:2') +2022-11-15 21:33:51,626 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.009e+02 1.803e+02 2.125e+02 2.518e+02 5.093e+02, threshold=4.250e+02, percent-clipped=1.0 +2022-11-15 21:34:23,228 INFO [train.py:876] (2/4) Epoch 6, batch 5800, loss[loss=0.09297, simple_loss=0.1145, pruned_loss=0.03574, over 4469.00 frames. ], tot_loss[loss=0.1599, simple_loss=0.171, pruned_loss=0.07441, over 1082207.46 frames. ], batch size: 5, lr: 1.30e-02, grad_scale: 16.0 +2022-11-15 21:34:52,577 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-15 21:35:01,872 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.108e+02 1.784e+02 2.140e+02 2.674e+02 6.368e+02, threshold=4.280e+02, percent-clipped=2.0 +2022-11-15 21:35:18,028 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9960, 1.7507, 1.9035, 1.8626, 2.2338, 1.7966, 1.3408, 2.2873], + device='cuda:2'), covar=tensor([0.0874, 0.1599, 0.0876, 0.0972, 0.0597, 0.1485, 0.1783, 0.0954], + device='cuda:2'), in_proj_covar=tensor([0.0195, 0.0209, 0.0200, 0.0321, 0.0218, 0.0215, 0.0194, 0.0193], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:35:31,654 INFO [train.py:876] (2/4) Epoch 6, batch 5900, loss[loss=0.1711, simple_loss=0.1869, pruned_loss=0.07765, over 5559.00 frames. ], tot_loss[loss=0.1587, simple_loss=0.17, pruned_loss=0.0737, over 1078020.48 frames. ], batch size: 14, lr: 1.30e-02, grad_scale: 16.0 +2022-11-15 21:35:32,378 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42262.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:35:34,493 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8588, 2.1195, 3.4055, 2.9429, 3.7048, 2.0402, 3.1635, 3.7964], + device='cuda:2'), covar=tensor([0.0479, 0.1808, 0.0659, 0.1644, 0.0702, 0.1572, 0.1361, 0.0790], + device='cuda:2'), in_proj_covar=tensor([0.0212, 0.0193, 0.0192, 0.0211, 0.0197, 0.0188, 0.0232, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:36:00,792 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42304.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 21:36:05,201 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=42310.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 21:36:10,907 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.198e+02 1.865e+02 2.230e+02 2.875e+02 6.515e+02, threshold=4.459e+02, percent-clipped=7.0 +2022-11-15 21:36:26,972 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-11-15 21:36:37,638 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=42358.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:36:37,715 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0672, 3.4220, 2.4155, 3.2037, 2.3478, 2.4325, 1.9416, 2.9137], + device='cuda:2'), covar=tensor([0.1258, 0.0184, 0.0815, 0.0268, 0.0920, 0.0932, 0.1542, 0.0300], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0134, 0.0169, 0.0136, 0.0172, 0.0180, 0.0181, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:36:39,543 INFO [train.py:876] (2/4) Epoch 6, batch 6000, loss[loss=0.1662, simple_loss=0.1841, pruned_loss=0.07417, over 5633.00 frames. ], tot_loss[loss=0.1594, simple_loss=0.1707, pruned_loss=0.0741, over 1086434.78 frames. ], batch size: 38, lr: 1.30e-02, grad_scale: 8.0 +2022-11-15 21:36:39,543 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 21:36:47,840 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0487, 1.2742, 1.4065, 0.8992, 1.1781, 1.5185, 1.1665, 1.4074], + device='cuda:2'), covar=tensor([0.0032, 0.0026, 0.0019, 0.0025, 0.0020, 0.0014, 0.0033, 0.0019], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0032, 0.0034, 0.0033, 0.0032, 0.0029, 0.0034, 0.0028], + device='cuda:2'), out_proj_covar=tensor([3.3075e-05, 3.1043e-05, 3.0498e-05, 3.0409e-05, 2.8386e-05, 2.4894e-05, + 3.4381e-05, 2.5196e-05], device='cuda:2') +2022-11-15 21:36:57,490 INFO [train.py:908] (2/4) Epoch 6, validation: loss=0.1626, simple_loss=0.1837, pruned_loss=0.07077, over 1530663.00 frames. +2022-11-15 21:36:57,491 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 21:37:09,881 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42379.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:37:11,921 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9821, 4.3952, 3.9280, 3.7958, 2.5666, 4.5102, 2.4341, 3.8420], + device='cuda:2'), covar=tensor([0.0470, 0.0117, 0.0144, 0.0269, 0.0411, 0.0085, 0.0365, 0.0083], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0136, 0.0148, 0.0163, 0.0167, 0.0146, 0.0159, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:37:23,545 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9997, 2.7306, 3.3658, 4.4986, 4.9304, 4.1490, 3.9106, 4.7842], + device='cuda:2'), covar=tensor([0.0145, 0.3758, 0.1812, 0.3244, 0.0438, 0.1906, 0.1529, 0.0252], + device='cuda:2'), in_proj_covar=tensor([0.0193, 0.0210, 0.0202, 0.0320, 0.0217, 0.0213, 0.0194, 0.0191], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:37:37,845 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.029e+02 1.691e+02 2.055e+02 2.547e+02 4.668e+02, threshold=4.111e+02, percent-clipped=1.0 +2022-11-15 21:37:52,175 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42440.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:38:06,271 INFO [train.py:876] (2/4) Epoch 6, batch 6100, loss[loss=0.1753, simple_loss=0.1858, pruned_loss=0.08241, over 5590.00 frames. ], tot_loss[loss=0.1602, simple_loss=0.1713, pruned_loss=0.07461, over 1090154.93 frames. ], batch size: 24, lr: 1.29e-02, grad_scale: 8.0 +2022-11-15 21:38:37,130 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42506.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:38:42,730 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7554, 4.4086, 3.3965, 1.8523, 4.2326, 1.6182, 4.1972, 2.2314], + device='cuda:2'), covar=tensor([0.1364, 0.0129, 0.0458, 0.2216, 0.0166, 0.1942, 0.0166, 0.2019], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0102, 0.0114, 0.0119, 0.0107, 0.0127, 0.0097, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 21:38:45,894 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.054e+02 1.891e+02 2.304e+02 2.895e+02 7.790e+02, threshold=4.608e+02, percent-clipped=4.0 +2022-11-15 21:38:56,322 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3250, 3.6451, 2.6898, 1.7317, 3.5610, 1.2428, 3.4549, 1.9488], + device='cuda:2'), covar=tensor([0.1273, 0.0157, 0.0771, 0.1933, 0.0200, 0.2008, 0.0192, 0.1670], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0101, 0.0114, 0.0118, 0.0107, 0.0127, 0.0097, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 21:39:14,511 INFO [train.py:876] (2/4) Epoch 6, batch 6200, loss[loss=0.189, simple_loss=0.1931, pruned_loss=0.09251, over 4992.00 frames. ], tot_loss[loss=0.1608, simple_loss=0.1714, pruned_loss=0.07512, over 1080678.88 frames. ], batch size: 109, lr: 1.29e-02, grad_scale: 8.0 +2022-11-15 21:39:15,263 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=42562.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:39:18,948 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42567.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:39:32,239 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2229, 0.7775, 0.9091, 0.7513, 0.8014, 1.0580, 0.8785, 0.9537], + device='cuda:2'), covar=tensor([0.0971, 0.0926, 0.0630, 0.2341, 0.3154, 0.0407, 0.1049, 0.0951], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0014, 0.0011, 0.0013, 0.0012, 0.0010, 0.0013, 0.0010], + device='cuda:2'), out_proj_covar=tensor([4.7918e-05, 6.3468e-05, 4.9617e-05, 5.8556e-05, 5.3671e-05, 4.8138e-05, + 5.8671e-05, 4.8198e-05], device='cuda:2') +2022-11-15 21:39:32,913 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7126, 2.0365, 3.1660, 2.6393, 3.4961, 2.2434, 2.9804, 3.6488], + device='cuda:2'), covar=tensor([0.0433, 0.1909, 0.0755, 0.1652, 0.0398, 0.1299, 0.1078, 0.0719], + device='cuda:2'), in_proj_covar=tensor([0.0213, 0.0191, 0.0192, 0.0207, 0.0191, 0.0186, 0.0226, 0.0213], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:39:44,651 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=42604.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 21:39:48,432 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=42610.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:39:54,752 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.838e+01 1.745e+02 2.107e+02 2.704e+02 4.645e+02, threshold=4.215e+02, percent-clipped=1.0 +2022-11-15 21:40:11,521 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 21:40:15,378 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42649.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:40:17,233 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=42652.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 21:40:19,936 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5131, 2.7183, 2.6617, 2.5808, 2.7406, 2.7810, 1.0963, 2.8198], + device='cuda:2'), covar=tensor([0.0316, 0.0298, 0.0245, 0.0231, 0.0278, 0.0258, 0.2444, 0.0333], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0077, 0.0077, 0.0069, 0.0094, 0.0080, 0.0128, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:40:23,153 INFO [train.py:876] (2/4) Epoch 6, batch 6300, loss[loss=0.1261, simple_loss=0.1479, pruned_loss=0.05215, over 5715.00 frames. ], tot_loss[loss=0.159, simple_loss=0.1705, pruned_loss=0.07375, over 1084346.68 frames. ], batch size: 11, lr: 1.29e-02, grad_scale: 8.0 +2022-11-15 21:40:27,119 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.6849, 1.1925, 0.8931, 0.9703, 0.9252, 1.3358, 0.9816, 0.7636], + device='cuda:2'), covar=tensor([0.2163, 0.0329, 0.1527, 0.2195, 0.2057, 0.0419, 0.1672, 0.2101], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0045, 0.0050, 0.0063, 0.0047, 0.0040, 0.0046, 0.0050], + device='cuda:2'), out_proj_covar=tensor([1.1727e-04, 9.5239e-05, 1.0383e-04, 1.3032e-04, 1.0459e-04, 9.1983e-05, + 9.8865e-05, 1.0461e-04], device='cuda:2') +2022-11-15 21:40:42,824 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0615, 2.2310, 3.4551, 3.2047, 4.0853, 2.4004, 3.5714, 3.9569], + device='cuda:2'), covar=tensor([0.0443, 0.1646, 0.0817, 0.1531, 0.0333, 0.1512, 0.0968, 0.0598], + device='cuda:2'), in_proj_covar=tensor([0.0214, 0.0193, 0.0193, 0.0210, 0.0193, 0.0187, 0.0227, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:40:55,890 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42710.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:41:02,050 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.507e+01 1.912e+02 2.400e+02 2.974e+02 5.468e+02, threshold=4.801e+02, percent-clipped=3.0 +2022-11-15 21:41:03,577 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9042, 1.9998, 2.3181, 2.1133, 1.3773, 2.0965, 1.3832, 1.7399], + device='cuda:2'), covar=tensor([0.0151, 0.0071, 0.0068, 0.0088, 0.0187, 0.0079, 0.0211, 0.0107], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0136, 0.0147, 0.0164, 0.0167, 0.0147, 0.0160, 0.0135], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:41:13,143 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42735.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:41:31,366 INFO [train.py:876] (2/4) Epoch 6, batch 6400, loss[loss=0.1413, simple_loss=0.1529, pruned_loss=0.0648, over 5721.00 frames. ], tot_loss[loss=0.1581, simple_loss=0.1697, pruned_loss=0.07321, over 1081058.45 frames. ], batch size: 13, lr: 1.29e-02, grad_scale: 8.0 +2022-11-15 21:42:11,497 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.096e+02 1.772e+02 2.214e+02 2.586e+02 5.921e+02, threshold=4.429e+02, percent-clipped=3.0 +2022-11-15 21:42:38,689 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42859.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:42:39,914 INFO [train.py:876] (2/4) Epoch 6, batch 6500, loss[loss=0.1611, simple_loss=0.1753, pruned_loss=0.07342, over 5752.00 frames. ], tot_loss[loss=0.1596, simple_loss=0.1707, pruned_loss=0.07425, over 1082065.19 frames. ], batch size: 27, lr: 1.29e-02, grad_scale: 8.0 +2022-11-15 21:42:40,668 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42862.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:42:46,461 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4132, 4.3937, 3.0060, 4.2629, 3.2804, 3.0559, 2.5388, 3.7403], + device='cuda:2'), covar=tensor([0.1484, 0.0148, 0.0882, 0.0271, 0.0527, 0.0926, 0.1694, 0.0269], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0132, 0.0168, 0.0137, 0.0173, 0.0179, 0.0182, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:42:50,474 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.30 vs. limit=2.0 +2022-11-15 21:42:52,368 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42879.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:42:52,603 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-11-15 21:42:55,661 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9029, 2.2081, 3.5093, 2.9518, 3.6879, 2.5429, 3.2910, 3.9555], + device='cuda:2'), covar=tensor([0.0562, 0.1428, 0.0592, 0.1420, 0.0436, 0.1207, 0.0850, 0.0545], + device='cuda:2'), in_proj_covar=tensor([0.0210, 0.0191, 0.0188, 0.0207, 0.0191, 0.0185, 0.0224, 0.0209], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:43:11,894 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7662, 4.2662, 4.6166, 4.1347, 4.8627, 4.5756, 4.2869, 4.7426], + device='cuda:2'), covar=tensor([0.0287, 0.0245, 0.0388, 0.0278, 0.0276, 0.0193, 0.0289, 0.0281], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0122, 0.0092, 0.0124, 0.0133, 0.0080, 0.0106, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 21:43:19,264 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.054e+02 1.824e+02 2.207e+02 2.735e+02 7.842e+02, threshold=4.414e+02, percent-clipped=4.0 +2022-11-15 21:43:20,110 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42920.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:43:33,786 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42940.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:43:47,836 INFO [train.py:876] (2/4) Epoch 6, batch 6600, loss[loss=0.1301, simple_loss=0.1432, pruned_loss=0.0585, over 5536.00 frames. ], tot_loss[loss=0.1584, simple_loss=0.1695, pruned_loss=0.07361, over 1083081.02 frames. ], batch size: 10, lr: 1.29e-02, grad_scale: 8.0 +2022-11-15 21:44:18,653 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=43005.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:44:28,197 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.144e+02 1.711e+02 2.097e+02 2.455e+02 4.370e+02, threshold=4.194e+02, percent-clipped=0.0 +2022-11-15 21:44:31,280 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7280, 1.8766, 1.9022, 1.6197, 1.8784, 1.9175, 0.8136, 1.9094], + device='cuda:2'), covar=tensor([0.0419, 0.0341, 0.0283, 0.0310, 0.0349, 0.0310, 0.2027, 0.0416], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0077, 0.0078, 0.0069, 0.0094, 0.0081, 0.0128, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:44:39,655 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43035.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:44:46,520 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-11-15 21:44:56,954 INFO [train.py:876] (2/4) Epoch 6, batch 6700, loss[loss=0.172, simple_loss=0.1761, pruned_loss=0.08393, over 5653.00 frames. ], tot_loss[loss=0.1595, simple_loss=0.1701, pruned_loss=0.07443, over 1079220.36 frames. ], batch size: 38, lr: 1.29e-02, grad_scale: 8.0 +2022-11-15 21:44:59,679 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0481, 3.9012, 3.5440, 3.8533, 3.9749, 3.8020, 1.7531, 4.1575], + device='cuda:2'), covar=tensor([0.0244, 0.0436, 0.0486, 0.0292, 0.0434, 0.0366, 0.2691, 0.0304], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0077, 0.0077, 0.0069, 0.0093, 0.0080, 0.0127, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:45:12,512 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43083.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:45:36,326 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.495e+01 1.976e+02 2.503e+02 2.987e+02 6.000e+02, threshold=5.005e+02, percent-clipped=7.0 +2022-11-15 21:45:59,109 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3759, 3.5187, 3.4500, 3.2683, 3.4771, 3.4947, 1.2379, 3.6365], + device='cuda:2'), covar=tensor([0.0371, 0.0267, 0.0284, 0.0351, 0.0351, 0.0327, 0.3311, 0.0324], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0078, 0.0078, 0.0069, 0.0094, 0.0081, 0.0128, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:46:04,889 INFO [train.py:876] (2/4) Epoch 6, batch 6800, loss[loss=0.1556, simple_loss=0.1736, pruned_loss=0.06879, over 5562.00 frames. ], tot_loss[loss=0.1571, simple_loss=0.169, pruned_loss=0.07258, over 1086262.64 frames. ], batch size: 22, lr: 1.28e-02, grad_scale: 8.0 +2022-11-15 21:46:05,688 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43162.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:46:23,945 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.27 vs. limit=5.0 +2022-11-15 21:46:28,164 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.28 vs. limit=2.0 +2022-11-15 21:46:38,996 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43210.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:46:42,326 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=43215.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:46:44,824 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.047e+02 1.755e+02 2.083e+02 2.590e+02 5.758e+02, threshold=4.166e+02, percent-clipped=2.0 +2022-11-15 21:46:55,885 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=43235.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:47:13,949 INFO [train.py:876] (2/4) Epoch 6, batch 6900, loss[loss=0.183, simple_loss=0.1975, pruned_loss=0.08421, over 5606.00 frames. ], tot_loss[loss=0.1565, simple_loss=0.1685, pruned_loss=0.07223, over 1088464.97 frames. ], batch size: 38, lr: 1.28e-02, grad_scale: 8.0 +2022-11-15 21:47:24,063 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-11-15 21:47:44,592 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43305.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:47:53,738 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.277e+02 1.786e+02 2.260e+02 2.890e+02 4.786e+02, threshold=4.520e+02, percent-clipped=4.0 +2022-11-15 21:48:14,456 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.43 vs. limit=5.0 +2022-11-15 21:48:16,708 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43353.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:48:22,251 INFO [train.py:876] (2/4) Epoch 6, batch 7000, loss[loss=0.1887, simple_loss=0.1947, pruned_loss=0.09137, over 5706.00 frames. ], tot_loss[loss=0.1588, simple_loss=0.1697, pruned_loss=0.07395, over 1077252.67 frames. ], batch size: 34, lr: 1.28e-02, grad_scale: 8.0 +2022-11-15 21:48:37,165 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6142, 2.5298, 1.9248, 2.7289, 1.8561, 2.0161, 2.3305, 2.8690], + device='cuda:2'), covar=tensor([0.0723, 0.1270, 0.2934, 0.0739, 0.1796, 0.0968, 0.1563, 0.1135], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0071, 0.0088, 0.0062, 0.0071, 0.0068, 0.0079, 0.0054], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:48:57,403 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7320, 2.7863, 2.0602, 2.7574, 1.9745, 2.3225, 2.6018, 3.1543], + device='cuda:2'), covar=tensor([0.1611, 0.1812, 0.5224, 0.2039, 0.2189, 0.1152, 0.1977, 0.1231], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0071, 0.0089, 0.0063, 0.0071, 0.0069, 0.0080, 0.0055], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:49:02,130 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.099e+02 1.834e+02 2.170e+02 2.645e+02 5.568e+02, threshold=4.340e+02, percent-clipped=1.0 +2022-11-15 21:49:06,163 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-11-15 21:49:15,846 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8959, 2.2588, 3.1866, 3.9105, 3.7768, 3.0553, 2.2688, 3.8217], + device='cuda:2'), covar=tensor([0.0265, 0.2494, 0.1527, 0.2175, 0.0637, 0.2160, 0.1953, 0.0393], + device='cuda:2'), in_proj_covar=tensor([0.0198, 0.0212, 0.0201, 0.0325, 0.0220, 0.0216, 0.0194, 0.0200], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 21:49:16,463 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8561, 1.8276, 1.6880, 2.1132, 1.5247, 1.5113, 1.6154, 2.0619], + device='cuda:2'), covar=tensor([0.1044, 0.1056, 0.2633, 0.0743, 0.1576, 0.1314, 0.1716, 0.0813], + device='cuda:2'), in_proj_covar=tensor([0.0072, 0.0073, 0.0089, 0.0063, 0.0072, 0.0069, 0.0081, 0.0055], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:49:30,789 INFO [train.py:876] (2/4) Epoch 6, batch 7100, loss[loss=0.1452, simple_loss=0.1675, pruned_loss=0.06139, over 5656.00 frames. ], tot_loss[loss=0.1584, simple_loss=0.1702, pruned_loss=0.07329, over 1086965.40 frames. ], batch size: 29, lr: 1.28e-02, grad_scale: 8.0 +2022-11-15 21:49:45,388 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1026, 3.8990, 3.9147, 1.8850, 3.6235, 4.2102, 3.8215, 4.5571], + device='cuda:2'), covar=tensor([0.1637, 0.1253, 0.0827, 0.2860, 0.0385, 0.0267, 0.0241, 0.0264], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0187, 0.0143, 0.0190, 0.0159, 0.0165, 0.0136, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:50:06,734 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=43514.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:50:07,732 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43515.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:50:08,050 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-11-15 21:50:10,160 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.966e+01 1.749e+02 2.086e+02 2.685e+02 6.877e+02, threshold=4.173e+02, percent-clipped=3.0 +2022-11-15 21:50:21,414 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43535.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:50:38,614 INFO [train.py:876] (2/4) Epoch 6, batch 7200, loss[loss=0.1419, simple_loss=0.1674, pruned_loss=0.05821, over 5595.00 frames. ], tot_loss[loss=0.1581, simple_loss=0.1702, pruned_loss=0.07303, over 1091023.50 frames. ], batch size: 24, lr: 1.28e-02, grad_scale: 8.0 +2022-11-15 21:50:40,025 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43563.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:50:48,394 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=43575.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:50:53,857 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43583.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:51:17,829 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.100e+02 1.746e+02 2.258e+02 2.937e+02 6.701e+02, threshold=4.517e+02, percent-clipped=5.0 +2022-11-15 21:52:13,101 INFO [train.py:876] (2/4) Epoch 7, batch 0, loss[loss=0.1448, simple_loss=0.1738, pruned_loss=0.05786, over 5846.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.1738, pruned_loss=0.05786, over 5846.00 frames. ], batch size: 18, lr: 1.20e-02, grad_scale: 8.0 +2022-11-15 21:52:13,102 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 21:52:28,810 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8666, 3.0875, 3.0376, 2.9521, 3.0205, 2.9248, 1.4087, 3.1186], + device='cuda:2'), covar=tensor([0.0219, 0.0155, 0.0187, 0.0147, 0.0243, 0.0259, 0.2303, 0.0219], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0077, 0.0078, 0.0068, 0.0094, 0.0082, 0.0127, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:52:29,702 INFO [train.py:908] (2/4) Epoch 7, validation: loss=0.1631, simple_loss=0.1871, pruned_loss=0.06958, over 1530663.00 frames. +2022-11-15 21:52:29,703 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 21:53:28,696 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.081e+02 1.724e+02 2.075e+02 2.427e+02 4.341e+02, threshold=4.150e+02, percent-clipped=0.0 +2022-11-15 21:53:37,921 INFO [train.py:876] (2/4) Epoch 7, batch 100, loss[loss=0.1595, simple_loss=0.1706, pruned_loss=0.07423, over 5667.00 frames. ], tot_loss[loss=0.1603, simple_loss=0.1726, pruned_loss=0.074, over 434179.14 frames. ], batch size: 12, lr: 1.20e-02, grad_scale: 8.0 +2022-11-15 21:54:02,879 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2938, 3.1006, 3.0805, 1.2310, 3.0828, 3.4179, 3.3501, 3.4857], + device='cuda:2'), covar=tensor([0.2002, 0.1478, 0.0868, 0.3341, 0.0393, 0.0383, 0.0324, 0.0693], + device='cuda:2'), in_proj_covar=tensor([0.0184, 0.0189, 0.0148, 0.0194, 0.0159, 0.0169, 0.0139, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 21:54:15,373 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=43787.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:54:34,920 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0075, 4.0914, 3.8272, 3.6502, 2.4037, 4.2416, 2.2691, 3.3982], + device='cuda:2'), covar=tensor([0.0309, 0.0131, 0.0156, 0.0234, 0.0456, 0.0120, 0.0422, 0.0099], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0140, 0.0155, 0.0172, 0.0174, 0.0152, 0.0166, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 21:54:37,878 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.176e+02 1.863e+02 2.234e+02 2.721e+02 4.230e+02, threshold=4.468e+02, percent-clipped=1.0 +2022-11-15 21:54:47,143 INFO [train.py:876] (2/4) Epoch 7, batch 200, loss[loss=0.1343, simple_loss=0.1559, pruned_loss=0.05635, over 5441.00 frames. ], tot_loss[loss=0.1591, simple_loss=0.171, pruned_loss=0.07362, over 686919.28 frames. ], batch size: 10, lr: 1.19e-02, grad_scale: 8.0 +2022-11-15 21:54:49,907 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7250, 5.1729, 4.5172, 5.3293, 5.2734, 4.2801, 4.8180, 4.0955], + device='cuda:2'), covar=tensor([0.0200, 0.0568, 0.2050, 0.0401, 0.0499, 0.0506, 0.0336, 0.0993], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0153, 0.0242, 0.0152, 0.0188, 0.0153, 0.0164, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:54:57,721 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=43848.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:55:12,857 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=43870.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:55:30,749 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9929, 4.4191, 3.8706, 4.5205, 4.4024, 3.7218, 4.0691, 3.9123], + device='cuda:2'), covar=tensor([0.0375, 0.0490, 0.1547, 0.0373, 0.0455, 0.0483, 0.0644, 0.0699], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0151, 0.0242, 0.0152, 0.0186, 0.0153, 0.0164, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:55:36,289 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7334, 1.8583, 1.5334, 1.8609, 1.9205, 1.7642, 1.7344, 1.8342], + device='cuda:2'), covar=tensor([0.0465, 0.0732, 0.1627, 0.0753, 0.0641, 0.0495, 0.0951, 0.0541], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0150, 0.0241, 0.0151, 0.0186, 0.0152, 0.0163, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:55:38,983 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=43909.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:55:46,091 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.324e+02 1.863e+02 2.345e+02 2.603e+02 4.668e+02, threshold=4.689e+02, percent-clipped=1.0 +2022-11-15 21:55:55,863 INFO [train.py:876] (2/4) Epoch 7, batch 300, loss[loss=0.2327, simple_loss=0.2124, pruned_loss=0.1265, over 3070.00 frames. ], tot_loss[loss=0.158, simple_loss=0.17, pruned_loss=0.07304, over 840328.98 frames. ], batch size: 284, lr: 1.19e-02, grad_scale: 8.0 +2022-11-15 21:56:04,622 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0567, 2.9356, 2.3699, 1.5677, 2.8140, 1.0920, 2.8337, 1.6912], + device='cuda:2'), covar=tensor([0.1452, 0.0328, 0.0906, 0.2496, 0.0351, 0.2455, 0.0401, 0.1968], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0102, 0.0113, 0.0120, 0.0104, 0.0128, 0.0098, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 21:56:13,544 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=43959.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:56:21,761 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=43970.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:56:48,030 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0414, 3.4995, 3.0544, 3.4793, 3.4991, 2.9377, 3.0784, 3.0918], + device='cuda:2'), covar=tensor([0.1129, 0.0464, 0.1428, 0.0510, 0.0432, 0.0452, 0.0641, 0.0520], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0153, 0.0243, 0.0153, 0.0187, 0.0155, 0.0166, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 21:56:51,406 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9765, 1.2562, 1.4232, 1.2825, 1.3403, 1.2996, 1.4004, 1.1952], + device='cuda:2'), covar=tensor([0.0685, 0.0619, 0.1643, 0.1340, 0.1934, 0.2368, 0.1930, 0.0762], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0014, 0.0011, 0.0012, 0.0011, 0.0010, 0.0013, 0.0010], + device='cuda:2'), out_proj_covar=tensor([4.7421e-05, 6.1416e-05, 4.9231e-05, 5.6939e-05, 5.2102e-05, 4.7901e-05, + 5.7630e-05, 4.7860e-05], device='cuda:2') +2022-11-15 21:56:54,568 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.186e+02 1.760e+02 2.035e+02 2.652e+02 4.590e+02, threshold=4.070e+02, percent-clipped=0.0 +2022-11-15 21:56:55,822 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=44020.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:57:04,546 INFO [train.py:876] (2/4) Epoch 7, batch 400, loss[loss=0.1181, simple_loss=0.1466, pruned_loss=0.0448, over 5470.00 frames. ], tot_loss[loss=0.1565, simple_loss=0.1692, pruned_loss=0.07189, over 939609.29 frames. ], batch size: 12, lr: 1.19e-02, grad_scale: 8.0 +2022-11-15 21:57:05,421 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4967, 1.3326, 1.6531, 1.1274, 1.3530, 1.5665, 1.3337, 1.3791], + device='cuda:2'), covar=tensor([0.1228, 0.0722, 0.0341, 0.0935, 0.0761, 0.0507, 0.0662, 0.0558], + device='cuda:2'), in_proj_covar=tensor([0.0010, 0.0014, 0.0011, 0.0012, 0.0011, 0.0010, 0.0013, 0.0010], + device='cuda:2'), out_proj_covar=tensor([4.7583e-05, 6.1491e-05, 4.9313e-05, 5.7149e-05, 5.2544e-05, 4.8173e-05, + 5.7899e-05, 4.8142e-05], device='cuda:2') +2022-11-15 21:57:31,253 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.23 vs. limit=2.0 +2022-11-15 21:58:02,998 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.245e+02 1.796e+02 2.180e+02 2.809e+02 7.513e+02, threshold=4.360e+02, percent-clipped=4.0 +2022-11-15 21:58:13,185 INFO [train.py:876] (2/4) Epoch 7, batch 500, loss[loss=0.1708, simple_loss=0.1899, pruned_loss=0.07583, over 5545.00 frames. ], tot_loss[loss=0.1581, simple_loss=0.1694, pruned_loss=0.07334, over 993724.31 frames. ], batch size: 21, lr: 1.19e-02, grad_scale: 8.0 +2022-11-15 21:58:20,228 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44143.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:58:37,757 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=44170.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:59:10,361 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=44218.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 21:59:10,883 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.014e+02 1.778e+02 2.186e+02 2.838e+02 5.081e+02, threshold=4.371e+02, percent-clipped=2.0 +2022-11-15 21:59:20,348 INFO [train.py:876] (2/4) Epoch 7, batch 600, loss[loss=0.154, simple_loss=0.1721, pruned_loss=0.06795, over 5640.00 frames. ], tot_loss[loss=0.1553, simple_loss=0.1682, pruned_loss=0.07119, over 1033477.79 frames. ], batch size: 32, lr: 1.19e-02, grad_scale: 8.0 +2022-11-15 21:59:42,362 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44265.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:00:16,427 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44315.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:00:18,962 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.189e+02 1.730e+02 2.067e+02 2.557e+02 5.519e+02, threshold=4.134e+02, percent-clipped=3.0 +2022-11-15 22:00:28,133 INFO [train.py:876] (2/4) Epoch 7, batch 700, loss[loss=0.1268, simple_loss=0.1455, pruned_loss=0.05407, over 5467.00 frames. ], tot_loss[loss=0.1559, simple_loss=0.1685, pruned_loss=0.07167, over 1057743.98 frames. ], batch size: 12, lr: 1.19e-02, grad_scale: 16.0 +2022-11-15 22:00:47,255 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6488, 1.6135, 1.9661, 1.1711, 1.1391, 2.2328, 1.7971, 1.5288], + device='cuda:2'), covar=tensor([0.0613, 0.0765, 0.0744, 0.2427, 0.2987, 0.2876, 0.1088, 0.1187], + device='cuda:2'), in_proj_covar=tensor([0.0058, 0.0046, 0.0051, 0.0065, 0.0051, 0.0043, 0.0047, 0.0052], + device='cuda:2'), out_proj_covar=tensor([1.2040e-04, 9.8153e-05, 1.0852e-04, 1.3673e-04, 1.1276e-04, 9.9668e-05, + 1.0310e-04, 1.0906e-04], device='cuda:2') +2022-11-15 22:01:27,714 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 1.682e+02 2.055e+02 2.418e+02 5.378e+02, threshold=4.109e+02, percent-clipped=3.0 +2022-11-15 22:01:36,454 INFO [train.py:876] (2/4) Epoch 7, batch 800, loss[loss=0.1626, simple_loss=0.1819, pruned_loss=0.07168, over 5724.00 frames. ], tot_loss[loss=0.157, simple_loss=0.1692, pruned_loss=0.07241, over 1071107.23 frames. ], batch size: 27, lr: 1.19e-02, grad_scale: 8.0 +2022-11-15 22:01:43,084 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=44443.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:02:15,337 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2461, 2.8443, 2.6957, 1.5349, 2.5916, 2.9516, 2.9479, 3.1735], + device='cuda:2'), covar=tensor([0.1829, 0.1388, 0.0776, 0.2680, 0.0471, 0.0471, 0.0254, 0.0658], + device='cuda:2'), in_proj_covar=tensor([0.0184, 0.0189, 0.0148, 0.0189, 0.0162, 0.0167, 0.0139, 0.0177], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:02:15,859 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=44491.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:02:35,513 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.359e+02 1.866e+02 2.294e+02 2.881e+02 8.701e+02, threshold=4.588e+02, percent-clipped=3.0 +2022-11-15 22:02:40,770 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=44527.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:02:44,440 INFO [train.py:876] (2/4) Epoch 7, batch 900, loss[loss=0.1014, simple_loss=0.1354, pruned_loss=0.03372, over 5763.00 frames. ], tot_loss[loss=0.1569, simple_loss=0.1695, pruned_loss=0.07216, over 1079276.75 frames. ], batch size: 14, lr: 1.19e-02, grad_scale: 8.0 +2022-11-15 22:02:44,669 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5000, 3.5089, 3.3749, 3.1775, 2.1048, 3.4031, 1.9817, 3.0185], + device='cuda:2'), covar=tensor([0.0303, 0.0126, 0.0165, 0.0277, 0.0385, 0.0126, 0.0413, 0.0125], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0139, 0.0156, 0.0171, 0.0173, 0.0154, 0.0165, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:02:56,141 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=44550.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:03:06,233 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=44565.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:03:22,319 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=44588.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:03:26,938 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7333, 2.5979, 2.9721, 3.7189, 3.7648, 3.0016, 2.1953, 3.9495], + device='cuda:2'), covar=tensor([0.0558, 0.3125, 0.2698, 0.2766, 0.1155, 0.2876, 0.2660, 0.0447], + device='cuda:2'), in_proj_covar=tensor([0.0202, 0.0205, 0.0204, 0.0328, 0.0224, 0.0213, 0.0192, 0.0204], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 22:03:27,208 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-15 22:03:37,751 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=44611.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:03:39,299 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=44613.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:03:40,643 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=44615.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:03:44,048 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.116e+02 1.775e+02 2.262e+02 2.701e+02 4.427e+02, threshold=4.524e+02, percent-clipped=0.0 +2022-11-15 22:03:46,213 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1574, 1.5227, 1.8052, 1.0501, 0.8847, 2.3434, 1.5068, 1.4993], + device='cuda:2'), covar=tensor([0.0908, 0.0639, 0.0631, 0.1636, 0.2233, 0.0626, 0.1450, 0.1028], + device='cuda:2'), in_proj_covar=tensor([0.0058, 0.0046, 0.0052, 0.0065, 0.0052, 0.0043, 0.0047, 0.0053], + device='cuda:2'), out_proj_covar=tensor([1.2120e-04, 9.8411e-05, 1.0943e-04, 1.3646e-04, 1.1428e-04, 9.9542e-05, + 1.0429e-04, 1.1103e-04], device='cuda:2') +2022-11-15 22:03:52,916 INFO [train.py:876] (2/4) Epoch 7, batch 1000, loss[loss=0.1589, simple_loss=0.1619, pruned_loss=0.078, over 5591.00 frames. ], tot_loss[loss=0.1594, simple_loss=0.1713, pruned_loss=0.07373, over 1080810.45 frames. ], batch size: 18, lr: 1.18e-02, grad_scale: 8.0 +2022-11-15 22:04:13,019 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=44663.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:04:18,333 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-15 22:04:33,740 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8243, 3.2241, 2.3215, 3.0090, 2.2107, 2.4139, 1.8271, 2.7834], + device='cuda:2'), covar=tensor([0.1423, 0.0245, 0.0865, 0.0301, 0.1126, 0.0959, 0.1849, 0.0382], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0132, 0.0166, 0.0135, 0.0173, 0.0181, 0.0180, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 22:04:52,369 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.130e+02 1.670e+02 2.114e+02 2.520e+02 5.115e+02, threshold=4.229e+02, percent-clipped=2.0 +2022-11-15 22:05:01,298 INFO [train.py:876] (2/4) Epoch 7, batch 1100, loss[loss=0.08246, simple_loss=0.1126, pruned_loss=0.02618, over 4980.00 frames. ], tot_loss[loss=0.1579, simple_loss=0.1699, pruned_loss=0.07288, over 1078391.30 frames. ], batch size: 7, lr: 1.18e-02, grad_scale: 8.0 +2022-11-15 22:05:18,727 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0805, 2.2471, 3.0000, 3.8017, 4.1921, 3.0573, 2.3870, 4.0660], + device='cuda:2'), covar=tensor([0.0410, 0.4015, 0.2716, 0.3374, 0.0937, 0.3414, 0.2381, 0.0527], + device='cuda:2'), in_proj_covar=tensor([0.0206, 0.0212, 0.0207, 0.0339, 0.0228, 0.0219, 0.0194, 0.0209], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-11-15 22:05:54,959 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7580, 4.1480, 3.7512, 3.4561, 2.3309, 4.0782, 2.3055, 3.2806], + device='cuda:2'), covar=tensor([0.0385, 0.0114, 0.0166, 0.0360, 0.0454, 0.0149, 0.0406, 0.0127], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0137, 0.0155, 0.0172, 0.0172, 0.0154, 0.0165, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:06:00,679 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.112e+02 1.756e+02 2.130e+02 2.754e+02 4.425e+02, threshold=4.261e+02, percent-clipped=1.0 +2022-11-15 22:06:08,895 INFO [train.py:876] (2/4) Epoch 7, batch 1200, loss[loss=0.2095, simple_loss=0.1988, pruned_loss=0.1101, over 5351.00 frames. ], tot_loss[loss=0.158, simple_loss=0.1704, pruned_loss=0.07284, over 1080351.94 frames. ], batch size: 70, lr: 1.18e-02, grad_scale: 8.0 +2022-11-15 22:06:26,625 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0924, 3.8372, 2.7079, 3.6278, 2.9068, 2.8157, 1.9911, 3.2788], + device='cuda:2'), covar=tensor([0.1479, 0.0246, 0.0963, 0.0356, 0.0707, 0.0910, 0.1986, 0.0323], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0133, 0.0166, 0.0136, 0.0173, 0.0181, 0.0178, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:06:42,976 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44883.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:06:59,243 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44906.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:07:08,227 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.257e+02 1.841e+02 2.145e+02 2.633e+02 4.840e+02, threshold=4.289e+02, percent-clipped=5.0 +2022-11-15 22:07:16,623 INFO [train.py:876] (2/4) Epoch 7, batch 1300, loss[loss=0.1515, simple_loss=0.1743, pruned_loss=0.06432, over 5664.00 frames. ], tot_loss[loss=0.1594, simple_loss=0.171, pruned_loss=0.07396, over 1075324.03 frames. ], batch size: 29, lr: 1.18e-02, grad_scale: 8.0 +2022-11-15 22:07:18,154 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7693, 1.9307, 2.2736, 2.8295, 2.9209, 2.2125, 1.7119, 2.9926], + device='cuda:2'), covar=tensor([0.0710, 0.2379, 0.2065, 0.1543, 0.0977, 0.2512, 0.2117, 0.0631], + device='cuda:2'), in_proj_covar=tensor([0.0201, 0.0207, 0.0203, 0.0329, 0.0222, 0.0216, 0.0193, 0.0207], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0006, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-11-15 22:08:19,524 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.856e+01 1.627e+02 2.176e+02 2.698e+02 5.044e+02, threshold=4.352e+02, percent-clipped=2.0 +2022-11-15 22:08:25,663 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7701, 2.0479, 1.8078, 1.4855, 1.5915, 2.2782, 2.0750, 2.3127], + device='cuda:2'), covar=tensor([0.1500, 0.1209, 0.1516, 0.2224, 0.0991, 0.0623, 0.0526, 0.0814], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0189, 0.0150, 0.0188, 0.0163, 0.0171, 0.0141, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:08:28,069 INFO [train.py:876] (2/4) Epoch 7, batch 1400, loss[loss=0.1527, simple_loss=0.1812, pruned_loss=0.06214, over 5698.00 frames. ], tot_loss[loss=0.1583, simple_loss=0.17, pruned_loss=0.07332, over 1072717.86 frames. ], batch size: 28, lr: 1.18e-02, grad_scale: 8.0 +2022-11-15 22:08:43,534 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45056.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:08:47,021 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8967, 2.8723, 2.5025, 2.8423, 2.8504, 2.4669, 2.5509, 2.5706], + device='cuda:2'), covar=tensor([0.0298, 0.0521, 0.1463, 0.0485, 0.0589, 0.0536, 0.0862, 0.0626], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0154, 0.0242, 0.0152, 0.0189, 0.0154, 0.0164, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 22:09:00,836 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45081.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:09:07,997 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1153, 3.2205, 3.1013, 2.9503, 1.7507, 3.0779, 2.0242, 2.7369], + device='cuda:2'), covar=tensor([0.0363, 0.0123, 0.0137, 0.0243, 0.0432, 0.0127, 0.0367, 0.0117], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0141, 0.0158, 0.0177, 0.0175, 0.0159, 0.0169, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:09:24,843 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45117.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 22:09:26,571 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.027e+02 1.653e+02 2.032e+02 2.682e+02 5.131e+02, threshold=4.064e+02, percent-clipped=2.0 +2022-11-15 22:09:35,813 INFO [train.py:876] (2/4) Epoch 7, batch 1500, loss[loss=0.2126, simple_loss=0.2136, pruned_loss=0.1058, over 5565.00 frames. ], tot_loss[loss=0.1553, simple_loss=0.1683, pruned_loss=0.07116, over 1082062.28 frames. ], batch size: 54, lr: 1.18e-02, grad_scale: 8.0 +2022-11-15 22:09:41,811 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45142.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:09:59,688 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6111, 0.9709, 1.8323, 1.2112, 1.0939, 1.3513, 1.5876, 1.3068], + device='cuda:2'), covar=tensor([0.0078, 0.0041, 0.0033, 0.0049, 0.0104, 0.0057, 0.0036, 0.0049], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0019, 0.0023, 0.0021, 0.0018, 0.0022, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.7675e-05, 1.8669e-05, 1.8312e-05, 2.2761e-05, 2.0198e-05, 1.8700e-05, + 2.2174e-05, 2.3592e-05], device='cuda:2') +2022-11-15 22:10:09,060 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=45183.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:10:16,649 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.23 vs. limit=5.0 +2022-11-15 22:10:24,972 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=45206.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:10:27,688 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-11-15 22:10:32,051 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2663, 4.0014, 3.9004, 3.6825, 4.1694, 3.8436, 1.5050, 4.3416], + device='cuda:2'), covar=tensor([0.0281, 0.0394, 0.0278, 0.0531, 0.0345, 0.0533, 0.3303, 0.0301], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0078, 0.0079, 0.0071, 0.0099, 0.0083, 0.0132, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:10:32,793 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7524, 1.1641, 1.1292, 0.7081, 1.0223, 1.1513, 0.8723, 1.2334], + device='cuda:2'), covar=tensor([0.0046, 0.0033, 0.0047, 0.0047, 0.0042, 0.0034, 0.0071, 0.0026], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0035, 0.0037, 0.0037, 0.0033, 0.0030, 0.0036, 0.0029], + device='cuda:2'), out_proj_covar=tensor([3.4341e-05, 3.3312e-05, 3.3608e-05, 3.4565e-05, 2.9420e-05, 2.5357e-05, + 3.4976e-05, 2.5759e-05], device='cuda:2') +2022-11-15 22:10:34,226 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.120e+01 1.667e+02 2.029e+02 2.581e+02 4.326e+02, threshold=4.059e+02, percent-clipped=3.0 +2022-11-15 22:10:36,362 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0419, 4.6809, 3.4852, 2.0228, 4.4353, 1.7694, 4.1794, 2.6662], + device='cuda:2'), covar=tensor([0.1216, 0.0136, 0.0389, 0.2244, 0.0147, 0.1960, 0.0244, 0.1527], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0102, 0.0113, 0.0118, 0.0105, 0.0129, 0.0097, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:10:41,637 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=45231.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:10:42,913 INFO [train.py:876] (2/4) Epoch 7, batch 1600, loss[loss=0.1567, simple_loss=0.1703, pruned_loss=0.07153, over 5543.00 frames. ], tot_loss[loss=0.1554, simple_loss=0.1686, pruned_loss=0.07116, over 1087872.81 frames. ], batch size: 25, lr: 1.18e-02, grad_scale: 8.0 +2022-11-15 22:10:47,479 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7328, 0.8646, 1.8287, 1.2935, 1.0547, 1.4086, 1.4313, 1.3809], + device='cuda:2'), covar=tensor([0.0044, 0.0077, 0.0031, 0.0038, 0.0077, 0.0063, 0.0034, 0.0035], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0020, 0.0023, 0.0022, 0.0019, 0.0023, 0.0023], + device='cuda:2'), out_proj_covar=tensor([1.8512e-05, 1.9235e-05, 1.8852e-05, 2.3548e-05, 2.0855e-05, 1.9340e-05, + 2.2859e-05, 2.4869e-05], device='cuda:2') +2022-11-15 22:10:57,951 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=45254.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:11:18,283 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-11-15 22:11:42,146 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.135e+02 1.688e+02 2.070e+02 2.514e+02 3.710e+02, threshold=4.140e+02, percent-clipped=0.0 +2022-11-15 22:11:51,086 INFO [train.py:876] (2/4) Epoch 7, batch 1700, loss[loss=0.137, simple_loss=0.1561, pruned_loss=0.05901, over 5542.00 frames. ], tot_loss[loss=0.1532, simple_loss=0.167, pruned_loss=0.0697, over 1091384.60 frames. ], batch size: 21, lr: 1.18e-02, grad_scale: 8.0 +2022-11-15 22:11:59,551 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7918, 1.8511, 1.9209, 2.2255, 1.8524, 1.7166, 1.7157, 2.2045], + device='cuda:2'), covar=tensor([0.1497, 0.1873, 0.2654, 0.0933, 0.1636, 0.2721, 0.1811, 0.0665], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0080, 0.0092, 0.0070, 0.0077, 0.0075, 0.0089, 0.0061], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:12:01,495 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45348.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 22:12:03,286 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4834, 3.0577, 3.1820, 2.9381, 2.0152, 3.0549, 2.1024, 2.5041], + device='cuda:2'), covar=tensor([0.0225, 0.0090, 0.0109, 0.0188, 0.0286, 0.0101, 0.0286, 0.0108], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0140, 0.0157, 0.0175, 0.0174, 0.0157, 0.0166, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:12:18,360 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-11-15 22:12:21,834 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9380, 3.0071, 2.3939, 1.3476, 2.7777, 1.0329, 2.9699, 1.7245], + device='cuda:2'), covar=tensor([0.1745, 0.0351, 0.0850, 0.3153, 0.0426, 0.2894, 0.0333, 0.2462], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0102, 0.0112, 0.0116, 0.0104, 0.0127, 0.0095, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:12:42,830 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45409.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 22:12:45,013 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=45412.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 22:12:50,414 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.019e+02 1.783e+02 2.189e+02 2.743e+02 5.258e+02, threshold=4.378e+02, percent-clipped=6.0 +2022-11-15 22:12:58,915 INFO [train.py:876] (2/4) Epoch 7, batch 1800, loss[loss=0.2054, simple_loss=0.1989, pruned_loss=0.1059, over 5463.00 frames. ], tot_loss[loss=0.1518, simple_loss=0.1658, pruned_loss=0.06885, over 1088762.86 frames. ], batch size: 53, lr: 1.17e-02, grad_scale: 8.0 +2022-11-15 22:13:01,578 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=45437.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:13:54,907 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-11-15 22:13:57,535 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.508e+01 1.821e+02 2.184e+02 2.835e+02 7.738e+02, threshold=4.367e+02, percent-clipped=5.0 +2022-11-15 22:14:06,287 INFO [train.py:876] (2/4) Epoch 7, batch 1900, loss[loss=0.2199, simple_loss=0.2022, pruned_loss=0.1188, over 4981.00 frames. ], tot_loss[loss=0.155, simple_loss=0.1679, pruned_loss=0.07106, over 1087699.75 frames. ], batch size: 109, lr: 1.17e-02, grad_scale: 8.0 +2022-11-15 22:14:35,701 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-11-15 22:15:04,862 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.287e+02 1.823e+02 2.221e+02 2.683e+02 3.782e+02, threshold=4.442e+02, percent-clipped=0.0 +2022-11-15 22:15:13,881 INFO [train.py:876] (2/4) Epoch 7, batch 2000, loss[loss=0.2215, simple_loss=0.2176, pruned_loss=0.1127, over 5131.00 frames. ], tot_loss[loss=0.1521, simple_loss=0.1662, pruned_loss=0.06903, over 1091248.16 frames. ], batch size: 91, lr: 1.17e-02, grad_scale: 8.0 +2022-11-15 22:15:47,697 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2922, 3.8533, 4.1234, 3.9019, 4.3838, 4.1414, 4.0536, 4.3259], + device='cuda:2'), covar=tensor([0.0371, 0.0289, 0.0421, 0.0309, 0.0326, 0.0221, 0.0237, 0.0289], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0125, 0.0093, 0.0127, 0.0136, 0.0080, 0.0107, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 22:16:01,071 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45702.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:16:02,308 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=45704.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 22:16:07,690 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=45712.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 22:16:12,593 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.145e+02 1.738e+02 2.020e+02 2.541e+02 5.348e+02, threshold=4.040e+02, percent-clipped=1.0 +2022-11-15 22:16:21,531 INFO [train.py:876] (2/4) Epoch 7, batch 2100, loss[loss=0.1239, simple_loss=0.149, pruned_loss=0.04941, over 5716.00 frames. ], tot_loss[loss=0.1529, simple_loss=0.1667, pruned_loss=0.06958, over 1088149.22 frames. ], batch size: 15, lr: 1.17e-02, grad_scale: 8.0 +2022-11-15 22:16:24,227 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=45737.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:16:39,955 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=45760.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:16:42,035 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45763.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:16:49,036 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45774.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:16:55,982 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=45785.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:17:20,075 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.330e+01 1.725e+02 2.150e+02 2.571e+02 5.272e+02, threshold=4.300e+02, percent-clipped=3.0 +2022-11-15 22:17:28,683 INFO [train.py:876] (2/4) Epoch 7, batch 2200, loss[loss=0.128, simple_loss=0.1557, pruned_loss=0.05013, over 5583.00 frames. ], tot_loss[loss=0.1526, simple_loss=0.1668, pruned_loss=0.06924, over 1090748.93 frames. ], batch size: 24, lr: 1.17e-02, grad_scale: 8.0 +2022-11-15 22:17:30,168 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45835.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:17:38,976 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3502, 2.2816, 2.5360, 1.4040, 1.2148, 3.3518, 2.1086, 1.8153], + device='cuda:2'), covar=tensor([0.0713, 0.0617, 0.0630, 0.1761, 0.3480, 0.0573, 0.2780, 0.0782], + device='cuda:2'), in_proj_covar=tensor([0.0063, 0.0049, 0.0054, 0.0067, 0.0055, 0.0045, 0.0049, 0.0054], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 22:17:46,369 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45859.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:18:15,687 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9777, 2.0935, 3.5776, 3.0244, 4.1641, 2.3846, 3.4769, 4.0127], + device='cuda:2'), covar=tensor([0.0369, 0.1769, 0.0696, 0.1410, 0.0299, 0.1389, 0.0845, 0.0653], + device='cuda:2'), in_proj_covar=tensor([0.0211, 0.0189, 0.0191, 0.0207, 0.0195, 0.0183, 0.0221, 0.0207], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 22:18:22,755 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-11-15 22:18:27,932 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.053e+02 1.659e+02 2.110e+02 2.622e+02 5.586e+02, threshold=4.221e+02, percent-clipped=2.0 +2022-11-15 22:18:28,127 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45920.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:18:36,866 INFO [train.py:876] (2/4) Epoch 7, batch 2300, loss[loss=0.2039, simple_loss=0.201, pruned_loss=0.1034, over 5462.00 frames. ], tot_loss[loss=0.1518, simple_loss=0.1661, pruned_loss=0.06877, over 1092838.32 frames. ], batch size: 58, lr: 1.17e-02, grad_scale: 8.0 +2022-11-15 22:19:17,694 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4301, 5.0500, 3.7713, 2.2288, 4.7065, 2.2369, 4.3176, 3.0583], + device='cuda:2'), covar=tensor([0.0971, 0.0106, 0.0510, 0.1918, 0.0153, 0.1446, 0.0202, 0.1201], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0102, 0.0113, 0.0115, 0.0103, 0.0127, 0.0095, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:19:18,925 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0534, 4.6431, 4.8737, 4.6456, 5.1267, 5.0398, 4.6229, 5.1344], + device='cuda:2'), covar=tensor([0.0389, 0.0268, 0.0409, 0.0354, 0.0372, 0.0126, 0.0225, 0.0248], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0127, 0.0094, 0.0128, 0.0137, 0.0081, 0.0108, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 22:19:25,065 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46004.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 22:19:35,542 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.089e+02 1.800e+02 2.204e+02 2.753e+02 4.636e+02, threshold=4.408e+02, percent-clipped=3.0 +2022-11-15 22:19:44,587 INFO [train.py:876] (2/4) Epoch 7, batch 2400, loss[loss=0.1972, simple_loss=0.1938, pruned_loss=0.1003, over 5545.00 frames. ], tot_loss[loss=0.1542, simple_loss=0.1671, pruned_loss=0.07063, over 1087322.57 frames. ], batch size: 46, lr: 1.17e-02, grad_scale: 8.0 +2022-11-15 22:19:50,980 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3585, 2.0300, 1.5999, 1.7579, 1.0302, 1.6555, 1.2921, 1.8664], + device='cuda:2'), covar=tensor([0.0724, 0.0219, 0.0632, 0.0406, 0.1205, 0.0684, 0.1151, 0.0339], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0135, 0.0169, 0.0141, 0.0174, 0.0180, 0.0181, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 22:19:54,695 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.01 vs. limit=2.0 +2022-11-15 22:19:57,578 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46052.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 22:20:01,771 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46058.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:20:03,161 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8540, 2.0408, 2.2577, 1.4903, 1.1600, 2.8343, 2.1715, 1.7810], + device='cuda:2'), covar=tensor([0.0953, 0.0730, 0.0762, 0.2351, 0.1854, 0.2640, 0.0891, 0.1009], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0051, 0.0057, 0.0070, 0.0057, 0.0047, 0.0049, 0.0057], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 22:20:43,098 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.181e+02 1.760e+02 2.143e+02 2.682e+02 4.652e+02, threshold=4.287e+02, percent-clipped=1.0 +2022-11-15 22:20:46,477 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4782, 1.1801, 1.9790, 1.1934, 1.5093, 1.6129, 1.1377, 1.0053], + device='cuda:2'), covar=tensor([0.0022, 0.0053, 0.0017, 0.0037, 0.0026, 0.0041, 0.0031, 0.0041], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0019, 0.0023, 0.0021, 0.0018, 0.0023, 0.0023], + device='cuda:2'), out_proj_covar=tensor([1.7604e-05, 1.8711e-05, 1.7979e-05, 2.2864e-05, 1.9936e-05, 1.8395e-05, + 2.2718e-05, 2.4285e-05], device='cuda:2') +2022-11-15 22:20:49,995 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46130.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:20:51,913 INFO [train.py:876] (2/4) Epoch 7, batch 2500, loss[loss=0.1506, simple_loss=0.1685, pruned_loss=0.06638, over 5692.00 frames. ], tot_loss[loss=0.1519, simple_loss=0.1656, pruned_loss=0.06911, over 1091741.44 frames. ], batch size: 19, lr: 1.17e-02, grad_scale: 8.0 +2022-11-15 22:21:09,706 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1266, 3.1600, 2.4004, 1.7052, 3.0283, 1.2055, 3.0619, 1.7379], + device='cuda:2'), covar=tensor([0.1189, 0.0170, 0.0935, 0.1795, 0.0229, 0.2085, 0.0225, 0.1690], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0104, 0.0114, 0.0119, 0.0104, 0.0129, 0.0097, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:21:47,948 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46215.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:21:51,095 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.093e+02 1.657e+02 1.935e+02 2.390e+02 4.850e+02, threshold=3.869e+02, percent-clipped=1.0 +2022-11-15 22:21:57,593 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4178, 2.0347, 2.2218, 1.5508, 1.5199, 3.2361, 2.0163, 2.1427], + device='cuda:2'), covar=tensor([0.0750, 0.0706, 0.0588, 0.2298, 0.2431, 0.0964, 0.1420, 0.0900], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0052, 0.0058, 0.0071, 0.0057, 0.0046, 0.0051, 0.0058], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 22:21:59,980 INFO [train.py:876] (2/4) Epoch 7, batch 2600, loss[loss=0.1015, simple_loss=0.1366, pruned_loss=0.0332, over 5719.00 frames. ], tot_loss[loss=0.15, simple_loss=0.1643, pruned_loss=0.06785, over 1087737.45 frames. ], batch size: 13, lr: 1.16e-02, grad_scale: 8.0 +2022-11-15 22:22:13,769 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.5798, 5.3638, 5.5878, 5.4253, 5.1887, 4.9805, 6.1005, 5.2689], + device='cuda:2'), covar=tensor([0.0257, 0.0670, 0.0236, 0.0940, 0.0308, 0.0276, 0.0461, 0.0355], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0097, 0.0081, 0.0105, 0.0078, 0.0065, 0.0132, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:22:29,332 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=46276.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:22:59,257 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.157e+02 1.863e+02 2.232e+02 2.729e+02 4.275e+02, threshold=4.463e+02, percent-clipped=4.0 +2022-11-15 22:23:07,788 INFO [train.py:876] (2/4) Epoch 7, batch 2700, loss[loss=0.1761, simple_loss=0.1904, pruned_loss=0.08097, over 5655.00 frames. ], tot_loss[loss=0.1512, simple_loss=0.1655, pruned_loss=0.06851, over 1089205.80 frames. ], batch size: 29, lr: 1.16e-02, grad_scale: 8.0 +2022-11-15 22:23:10,635 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=46337.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:23:21,646 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.61 vs. limit=5.0 +2022-11-15 22:23:25,127 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46358.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:23:59,109 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46406.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:24:09,520 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.125e+02 1.800e+02 2.089e+02 2.479e+02 4.263e+02, threshold=4.179e+02, percent-clipped=0.0 +2022-11-15 22:24:16,894 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46430.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:24:18,866 INFO [train.py:876] (2/4) Epoch 7, batch 2800, loss[loss=0.1341, simple_loss=0.1577, pruned_loss=0.05526, over 5498.00 frames. ], tot_loss[loss=0.1495, simple_loss=0.164, pruned_loss=0.06756, over 1087050.08 frames. ], batch size: 17, lr: 1.16e-02, grad_scale: 16.0 +2022-11-15 22:24:24,019 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.53 vs. limit=5.0 +2022-11-15 22:24:31,575 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-11-15 22:24:47,612 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4014, 1.8782, 1.5458, 0.9022, 1.5016, 1.5022, 1.4043, 1.6053], + device='cuda:2'), covar=tensor([0.0029, 0.0028, 0.0023, 0.0036, 0.0021, 0.0018, 0.0026, 0.0031], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0033, 0.0035, 0.0036, 0.0032, 0.0029, 0.0034, 0.0029], + device='cuda:2'), out_proj_covar=tensor([3.3598e-05, 3.1866e-05, 3.1556e-05, 3.2884e-05, 2.8685e-05, 2.5000e-05, + 3.2951e-05, 2.5714e-05], device='cuda:2') +2022-11-15 22:24:49,434 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46478.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:24:58,836 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-11-15 22:25:14,648 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46515.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:25:18,216 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.078e+02 1.794e+02 2.220e+02 2.623e+02 4.968e+02, threshold=4.440e+02, percent-clipped=2.0 +2022-11-15 22:25:27,104 INFO [train.py:876] (2/4) Epoch 7, batch 2900, loss[loss=0.1095, simple_loss=0.1311, pruned_loss=0.04398, over 5750.00 frames. ], tot_loss[loss=0.1493, simple_loss=0.164, pruned_loss=0.06734, over 1081222.64 frames. ], batch size: 21, lr: 1.16e-02, grad_scale: 16.0 +2022-11-15 22:25:34,659 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-11-15 22:25:46,475 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2394, 4.6491, 2.9082, 4.4851, 3.5270, 3.0332, 2.4351, 3.7815], + device='cuda:2'), covar=tensor([0.1734, 0.0174, 0.1166, 0.0244, 0.0539, 0.0975, 0.2061, 0.0349], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0133, 0.0166, 0.0136, 0.0171, 0.0176, 0.0174, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:25:47,079 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46563.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:25:53,831 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3167, 2.2127, 2.4066, 1.3757, 1.5139, 3.0775, 2.4057, 1.8863], + device='cuda:2'), covar=tensor([0.0863, 0.0965, 0.0725, 0.3277, 0.2888, 0.0461, 0.1761, 0.1049], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0054, 0.0057, 0.0072, 0.0056, 0.0047, 0.0051, 0.0058], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 22:26:08,953 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=46595.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:26:13,982 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=46602.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:26:19,088 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6867, 2.4460, 2.2131, 1.6648, 2.3197, 2.8210, 2.6107, 3.0070], + device='cuda:2'), covar=tensor([0.1794, 0.1369, 0.1087, 0.2199, 0.0513, 0.0446, 0.0407, 0.0652], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0186, 0.0150, 0.0188, 0.0163, 0.0173, 0.0143, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:26:20,411 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.26 vs. limit=5.0 +2022-11-15 22:26:26,117 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.182e+02 1.852e+02 2.208e+02 2.830e+02 6.179e+02, threshold=4.416e+02, percent-clipped=4.0 +2022-11-15 22:26:34,366 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46632.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:26:34,920 INFO [train.py:876] (2/4) Epoch 7, batch 3000, loss[loss=0.1172, simple_loss=0.1426, pruned_loss=0.0459, over 5606.00 frames. ], tot_loss[loss=0.1507, simple_loss=0.1643, pruned_loss=0.06857, over 1077004.62 frames. ], batch size: 18, lr: 1.16e-02, grad_scale: 16.0 +2022-11-15 22:26:34,920 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 22:26:52,608 INFO [train.py:908] (2/4) Epoch 7, validation: loss=0.1596, simple_loss=0.1815, pruned_loss=0.06886, over 1530663.00 frames. +2022-11-15 22:26:52,609 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 22:26:58,046 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1739, 3.1987, 3.3980, 3.4466, 3.2807, 4.0576, 3.8001, 3.4239], + device='cuda:2'), covar=tensor([0.1083, 0.0874, 0.1872, 0.0885, 0.1478, 0.0792, 0.1224, 0.2905], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0080, 0.0093, 0.0072, 0.0078, 0.0075, 0.0087, 0.0061], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:27:08,201 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=46656.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 22:27:12,637 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=46663.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:27:19,026 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-15 22:27:50,896 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.581e+01 1.880e+02 2.186e+02 2.617e+02 4.736e+02, threshold=4.372e+02, percent-clipped=3.0 +2022-11-15 22:27:59,286 INFO [train.py:876] (2/4) Epoch 7, batch 3100, loss[loss=0.1658, simple_loss=0.1554, pruned_loss=0.08805, over 4140.00 frames. ], tot_loss[loss=0.1502, simple_loss=0.1643, pruned_loss=0.06801, over 1081366.00 frames. ], batch size: 181, lr: 1.16e-02, grad_scale: 16.0 +2022-11-15 22:28:15,885 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-11-15 22:28:51,917 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=46810.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:28:58,609 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.858e+01 1.618e+02 2.001e+02 2.391e+02 5.637e+02, threshold=4.002e+02, percent-clipped=1.0 +2022-11-15 22:29:07,484 INFO [train.py:876] (2/4) Epoch 7, batch 3200, loss[loss=0.1298, simple_loss=0.1568, pruned_loss=0.05141, over 5579.00 frames. ], tot_loss[loss=0.1513, simple_loss=0.1656, pruned_loss=0.06855, over 1082871.43 frames. ], batch size: 18, lr: 1.16e-02, grad_scale: 16.0 +2022-11-15 22:29:07,618 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3661, 1.8556, 2.2102, 1.4146, 1.3028, 2.5576, 2.1341, 1.6568], + device='cuda:2'), covar=tensor([0.0907, 0.1132, 0.0686, 0.2293, 0.2049, 0.0694, 0.0990, 0.1169], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0052, 0.0056, 0.0069, 0.0055, 0.0045, 0.0050, 0.0056], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 22:29:14,299 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6679, 2.1321, 2.6686, 3.3974, 3.6183, 2.5429, 2.0909, 3.5878], + device='cuda:2'), covar=tensor([0.0771, 0.3518, 0.2751, 0.4428, 0.1350, 0.4093, 0.2934, 0.0419], + device='cuda:2'), in_proj_covar=tensor([0.0201, 0.0208, 0.0199, 0.0321, 0.0222, 0.0214, 0.0191, 0.0207], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-11-15 22:29:24,956 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.07 vs. limit=5.0 +2022-11-15 22:29:33,391 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=46871.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:30:06,305 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.017e+02 1.769e+02 2.073e+02 2.656e+02 5.224e+02, threshold=4.147e+02, percent-clipped=4.0 +2022-11-15 22:30:14,554 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46932.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:30:15,151 INFO [train.py:876] (2/4) Epoch 7, batch 3300, loss[loss=0.1442, simple_loss=0.1604, pruned_loss=0.064, over 5790.00 frames. ], tot_loss[loss=0.1519, simple_loss=0.166, pruned_loss=0.06885, over 1081250.85 frames. ], batch size: 14, lr: 1.16e-02, grad_scale: 16.0 +2022-11-15 22:30:27,237 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46951.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 22:30:32,493 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46958.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:30:47,192 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46980.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:31:14,482 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.138e+02 1.793e+02 2.115e+02 2.507e+02 4.736e+02, threshold=4.229e+02, percent-clipped=3.0 +2022-11-15 22:31:20,481 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0715, 4.7999, 3.6409, 2.1371, 4.4734, 1.8899, 4.3403, 2.6886], + device='cuda:2'), covar=tensor([0.1273, 0.0112, 0.0420, 0.2220, 0.0156, 0.1797, 0.0199, 0.1633], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0103, 0.0114, 0.0119, 0.0105, 0.0127, 0.0095, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:31:23,352 INFO [train.py:876] (2/4) Epoch 7, batch 3400, loss[loss=0.1835, simple_loss=0.1692, pruned_loss=0.0989, over 4167.00 frames. ], tot_loss[loss=0.1532, simple_loss=0.167, pruned_loss=0.06974, over 1077231.70 frames. ], batch size: 181, lr: 1.15e-02, grad_scale: 16.0 +2022-11-15 22:31:37,686 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.82 vs. limit=5.0 +2022-11-15 22:32:02,480 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.30 vs. limit=2.0 +2022-11-15 22:32:22,900 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.015e+02 1.842e+02 2.111e+02 2.660e+02 4.907e+02, threshold=4.221e+02, percent-clipped=3.0 +2022-11-15 22:32:31,444 INFO [train.py:876] (2/4) Epoch 7, batch 3500, loss[loss=0.1538, simple_loss=0.1614, pruned_loss=0.07311, over 5489.00 frames. ], tot_loss[loss=0.1495, simple_loss=0.1638, pruned_loss=0.06762, over 1080590.49 frames. ], batch size: 17, lr: 1.15e-02, grad_scale: 16.0 +2022-11-15 22:32:54,457 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=47166.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:33:23,954 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-15 22:33:30,494 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3440, 3.0586, 3.2892, 1.6934, 3.2383, 3.6255, 3.6668, 4.2428], + device='cuda:2'), covar=tensor([0.1852, 0.1458, 0.1380, 0.2768, 0.0381, 0.0590, 0.0356, 0.0418], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0189, 0.0153, 0.0190, 0.0162, 0.0178, 0.0142, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:33:30,930 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.117e+02 1.714e+02 2.031e+02 2.581e+02 4.185e+02, threshold=4.062e+02, percent-clipped=0.0 +2022-11-15 22:33:33,091 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.6257, 4.9718, 5.3098, 5.0411, 5.6675, 5.4431, 4.6909, 5.5914], + device='cuda:2'), covar=tensor([0.0244, 0.0219, 0.0388, 0.0233, 0.0266, 0.0110, 0.0205, 0.0168], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0123, 0.0093, 0.0121, 0.0137, 0.0080, 0.0106, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 22:33:39,507 INFO [train.py:876] (2/4) Epoch 7, batch 3600, loss[loss=0.1301, simple_loss=0.1495, pruned_loss=0.05536, over 5763.00 frames. ], tot_loss[loss=0.1503, simple_loss=0.165, pruned_loss=0.06774, over 1087027.35 frames. ], batch size: 16, lr: 1.15e-02, grad_scale: 16.0 +2022-11-15 22:33:44,747 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47241.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:33:52,085 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=47251.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:33:56,655 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=47258.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:33:59,994 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9897, 1.2123, 1.3156, 0.8195, 0.8546, 0.9481, 1.1778, 1.3034], + device='cuda:2'), covar=tensor([0.0047, 0.0038, 0.0030, 0.0038, 0.0034, 0.0031, 0.0035, 0.0053], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0034, 0.0036, 0.0036, 0.0034, 0.0031, 0.0035, 0.0029], + device='cuda:2'), out_proj_covar=tensor([3.4447e-05, 3.2513e-05, 3.2586e-05, 3.3552e-05, 3.0062e-05, 2.6748e-05, + 3.3297e-05, 2.6383e-05], device='cuda:2') +2022-11-15 22:34:00,855 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.77 vs. limit=2.0 +2022-11-15 22:34:23,971 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=47299.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:34:26,057 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47302.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:34:28,907 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=47306.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:34:38,257 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.049e+02 1.693e+02 2.082e+02 2.562e+02 5.983e+02, threshold=4.163e+02, percent-clipped=3.0 +2022-11-15 22:34:47,574 INFO [train.py:876] (2/4) Epoch 7, batch 3700, loss[loss=0.2068, simple_loss=0.2022, pruned_loss=0.1057, over 5560.00 frames. ], tot_loss[loss=0.1491, simple_loss=0.1645, pruned_loss=0.06678, over 1091122.00 frames. ], batch size: 46, lr: 1.15e-02, grad_scale: 16.0 +2022-11-15 22:34:47,635 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7286, 2.6906, 2.7208, 2.8111, 2.6324, 2.2319, 3.0628, 2.6801], + device='cuda:2'), covar=tensor([0.0577, 0.0995, 0.0604, 0.1025, 0.0763, 0.0633, 0.1003, 0.0884], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0098, 0.0082, 0.0104, 0.0077, 0.0067, 0.0131, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:35:18,593 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47379.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:35:37,651 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.87 vs. limit=5.0 +2022-11-15 22:35:45,327 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.32 vs. limit=5.0 +2022-11-15 22:35:46,989 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.114e+02 1.850e+02 2.309e+02 2.787e+02 6.530e+02, threshold=4.618e+02, percent-clipped=2.0 +2022-11-15 22:35:56,053 INFO [train.py:876] (2/4) Epoch 7, batch 3800, loss[loss=0.1247, simple_loss=0.1567, pruned_loss=0.04632, over 5645.00 frames. ], tot_loss[loss=0.1497, simple_loss=0.1646, pruned_loss=0.06736, over 1090004.73 frames. ], batch size: 18, lr: 1.15e-02, grad_scale: 16.0 +2022-11-15 22:36:01,318 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5418, 1.0505, 1.1651, 0.7536, 1.4153, 1.1262, 1.1291, 1.1769], + device='cuda:2'), covar=tensor([0.0790, 0.0814, 0.1507, 0.1653, 0.1292, 0.1262, 0.1124, 0.0560], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0011, 0.0013, 0.0012, 0.0010, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.2787e-05, 6.9768e-05, 5.2440e-05, 6.2480e-05, 5.7834e-05, 5.2755e-05, + 6.5519e-05, 5.3679e-05], device='cuda:2') +2022-11-15 22:36:01,332 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47440.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:36:18,976 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=47466.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:36:21,818 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.81 vs. limit=2.0 +2022-11-15 22:36:23,134 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4220, 1.9561, 2.3835, 1.5167, 1.5542, 3.1007, 2.4134, 1.7970], + device='cuda:2'), covar=tensor([0.0650, 0.0676, 0.0390, 0.2092, 0.2069, 0.1686, 0.0716, 0.0854], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0051, 0.0055, 0.0069, 0.0057, 0.0046, 0.0051, 0.0058], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 22:36:44,712 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8572, 2.3192, 2.7771, 3.6937, 3.7278, 2.6932, 2.6328, 3.6908], + device='cuda:2'), covar=tensor([0.0413, 0.2756, 0.2093, 0.3434, 0.1027, 0.2985, 0.1883, 0.0482], + device='cuda:2'), in_proj_covar=tensor([0.0209, 0.0213, 0.0204, 0.0328, 0.0231, 0.0220, 0.0197, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 22:36:51,696 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=47514.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:36:55,757 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7272, 2.0076, 2.0363, 1.3090, 2.0530, 2.5040, 1.9339, 2.4298], + device='cuda:2'), covar=tensor([0.1915, 0.1636, 0.1409, 0.2706, 0.0666, 0.0592, 0.0477, 0.0947], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0188, 0.0150, 0.0189, 0.0161, 0.0174, 0.0141, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:36:56,173 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.113e+02 1.606e+02 1.959e+02 2.426e+02 3.388e+02, threshold=3.918e+02, percent-clipped=0.0 +2022-11-15 22:37:04,433 INFO [train.py:876] (2/4) Epoch 7, batch 3900, loss[loss=0.1621, simple_loss=0.1776, pruned_loss=0.07328, over 5575.00 frames. ], tot_loss[loss=0.1487, simple_loss=0.1633, pruned_loss=0.06704, over 1086595.05 frames. ], batch size: 22, lr: 1.15e-02, grad_scale: 8.0 +2022-11-15 22:37:15,518 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-11-15 22:37:47,946 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=47597.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:37:59,335 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7644, 2.1463, 3.4926, 3.0115, 3.5699, 2.2375, 3.2424, 3.8451], + device='cuda:2'), covar=tensor([0.0622, 0.1708, 0.0745, 0.1666, 0.0508, 0.1609, 0.1172, 0.0670], + device='cuda:2'), in_proj_covar=tensor([0.0214, 0.0191, 0.0196, 0.0209, 0.0205, 0.0183, 0.0224, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 22:38:04,272 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.046e+02 1.833e+02 2.156e+02 2.681e+02 5.878e+02, threshold=4.311e+02, percent-clipped=5.0 +2022-11-15 22:38:12,261 INFO [train.py:876] (2/4) Epoch 7, batch 4000, loss[loss=0.2439, simple_loss=0.2132, pruned_loss=0.1373, over 5464.00 frames. ], tot_loss[loss=0.1507, simple_loss=0.1643, pruned_loss=0.06851, over 1080891.07 frames. ], batch size: 64, lr: 1.15e-02, grad_scale: 8.0 +2022-11-15 22:38:16,001 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4807, 4.1620, 4.1963, 4.1318, 4.5623, 4.4513, 4.2136, 4.4131], + device='cuda:2'), covar=tensor([0.0784, 0.0735, 0.0955, 0.0848, 0.0696, 0.0436, 0.0656, 0.0937], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0128, 0.0096, 0.0127, 0.0140, 0.0083, 0.0108, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 22:38:37,651 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6486, 2.2687, 3.3216, 2.8819, 3.2223, 2.2879, 3.0716, 3.6388], + device='cuda:2'), covar=tensor([0.0479, 0.1329, 0.0646, 0.1250, 0.0654, 0.1236, 0.0926, 0.0649], + device='cuda:2'), in_proj_covar=tensor([0.0217, 0.0192, 0.0198, 0.0212, 0.0206, 0.0184, 0.0225, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 22:39:00,431 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0935, 4.6307, 4.1345, 4.6524, 4.6959, 3.9277, 4.4446, 3.9571], + device='cuda:2'), covar=tensor([0.0343, 0.0406, 0.1500, 0.0350, 0.0383, 0.0391, 0.0387, 0.0699], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0158, 0.0254, 0.0158, 0.0198, 0.0159, 0.0168, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 22:39:11,190 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4026, 4.7495, 4.1978, 4.6595, 4.5624, 4.3730, 2.3720, 4.5907], + device='cuda:2'), covar=tensor([0.0362, 0.0534, 0.0324, 0.0182, 0.0452, 0.0397, 0.2733, 0.0256], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0076, 0.0078, 0.0072, 0.0096, 0.0082, 0.0128, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:39:12,714 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.180e+02 1.745e+02 2.156e+02 2.532e+02 4.633e+02, threshold=4.312e+02, percent-clipped=4.0 +2022-11-15 22:39:18,094 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9112, 1.5816, 1.9124, 1.1405, 1.2338, 2.6534, 2.3457, 1.5782], + device='cuda:2'), covar=tensor([0.0994, 0.0817, 0.0494, 0.2715, 0.3956, 0.1927, 0.0690, 0.1268], + device='cuda:2'), in_proj_covar=tensor([0.0065, 0.0051, 0.0054, 0.0069, 0.0055, 0.0045, 0.0050, 0.0057], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 22:39:20,699 INFO [train.py:876] (2/4) Epoch 7, batch 4100, loss[loss=0.1798, simple_loss=0.1815, pruned_loss=0.08905, over 4984.00 frames. ], tot_loss[loss=0.1497, simple_loss=0.1641, pruned_loss=0.06766, over 1082141.22 frames. ], batch size: 109, lr: 1.15e-02, grad_scale: 8.0 +2022-11-15 22:39:21,997 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=47735.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:39:24,128 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47738.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:39:29,520 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1312, 2.3599, 3.6714, 3.0531, 3.9613, 2.6414, 3.6428, 4.0831], + device='cuda:2'), covar=tensor([0.0526, 0.1505, 0.0740, 0.1580, 0.0539, 0.1479, 0.0975, 0.0641], + device='cuda:2'), in_proj_covar=tensor([0.0217, 0.0193, 0.0200, 0.0213, 0.0207, 0.0185, 0.0225, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 22:39:46,926 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7939, 2.7698, 2.6843, 3.0393, 2.3160, 3.0436, 3.1434, 3.4142], + device='cuda:2'), covar=tensor([0.1214, 0.1644, 0.2411, 0.1998, 0.2105, 0.1370, 0.1550, 0.1182], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0082, 0.0094, 0.0074, 0.0079, 0.0076, 0.0088, 0.0059], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:39:58,261 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47788.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:39:58,852 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3640, 3.8221, 3.3058, 3.8021, 3.8289, 3.1492, 3.3926, 3.2496], + device='cuda:2'), covar=tensor([0.0769, 0.0460, 0.1541, 0.0516, 0.0470, 0.0619, 0.0605, 0.0692], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0158, 0.0256, 0.0157, 0.0196, 0.0160, 0.0169, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 22:40:05,381 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47799.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:40:12,057 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47809.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 22:40:20,468 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.020e+02 1.771e+02 2.148e+02 2.600e+02 4.098e+02, threshold=4.296e+02, percent-clipped=0.0 +2022-11-15 22:40:29,024 INFO [train.py:876] (2/4) Epoch 7, batch 4200, loss[loss=0.1381, simple_loss=0.1721, pruned_loss=0.05203, over 5546.00 frames. ], tot_loss[loss=0.1488, simple_loss=0.1635, pruned_loss=0.06705, over 1084194.06 frames. ], batch size: 15, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:40:39,615 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47849.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:40:49,988 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1986, 2.3440, 3.0039, 3.9325, 3.9705, 3.4160, 2.7244, 3.9958], + device='cuda:2'), covar=tensor([0.0409, 0.3205, 0.2544, 0.2601, 0.0904, 0.2269, 0.1959, 0.0546], + device='cuda:2'), in_proj_covar=tensor([0.0207, 0.0206, 0.0206, 0.0327, 0.0226, 0.0217, 0.0197, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 22:40:53,490 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47870.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 22:41:02,626 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9058, 3.3335, 2.2257, 3.0488, 2.2951, 2.4453, 1.9292, 2.8737], + device='cuda:2'), covar=tensor([0.1251, 0.0198, 0.0994, 0.0334, 0.0954, 0.0888, 0.1760, 0.0338], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0135, 0.0167, 0.0141, 0.0172, 0.0178, 0.0176, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 22:41:11,957 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=47897.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:41:13,306 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3154, 1.3472, 1.7206, 1.2539, 1.2152, 1.6359, 1.1964, 0.8132], + device='cuda:2'), covar=tensor([0.0016, 0.0031, 0.0020, 0.0026, 0.0034, 0.0024, 0.0025, 0.0042], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0019, 0.0022, 0.0020, 0.0019, 0.0022, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.6686e-05, 1.8192e-05, 1.7320e-05, 2.1880e-05, 1.9142e-05, 1.9030e-05, + 2.1848e-05, 2.3482e-05], device='cuda:2') +2022-11-15 22:41:22,828 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.23 vs. limit=5.0 +2022-11-15 22:41:27,582 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.072e+02 1.804e+02 2.056e+02 2.615e+02 5.152e+02, threshold=4.112e+02, percent-clipped=3.0 +2022-11-15 22:41:35,088 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6713, 1.8499, 3.2574, 2.6285, 3.6685, 2.1367, 3.0216, 3.8455], + device='cuda:2'), covar=tensor([0.0758, 0.3026, 0.1215, 0.2194, 0.0712, 0.2134, 0.1751, 0.0833], + device='cuda:2'), in_proj_covar=tensor([0.0218, 0.0193, 0.0198, 0.0211, 0.0206, 0.0184, 0.0224, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 22:41:36,124 INFO [train.py:876] (2/4) Epoch 7, batch 4300, loss[loss=0.1015, simple_loss=0.1299, pruned_loss=0.03661, over 5455.00 frames. ], tot_loss[loss=0.1471, simple_loss=0.1625, pruned_loss=0.06581, over 1079804.16 frames. ], batch size: 10, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:41:44,861 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=47945.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:42:07,886 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-15 22:42:28,755 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3132, 4.3253, 3.9755, 3.6369, 4.2533, 4.1556, 1.7244, 4.3549], + device='cuda:2'), covar=tensor([0.0229, 0.0243, 0.0301, 0.0537, 0.0328, 0.0324, 0.3066, 0.0269], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0077, 0.0079, 0.0073, 0.0096, 0.0082, 0.0128, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:42:36,452 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.061e+02 1.735e+02 2.013e+02 2.634e+02 5.022e+02, threshold=4.026e+02, percent-clipped=3.0 +2022-11-15 22:42:44,700 INFO [train.py:876] (2/4) Epoch 7, batch 4400, loss[loss=0.1386, simple_loss=0.1632, pruned_loss=0.05695, over 5774.00 frames. ], tot_loss[loss=0.1483, simple_loss=0.1636, pruned_loss=0.06649, over 1080919.99 frames. ], batch size: 20, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:42:46,474 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48035.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:43:01,693 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7420, 1.0130, 1.3080, 0.6049, 0.7869, 1.1043, 0.6121, 0.9690], + device='cuda:2'), covar=tensor([0.0037, 0.0028, 0.0030, 0.0032, 0.0028, 0.0028, 0.0057, 0.0035], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0036, 0.0037, 0.0039, 0.0035, 0.0033, 0.0036, 0.0030], + device='cuda:2'), out_proj_covar=tensor([3.5611e-05, 3.3874e-05, 3.3540e-05, 3.5624e-05, 3.1428e-05, 2.8267e-05, + 3.4478e-05, 2.6727e-05], device='cuda:2') +2022-11-15 22:43:11,626 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48072.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:43:16,912 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7671, 1.1829, 1.8955, 1.6542, 1.7447, 1.5917, 1.9608, 1.4248], + device='cuda:2'), covar=tensor([0.0028, 0.0062, 0.0016, 0.0027, 0.0035, 0.0066, 0.0022, 0.0047], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0019, 0.0022, 0.0020, 0.0019, 0.0022, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.6124e-05, 1.8009e-05, 1.7181e-05, 2.1866e-05, 1.9145e-05, 1.8883e-05, + 2.1877e-05, 2.3711e-05], device='cuda:2') +2022-11-15 22:43:18,749 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48083.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:43:18,781 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.2896, 4.8808, 5.0216, 4.9020, 5.4045, 5.3082, 4.7282, 5.3542], + device='cuda:2'), covar=tensor([0.0407, 0.0241, 0.0482, 0.0253, 0.0363, 0.0137, 0.0217, 0.0248], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0126, 0.0095, 0.0126, 0.0138, 0.0083, 0.0105, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 22:43:26,524 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-11-15 22:43:26,713 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48094.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:43:40,039 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8955, 2.1837, 1.7131, 1.4178, 1.7010, 2.4457, 2.1377, 2.4278], + device='cuda:2'), covar=tensor([0.1316, 0.1185, 0.1280, 0.2285, 0.0847, 0.0505, 0.0345, 0.0772], + device='cuda:2'), in_proj_covar=tensor([0.0186, 0.0195, 0.0156, 0.0196, 0.0171, 0.0179, 0.0145, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:43:42,876 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 22:43:44,370 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.086e+02 1.659e+02 2.129e+02 2.741e+02 6.031e+02, threshold=4.258e+02, percent-clipped=4.0 +2022-11-15 22:43:52,259 INFO [train.py:876] (2/4) Epoch 7, batch 4500, loss[loss=0.151, simple_loss=0.1644, pruned_loss=0.06874, over 5494.00 frames. ], tot_loss[loss=0.1497, simple_loss=0.164, pruned_loss=0.06775, over 1082480.99 frames. ], batch size: 49, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:43:52,414 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48133.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:43:59,760 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48144.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:44:14,576 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48165.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 22:44:46,319 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48212.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 22:44:52,379 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.044e+02 1.660e+02 2.072e+02 2.700e+02 4.661e+02, threshold=4.143e+02, percent-clipped=2.0 +2022-11-15 22:45:00,427 INFO [train.py:876] (2/4) Epoch 7, batch 4600, loss[loss=0.1097, simple_loss=0.1357, pruned_loss=0.04179, over 5445.00 frames. ], tot_loss[loss=0.1506, simple_loss=0.1647, pruned_loss=0.06827, over 1078081.09 frames. ], batch size: 11, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:45:27,976 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48273.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 22:45:49,434 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9053, 1.2173, 1.0762, 0.5247, 0.8698, 1.2138, 0.9975, 1.2187], + device='cuda:2'), covar=tensor([0.0038, 0.0037, 0.0034, 0.0046, 0.0040, 0.0025, 0.0047, 0.0053], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0036, 0.0037, 0.0038, 0.0035, 0.0032, 0.0036, 0.0029], + device='cuda:2'), out_proj_covar=tensor([3.4709e-05, 3.3468e-05, 3.3035e-05, 3.4724e-05, 3.1522e-05, 2.7677e-05, + 3.4351e-05, 2.6290e-05], device='cuda:2') +2022-11-15 22:45:56,208 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2050, 4.1117, 3.6290, 3.5417, 4.1402, 3.9823, 1.3345, 4.3170], + device='cuda:2'), covar=tensor([0.0233, 0.0297, 0.0563, 0.0592, 0.0381, 0.0500, 0.3476, 0.0307], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0080, 0.0079, 0.0074, 0.0096, 0.0083, 0.0130, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:45:58,078 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.34 vs. limit=5.0 +2022-11-15 22:46:00,361 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.286e+02 1.710e+02 2.127e+02 2.706e+02 4.557e+02, threshold=4.254e+02, percent-clipped=4.0 +2022-11-15 22:46:02,397 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48324.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:46:08,680 INFO [train.py:876] (2/4) Epoch 7, batch 4700, loss[loss=0.1088, simple_loss=0.1296, pruned_loss=0.04398, over 5714.00 frames. ], tot_loss[loss=0.1484, simple_loss=0.1631, pruned_loss=0.06685, over 1078455.40 frames. ], batch size: 11, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:46:44,387 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48385.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:46:50,278 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48394.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:46:52,215 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48397.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:46:58,025 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-15 22:46:58,384 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3464, 4.6241, 3.2365, 4.3793, 3.4158, 2.9690, 2.1327, 3.9397], + device='cuda:2'), covar=tensor([0.1803, 0.0159, 0.1133, 0.0290, 0.0594, 0.1112, 0.2623, 0.0274], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0139, 0.0174, 0.0145, 0.0177, 0.0183, 0.0182, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 22:47:08,412 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 1.686e+02 2.046e+02 2.615e+02 4.137e+02, threshold=4.091e+02, percent-clipped=0.0 +2022-11-15 22:47:10,258 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48423.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:47:13,454 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48428.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:47:16,567 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7479, 3.5612, 3.7928, 1.9385, 3.7448, 3.9021, 3.9598, 4.6527], + device='cuda:2'), covar=tensor([0.1779, 0.1201, 0.0432, 0.2373, 0.0239, 0.0435, 0.0337, 0.0340], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0191, 0.0154, 0.0190, 0.0167, 0.0173, 0.0143, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:47:17,035 INFO [train.py:876] (2/4) Epoch 7, batch 4800, loss[loss=0.1181, simple_loss=0.138, pruned_loss=0.04912, over 5051.00 frames. ], tot_loss[loss=0.1481, simple_loss=0.1631, pruned_loss=0.06659, over 1080906.55 frames. ], batch size: 7, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:47:23,003 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48442.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:47:24,433 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48444.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:47:25,654 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6223, 4.6185, 4.0566, 4.1544, 4.5284, 4.4344, 1.8160, 4.6815], + device='cuda:2'), covar=tensor([0.0188, 0.0201, 0.0324, 0.0454, 0.0238, 0.0252, 0.3080, 0.0296], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0078, 0.0078, 0.0073, 0.0094, 0.0081, 0.0128, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:47:33,555 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48458.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:47:34,412 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.06 vs. limit=2.0 +2022-11-15 22:47:36,195 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9414, 1.0940, 1.1877, 0.8331, 1.0468, 1.1765, 0.7748, 1.1352], + device='cuda:2'), covar=tensor([0.0046, 0.0036, 0.0034, 0.0044, 0.0039, 0.0030, 0.0053, 0.0046], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0035, 0.0037, 0.0038, 0.0035, 0.0032, 0.0036, 0.0029], + device='cuda:2'), out_proj_covar=tensor([3.4640e-05, 3.3067e-05, 3.3182e-05, 3.5198e-05, 3.1438e-05, 2.7689e-05, + 3.4959e-05, 2.5947e-05], device='cuda:2') +2022-11-15 22:47:38,232 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48465.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 22:47:49,819 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4384, 2.7153, 2.9900, 1.6127, 2.7845, 3.2855, 3.3123, 3.3936], + device='cuda:2'), covar=tensor([0.1425, 0.1340, 0.0607, 0.2329, 0.0414, 0.0401, 0.0229, 0.0649], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0190, 0.0152, 0.0188, 0.0166, 0.0171, 0.0141, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:47:51,730 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48484.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:47:57,028 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48492.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:48:10,992 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48513.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 22:48:16,162 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.121e+02 1.802e+02 2.155e+02 2.691e+02 6.184e+02, threshold=4.310e+02, percent-clipped=3.0 +2022-11-15 22:48:25,056 INFO [train.py:876] (2/4) Epoch 7, batch 4900, loss[loss=0.1895, simple_loss=0.1942, pruned_loss=0.09236, over 5588.00 frames. ], tot_loss[loss=0.1463, simple_loss=0.1619, pruned_loss=0.06542, over 1086846.19 frames. ], batch size: 43, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:48:26,483 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2718, 1.1840, 1.3922, 0.9318, 1.2114, 1.3454, 0.5791, 1.3306], + device='cuda:2'), covar=tensor([0.0049, 0.0068, 0.0034, 0.0059, 0.0049, 0.0031, 0.0048, 0.0062], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0036, 0.0038, 0.0039, 0.0037, 0.0033, 0.0038, 0.0030], + device='cuda:2'), out_proj_covar=tensor([3.5922e-05, 3.4128e-05, 3.4220e-05, 3.5918e-05, 3.2576e-05, 2.8222e-05, + 3.6238e-05, 2.6656e-05], device='cuda:2') +2022-11-15 22:48:46,457 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-11-15 22:48:48,180 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48568.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 22:48:48,868 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4981, 1.1999, 1.7199, 1.2599, 1.1117, 1.5347, 1.1990, 0.9799], + device='cuda:2'), covar=tensor([0.0024, 0.0049, 0.0027, 0.0031, 0.0057, 0.0034, 0.0029, 0.0040], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0020, 0.0024, 0.0022, 0.0021, 0.0024, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.7590e-05, 1.9958e-05, 1.9103e-05, 2.4396e-05, 2.1429e-05, 2.0669e-05, + 2.3498e-05, 2.6024e-05], device='cuda:2') +2022-11-15 22:49:09,778 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-11-15 22:49:24,581 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.141e+02 1.657e+02 1.917e+02 2.351e+02 4.435e+02, threshold=3.835e+02, percent-clipped=1.0 +2022-11-15 22:49:32,511 INFO [train.py:876] (2/4) Epoch 7, batch 5000, loss[loss=0.1152, simple_loss=0.137, pruned_loss=0.0467, over 5693.00 frames. ], tot_loss[loss=0.1446, simple_loss=0.1607, pruned_loss=0.06423, over 1090394.34 frames. ], batch size: 34, lr: 1.14e-02, grad_scale: 8.0 +2022-11-15 22:49:57,226 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4977, 4.5500, 4.6208, 4.8028, 4.3099, 3.8286, 5.2136, 4.5545], + device='cuda:2'), covar=tensor([0.0398, 0.0880, 0.0335, 0.0986, 0.0385, 0.0354, 0.0707, 0.0642], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0094, 0.0080, 0.0101, 0.0075, 0.0066, 0.0127, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:50:04,375 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48680.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:50:32,601 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.591e+01 1.796e+02 2.203e+02 2.625e+02 4.237e+02, threshold=4.405e+02, percent-clipped=2.0 +2022-11-15 22:50:37,389 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48728.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:50:39,409 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8725, 2.3960, 2.8860, 3.6263, 3.8924, 2.9579, 2.5898, 3.8435], + device='cuda:2'), covar=tensor([0.0681, 0.2949, 0.2906, 0.4154, 0.0945, 0.3149, 0.2362, 0.0672], + device='cuda:2'), in_proj_covar=tensor([0.0206, 0.0202, 0.0200, 0.0321, 0.0224, 0.0215, 0.0195, 0.0213], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 22:50:40,543 INFO [train.py:876] (2/4) Epoch 7, batch 5100, loss[loss=0.2113, simple_loss=0.2123, pruned_loss=0.1052, over 5454.00 frames. ], tot_loss[loss=0.1474, simple_loss=0.163, pruned_loss=0.06589, over 1086773.96 frames. ], batch size: 64, lr: 1.13e-02, grad_scale: 8.0 +2022-11-15 22:50:44,210 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-11-15 22:50:44,605 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1695, 1.4463, 1.1397, 1.0312, 1.1678, 1.6738, 1.6977, 1.5677], + device='cuda:2'), covar=tensor([0.0816, 0.0563, 0.1173, 0.1351, 0.0688, 0.0384, 0.0393, 0.0770], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0191, 0.0154, 0.0190, 0.0166, 0.0174, 0.0143, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:50:53,604 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48753.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:50:57,539 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1966, 3.7153, 3.2298, 3.7304, 3.7366, 3.0685, 3.3483, 3.1858], + device='cuda:2'), covar=tensor([0.1227, 0.0557, 0.1561, 0.0477, 0.0537, 0.0514, 0.0674, 0.0733], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0157, 0.0252, 0.0156, 0.0197, 0.0158, 0.0167, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 22:51:09,841 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48776.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:51:12,139 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48779.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:51:39,998 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.029e+02 1.794e+02 2.055e+02 2.515e+02 6.538e+02, threshold=4.110e+02, percent-clipped=4.0 +2022-11-15 22:51:40,528 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48821.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:51:48,602 INFO [train.py:876] (2/4) Epoch 7, batch 5200, loss[loss=0.152, simple_loss=0.1745, pruned_loss=0.0648, over 5578.00 frames. ], tot_loss[loss=0.1469, simple_loss=0.1626, pruned_loss=0.06557, over 1089120.23 frames. ], batch size: 22, lr: 1.13e-02, grad_scale: 8.0 +2022-11-15 22:51:51,929 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48838.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:52:11,474 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48868.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 22:52:21,485 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48882.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:52:32,606 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48899.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:52:40,353 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7197, 1.8425, 1.7763, 2.0601, 1.7307, 1.4172, 1.7604, 1.9812], + device='cuda:2'), covar=tensor([0.1700, 0.1860, 0.3276, 0.1152, 0.2064, 0.2330, 0.2183, 0.0847], + device='cuda:2'), in_proj_covar=tensor([0.0082, 0.0084, 0.0097, 0.0075, 0.0080, 0.0079, 0.0089, 0.0060], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:52:43,692 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48916.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 22:52:46,876 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.035e+02 1.737e+02 2.203e+02 2.681e+02 5.321e+02, threshold=4.406e+02, percent-clipped=3.0 +2022-11-15 22:52:55,132 INFO [train.py:876] (2/4) Epoch 7, batch 5300, loss[loss=0.1406, simple_loss=0.1577, pruned_loss=0.0617, over 5648.00 frames. ], tot_loss[loss=0.148, simple_loss=0.1631, pruned_loss=0.06646, over 1086777.97 frames. ], batch size: 36, lr: 1.13e-02, grad_scale: 8.0 +2022-11-15 22:53:08,290 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7353, 1.7864, 2.1974, 1.3350, 0.8793, 2.5818, 1.8817, 1.6472], + device='cuda:2'), covar=tensor([0.0868, 0.1128, 0.0696, 0.2406, 0.3779, 0.0627, 0.0960, 0.1191], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0054, 0.0056, 0.0071, 0.0055, 0.0045, 0.0052, 0.0058], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 22:53:26,623 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48980.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:53:32,859 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.92 vs. limit=5.0 +2022-11-15 22:53:45,833 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 22:53:54,573 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.048e+02 1.623e+02 1.921e+02 2.407e+02 5.715e+02, threshold=3.842e+02, percent-clipped=1.0 +2022-11-15 22:53:59,202 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49028.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:54:02,411 INFO [train.py:876] (2/4) Epoch 7, batch 5400, loss[loss=0.1501, simple_loss=0.1709, pruned_loss=0.06466, over 5588.00 frames. ], tot_loss[loss=0.1481, simple_loss=0.1635, pruned_loss=0.06635, over 1086989.70 frames. ], batch size: 23, lr: 1.13e-02, grad_scale: 8.0 +2022-11-15 22:54:16,526 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49053.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:54:29,575 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1044, 2.9820, 3.0805, 1.5496, 3.1031, 3.7996, 3.3885, 3.9226], + device='cuda:2'), covar=tensor([0.2540, 0.1792, 0.0848, 0.3175, 0.0344, 0.0512, 0.0319, 0.0637], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0191, 0.0152, 0.0190, 0.0168, 0.0177, 0.0146, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 22:54:34,076 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49079.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:54:48,496 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49101.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:54:54,140 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-11-15 22:55:01,509 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.34 vs. limit=2.0 +2022-11-15 22:55:02,428 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.256e+02 1.756e+02 2.023e+02 2.468e+02 4.836e+02, threshold=4.046e+02, percent-clipped=5.0 +2022-11-15 22:55:06,393 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49127.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:55:10,253 INFO [train.py:876] (2/4) Epoch 7, batch 5500, loss[loss=0.115, simple_loss=0.1439, pruned_loss=0.0431, over 5539.00 frames. ], tot_loss[loss=0.1477, simple_loss=0.1632, pruned_loss=0.0661, over 1090903.30 frames. ], batch size: 21, lr: 1.13e-02, grad_scale: 8.0 +2022-11-15 22:55:37,558 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0311, 1.4025, 1.1719, 0.5905, 1.3531, 1.2946, 0.7946, 1.1473], + device='cuda:2'), covar=tensor([0.0034, 0.0025, 0.0031, 0.0042, 0.0028, 0.0026, 0.0045, 0.0034], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0038, 0.0040, 0.0041, 0.0039, 0.0035, 0.0040, 0.0032], + device='cuda:2'), out_proj_covar=tensor([3.7984e-05, 3.5353e-05, 3.6070e-05, 3.7347e-05, 3.4358e-05, 3.0281e-05, + 3.8627e-05, 2.8870e-05], device='cuda:2') +2022-11-15 22:55:40,401 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49177.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:55:45,835 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3082, 4.1531, 4.2367, 4.5671, 4.2656, 4.0566, 1.8311, 4.3762], + device='cuda:2'), covar=tensor([0.0480, 0.0620, 0.0499, 0.0356, 0.0613, 0.0648, 0.4459, 0.0401], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0079, 0.0079, 0.0073, 0.0097, 0.0082, 0.0130, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:55:51,694 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49194.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:56:10,385 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.123e+02 1.636e+02 2.022e+02 2.644e+02 5.189e+02, threshold=4.044e+02, percent-clipped=1.0 +2022-11-15 22:56:18,737 INFO [train.py:876] (2/4) Epoch 7, batch 5600, loss[loss=0.1737, simple_loss=0.1831, pruned_loss=0.0822, over 5574.00 frames. ], tot_loss[loss=0.1466, simple_loss=0.1624, pruned_loss=0.06541, over 1088760.12 frames. ], batch size: 21, lr: 1.13e-02, grad_scale: 8.0 +2022-11-15 22:56:20,842 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=49236.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:56:30,209 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0398, 2.2081, 3.5286, 2.9387, 4.0451, 2.6379, 3.5678, 3.9370], + device='cuda:2'), covar=tensor([0.0599, 0.1773, 0.0885, 0.1838, 0.0544, 0.1570, 0.1111, 0.0837], + device='cuda:2'), in_proj_covar=tensor([0.0215, 0.0190, 0.0194, 0.0207, 0.0207, 0.0184, 0.0220, 0.0215], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 22:56:51,676 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0041, 1.1251, 1.2164, 1.0322, 1.3230, 1.4328, 1.0657, 1.3533], + device='cuda:2'), covar=tensor([0.0043, 0.0036, 0.0033, 0.0036, 0.0035, 0.0027, 0.0044, 0.0026], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0037, 0.0039, 0.0040, 0.0038, 0.0034, 0.0039, 0.0032], + device='cuda:2'), out_proj_covar=tensor([3.7221e-05, 3.4715e-05, 3.5353e-05, 3.6777e-05, 3.3817e-05, 2.9941e-05, + 3.7869e-05, 2.8490e-05], device='cuda:2') +2022-11-15 22:57:02,611 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=49297.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:57:18,855 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.878e+01 1.572e+02 2.070e+02 2.645e+02 4.455e+02, threshold=4.140e+02, percent-clipped=3.0 +2022-11-15 22:57:27,068 INFO [train.py:876] (2/4) Epoch 7, batch 5700, loss[loss=0.1213, simple_loss=0.1285, pruned_loss=0.05707, over 5408.00 frames. ], tot_loss[loss=0.1457, simple_loss=0.1616, pruned_loss=0.06488, over 1088383.20 frames. ], batch size: 9, lr: 1.13e-02, grad_scale: 8.0 +2022-11-15 22:58:18,165 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9951, 1.0833, 1.2478, 1.2654, 1.4268, 1.3563, 1.0664, 1.2482], + device='cuda:2'), covar=tensor([0.0039, 0.0039, 0.0033, 0.0039, 0.0035, 0.0033, 0.0062, 0.0041], + device='cuda:2'), in_proj_covar=tensor([0.0040, 0.0036, 0.0038, 0.0039, 0.0037, 0.0033, 0.0038, 0.0030], + device='cuda:2'), out_proj_covar=tensor([3.6056e-05, 3.3654e-05, 3.4153e-05, 3.5513e-05, 3.2798e-05, 2.9120e-05, + 3.6587e-05, 2.7121e-05], device='cuda:2') +2022-11-15 22:58:27,013 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.158e+02 1.691e+02 1.989e+02 2.573e+02 4.583e+02, threshold=3.978e+02, percent-clipped=3.0 +2022-11-15 22:58:27,812 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=49422.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:58:34,806 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=49432.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 22:58:35,270 INFO [train.py:876] (2/4) Epoch 7, batch 5800, loss[loss=0.1129, simple_loss=0.1421, pruned_loss=0.04181, over 5561.00 frames. ], tot_loss[loss=0.1475, simple_loss=0.1629, pruned_loss=0.06603, over 1082078.40 frames. ], batch size: 14, lr: 1.13e-02, grad_scale: 8.0 +2022-11-15 22:58:38,047 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-11-15 22:58:52,627 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8883, 4.2498, 3.9203, 3.9466, 4.1679, 3.9704, 1.5262, 4.2899], + device='cuda:2'), covar=tensor([0.0334, 0.0219, 0.0382, 0.0355, 0.0309, 0.0308, 0.3248, 0.0242], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0078, 0.0079, 0.0075, 0.0098, 0.0083, 0.0131, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 22:59:04,658 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49477.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:59:08,973 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=49483.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:59:15,732 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=49493.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 22:59:16,294 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49494.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:59:25,456 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9120, 5.5716, 5.0592, 5.6648, 5.6235, 4.8335, 5.2560, 4.8024], + device='cuda:2'), covar=tensor([0.0352, 0.0404, 0.1065, 0.0254, 0.0396, 0.0360, 0.0300, 0.0339], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0162, 0.0255, 0.0157, 0.0202, 0.0159, 0.0172, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 22:59:32,019 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4748, 1.7471, 1.2778, 1.2248, 1.2722, 2.0198, 1.6864, 1.8826], + device='cuda:2'), covar=tensor([0.1084, 0.0671, 0.1415, 0.1555, 0.0884, 0.0503, 0.0470, 0.0665], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0189, 0.0152, 0.0188, 0.0165, 0.0175, 0.0144, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 22:59:33,818 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.316e+02 1.742e+02 2.165e+02 2.707e+02 6.167e+02, threshold=4.330e+02, percent-clipped=6.0 +2022-11-15 22:59:36,566 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49525.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 22:59:42,441 INFO [train.py:876] (2/4) Epoch 7, batch 5900, loss[loss=0.1395, simple_loss=0.1613, pruned_loss=0.05887, over 5688.00 frames. ], tot_loss[loss=0.1454, simple_loss=0.1616, pruned_loss=0.06456, over 1082075.25 frames. ], batch size: 19, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 22:59:48,544 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49542.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:00:14,550 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4575, 4.6021, 3.0042, 4.2887, 3.5096, 3.1071, 2.8053, 3.8159], + device='cuda:2'), covar=tensor([0.1569, 0.0186, 0.1039, 0.0352, 0.0589, 0.1053, 0.1624, 0.0307], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0134, 0.0164, 0.0139, 0.0170, 0.0176, 0.0172, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:00:21,904 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49592.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:00:29,129 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4084, 3.4446, 3.2622, 3.6247, 3.2470, 3.0489, 3.8876, 3.3840], + device='cuda:2'), covar=tensor([0.0512, 0.0815, 0.0581, 0.0987, 0.0772, 0.0437, 0.0867, 0.0700], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0097, 0.0082, 0.0104, 0.0078, 0.0068, 0.0132, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:00:32,660 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7430, 2.1099, 3.2736, 2.8071, 3.3730, 2.2974, 2.9715, 3.6931], + device='cuda:2'), covar=tensor([0.0488, 0.1507, 0.0722, 0.1469, 0.0547, 0.1468, 0.1107, 0.0674], + device='cuda:2'), in_proj_covar=tensor([0.0214, 0.0185, 0.0193, 0.0207, 0.0205, 0.0184, 0.0219, 0.0212], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:00:42,047 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.096e+02 1.707e+02 2.021e+02 2.580e+02 5.267e+02, threshold=4.043e+02, percent-clipped=3.0 +2022-11-15 23:00:50,013 INFO [train.py:876] (2/4) Epoch 7, batch 6000, loss[loss=0.2043, simple_loss=0.1868, pruned_loss=0.111, over 3122.00 frames. ], tot_loss[loss=0.1453, simple_loss=0.1616, pruned_loss=0.06449, over 1088369.55 frames. ], batch size: 284, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 23:00:50,013 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 23:01:06,062 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1614, 1.2004, 1.6733, 1.0900, 1.4773, 1.5517, 1.2859, 0.9956], + device='cuda:2'), covar=tensor([0.0018, 0.0044, 0.0026, 0.0039, 0.0025, 0.0020, 0.0031, 0.0040], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0020, 0.0021, 0.0024, 0.0022, 0.0020, 0.0024, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.7134e-05, 1.9721e-05, 1.9342e-05, 2.4018e-05, 2.0715e-05, 1.9725e-05, + 2.3422e-05, 2.6503e-05], device='cuda:2') +2022-11-15 23:01:07,886 INFO [train.py:908] (2/4) Epoch 7, validation: loss=0.1616, simple_loss=0.1829, pruned_loss=0.07014, over 1530663.00 frames. +2022-11-15 23:01:07,887 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 23:01:48,087 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-11-15 23:02:07,550 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.086e+02 1.774e+02 2.178e+02 2.539e+02 4.839e+02, threshold=4.355e+02, percent-clipped=2.0 +2022-11-15 23:02:15,507 INFO [train.py:876] (2/4) Epoch 7, batch 6100, loss[loss=0.1451, simple_loss=0.1823, pruned_loss=0.05399, over 5751.00 frames. ], tot_loss[loss=0.1481, simple_loss=0.1637, pruned_loss=0.06621, over 1083226.47 frames. ], batch size: 27, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 23:02:38,516 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=49766.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:02:46,271 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49778.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:02:52,922 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49788.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:03:05,618 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-15 23:03:16,326 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.740e+01 1.709e+02 2.118e+02 2.760e+02 4.478e+02, threshold=4.236e+02, percent-clipped=1.0 +2022-11-15 23:03:16,629 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-15 23:03:20,409 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=49827.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:03:24,172 INFO [train.py:876] (2/4) Epoch 7, batch 6200, loss[loss=0.101, simple_loss=0.1214, pruned_loss=0.0403, over 5460.00 frames. ], tot_loss[loss=0.1461, simple_loss=0.1623, pruned_loss=0.06493, over 1084952.38 frames. ], batch size: 11, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 23:03:24,293 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2987, 4.3281, 2.8319, 4.1105, 3.2395, 2.7464, 2.2434, 3.6603], + device='cuda:2'), covar=tensor([0.1553, 0.0179, 0.1079, 0.0296, 0.0649, 0.1013, 0.1980, 0.0264], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0135, 0.0165, 0.0139, 0.0171, 0.0175, 0.0173, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:03:35,329 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9154, 3.1403, 3.1059, 2.9875, 3.1609, 3.0295, 1.3073, 3.2545], + device='cuda:2'), covar=tensor([0.0344, 0.0321, 0.0298, 0.0257, 0.0315, 0.0350, 0.3048, 0.0297], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0081, 0.0080, 0.0077, 0.0099, 0.0085, 0.0133, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:03:52,032 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7579, 2.9864, 2.7391, 2.9359, 2.5155, 2.8780, 3.0624, 3.4536], + device='cuda:2'), covar=tensor([0.0927, 0.1540, 0.2229, 0.1153, 0.1768, 0.0953, 0.1230, 0.2751], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0083, 0.0095, 0.0074, 0.0078, 0.0079, 0.0085, 0.0061], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:04:03,703 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49892.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:04:23,026 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.007e+02 1.621e+02 1.970e+02 2.358e+02 3.613e+02, threshold=3.939e+02, percent-clipped=0.0 +2022-11-15 23:04:31,726 INFO [train.py:876] (2/4) Epoch 7, batch 6300, loss[loss=0.1466, simple_loss=0.1627, pruned_loss=0.06524, over 5518.00 frames. ], tot_loss[loss=0.1479, simple_loss=0.1639, pruned_loss=0.06595, over 1086511.63 frames. ], batch size: 17, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 23:04:36,246 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49940.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:04:43,558 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.22 vs. limit=2.0 +2022-11-15 23:04:46,687 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6525, 1.3980, 1.7482, 0.9669, 1.8438, 1.4378, 1.4828, 1.3175], + device='cuda:2'), covar=tensor([0.1472, 0.0952, 0.0610, 0.2636, 0.1612, 0.0176, 0.0621, 0.0492], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0011, 0.0014, 0.0013, 0.0011, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.6021e-05, 7.4906e-05, 5.6092e-05, 6.4663e-05, 6.1389e-05, 5.4580e-05, + 6.9417e-05, 5.7089e-05], device='cuda:2') +2022-11-15 23:05:34,293 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.143e+02 1.742e+02 2.037e+02 2.563e+02 6.362e+02, threshold=4.074e+02, percent-clipped=2.0 +2022-11-15 23:05:42,863 INFO [train.py:876] (2/4) Epoch 7, batch 6400, loss[loss=0.1384, simple_loss=0.1606, pruned_loss=0.05805, over 5729.00 frames. ], tot_loss[loss=0.147, simple_loss=0.1623, pruned_loss=0.06584, over 1074486.51 frames. ], batch size: 17, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 23:05:57,867 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5467, 3.6337, 3.5712, 3.5161, 3.6026, 3.5292, 1.3686, 3.7656], + device='cuda:2'), covar=tensor([0.0370, 0.0558, 0.0319, 0.0281, 0.0388, 0.0389, 0.3470, 0.0303], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0080, 0.0079, 0.0074, 0.0097, 0.0084, 0.0132, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:06:00,425 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1447, 2.8991, 2.9097, 2.9173, 3.2013, 3.1289, 3.2178, 3.1469], + device='cuda:2'), covar=tensor([0.0802, 0.0859, 0.0927, 0.0795, 0.0806, 0.0429, 0.0578, 0.0906], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0128, 0.0096, 0.0128, 0.0141, 0.0084, 0.0108, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 23:06:13,106 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50078.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:06:20,293 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50088.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 23:06:41,519 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.049e+02 1.674e+02 2.100e+02 2.789e+02 6.462e+02, threshold=4.200e+02, percent-clipped=5.0 +2022-11-15 23:06:42,285 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=50122.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:06:45,244 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50126.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:06:47,034 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50128.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:06:50,163 INFO [train.py:876] (2/4) Epoch 7, batch 6500, loss[loss=0.09587, simple_loss=0.132, pruned_loss=0.02989, over 5025.00 frames. ], tot_loss[loss=0.1454, simple_loss=0.1614, pruned_loss=0.06471, over 1081659.28 frames. ], batch size: 7, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 23:06:52,579 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50136.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:06:55,852 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-11-15 23:07:13,874 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0933, 1.4343, 1.6597, 1.0915, 1.9190, 1.0752, 1.3940, 1.2286], + device='cuda:2'), covar=tensor([0.0905, 0.0934, 0.0442, 0.1652, 0.0804, 0.1284, 0.0777, 0.2106], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0011, 0.0013, 0.0013, 0.0011, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.5166e-05, 7.4575e-05, 5.5502e-05, 6.4012e-05, 6.1048e-05, 5.4318e-05, + 6.9378e-05, 5.6841e-05], device='cuda:2') +2022-11-15 23:07:18,914 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5158, 2.0790, 2.1808, 2.8106, 2.7549, 2.2171, 1.7206, 2.8721], + device='cuda:2'), covar=tensor([0.1124, 0.1927, 0.1647, 0.1352, 0.0842, 0.2137, 0.1963, 0.0672], + device='cuda:2'), in_proj_covar=tensor([0.0208, 0.0202, 0.0200, 0.0323, 0.0221, 0.0213, 0.0195, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 23:07:28,587 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50189.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:07:34,061 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50197.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:07:49,846 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.163e+02 1.678e+02 2.090e+02 2.492e+02 4.594e+02, threshold=4.179e+02, percent-clipped=1.0 +2022-11-15 23:07:56,608 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.77 vs. limit=5.0 +2022-11-15 23:07:58,111 INFO [train.py:876] (2/4) Epoch 7, batch 6600, loss[loss=0.07635, simple_loss=0.1045, pruned_loss=0.0241, over 5165.00 frames. ], tot_loss[loss=0.1458, simple_loss=0.1617, pruned_loss=0.06492, over 1081926.15 frames. ], batch size: 8, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 23:08:15,539 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50258.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:08:57,891 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.220e+02 1.709e+02 2.078e+02 2.678e+02 6.792e+02, threshold=4.156e+02, percent-clipped=5.0 +2022-11-15 23:09:05,715 INFO [train.py:876] (2/4) Epoch 7, batch 6700, loss[loss=0.2196, simple_loss=0.2096, pruned_loss=0.1149, over 5451.00 frames. ], tot_loss[loss=0.1458, simple_loss=0.1618, pruned_loss=0.06496, over 1089083.96 frames. ], batch size: 53, lr: 1.12e-02, grad_scale: 16.0 +2022-11-15 23:09:47,982 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-11-15 23:10:02,988 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-11-15 23:10:05,743 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 1.792e+02 2.397e+02 3.058e+02 5.863e+02, threshold=4.794e+02, percent-clipped=9.0 +2022-11-15 23:10:06,553 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50422.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:10:13,770 INFO [train.py:876] (2/4) Epoch 7, batch 6800, loss[loss=0.1523, simple_loss=0.1754, pruned_loss=0.0646, over 5561.00 frames. ], tot_loss[loss=0.1454, simple_loss=0.162, pruned_loss=0.06441, over 1095537.63 frames. ], batch size: 25, lr: 1.11e-02, grad_scale: 16.0 +2022-11-15 23:10:34,672 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8303, 2.1835, 2.6786, 3.5837, 3.6382, 2.6186, 2.1606, 3.6507], + device='cuda:2'), covar=tensor([0.0450, 0.2732, 0.2146, 0.2769, 0.1115, 0.3137, 0.2334, 0.0418], + device='cuda:2'), in_proj_covar=tensor([0.0211, 0.0204, 0.0203, 0.0324, 0.0226, 0.0219, 0.0195, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 23:10:38,769 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50470.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:10:48,163 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=50484.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:11:12,538 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.121e+02 1.731e+02 2.163e+02 2.798e+02 6.704e+02, threshold=4.325e+02, percent-clipped=4.0 +2022-11-15 23:11:13,646 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-11-15 23:11:17,200 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-11-15 23:11:20,774 INFO [train.py:876] (2/4) Epoch 7, batch 6900, loss[loss=0.1261, simple_loss=0.1407, pruned_loss=0.05581, over 5507.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.1617, pruned_loss=0.06395, over 1093974.16 frames. ], batch size: 17, lr: 1.11e-02, grad_scale: 16.0 +2022-11-15 23:11:22,242 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6386, 2.7109, 2.5176, 2.8086, 2.2141, 2.3207, 2.5366, 3.0779], + device='cuda:2'), covar=tensor([0.1080, 0.1565, 0.2608, 0.1858, 0.2006, 0.1691, 0.1781, 0.4480], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0085, 0.0094, 0.0074, 0.0079, 0.0079, 0.0088, 0.0062], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:11:29,482 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50546.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:11:34,027 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=50553.0, num_to_drop=1, layers_to_drop={3} +2022-11-15 23:11:40,805 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-15 23:12:10,991 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50607.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:12:20,427 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 1.773e+02 2.130e+02 2.487e+02 4.682e+02, threshold=4.260e+02, percent-clipped=1.0 +2022-11-15 23:12:22,527 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.07 vs. limit=2.0 +2022-11-15 23:12:28,774 INFO [train.py:876] (2/4) Epoch 7, batch 7000, loss[loss=0.1371, simple_loss=0.1512, pruned_loss=0.06153, over 4035.00 frames. ], tot_loss[loss=0.143, simple_loss=0.1603, pruned_loss=0.06286, over 1089989.86 frames. ], batch size: 4, lr: 1.11e-02, grad_scale: 16.0 +2022-11-15 23:13:27,406 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.163e+02 1.720e+02 2.093e+02 2.606e+02 4.257e+02, threshold=4.187e+02, percent-clipped=0.0 +2022-11-15 23:13:35,686 INFO [train.py:876] (2/4) Epoch 7, batch 7100, loss[loss=0.1482, simple_loss=0.1814, pruned_loss=0.05745, over 5708.00 frames. ], tot_loss[loss=0.1437, simple_loss=0.1604, pruned_loss=0.06354, over 1081676.19 frames. ], batch size: 17, lr: 1.11e-02, grad_scale: 16.0 +2022-11-15 23:13:45,229 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8734, 3.7470, 2.5940, 3.4566, 2.8472, 2.5337, 2.0106, 3.2054], + device='cuda:2'), covar=tensor([0.1894, 0.0318, 0.1192, 0.0437, 0.0904, 0.1256, 0.2108, 0.0466], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0135, 0.0164, 0.0139, 0.0172, 0.0174, 0.0175, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:13:54,388 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2600, 2.3208, 2.1121, 2.4203, 2.0481, 1.7986, 2.1124, 2.5392], + device='cuda:2'), covar=tensor([0.1421, 0.1506, 0.3052, 0.1258, 0.1803, 0.1609, 0.2037, 0.1315], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0088, 0.0098, 0.0077, 0.0081, 0.0081, 0.0090, 0.0063], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:14:11,203 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50782.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:14:12,517 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50784.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:14:29,890 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50808.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:14:38,107 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.155e+02 1.697e+02 1.983e+02 2.631e+02 5.249e+02, threshold=3.966e+02, percent-clipped=2.0 +2022-11-15 23:14:45,331 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50832.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:14:45,961 INFO [train.py:876] (2/4) Epoch 7, batch 7200, loss[loss=0.2489, simple_loss=0.2154, pruned_loss=0.1411, over 3146.00 frames. ], tot_loss[loss=0.1476, simple_loss=0.1634, pruned_loss=0.06597, over 1080870.00 frames. ], batch size: 284, lr: 1.11e-02, grad_scale: 16.0 +2022-11-15 23:14:53,261 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50843.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:14:56,379 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.8198, 5.1799, 5.4503, 5.1817, 5.8355, 5.6163, 4.9007, 5.7292], + device='cuda:2'), covar=tensor([0.0271, 0.0262, 0.0425, 0.0284, 0.0246, 0.0113, 0.0195, 0.0204], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0127, 0.0097, 0.0127, 0.0141, 0.0083, 0.0106, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 23:14:59,659 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50853.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:15:10,565 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50869.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:15:17,673 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1875, 3.2450, 3.1840, 2.8949, 1.9212, 3.2953, 1.9770, 2.7813], + device='cuda:2'), covar=tensor([0.0399, 0.0124, 0.0135, 0.0293, 0.0450, 0.0118, 0.0397, 0.0130], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0147, 0.0163, 0.0183, 0.0177, 0.0160, 0.0173, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:15:31,058 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50901.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:15:31,698 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=50902.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:16:19,019 INFO [train.py:876] (2/4) Epoch 8, batch 0, loss[loss=0.1216, simple_loss=0.1551, pruned_loss=0.04405, over 5578.00 frames. ], tot_loss[loss=0.1216, simple_loss=0.1551, pruned_loss=0.04405, over 5578.00 frames. ], batch size: 22, lr: 1.05e-02, grad_scale: 16.0 +2022-11-15 23:16:19,019 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 23:16:35,654 INFO [train.py:908] (2/4) Epoch 8, validation: loss=0.161, simple_loss=0.1821, pruned_loss=0.06991, over 1530663.00 frames. +2022-11-15 23:16:35,655 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 23:16:45,814 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.120e+02 1.842e+02 2.230e+02 2.830e+02 5.263e+02, threshold=4.459e+02, percent-clipped=7.0 +2022-11-15 23:16:58,254 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5309, 4.5433, 4.6924, 4.7238, 4.3102, 3.9012, 5.1234, 4.5145], + device='cuda:2'), covar=tensor([0.0401, 0.0879, 0.0294, 0.0895, 0.0441, 0.0354, 0.0724, 0.0528], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0095, 0.0081, 0.0105, 0.0077, 0.0068, 0.0130, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:17:00,339 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0650, 1.2094, 1.2149, 0.9620, 0.9032, 1.2326, 0.8138, 1.2624], + device='cuda:2'), covar=tensor([0.0036, 0.0024, 0.0036, 0.0040, 0.0036, 0.0028, 0.0052, 0.0038], + device='cuda:2'), in_proj_covar=tensor([0.0040, 0.0035, 0.0038, 0.0039, 0.0036, 0.0033, 0.0037, 0.0031], + device='cuda:2'), out_proj_covar=tensor([3.5973e-05, 3.1989e-05, 3.4514e-05, 3.5765e-05, 3.1863e-05, 2.8815e-05, + 3.5212e-05, 2.7106e-05], device='cuda:2') +2022-11-15 23:17:02,916 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50946.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:17:42,749 INFO [train.py:876] (2/4) Epoch 8, batch 100, loss[loss=0.1134, simple_loss=0.1362, pruned_loss=0.0453, over 5696.00 frames. ], tot_loss[loss=0.1471, simple_loss=0.1637, pruned_loss=0.06522, over 433198.28 frames. ], batch size: 12, lr: 1.04e-02, grad_scale: 16.0 +2022-11-15 23:17:44,302 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51007.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:17:53,335 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.018e+02 1.590e+02 1.934e+02 2.468e+02 5.065e+02, threshold=3.869e+02, percent-clipped=2.0 +2022-11-15 23:17:57,739 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.30 vs. limit=5.0 +2022-11-15 23:18:07,923 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.64 vs. limit=2.0 +2022-11-15 23:18:08,867 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6292, 4.6202, 3.3229, 4.4999, 3.5692, 3.1157, 2.5952, 3.9437], + device='cuda:2'), covar=tensor([0.1386, 0.0240, 0.0855, 0.0281, 0.0508, 0.0989, 0.1902, 0.0312], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0139, 0.0166, 0.0142, 0.0176, 0.0178, 0.0178, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:18:14,687 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0078, 3.8896, 2.7349, 3.7583, 2.9883, 2.6204, 2.1131, 3.3313], + device='cuda:2'), covar=tensor([0.1670, 0.0292, 0.0911, 0.0284, 0.0766, 0.1047, 0.1949, 0.0360], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0138, 0.0166, 0.0141, 0.0175, 0.0177, 0.0178, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:18:21,358 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51062.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:18:49,824 INFO [train.py:876] (2/4) Epoch 8, batch 200, loss[loss=0.1345, simple_loss=0.152, pruned_loss=0.05852, over 5735.00 frames. ], tot_loss[loss=0.1433, simple_loss=0.1612, pruned_loss=0.06263, over 696959.37 frames. ], batch size: 31, lr: 1.04e-02, grad_scale: 16.0 +2022-11-15 23:18:49,893 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3943, 3.0893, 3.2464, 2.9722, 3.4752, 3.3259, 3.1660, 3.4120], + device='cuda:2'), covar=tensor([0.0446, 0.0375, 0.0471, 0.0461, 0.0408, 0.0188, 0.0337, 0.0427], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0128, 0.0098, 0.0128, 0.0141, 0.0084, 0.0108, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 23:19:00,022 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.568e+01 1.812e+02 2.179e+02 2.624e+02 4.566e+02, threshold=4.359e+02, percent-clipped=4.0 +2022-11-15 23:19:01,573 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51123.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:19:11,893 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51138.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:19:29,232 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51164.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 23:19:34,477 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51172.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:19:43,973 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.22 vs. limit=2.0 +2022-11-15 23:19:45,098 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.19 vs. limit=2.0 +2022-11-15 23:19:51,603 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-11-15 23:19:54,826 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51202.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:19:56,974 INFO [train.py:876] (2/4) Epoch 8, batch 300, loss[loss=0.1002, simple_loss=0.1226, pruned_loss=0.0389, over 5251.00 frames. ], tot_loss[loss=0.1449, simple_loss=0.162, pruned_loss=0.06392, over 855064.11 frames. ], batch size: 8, lr: 1.04e-02, grad_scale: 16.0 +2022-11-15 23:20:06,883 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.85 vs. limit=2.0 +2022-11-15 23:20:07,740 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.034e+02 1.685e+02 2.012e+02 2.730e+02 5.121e+02, threshold=4.024e+02, percent-clipped=2.0 +2022-11-15 23:20:15,738 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51233.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:20:27,337 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51250.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:20:36,069 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5736, 1.6888, 2.3893, 2.2302, 2.3330, 1.6054, 2.1236, 2.5269], + device='cuda:2'), covar=tensor([0.0421, 0.1022, 0.0527, 0.0746, 0.0586, 0.1009, 0.0710, 0.0495], + device='cuda:2'), in_proj_covar=tensor([0.0217, 0.0191, 0.0199, 0.0208, 0.0212, 0.0187, 0.0222, 0.0216], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:20:37,356 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0574, 3.6656, 2.6113, 3.4247, 2.6133, 2.6644, 2.1505, 3.0841], + device='cuda:2'), covar=tensor([0.1437, 0.0212, 0.0983, 0.0306, 0.0955, 0.0920, 0.1693, 0.0356], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0137, 0.0166, 0.0140, 0.0175, 0.0175, 0.0177, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:20:46,047 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6315, 1.6036, 1.8602, 1.6090, 1.0364, 1.5281, 1.1900, 1.2819], + device='cuda:2'), covar=tensor([0.0112, 0.0051, 0.0066, 0.0084, 0.0198, 0.0069, 0.0166, 0.0106], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0148, 0.0163, 0.0185, 0.0179, 0.0162, 0.0176, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:21:03,050 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51302.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:21:03,847 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8315, 3.7257, 3.8721, 1.9970, 3.4860, 3.8369, 3.9467, 4.5783], + device='cuda:2'), covar=tensor([0.1800, 0.1189, 0.0463, 0.2606, 0.0231, 0.0466, 0.0331, 0.0309], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0187, 0.0155, 0.0190, 0.0169, 0.0178, 0.0145, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 23:21:05,012 INFO [train.py:876] (2/4) Epoch 8, batch 400, loss[loss=0.1469, simple_loss=0.1644, pruned_loss=0.06469, over 5618.00 frames. ], tot_loss[loss=0.1451, simple_loss=0.1619, pruned_loss=0.06412, over 938885.28 frames. ], batch size: 29, lr: 1.04e-02, grad_scale: 16.0 +2022-11-15 23:21:16,263 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.652e+01 1.583e+02 1.915e+02 2.557e+02 6.087e+02, threshold=3.830e+02, percent-clipped=2.0 +2022-11-15 23:21:46,363 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2465, 2.4732, 2.9956, 3.9037, 3.9132, 3.2889, 2.4233, 3.9279], + device='cuda:2'), covar=tensor([0.0365, 0.2979, 0.2549, 0.2755, 0.0839, 0.2518, 0.2332, 0.0844], + device='cuda:2'), in_proj_covar=tensor([0.0215, 0.0207, 0.0205, 0.0325, 0.0228, 0.0218, 0.0198, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 23:21:57,197 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51382.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:21:58,549 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0566, 1.5006, 1.1828, 0.9369, 1.1327, 1.1405, 0.8075, 1.3125], + device='cuda:2'), covar=tensor([0.0035, 0.0023, 0.0042, 0.0039, 0.0033, 0.0030, 0.0052, 0.0030], + device='cuda:2'), in_proj_covar=tensor([0.0040, 0.0035, 0.0038, 0.0039, 0.0036, 0.0034, 0.0037, 0.0031], + device='cuda:2'), out_proj_covar=tensor([3.6319e-05, 3.2301e-05, 3.5061e-05, 3.5048e-05, 3.1606e-05, 2.8939e-05, + 3.5085e-05, 2.7219e-05], device='cuda:2') +2022-11-15 23:22:03,770 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51392.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:22:04,496 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7624, 2.0840, 3.3353, 2.8758, 3.6845, 2.0117, 2.9157, 3.8230], + device='cuda:2'), covar=tensor([0.0649, 0.2034, 0.0837, 0.1561, 0.0546, 0.1930, 0.1405, 0.0701], + device='cuda:2'), in_proj_covar=tensor([0.0217, 0.0191, 0.0198, 0.0208, 0.0212, 0.0186, 0.0220, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:22:12,922 INFO [train.py:876] (2/4) Epoch 8, batch 500, loss[loss=0.132, simple_loss=0.1571, pruned_loss=0.05348, over 5565.00 frames. ], tot_loss[loss=0.1443, simple_loss=0.1615, pruned_loss=0.06356, over 992901.62 frames. ], batch size: 40, lr: 1.04e-02, grad_scale: 16.0 +2022-11-15 23:22:21,689 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51418.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:22:21,992 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.05 vs. limit=2.0 +2022-11-15 23:22:23,655 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.464e+01 1.684e+02 2.091e+02 2.743e+02 4.142e+02, threshold=4.181e+02, percent-clipped=1.0 +2022-11-15 23:22:35,557 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51438.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:22:38,892 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51443.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:22:45,533 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51453.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:22:47,384 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-15 23:22:53,370 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51464.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:23:08,423 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51486.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:23:14,486 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51495.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:23:21,004 INFO [train.py:876] (2/4) Epoch 8, batch 600, loss[loss=0.1732, simple_loss=0.1888, pruned_loss=0.07882, over 5608.00 frames. ], tot_loss[loss=0.1442, simple_loss=0.1613, pruned_loss=0.06359, over 1032134.66 frames. ], batch size: 22, lr: 1.04e-02, grad_scale: 32.0 +2022-11-15 23:23:26,056 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51512.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:23:32,195 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.319e+01 1.675e+02 2.028e+02 2.576e+02 4.109e+02, threshold=4.056e+02, percent-clipped=0.0 +2022-11-15 23:23:37,232 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51528.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:23:56,179 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51556.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:24:28,201 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51602.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:24:28,837 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9998, 1.9220, 2.2531, 1.4237, 1.0427, 2.9843, 2.1786, 1.5983], + device='cuda:2'), covar=tensor([0.0960, 0.1163, 0.0681, 0.2934, 0.6669, 0.0627, 0.1257, 0.1630], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0056, 0.0059, 0.0076, 0.0059, 0.0047, 0.0054, 0.0061], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 23:24:30,082 INFO [train.py:876] (2/4) Epoch 8, batch 700, loss[loss=0.126, simple_loss=0.1504, pruned_loss=0.05078, over 5558.00 frames. ], tot_loss[loss=0.1469, simple_loss=0.1637, pruned_loss=0.0651, over 1051525.94 frames. ], batch size: 21, lr: 1.04e-02, grad_scale: 32.0 +2022-11-15 23:24:35,029 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9952, 1.5612, 1.0450, 0.9491, 1.3372, 1.2580, 0.6432, 1.4688], + device='cuda:2'), covar=tensor([0.0029, 0.0018, 0.0028, 0.0032, 0.0025, 0.0023, 0.0056, 0.0023], + device='cuda:2'), in_proj_covar=tensor([0.0040, 0.0036, 0.0039, 0.0040, 0.0037, 0.0034, 0.0038, 0.0031], + device='cuda:2'), out_proj_covar=tensor([3.6497e-05, 3.3278e-05, 3.5828e-05, 3.6052e-05, 3.2611e-05, 2.9060e-05, + 3.6151e-05, 2.7664e-05], device='cuda:2') +2022-11-15 23:24:38,289 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5685, 2.1889, 2.5623, 3.2626, 3.4076, 2.5208, 2.0886, 3.4734], + device='cuda:2'), covar=tensor([0.0674, 0.3081, 0.2630, 0.4058, 0.1568, 0.3456, 0.2310, 0.0618], + device='cuda:2'), in_proj_covar=tensor([0.0215, 0.0207, 0.0200, 0.0324, 0.0228, 0.0215, 0.0194, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 23:24:40,707 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.048e+01 1.604e+02 2.114e+02 2.490e+02 4.177e+02, threshold=4.229e+02, percent-clipped=3.0 +2022-11-15 23:25:01,592 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51650.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:25:39,230 INFO [train.py:876] (2/4) Epoch 8, batch 800, loss[loss=0.1456, simple_loss=0.1442, pruned_loss=0.07349, over 4143.00 frames. ], tot_loss[loss=0.1445, simple_loss=0.1613, pruned_loss=0.06387, over 1053284.24 frames. ], batch size: 181, lr: 1.04e-02, grad_scale: 16.0 +2022-11-15 23:25:47,917 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51718.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:25:50,399 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.606e+01 1.543e+02 1.960e+02 2.359e+02 4.121e+02, threshold=3.919e+02, percent-clipped=0.0 +2022-11-15 23:25:51,831 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4152, 5.0403, 4.5816, 4.9569, 5.0646, 4.3476, 4.8698, 4.5491], + device='cuda:2'), covar=tensor([0.0282, 0.0346, 0.1315, 0.0566, 0.0407, 0.0373, 0.0400, 0.0484], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0165, 0.0258, 0.0157, 0.0203, 0.0159, 0.0173, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 23:26:01,938 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51738.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:26:08,000 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51747.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:26:08,934 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51748.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:26:20,931 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51766.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:26:23,261 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.68 vs. limit=5.0 +2022-11-15 23:26:23,670 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0794, 2.0770, 2.5443, 1.4225, 1.4790, 2.9425, 2.3239, 2.0452], + device='cuda:2'), covar=tensor([0.0788, 0.0865, 0.0443, 0.2539, 0.3042, 0.0668, 0.0945, 0.1158], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0057, 0.0060, 0.0075, 0.0057, 0.0048, 0.0053, 0.0061], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 23:26:41,607 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.72 vs. limit=5.0 +2022-11-15 23:26:47,898 INFO [train.py:876] (2/4) Epoch 8, batch 900, loss[loss=0.1701, simple_loss=0.1836, pruned_loss=0.07829, over 5698.00 frames. ], tot_loss[loss=0.1438, simple_loss=0.1611, pruned_loss=0.06324, over 1071617.61 frames. ], batch size: 28, lr: 1.04e-02, grad_scale: 16.0 +2022-11-15 23:26:50,119 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51808.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:26:53,813 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0414, 4.6761, 4.0627, 3.7177, 2.3618, 4.4317, 2.4219, 3.7454], + device='cuda:2'), covar=tensor([0.0307, 0.0061, 0.0173, 0.0330, 0.0467, 0.0097, 0.0406, 0.0093], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0144, 0.0159, 0.0179, 0.0172, 0.0158, 0.0171, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:26:59,511 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.277e+02 1.796e+02 2.172e+02 2.751e+02 5.616e+02, threshold=4.345e+02, percent-clipped=4.0 +2022-11-15 23:27:03,655 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51828.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 23:27:07,663 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1731, 4.3961, 3.0220, 4.1615, 3.4613, 3.0102, 2.3806, 3.7683], + device='cuda:2'), covar=tensor([0.1712, 0.0203, 0.0965, 0.0332, 0.0623, 0.0893, 0.1858, 0.0315], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0137, 0.0165, 0.0141, 0.0174, 0.0176, 0.0175, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:27:12,052 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8582, 4.4225, 4.6610, 4.3788, 4.9478, 4.8410, 4.3579, 4.8937], + device='cuda:2'), covar=tensor([0.0424, 0.0288, 0.0460, 0.0280, 0.0364, 0.0119, 0.0252, 0.0278], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0132, 0.0100, 0.0130, 0.0146, 0.0087, 0.0110, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 23:27:18,387 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5252, 4.0547, 3.2364, 2.0105, 3.9162, 1.3138, 3.7117, 2.2413], + device='cuda:2'), covar=tensor([0.1485, 0.0171, 0.0569, 0.2034, 0.0174, 0.2236, 0.0268, 0.1593], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0108, 0.0118, 0.0117, 0.0107, 0.0128, 0.0102, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:27:19,697 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51851.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:27:37,039 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51876.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:27:48,033 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0903, 1.3967, 1.6654, 1.0252, 1.2431, 1.6068, 1.0643, 0.7063], + device='cuda:2'), covar=tensor([0.0021, 0.0061, 0.0018, 0.0036, 0.0032, 0.0019, 0.0029, 0.0044], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0020, 0.0025, 0.0022, 0.0020, 0.0024, 0.0024], + device='cuda:2'), out_proj_covar=tensor([1.8290e-05, 1.9464e-05, 1.8547e-05, 2.4860e-05, 2.1233e-05, 1.9779e-05, + 2.3120e-05, 2.5534e-05], device='cuda:2') +2022-11-15 23:27:57,331 INFO [train.py:876] (2/4) Epoch 8, batch 1000, loss[loss=0.1963, simple_loss=0.1945, pruned_loss=0.09909, over 4676.00 frames. ], tot_loss[loss=0.144, simple_loss=0.1612, pruned_loss=0.06337, over 1076019.15 frames. ], batch size: 135, lr: 1.04e-02, grad_scale: 16.0 +2022-11-15 23:28:05,019 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3336, 1.0303, 1.0924, 0.7932, 1.1131, 1.0994, 0.7575, 0.8024], + device='cuda:2'), covar=tensor([0.0535, 0.0597, 0.0348, 0.1094, 0.1106, 0.0632, 0.1069, 0.0536], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0011, 0.0014, 0.0012, 0.0011, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.5324e-05, 7.3615e-05, 5.4550e-05, 6.5331e-05, 6.0447e-05, 5.4626e-05, + 6.8145e-05, 5.6243e-05], device='cuda:2') +2022-11-15 23:28:08,740 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.111e+02 1.780e+02 2.132e+02 2.745e+02 5.068e+02, threshold=4.264e+02, percent-clipped=2.0 +2022-11-15 23:28:32,653 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0112, 4.9657, 4.9991, 5.0956, 4.4453, 3.8507, 5.6317, 4.7873], + device='cuda:2'), covar=tensor([0.0349, 0.0650, 0.0480, 0.1209, 0.0445, 0.0357, 0.0692, 0.0454], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0098, 0.0084, 0.0108, 0.0080, 0.0071, 0.0133, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:29:04,721 INFO [train.py:876] (2/4) Epoch 8, batch 1100, loss[loss=0.2389, simple_loss=0.2089, pruned_loss=0.1345, over 2918.00 frames. ], tot_loss[loss=0.1454, simple_loss=0.1621, pruned_loss=0.0644, over 1076720.52 frames. ], batch size: 284, lr: 1.03e-02, grad_scale: 16.0 +2022-11-15 23:29:12,352 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 23:29:16,572 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.856e+01 1.738e+02 2.117e+02 2.536e+02 5.317e+02, threshold=4.235e+02, percent-clipped=1.0 +2022-11-15 23:29:19,702 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2955, 1.2917, 1.8307, 1.3253, 1.3236, 1.5274, 1.1910, 1.2786], + device='cuda:2'), covar=tensor([0.0103, 0.0072, 0.0025, 0.0051, 0.0057, 0.0083, 0.0031, 0.0047], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0021, 0.0025, 0.0023, 0.0021, 0.0024, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.8885e-05, 1.9755e-05, 1.9082e-05, 2.5378e-05, 2.1624e-05, 2.0688e-05, + 2.3764e-05, 2.6098e-05], device='cuda:2') +2022-11-15 23:29:25,949 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.36 vs. limit=2.0 +2022-11-15 23:29:26,364 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2246, 2.8838, 3.0432, 2.7704, 1.8776, 2.9874, 1.9789, 2.6924], + device='cuda:2'), covar=tensor([0.0326, 0.0170, 0.0141, 0.0301, 0.0378, 0.0143, 0.0373, 0.0147], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0147, 0.0161, 0.0182, 0.0174, 0.0160, 0.0173, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:29:27,935 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52038.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:29:29,213 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52040.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:29:34,335 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52048.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:29:45,745 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.19 vs. limit=2.0 +2022-11-15 23:29:55,141 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.78 vs. limit=2.0 +2022-11-15 23:29:59,646 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52086.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:30:06,071 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52096.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:30:09,794 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52101.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:30:10,950 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52103.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:30:12,208 INFO [train.py:876] (2/4) Epoch 8, batch 1200, loss[loss=0.1676, simple_loss=0.1801, pruned_loss=0.07756, over 5546.00 frames. ], tot_loss[loss=0.1442, simple_loss=0.1611, pruned_loss=0.0636, over 1077697.74 frames. ], batch size: 40, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:30:12,310 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9685, 4.7231, 3.6247, 1.9835, 4.5019, 1.8849, 4.3675, 2.5769], + device='cuda:2'), covar=tensor([0.1354, 0.0190, 0.0522, 0.2414, 0.0172, 0.2206, 0.0226, 0.1797], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0107, 0.0117, 0.0116, 0.0107, 0.0129, 0.0100, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:30:23,870 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.007e+02 1.672e+02 2.083e+02 2.557e+02 4.587e+02, threshold=4.167e+02, percent-clipped=2.0 +2022-11-15 23:30:43,199 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 23:30:43,494 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52151.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:30:44,163 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4981, 4.0927, 3.3673, 1.6993, 3.9378, 1.5982, 3.6874, 2.0495], + device='cuda:2'), covar=tensor([0.1975, 0.0363, 0.0783, 0.3145, 0.0363, 0.2894, 0.0470, 0.2655], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0107, 0.0117, 0.0116, 0.0107, 0.0129, 0.0100, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:31:12,355 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52193.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:31:14,666 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-11-15 23:31:16,112 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52199.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:31:16,127 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6893, 4.4924, 4.7360, 4.7198, 4.3140, 4.0763, 5.1881, 4.4782], + device='cuda:2'), covar=tensor([0.0400, 0.1125, 0.0345, 0.1181, 0.0488, 0.0330, 0.0660, 0.0520], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0101, 0.0084, 0.0111, 0.0082, 0.0072, 0.0136, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:31:20,578 INFO [train.py:876] (2/4) Epoch 8, batch 1300, loss[loss=0.1179, simple_loss=0.1392, pruned_loss=0.04836, over 5735.00 frames. ], tot_loss[loss=0.144, simple_loss=0.1611, pruned_loss=0.0635, over 1076995.60 frames. ], batch size: 11, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:31:24,727 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6544, 4.1932, 3.8049, 3.5287, 2.2494, 4.0805, 2.2722, 3.6198], + device='cuda:2'), covar=tensor([0.0429, 0.0182, 0.0172, 0.0324, 0.0539, 0.0131, 0.0495, 0.0152], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0146, 0.0160, 0.0181, 0.0174, 0.0160, 0.0173, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:31:32,436 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.004e+02 1.693e+02 2.062e+02 2.550e+02 7.238e+02, threshold=4.125e+02, percent-clipped=3.0 +2022-11-15 23:31:54,168 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52254.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:32:26,708 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6200, 4.1087, 3.6119, 4.0436, 4.0905, 3.4713, 3.7062, 3.5880], + device='cuda:2'), covar=tensor([0.0679, 0.0471, 0.1312, 0.0416, 0.0440, 0.0412, 0.0680, 0.0714], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0162, 0.0259, 0.0159, 0.0203, 0.0160, 0.0174, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 23:32:27,892 INFO [train.py:876] (2/4) Epoch 8, batch 1400, loss[loss=0.1336, simple_loss=0.1667, pruned_loss=0.05027, over 5624.00 frames. ], tot_loss[loss=0.1431, simple_loss=0.161, pruned_loss=0.06265, over 1078584.40 frames. ], batch size: 18, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:32:32,562 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52312.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:32:39,668 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.080e+02 1.794e+02 2.190e+02 2.627e+02 5.142e+02, threshold=4.380e+02, percent-clipped=4.0 +2022-11-15 23:32:46,415 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4648, 3.7893, 3.6343, 3.3073, 2.1732, 3.8518, 2.1785, 3.0875], + device='cuda:2'), covar=tensor([0.0405, 0.0211, 0.0155, 0.0344, 0.0502, 0.0127, 0.0498, 0.0145], + device='cuda:2'), in_proj_covar=tensor([0.0184, 0.0150, 0.0164, 0.0186, 0.0179, 0.0163, 0.0177, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:33:04,364 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9441, 0.6018, 0.7994, 0.7911, 0.9240, 0.9450, 0.5367, 0.6026], + device='cuda:2'), covar=tensor([0.0267, 0.0390, 0.0307, 0.0536, 0.0333, 0.0299, 0.0931, 0.0435], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0011, 0.0013, 0.0012, 0.0010, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.6044e-05, 7.2768e-05, 5.4260e-05, 6.4467e-05, 5.9915e-05, 5.4531e-05, + 6.8240e-05, 5.5699e-05], device='cuda:2') +2022-11-15 23:33:13,765 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52373.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:33:25,676 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-11-15 23:33:29,160 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52396.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:33:33,941 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52403.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:33:35,129 INFO [train.py:876] (2/4) Epoch 8, batch 1500, loss[loss=0.0711, simple_loss=0.1081, pruned_loss=0.01704, over 5718.00 frames. ], tot_loss[loss=0.1431, simple_loss=0.1607, pruned_loss=0.06275, over 1085953.19 frames. ], batch size: 13, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:33:47,243 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.037e+02 1.630e+02 1.908e+02 2.524e+02 5.804e+02, threshold=3.816e+02, percent-clipped=2.0 +2022-11-15 23:34:06,269 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52451.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:34:16,570 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5222, 3.2350, 3.4585, 3.2909, 3.5494, 3.4493, 1.3475, 3.7191], + device='cuda:2'), covar=tensor([0.0375, 0.0571, 0.0356, 0.0387, 0.0439, 0.0402, 0.3483, 0.0295], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0082, 0.0078, 0.0072, 0.0096, 0.0081, 0.0126, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:34:42,452 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-11-15 23:34:42,718 INFO [train.py:876] (2/4) Epoch 8, batch 1600, loss[loss=0.1769, simple_loss=0.1821, pruned_loss=0.08585, over 5647.00 frames. ], tot_loss[loss=0.142, simple_loss=0.1596, pruned_loss=0.06224, over 1080371.73 frames. ], batch size: 32, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:34:55,355 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 1.681e+02 2.048e+02 2.311e+02 7.167e+02, threshold=4.097e+02, percent-clipped=4.0 +2022-11-15 23:35:05,994 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8855, 3.7385, 3.7766, 3.5816, 3.9049, 3.7121, 1.5200, 4.0519], + device='cuda:2'), covar=tensor([0.0329, 0.0491, 0.0317, 0.0422, 0.0337, 0.0406, 0.3324, 0.0318], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0082, 0.0079, 0.0073, 0.0097, 0.0082, 0.0126, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:35:13,814 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52549.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:35:18,150 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52555.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:35:54,044 INFO [train.py:876] (2/4) Epoch 8, batch 1700, loss[loss=0.1163, simple_loss=0.1398, pruned_loss=0.04636, over 5466.00 frames. ], tot_loss[loss=0.1431, simple_loss=0.1606, pruned_loss=0.06281, over 1082560.74 frames. ], batch size: 11, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:36:02,203 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52616.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:36:06,802 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 1.756e+02 2.105e+02 2.573e+02 4.026e+02, threshold=4.210e+02, percent-clipped=0.0 +2022-11-15 23:36:20,634 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2984, 4.7987, 4.2556, 4.6776, 4.7956, 3.9097, 4.5310, 4.0682], + device='cuda:2'), covar=tensor([0.0222, 0.0358, 0.1342, 0.0377, 0.0350, 0.0460, 0.0469, 0.0573], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0163, 0.0259, 0.0159, 0.0205, 0.0160, 0.0175, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 23:36:38,980 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52668.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:36:49,134 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-11-15 23:36:51,532 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3342, 2.2816, 3.9471, 3.2574, 4.4042, 2.9385, 3.9439, 4.2158], + device='cuda:2'), covar=tensor([0.0566, 0.2198, 0.0695, 0.1890, 0.0238, 0.1631, 0.1068, 0.0638], + device='cuda:2'), in_proj_covar=tensor([0.0221, 0.0191, 0.0195, 0.0208, 0.0214, 0.0189, 0.0219, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:36:59,017 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52696.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:37:05,453 INFO [train.py:876] (2/4) Epoch 8, batch 1800, loss[loss=0.1018, simple_loss=0.1282, pruned_loss=0.03765, over 5504.00 frames. ], tot_loss[loss=0.1412, simple_loss=0.1593, pruned_loss=0.06158, over 1082765.72 frames. ], batch size: 12, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:37:18,120 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.097e+02 1.718e+02 1.975e+02 2.529e+02 4.433e+02, threshold=3.950e+02, percent-clipped=1.0 +2022-11-15 23:37:33,369 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52744.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:37:38,968 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52752.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:37:51,668 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.05 vs. limit=2.0 +2022-11-15 23:37:58,474 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0151, 1.1131, 0.9632, 0.8764, 1.3301, 1.2703, 0.8419, 1.4021], + device='cuda:2'), covar=tensor([0.0036, 0.0019, 0.0042, 0.0048, 0.0026, 0.0024, 0.0079, 0.0027], + device='cuda:2'), in_proj_covar=tensor([0.0043, 0.0039, 0.0041, 0.0041, 0.0039, 0.0035, 0.0040, 0.0034], + device='cuda:2'), out_proj_covar=tensor([3.9597e-05, 3.5190e-05, 3.7128e-05, 3.6779e-05, 3.4698e-05, 3.0441e-05, + 3.8114e-05, 2.9915e-05], device='cuda:2') +2022-11-15 23:38:01,150 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0959, 4.7753, 3.6858, 2.1083, 4.5316, 2.4371, 4.7480, 2.6786], + device='cuda:2'), covar=tensor([0.1280, 0.0172, 0.0564, 0.2335, 0.0176, 0.1470, 0.0137, 0.1686], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0106, 0.0116, 0.0114, 0.0105, 0.0126, 0.0098, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 23:38:05,680 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5132, 5.1327, 3.1496, 4.8452, 3.7339, 3.2249, 2.9437, 4.4162], + device='cuda:2'), covar=tensor([0.1380, 0.0126, 0.0847, 0.0199, 0.0419, 0.0883, 0.1515, 0.0140], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0139, 0.0165, 0.0141, 0.0174, 0.0177, 0.0174, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:38:17,349 INFO [train.py:876] (2/4) Epoch 8, batch 1900, loss[loss=0.1858, simple_loss=0.1955, pruned_loss=0.08802, over 5738.00 frames. ], tot_loss[loss=0.1395, simple_loss=0.1584, pruned_loss=0.06029, over 1088615.96 frames. ], batch size: 31, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:38:19,225 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-11-15 23:38:23,495 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52813.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:38:28,117 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-11-15 23:38:30,567 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.099e+02 1.627e+02 1.958e+02 2.548e+02 4.819e+02, threshold=3.916e+02, percent-clipped=3.0 +2022-11-15 23:38:49,593 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52849.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:38:51,055 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3209, 2.4550, 3.8086, 2.9929, 4.0856, 2.9309, 3.7309, 4.2837], + device='cuda:2'), covar=tensor([0.0455, 0.1507, 0.0658, 0.1719, 0.0338, 0.1331, 0.1087, 0.0577], + device='cuda:2'), in_proj_covar=tensor([0.0221, 0.0192, 0.0198, 0.0210, 0.0216, 0.0189, 0.0221, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:39:23,807 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52897.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:39:29,668 INFO [train.py:876] (2/4) Epoch 8, batch 2000, loss[loss=0.1497, simple_loss=0.1713, pruned_loss=0.06406, over 5833.00 frames. ], tot_loss[loss=0.142, simple_loss=0.16, pruned_loss=0.06202, over 1087726.35 frames. ], batch size: 18, lr: 1.03e-02, grad_scale: 8.0 +2022-11-15 23:39:33,832 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52911.0, num_to_drop=1, layers_to_drop={2} +2022-11-15 23:39:42,131 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.867e+01 1.663e+02 2.013e+02 2.665e+02 5.051e+02, threshold=4.025e+02, percent-clipped=6.0 +2022-11-15 23:40:14,207 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52968.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:40:28,462 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0686, 1.2490, 1.2633, 0.9273, 0.8043, 1.1542, 1.2652, 1.1250], + device='cuda:2'), covar=tensor([0.3367, 0.0746, 0.0676, 0.0895, 0.4432, 0.1928, 0.2034, 0.1590], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0011, 0.0014, 0.0013, 0.0011, 0.0015, 0.0012], + device='cuda:2'), out_proj_covar=tensor([5.7103e-05, 7.5665e-05, 5.7389e-05, 6.8093e-05, 6.2585e-05, 5.6872e-05, + 7.1123e-05, 5.8261e-05], device='cuda:2') +2022-11-15 23:40:40,881 INFO [train.py:876] (2/4) Epoch 8, batch 2100, loss[loss=0.1032, simple_loss=0.1365, pruned_loss=0.03498, over 5503.00 frames. ], tot_loss[loss=0.1412, simple_loss=0.1593, pruned_loss=0.06152, over 1093680.81 frames. ], batch size: 11, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:40:41,840 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.06 vs. limit=5.0 +2022-11-15 23:40:44,268 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.37 vs. limit=2.0 +2022-11-15 23:40:48,788 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53016.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:40:53,708 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.147e+02 1.690e+02 2.165e+02 2.534e+02 4.185e+02, threshold=4.330e+02, percent-clipped=4.0 +2022-11-15 23:41:14,438 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1563, 2.5679, 2.6163, 1.3210, 2.8284, 2.7508, 2.8072, 3.1340], + device='cuda:2'), covar=tensor([0.1824, 0.1526, 0.0949, 0.2893, 0.0582, 0.0792, 0.0421, 0.0757], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0187, 0.0158, 0.0191, 0.0169, 0.0180, 0.0151, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 23:41:17,505 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53056.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:41:52,189 INFO [train.py:876] (2/4) Epoch 8, batch 2200, loss[loss=0.1102, simple_loss=0.1412, pruned_loss=0.03957, over 5448.00 frames. ], tot_loss[loss=0.1396, simple_loss=0.1582, pruned_loss=0.06047, over 1092919.93 frames. ], batch size: 11, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:41:54,665 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53108.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:42:01,091 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53117.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:42:05,399 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 1.642e+02 2.019e+02 2.545e+02 4.106e+02, threshold=4.038e+02, percent-clipped=0.0 +2022-11-15 23:42:13,020 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9440, 4.3975, 3.9674, 3.8803, 2.5488, 4.5180, 2.5859, 3.9606], + device='cuda:2'), covar=tensor([0.0380, 0.0160, 0.0161, 0.0449, 0.0488, 0.0098, 0.0411, 0.0140], + device='cuda:2'), in_proj_covar=tensor([0.0184, 0.0153, 0.0165, 0.0187, 0.0179, 0.0164, 0.0176, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:42:54,747 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8217, 2.2853, 3.1669, 2.7003, 3.4583, 2.3780, 3.0524, 3.6746], + device='cuda:2'), covar=tensor([0.0511, 0.1670, 0.0831, 0.1740, 0.0614, 0.1439, 0.1349, 0.0752], + device='cuda:2'), in_proj_covar=tensor([0.0221, 0.0193, 0.0200, 0.0211, 0.0216, 0.0188, 0.0222, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-15 23:43:05,030 INFO [train.py:876] (2/4) Epoch 8, batch 2300, loss[loss=0.1395, simple_loss=0.1757, pruned_loss=0.05168, over 5592.00 frames. ], tot_loss[loss=0.1391, simple_loss=0.1578, pruned_loss=0.0602, over 1089825.10 frames. ], batch size: 18, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:43:09,389 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53211.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:43:13,261 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2060, 4.2078, 4.4074, 4.0347, 4.2545, 3.9146, 1.6082, 4.4787], + device='cuda:2'), covar=tensor([0.0287, 0.0559, 0.0166, 0.0332, 0.0291, 0.0427, 0.3351, 0.0295], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0083, 0.0081, 0.0074, 0.0099, 0.0084, 0.0129, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:43:17,886 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.001e+02 1.609e+02 1.989e+02 2.421e+02 4.681e+02, threshold=3.978e+02, percent-clipped=2.0 +2022-11-15 23:43:31,404 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53241.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:43:43,978 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53259.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:43:52,329 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.56 vs. limit=2.0 +2022-11-15 23:44:06,635 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9837, 2.7160, 3.0458, 1.4030, 3.0213, 3.1106, 3.1324, 3.6078], + device='cuda:2'), covar=tensor([0.2338, 0.1733, 0.0861, 0.3522, 0.0543, 0.0906, 0.0497, 0.0630], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0185, 0.0158, 0.0190, 0.0170, 0.0180, 0.0149, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 23:44:14,676 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53302.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:44:16,029 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53304.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:44:16,591 INFO [train.py:876] (2/4) Epoch 8, batch 2400, loss[loss=0.0943, simple_loss=0.1337, pruned_loss=0.02746, over 5549.00 frames. ], tot_loss[loss=0.1395, simple_loss=0.1584, pruned_loss=0.06024, over 1094622.67 frames. ], batch size: 13, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:44:24,082 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53315.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:44:29,620 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.094e+02 1.667e+02 1.893e+02 2.315e+02 4.306e+02, threshold=3.787e+02, percent-clipped=3.0 +2022-11-15 23:44:40,165 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-15 23:45:00,210 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53365.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:45:01,565 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53367.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:45:07,910 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53376.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:45:11,686 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4014, 3.3387, 3.2473, 3.1198, 2.0569, 3.3523, 2.1492, 3.0059], + device='cuda:2'), covar=tensor([0.0320, 0.0127, 0.0152, 0.0281, 0.0412, 0.0140, 0.0395, 0.0108], + device='cuda:2'), in_proj_covar=tensor([0.0185, 0.0154, 0.0165, 0.0187, 0.0180, 0.0166, 0.0177, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:45:28,841 INFO [train.py:876] (2/4) Epoch 8, batch 2500, loss[loss=0.1686, simple_loss=0.1874, pruned_loss=0.0749, over 5267.00 frames. ], tot_loss[loss=0.1394, simple_loss=0.1585, pruned_loss=0.06017, over 1093106.73 frames. ], batch size: 79, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:45:31,084 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53408.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:45:33,743 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53412.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:45:41,344 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.144e+02 1.752e+02 2.233e+02 2.745e+02 4.955e+02, threshold=4.465e+02, percent-clipped=8.0 +2022-11-15 23:45:44,964 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53428.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:46:04,891 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53456.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:46:15,275 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-15 23:46:39,616 INFO [train.py:876] (2/4) Epoch 8, batch 2600, loss[loss=0.1308, simple_loss=0.158, pruned_loss=0.05184, over 5524.00 frames. ], tot_loss[loss=0.1399, simple_loss=0.1587, pruned_loss=0.06049, over 1094230.61 frames. ], batch size: 21, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:46:52,571 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.516e+01 1.584e+02 1.998e+02 2.446e+02 4.760e+02, threshold=3.997e+02, percent-clipped=2.0 +2022-11-15 23:47:01,730 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-11-15 23:47:15,536 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7204, 3.9180, 3.8057, 3.6527, 3.7495, 3.7503, 1.6357, 3.9141], + device='cuda:2'), covar=tensor([0.0316, 0.0317, 0.0279, 0.0245, 0.0343, 0.0358, 0.2873, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0083, 0.0081, 0.0074, 0.0099, 0.0085, 0.0127, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-15 23:47:20,656 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9323, 3.3006, 2.3998, 2.9861, 2.1751, 2.4843, 1.8854, 2.7461], + device='cuda:2'), covar=tensor([0.1469, 0.0252, 0.0972, 0.0396, 0.1220, 0.0912, 0.1804, 0.0433], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0143, 0.0167, 0.0142, 0.0177, 0.0179, 0.0177, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:47:39,502 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-11-15 23:47:45,386 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53597.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:47:47,108 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-11-15 23:47:51,090 INFO [train.py:876] (2/4) Epoch 8, batch 2700, loss[loss=0.07736, simple_loss=0.1131, pruned_loss=0.02079, over 5485.00 frames. ], tot_loss[loss=0.1384, simple_loss=0.1579, pruned_loss=0.05946, over 1087377.56 frames. ], batch size: 10, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:48:04,171 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.093e+02 1.781e+02 2.176e+02 2.706e+02 9.486e+02, threshold=4.353e+02, percent-clipped=5.0 +2022-11-15 23:48:21,588 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8153, 2.9136, 2.3302, 2.4497, 1.7115, 2.3577, 1.6808, 2.5571], + device='cuda:2'), covar=tensor([0.1335, 0.0266, 0.0816, 0.0492, 0.1454, 0.0792, 0.1659, 0.0357], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0142, 0.0164, 0.0141, 0.0176, 0.0176, 0.0176, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:48:30,815 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53660.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:48:38,715 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53671.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:48:48,233 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2733, 4.3017, 2.8781, 4.1025, 3.2982, 2.9507, 2.2719, 3.6356], + device='cuda:2'), covar=tensor([0.1646, 0.0209, 0.1049, 0.0271, 0.0734, 0.0981, 0.1953, 0.0309], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0141, 0.0165, 0.0141, 0.0175, 0.0177, 0.0177, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-15 23:48:49,648 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7110, 2.0527, 2.6120, 1.5275, 1.3943, 2.9770, 2.3895, 2.2007], + device='cuda:2'), covar=tensor([0.0787, 0.1408, 0.1083, 0.3546, 0.3569, 0.1524, 0.1180, 0.1350], + device='cuda:2'), in_proj_covar=tensor([0.0072, 0.0063, 0.0063, 0.0078, 0.0059, 0.0048, 0.0055, 0.0064], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 23:49:02,626 INFO [train.py:876] (2/4) Epoch 8, batch 2800, loss[loss=0.2111, simple_loss=0.2109, pruned_loss=0.1057, over 5436.00 frames. ], tot_loss[loss=0.1405, simple_loss=0.1586, pruned_loss=0.0612, over 1081017.32 frames. ], batch size: 58, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:49:06,298 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4220, 3.7373, 3.4451, 3.3743, 1.9922, 3.7094, 2.0248, 2.9657], + device='cuda:2'), covar=tensor([0.0457, 0.0190, 0.0174, 0.0274, 0.0520, 0.0138, 0.0459, 0.0186], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0153, 0.0162, 0.0184, 0.0178, 0.0163, 0.0175, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:49:07,919 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53712.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:49:15,686 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.500e+01 1.616e+02 2.009e+02 2.401e+02 5.865e+02, threshold=4.018e+02, percent-clipped=2.0 +2022-11-15 23:49:15,808 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53723.0, num_to_drop=1, layers_to_drop={1} +2022-11-15 23:49:35,361 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0602, 3.7452, 3.8684, 3.6871, 4.1468, 3.7224, 3.8180, 4.0995], + device='cuda:2'), covar=tensor([0.0383, 0.0348, 0.0498, 0.0337, 0.0326, 0.0435, 0.0266, 0.0325], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0131, 0.0100, 0.0129, 0.0144, 0.0087, 0.0107, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 23:49:42,121 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53760.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:49:42,506 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-15 23:50:03,497 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0738, 2.4798, 3.0263, 3.8206, 4.0542, 3.2590, 2.6564, 4.0668], + device='cuda:2'), covar=tensor([0.0365, 0.3186, 0.2327, 0.4255, 0.0812, 0.2835, 0.2122, 0.0402], + device='cuda:2'), in_proj_covar=tensor([0.0219, 0.0203, 0.0200, 0.0323, 0.0224, 0.0214, 0.0194, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 23:50:08,727 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.22 vs. limit=2.0 +2022-11-15 23:50:15,004 INFO [train.py:876] (2/4) Epoch 8, batch 2900, loss[loss=0.1455, simple_loss=0.1659, pruned_loss=0.06259, over 5668.00 frames. ], tot_loss[loss=0.1398, simple_loss=0.158, pruned_loss=0.06086, over 1082933.66 frames. ], batch size: 19, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:50:20,219 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-15 23:50:27,692 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.192e+01 1.630e+02 2.037e+02 2.446e+02 6.104e+02, threshold=4.074e+02, percent-clipped=4.0 +2022-11-15 23:50:39,935 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1290, 4.5836, 4.9118, 4.6542, 5.1726, 5.1016, 4.5298, 5.0915], + device='cuda:2'), covar=tensor([0.0313, 0.0254, 0.0399, 0.0242, 0.0307, 0.0121, 0.0255, 0.0248], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0133, 0.0101, 0.0131, 0.0146, 0.0089, 0.0109, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 23:50:44,427 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2126, 3.2467, 2.5053, 1.7583, 3.1797, 1.2604, 3.0704, 1.7456], + device='cuda:2'), covar=tensor([0.1219, 0.0168, 0.0863, 0.1805, 0.0233, 0.2027, 0.0275, 0.1646], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0106, 0.0113, 0.0115, 0.0102, 0.0127, 0.0098, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-15 23:50:44,668 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-15 23:50:49,013 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-15 23:51:20,659 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53897.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:51:26,048 INFO [train.py:876] (2/4) Epoch 8, batch 3000, loss[loss=0.1254, simple_loss=0.1497, pruned_loss=0.05056, over 5586.00 frames. ], tot_loss[loss=0.1389, simple_loss=0.1571, pruned_loss=0.06031, over 1083731.22 frames. ], batch size: 23, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:51:26,049 INFO [train.py:899] (2/4) Computing validation loss +2022-11-15 23:51:44,992 INFO [train.py:908] (2/4) Epoch 8, validation: loss=0.1608, simple_loss=0.1816, pruned_loss=0.06996, over 1530663.00 frames. +2022-11-15 23:51:44,994 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-15 23:51:57,592 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.245e+01 1.684e+02 1.979e+02 2.404e+02 5.002e+02, threshold=3.957e+02, percent-clipped=2.0 +2022-11-15 23:52:13,791 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53945.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:52:24,742 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53960.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:52:32,414 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53971.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:52:57,148 INFO [train.py:876] (2/4) Epoch 8, batch 3100, loss[loss=0.1062, simple_loss=0.1402, pruned_loss=0.03611, over 5763.00 frames. ], tot_loss[loss=0.1382, simple_loss=0.1571, pruned_loss=0.05962, over 1085068.27 frames. ], batch size: 20, lr: 1.02e-02, grad_scale: 8.0 +2022-11-15 23:52:59,294 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=54008.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:53:07,363 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=54019.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:53:09,939 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.766e+01 1.773e+02 2.219e+02 2.737e+02 4.389e+02, threshold=4.437e+02, percent-clipped=4.0 +2022-11-15 23:53:10,096 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=54023.0, num_to_drop=1, layers_to_drop={0} +2022-11-15 23:53:43,826 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=54071.0, num_to_drop=0, layers_to_drop=set() +2022-11-15 23:54:08,043 INFO [train.py:876] (2/4) Epoch 8, batch 3200, loss[loss=0.1044, simple_loss=0.1361, pruned_loss=0.03629, over 5579.00 frames. ], tot_loss[loss=0.1398, simple_loss=0.1579, pruned_loss=0.0609, over 1077490.52 frames. ], batch size: 18, lr: 1.01e-02, grad_scale: 16.0 +2022-11-15 23:54:21,049 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.083e+02 1.665e+02 2.003e+02 2.661e+02 5.081e+02, threshold=4.007e+02, percent-clipped=1.0 +2022-11-15 23:54:30,389 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.83 vs. limit=5.0 +2022-11-15 23:55:20,030 INFO [train.py:876] (2/4) Epoch 8, batch 3300, loss[loss=0.1433, simple_loss=0.1543, pruned_loss=0.06616, over 5564.00 frames. ], tot_loss[loss=0.1411, simple_loss=0.1586, pruned_loss=0.06181, over 1073533.08 frames. ], batch size: 40, lr: 1.01e-02, grad_scale: 16.0 +2022-11-15 23:55:24,576 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.3802, 4.7746, 5.1840, 4.8150, 5.3702, 5.2524, 4.5824, 5.3113], + device='cuda:2'), covar=tensor([0.0285, 0.0279, 0.0401, 0.0292, 0.0301, 0.0157, 0.0215, 0.0204], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0134, 0.0101, 0.0134, 0.0149, 0.0089, 0.0110, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-15 23:55:32,992 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.066e+02 1.566e+02 1.855e+02 2.366e+02 3.545e+02, threshold=3.710e+02, percent-clipped=0.0 +2022-11-15 23:56:31,639 INFO [train.py:876] (2/4) Epoch 8, batch 3400, loss[loss=0.1535, simple_loss=0.176, pruned_loss=0.06555, over 5603.00 frames. ], tot_loss[loss=0.1408, simple_loss=0.1588, pruned_loss=0.06142, over 1080726.54 frames. ], batch size: 23, lr: 1.01e-02, grad_scale: 16.0 +2022-11-15 23:56:43,984 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.133e+02 1.624e+02 2.119e+02 2.818e+02 4.148e+02, threshold=4.237e+02, percent-clipped=5.0 +2022-11-15 23:56:55,296 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3108, 4.8814, 4.3265, 4.9166, 4.8705, 4.1940, 4.5361, 4.2431], + device='cuda:2'), covar=tensor([0.0306, 0.0449, 0.1399, 0.0288, 0.0346, 0.0415, 0.0490, 0.0592], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0167, 0.0261, 0.0161, 0.0208, 0.0163, 0.0175, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-15 23:57:17,492 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5748, 1.7785, 2.2012, 1.5404, 1.1726, 2.2874, 2.0471, 1.7227], + device='cuda:2'), covar=tensor([0.0900, 0.1193, 0.0658, 0.2696, 0.2574, 0.0689, 0.0737, 0.1487], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0065, 0.0064, 0.0080, 0.0061, 0.0050, 0.0057, 0.0066], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 23:57:35,014 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7499, 2.1906, 2.8825, 3.5364, 3.5929, 2.7547, 2.3515, 3.6526], + device='cuda:2'), covar=tensor([0.0557, 0.3597, 0.2204, 0.3872, 0.1283, 0.3127, 0.2356, 0.0668], + device='cuda:2'), in_proj_covar=tensor([0.0220, 0.0204, 0.0200, 0.0323, 0.0227, 0.0213, 0.0196, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-15 23:57:44,042 INFO [train.py:876] (2/4) Epoch 8, batch 3500, loss[loss=0.1228, simple_loss=0.147, pruned_loss=0.0493, over 5601.00 frames. ], tot_loss[loss=0.1408, simple_loss=0.1591, pruned_loss=0.0613, over 1081377.96 frames. ], batch size: 23, lr: 1.01e-02, grad_scale: 16.0 +2022-11-15 23:57:48,965 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9193, 1.8972, 1.8386, 1.9509, 1.8184, 1.4880, 1.7340, 2.2012], + device='cuda:2'), covar=tensor([0.1161, 0.2440, 0.2095, 0.1524, 0.1773, 0.2718, 0.1899, 0.1190], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0089, 0.0096, 0.0082, 0.0083, 0.0085, 0.0090, 0.0065], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-15 23:57:56,208 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.047e+02 1.748e+02 2.123e+02 2.644e+02 4.958e+02, threshold=4.247e+02, percent-clipped=1.0 +2022-11-15 23:58:13,240 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3837, 2.1461, 2.6352, 1.6092, 1.5433, 2.7665, 2.4613, 2.0181], + device='cuda:2'), covar=tensor([0.0671, 0.0905, 0.0538, 0.2375, 0.1559, 0.2961, 0.0694, 0.0959], + device='cuda:2'), in_proj_covar=tensor([0.0073, 0.0065, 0.0064, 0.0079, 0.0060, 0.0050, 0.0056, 0.0064], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-15 23:58:26,292 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4336, 1.1466, 1.2614, 0.9371, 1.2339, 1.5360, 0.9102, 0.9914], + device='cuda:2'), covar=tensor([0.0321, 0.0458, 0.0314, 0.0835, 0.1057, 0.0760, 0.0774, 0.0262], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0015, 0.0011, 0.0014, 0.0012, 0.0011, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.4590e-05, 7.1779e-05, 5.4849e-05, 6.6011e-05, 5.9940e-05, 5.5276e-05, + 6.8877e-05, 5.5959e-05], device='cuda:2') +2022-11-15 23:58:53,939 INFO [train.py:876] (2/4) Epoch 8, batch 3600, loss[loss=0.1913, simple_loss=0.2036, pruned_loss=0.08953, over 5591.00 frames. ], tot_loss[loss=0.141, simple_loss=0.1594, pruned_loss=0.06133, over 1082814.20 frames. ], batch size: 22, lr: 1.01e-02, grad_scale: 16.0 +2022-11-15 23:59:02,993 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-11-15 23:59:03,416 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.95 vs. limit=5.0 +2022-11-15 23:59:05,648 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.193e+02 1.765e+02 2.064e+02 2.542e+02 7.404e+02, threshold=4.127e+02, percent-clipped=4.0 +2022-11-15 23:59:44,706 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.37 vs. limit=5.0 +2022-11-15 23:59:47,921 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9373, 2.3541, 3.4283, 2.9794, 3.7935, 2.3567, 3.3555, 3.8459], + device='cuda:2'), covar=tensor([0.0860, 0.1933, 0.1026, 0.1919, 0.0559, 0.1921, 0.1537, 0.0940], + device='cuda:2'), in_proj_covar=tensor([0.0221, 0.0197, 0.0204, 0.0213, 0.0217, 0.0191, 0.0222, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:00:01,688 INFO [train.py:876] (2/4) Epoch 8, batch 3700, loss[loss=0.1347, simple_loss=0.1532, pruned_loss=0.05814, over 5640.00 frames. ], tot_loss[loss=0.1409, simple_loss=0.1594, pruned_loss=0.06122, over 1082010.97 frames. ], batch size: 38, lr: 1.01e-02, grad_scale: 16.0 +2022-11-16 00:00:14,195 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.054e+02 1.627e+02 2.007e+02 2.375e+02 5.660e+02, threshold=4.014e+02, percent-clipped=3.0 +2022-11-16 00:00:42,274 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=54664.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 00:00:55,689 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.3100, 4.7420, 5.1794, 4.8341, 5.4211, 5.2532, 4.5925, 5.3089], + device='cuda:2'), covar=tensor([0.0343, 0.0271, 0.0344, 0.0266, 0.0291, 0.0138, 0.0215, 0.0189], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0134, 0.0101, 0.0133, 0.0147, 0.0089, 0.0110, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 00:01:09,320 INFO [train.py:876] (2/4) Epoch 8, batch 3800, loss[loss=0.1049, simple_loss=0.1337, pruned_loss=0.03798, over 5316.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.1604, pruned_loss=0.06173, over 1077230.12 frames. ], batch size: 9, lr: 1.01e-02, grad_scale: 16.0 +2022-11-16 00:01:22,412 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.122e+02 1.662e+02 2.074e+02 2.682e+02 3.562e+02, threshold=4.148e+02, percent-clipped=0.0 +2022-11-16 00:01:23,826 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=54725.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 00:02:17,445 INFO [train.py:876] (2/4) Epoch 8, batch 3900, loss[loss=0.1779, simple_loss=0.1799, pruned_loss=0.08798, over 5456.00 frames. ], tot_loss[loss=0.143, simple_loss=0.1611, pruned_loss=0.06245, over 1079162.69 frames. ], batch size: 64, lr: 1.01e-02, grad_scale: 16.0 +2022-11-16 00:02:29,732 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.833e+01 1.696e+02 2.085e+02 2.412e+02 7.560e+02, threshold=4.170e+02, percent-clipped=1.0 +2022-11-16 00:02:56,185 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=54862.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:03:17,397 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-11-16 00:03:25,460 INFO [train.py:876] (2/4) Epoch 8, batch 4000, loss[loss=0.07742, simple_loss=0.103, pruned_loss=0.02592, over 5224.00 frames. ], tot_loss[loss=0.1415, simple_loss=0.1595, pruned_loss=0.0618, over 1081871.80 frames. ], batch size: 7, lr: 1.01e-02, grad_scale: 16.0 +2022-11-16 00:03:32,698 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1259, 3.7920, 2.6779, 3.6262, 2.9402, 2.6792, 2.2903, 3.2157], + device='cuda:2'), covar=tensor([0.1575, 0.0274, 0.1077, 0.0291, 0.0834, 0.1089, 0.1681, 0.0425], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0143, 0.0165, 0.0141, 0.0175, 0.0178, 0.0172, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 00:03:37,028 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.102e+02 1.640e+02 2.022e+02 2.606e+02 3.847e+02, threshold=4.045e+02, percent-clipped=0.0 +2022-11-16 00:03:37,231 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=54923.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 00:04:09,859 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-11-16 00:04:12,401 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7211, 4.7859, 4.8132, 4.9798, 4.3764, 4.1553, 5.4310, 4.7498], + device='cuda:2'), covar=tensor([0.0395, 0.0733, 0.0290, 0.0791, 0.0463, 0.0274, 0.0513, 0.0445], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0096, 0.0083, 0.0105, 0.0078, 0.0068, 0.0132, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 00:04:39,882 INFO [train.py:876] (2/4) Epoch 8, batch 4100, loss[loss=0.1147, simple_loss=0.1408, pruned_loss=0.04431, over 5776.00 frames. ], tot_loss[loss=0.1405, simple_loss=0.1582, pruned_loss=0.06141, over 1073155.88 frames. ], batch size: 20, lr: 1.01e-02, grad_scale: 16.0 +2022-11-16 00:04:49,729 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=55020.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 00:04:51,561 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.489e+01 1.632e+02 1.927e+02 2.505e+02 4.639e+02, threshold=3.854e+02, percent-clipped=4.0 +2022-11-16 00:05:05,430 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9796, 3.5897, 2.4749, 3.3163, 2.6258, 2.4815, 2.0546, 3.0042], + device='cuda:2'), covar=tensor([0.1588, 0.0263, 0.1171, 0.0414, 0.0936, 0.1045, 0.1953, 0.0452], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0143, 0.0166, 0.0142, 0.0175, 0.0178, 0.0175, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 00:05:19,244 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1311, 2.4289, 2.6946, 1.4144, 2.4902, 2.9008, 2.9695, 2.9788], + device='cuda:2'), covar=tensor([0.1800, 0.1565, 0.1125, 0.2750, 0.0600, 0.0684, 0.0401, 0.0880], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0186, 0.0159, 0.0191, 0.0171, 0.0184, 0.0150, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 00:05:47,255 INFO [train.py:876] (2/4) Epoch 8, batch 4200, loss[loss=0.179, simple_loss=0.1778, pruned_loss=0.09014, over 5369.00 frames. ], tot_loss[loss=0.1399, simple_loss=0.1578, pruned_loss=0.061, over 1077168.46 frames. ], batch size: 70, lr: 1.01e-02, grad_scale: 16.0 +2022-11-16 00:05:59,243 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.030e+02 1.648e+02 1.989e+02 2.446e+02 4.173e+02, threshold=3.979e+02, percent-clipped=3.0 +2022-11-16 00:06:16,054 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=55148.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:06:31,988 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.24 vs. limit=2.0 +2022-11-16 00:06:54,342 INFO [train.py:876] (2/4) Epoch 8, batch 4300, loss[loss=0.1218, simple_loss=0.1493, pruned_loss=0.04711, over 5604.00 frames. ], tot_loss[loss=0.1395, simple_loss=0.1584, pruned_loss=0.06032, over 1090809.99 frames. ], batch size: 24, lr: 1.00e-02, grad_scale: 16.0 +2022-11-16 00:06:57,960 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=55209.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:06:59,243 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=55211.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:07:04,170 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=55218.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 00:07:07,359 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.645e+01 1.651e+02 2.028e+02 2.598e+02 5.835e+02, threshold=4.056e+02, percent-clipped=3.0 +2022-11-16 00:07:25,006 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5105, 3.4934, 3.4914, 3.2476, 3.6107, 3.4473, 1.3478, 3.7112], + device='cuda:2'), covar=tensor([0.0269, 0.0448, 0.0290, 0.0330, 0.0301, 0.0354, 0.2962, 0.0291], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0084, 0.0081, 0.0075, 0.0099, 0.0086, 0.0129, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 00:07:40,462 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=55272.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:07:47,635 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-16 00:07:57,032 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6260, 1.5296, 1.7927, 1.1244, 1.4420, 1.7227, 1.2329, 1.1065], + device='cuda:2'), covar=tensor([0.0019, 0.0044, 0.0028, 0.0043, 0.0049, 0.0030, 0.0030, 0.0038], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0021, 0.0027, 0.0024, 0.0023, 0.0025, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.8738e-05, 2.0435e-05, 1.9257e-05, 2.6624e-05, 2.2196e-05, 2.2194e-05, + 2.4758e-05, 2.6250e-05], device='cuda:2') +2022-11-16 00:08:02,075 INFO [train.py:876] (2/4) Epoch 8, batch 4400, loss[loss=0.1646, simple_loss=0.1748, pruned_loss=0.07724, over 5603.00 frames. ], tot_loss[loss=0.1398, simple_loss=0.1588, pruned_loss=0.06039, over 1093069.99 frames. ], batch size: 24, lr: 1.00e-02, grad_scale: 16.0 +2022-11-16 00:08:09,019 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-11-16 00:08:12,967 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=55320.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 00:08:14,769 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.189e+02 1.701e+02 2.121e+02 2.893e+02 5.250e+02, threshold=4.241e+02, percent-clipped=3.0 +2022-11-16 00:08:28,384 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1175, 1.2594, 1.2430, 1.0724, 0.9856, 1.6282, 1.0032, 1.5258], + device='cuda:2'), covar=tensor([0.0040, 0.0034, 0.0030, 0.0035, 0.0035, 0.0026, 0.0060, 0.0027], + device='cuda:2'), in_proj_covar=tensor([0.0045, 0.0040, 0.0043, 0.0042, 0.0041, 0.0037, 0.0041, 0.0035], + device='cuda:2'), out_proj_covar=tensor([4.0637e-05, 3.6285e-05, 3.8771e-05, 3.8019e-05, 3.6580e-05, 3.1885e-05, + 3.8874e-05, 3.1100e-05], device='cuda:2') +2022-11-16 00:08:44,937 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=55368.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 00:09:10,912 INFO [train.py:876] (2/4) Epoch 8, batch 4500, loss[loss=0.2023, simple_loss=0.1943, pruned_loss=0.1051, over 5332.00 frames. ], tot_loss[loss=0.1388, simple_loss=0.1574, pruned_loss=0.06009, over 1084958.89 frames. ], batch size: 70, lr: 1.00e-02, grad_scale: 16.0 +2022-11-16 00:09:22,569 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.579e+01 1.575e+02 1.933e+02 2.382e+02 3.910e+02, threshold=3.866e+02, percent-clipped=0.0 +2022-11-16 00:10:10,615 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.22 vs. limit=2.0 +2022-11-16 00:10:18,058 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=55504.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:10:18,621 INFO [train.py:876] (2/4) Epoch 8, batch 4600, loss[loss=0.1601, simple_loss=0.1717, pruned_loss=0.07425, over 5532.00 frames. ], tot_loss[loss=0.1396, simple_loss=0.1583, pruned_loss=0.06046, over 1087944.70 frames. ], batch size: 40, lr: 1.00e-02, grad_scale: 16.0 +2022-11-16 00:10:18,760 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7499, 1.5667, 1.8868, 1.1159, 1.6581, 1.7792, 1.3911, 1.1074], + device='cuda:2'), covar=tensor([0.0016, 0.0037, 0.0021, 0.0050, 0.0026, 0.0035, 0.0025, 0.0043], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0021, 0.0027, 0.0023, 0.0022, 0.0025, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.8493e-05, 1.9739e-05, 1.8894e-05, 2.6580e-05, 2.1840e-05, 2.1617e-05, + 2.4292e-05, 2.6049e-05], device='cuda:2') +2022-11-16 00:10:27,249 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=55518.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 00:10:30,361 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.015e+02 1.826e+02 2.052e+02 2.615e+02 3.619e+02, threshold=4.103e+02, percent-clipped=0.0 +2022-11-16 00:10:59,147 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7755, 3.8943, 3.5843, 3.3673, 2.0646, 3.8526, 2.2346, 2.9529], + device='cuda:2'), covar=tensor([0.0373, 0.0178, 0.0193, 0.0346, 0.0500, 0.0150, 0.0451, 0.0181], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0154, 0.0165, 0.0183, 0.0178, 0.0164, 0.0175, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:10:59,633 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=55566.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:11:00,333 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=55567.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:11:26,709 INFO [train.py:876] (2/4) Epoch 8, batch 4700, loss[loss=0.1581, simple_loss=0.1711, pruned_loss=0.07257, over 4987.00 frames. ], tot_loss[loss=0.1402, simple_loss=0.1585, pruned_loss=0.06096, over 1084398.23 frames. ], batch size: 109, lr: 1.00e-02, grad_scale: 16.0 +2022-11-16 00:11:38,377 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.046e+02 1.703e+02 2.017e+02 2.735e+02 4.468e+02, threshold=4.034e+02, percent-clipped=2.0 +2022-11-16 00:12:33,872 INFO [train.py:876] (2/4) Epoch 8, batch 4800, loss[loss=0.09835, simple_loss=0.1175, pruned_loss=0.03958, over 5535.00 frames. ], tot_loss[loss=0.1395, simple_loss=0.158, pruned_loss=0.0605, over 1085011.54 frames. ], batch size: 10, lr: 1.00e-02, grad_scale: 16.0 +2022-11-16 00:12:46,303 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.028e+02 1.529e+02 1.884e+02 2.231e+02 4.028e+02, threshold=3.767e+02, percent-clipped=0.0 +2022-11-16 00:13:02,515 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=55748.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:13:14,838 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5952, 4.6402, 3.5147, 1.9976, 4.4407, 1.5841, 4.3391, 2.3881], + device='cuda:2'), covar=tensor([0.1554, 0.0170, 0.0537, 0.2315, 0.0187, 0.2143, 0.0175, 0.1907], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0110, 0.0119, 0.0118, 0.0107, 0.0131, 0.0101, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:13:31,933 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3063, 3.0101, 3.3121, 1.7003, 2.8194, 3.6216, 3.4578, 3.7846], + device='cuda:2'), covar=tensor([0.2250, 0.1566, 0.0729, 0.2962, 0.0649, 0.0533, 0.0384, 0.0629], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0186, 0.0159, 0.0188, 0.0172, 0.0180, 0.0152, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 00:13:40,598 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=55804.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:13:41,119 INFO [train.py:876] (2/4) Epoch 8, batch 4900, loss[loss=0.1122, simple_loss=0.1437, pruned_loss=0.04036, over 5542.00 frames. ], tot_loss[loss=0.1385, simple_loss=0.1574, pruned_loss=0.05979, over 1082977.08 frames. ], batch size: 14, lr: 9.99e-03, grad_scale: 16.0 +2022-11-16 00:13:43,892 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=55809.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:13:51,747 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7985, 1.1011, 1.5560, 0.9553, 1.4398, 1.8260, 1.1329, 1.2498], + device='cuda:2'), covar=tensor([0.0646, 0.0969, 0.0234, 0.1308, 0.1479, 0.0431, 0.0511, 0.0920], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0011, 0.0014, 0.0013, 0.0011, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.7397e-05, 7.4826e-05, 5.7688e-05, 6.7724e-05, 6.2131e-05, 5.5980e-05, + 6.9644e-05, 5.8068e-05], device='cuda:2') +2022-11-16 00:13:53,467 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.007e+01 1.783e+02 2.085e+02 2.465e+02 4.573e+02, threshold=4.169e+02, percent-clipped=4.0 +2022-11-16 00:14:07,074 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-11-16 00:14:10,705 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=55848.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:14:13,176 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=55852.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:14:23,010 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=55867.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:14:37,846 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.67 vs. limit=5.0 +2022-11-16 00:14:39,770 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.22 vs. limit=2.0 +2022-11-16 00:14:49,101 INFO [train.py:876] (2/4) Epoch 8, batch 5000, loss[loss=0.1631, simple_loss=0.1805, pruned_loss=0.07283, over 5498.00 frames. ], tot_loss[loss=0.1393, simple_loss=0.1581, pruned_loss=0.06024, over 1081854.01 frames. ], batch size: 49, lr: 9.98e-03, grad_scale: 16.0 +2022-11-16 00:14:51,908 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=55909.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:14:55,679 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=55915.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:15:00,898 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.639e+01 1.479e+02 1.807e+02 2.290e+02 3.768e+02, threshold=3.615e+02, percent-clipped=0.0 +2022-11-16 00:15:12,696 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-11-16 00:15:22,454 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.60 vs. limit=5.0 +2022-11-16 00:15:48,241 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.77 vs. limit=5.0 +2022-11-16 00:15:57,404 INFO [train.py:876] (2/4) Epoch 8, batch 5100, loss[loss=0.2043, simple_loss=0.2067, pruned_loss=0.101, over 5332.00 frames. ], tot_loss[loss=0.1384, simple_loss=0.1575, pruned_loss=0.05963, over 1086751.28 frames. ], batch size: 70, lr: 9.97e-03, grad_scale: 16.0 +2022-11-16 00:16:07,152 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0644, 1.9177, 2.2738, 3.2486, 3.1543, 2.4297, 2.0097, 3.4332], + device='cuda:2'), covar=tensor([0.0849, 0.3021, 0.2061, 0.1975, 0.1030, 0.2661, 0.2289, 0.0548], + device='cuda:2'), in_proj_covar=tensor([0.0224, 0.0209, 0.0201, 0.0322, 0.0223, 0.0216, 0.0195, 0.0224], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 00:16:09,575 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.082e+02 1.582e+02 1.987e+02 2.357e+02 3.737e+02, threshold=3.975e+02, percent-clipped=1.0 +2022-11-16 00:16:18,219 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1562, 2.4270, 2.7349, 2.4552, 1.5766, 2.5549, 1.7756, 1.6892], + device='cuda:2'), covar=tensor([0.0254, 0.0118, 0.0108, 0.0170, 0.0304, 0.0125, 0.0315, 0.0179], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0154, 0.0165, 0.0185, 0.0179, 0.0165, 0.0175, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:16:40,149 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-11-16 00:16:41,767 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0658, 0.7259, 0.7121, 0.7340, 1.1011, 0.9207, 0.6847, 0.7348], + device='cuda:2'), covar=tensor([0.0283, 0.0327, 0.0432, 0.0632, 0.0465, 0.0324, 0.0770, 0.0310], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0012, 0.0014, 0.0013, 0.0011, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.7311e-05, 7.5815e-05, 6.0058e-05, 6.8140e-05, 6.3369e-05, 5.6679e-05, + 7.0812e-05, 5.7971e-05], device='cuda:2') +2022-11-16 00:17:02,303 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.84 vs. limit=5.0 +2022-11-16 00:17:05,613 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=56104.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:17:06,567 INFO [train.py:876] (2/4) Epoch 8, batch 5200, loss[loss=0.1206, simple_loss=0.1531, pruned_loss=0.04401, over 5555.00 frames. ], tot_loss[loss=0.1369, simple_loss=0.1565, pruned_loss=0.05865, over 1087926.69 frames. ], batch size: 16, lr: 9.96e-03, grad_scale: 32.0 +2022-11-16 00:17:18,359 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.081e+02 1.676e+02 2.046e+02 2.590e+02 6.107e+02, threshold=4.093e+02, percent-clipped=5.0 +2022-11-16 00:17:27,642 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4322, 4.4358, 4.3657, 4.5774, 3.9957, 3.7788, 4.9382, 4.5249], + device='cuda:2'), covar=tensor([0.0367, 0.0702, 0.0377, 0.0780, 0.0541, 0.0380, 0.0692, 0.0498], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0103, 0.0090, 0.0112, 0.0083, 0.0073, 0.0141, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 00:17:32,910 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56145.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:18:08,692 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2563, 1.1746, 1.6966, 1.5300, 1.6371, 1.7739, 1.6411, 1.5027], + device='cuda:2'), covar=tensor([0.0012, 0.0093, 0.0054, 0.0029, 0.0064, 0.0055, 0.0027, 0.0028], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0021, 0.0026, 0.0023, 0.0022, 0.0025, 0.0024], + device='cuda:2'), out_proj_covar=tensor([1.7884e-05, 1.9333e-05, 1.8953e-05, 2.5715e-05, 2.1495e-05, 2.1693e-05, + 2.3974e-05, 2.5101e-05], device='cuda:2') +2022-11-16 00:18:13,847 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=56204.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:18:14,420 INFO [train.py:876] (2/4) Epoch 8, batch 5300, loss[loss=0.08441, simple_loss=0.1093, pruned_loss=0.02979, over 4508.00 frames. ], tot_loss[loss=0.1372, simple_loss=0.1569, pruned_loss=0.05873, over 1092344.85 frames. ], batch size: 5, lr: 9.95e-03, grad_scale: 16.0 +2022-11-16 00:18:15,295 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=56206.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:18:26,631 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2410, 3.4483, 2.6006, 1.6182, 3.1694, 1.2815, 3.2012, 1.7421], + device='cuda:2'), covar=tensor([0.1339, 0.0207, 0.0892, 0.2063, 0.0283, 0.2255, 0.0287, 0.1823], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0107, 0.0115, 0.0117, 0.0106, 0.0128, 0.0098, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:18:27,820 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.018e+02 1.491e+02 1.984e+02 2.507e+02 5.516e+02, threshold=3.968e+02, percent-clipped=2.0 +2022-11-16 00:18:29,355 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8699, 1.3404, 1.8211, 0.9573, 1.7478, 0.9952, 1.2336, 1.1727], + device='cuda:2'), covar=tensor([0.1289, 0.0942, 0.0553, 0.1353, 0.1686, 0.1030, 0.0908, 0.1482], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0012, 0.0014, 0.0013, 0.0010, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.6767e-05, 7.4330e-05, 5.9169e-05, 6.6701e-05, 6.2178e-05, 5.5495e-05, + 6.9589e-05, 5.6907e-05], device='cuda:2') +2022-11-16 00:19:22,484 INFO [train.py:876] (2/4) Epoch 8, batch 5400, loss[loss=0.09593, simple_loss=0.1317, pruned_loss=0.03007, over 5545.00 frames. ], tot_loss[loss=0.1363, simple_loss=0.1558, pruned_loss=0.05835, over 1090639.78 frames. ], batch size: 25, lr: 9.94e-03, grad_scale: 16.0 +2022-11-16 00:19:28,791 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6338, 2.3072, 2.6518, 1.4655, 1.2111, 3.1316, 2.1799, 2.1622], + device='cuda:2'), covar=tensor([0.0588, 0.0828, 0.0443, 0.2875, 0.4067, 0.0486, 0.1886, 0.0938], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0067, 0.0065, 0.0082, 0.0061, 0.0051, 0.0059, 0.0067], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-11-16 00:19:34,817 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.85 vs. limit=5.0 +2022-11-16 00:19:35,868 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.080e+02 1.666e+02 2.112e+02 2.749e+02 4.650e+02, threshold=4.223e+02, percent-clipped=6.0 +2022-11-16 00:20:29,853 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=56404.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:20:30,398 INFO [train.py:876] (2/4) Epoch 8, batch 5500, loss[loss=0.1138, simple_loss=0.1418, pruned_loss=0.04292, over 5530.00 frames. ], tot_loss[loss=0.1359, simple_loss=0.1556, pruned_loss=0.05812, over 1094845.02 frames. ], batch size: 13, lr: 9.94e-03, grad_scale: 16.0 +2022-11-16 00:20:42,594 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.180e+02 1.705e+02 2.205e+02 2.519e+02 5.507e+02, threshold=4.409e+02, percent-clipped=4.0 +2022-11-16 00:20:49,563 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4795, 4.4189, 2.9730, 4.3510, 3.4637, 2.7273, 2.4000, 3.6887], + device='cuda:2'), covar=tensor([0.1696, 0.0309, 0.1242, 0.0300, 0.0727, 0.1181, 0.2098, 0.0337], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0135, 0.0160, 0.0137, 0.0168, 0.0169, 0.0171, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 00:21:02,112 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=56452.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:21:15,777 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5060, 4.0247, 4.2650, 4.0584, 4.6120, 4.4242, 4.2028, 4.6116], + device='cuda:2'), covar=tensor([0.0445, 0.0386, 0.0502, 0.0341, 0.0363, 0.0206, 0.0293, 0.0284], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0133, 0.0102, 0.0132, 0.0149, 0.0089, 0.0113, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 00:21:35,575 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=56501.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:21:37,723 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=56504.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:21:38,228 INFO [train.py:876] (2/4) Epoch 8, batch 5600, loss[loss=0.2005, simple_loss=0.1968, pruned_loss=0.1021, over 5368.00 frames. ], tot_loss[loss=0.1375, simple_loss=0.1569, pruned_loss=0.05907, over 1096284.84 frames. ], batch size: 70, lr: 9.93e-03, grad_scale: 16.0 +2022-11-16 00:21:50,136 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56523.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 00:21:50,546 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.094e+02 1.658e+02 2.017e+02 2.598e+02 4.706e+02, threshold=4.034e+02, percent-clipped=2.0 +2022-11-16 00:22:03,477 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3296, 0.8950, 1.1397, 0.8515, 1.5005, 1.3153, 0.9205, 1.2071], + device='cuda:2'), covar=tensor([0.0546, 0.0662, 0.0798, 0.1073, 0.0931, 0.0386, 0.0597, 0.0732], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0016, 0.0012, 0.0014, 0.0013, 0.0011, 0.0015, 0.0011], + device='cuda:2'), out_proj_covar=tensor([5.7274e-05, 7.5765e-05, 6.0273e-05, 6.8111e-05, 6.2412e-05, 5.6466e-05, + 7.0191e-05, 5.6910e-05], device='cuda:2') +2022-11-16 00:22:08,266 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.15 vs. limit=5.0 +2022-11-16 00:22:09,939 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=56552.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:22:31,089 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=56584.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 00:22:38,074 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3672, 1.4269, 1.4872, 1.0287, 1.2567, 1.1657, 1.3385, 1.5033], + device='cuda:2'), covar=tensor([0.0046, 0.0059, 0.0043, 0.0043, 0.0039, 0.0030, 0.0035, 0.0069], + device='cuda:2'), in_proj_covar=tensor([0.0046, 0.0041, 0.0044, 0.0043, 0.0042, 0.0038, 0.0043, 0.0036], + device='cuda:2'), out_proj_covar=tensor([4.1414e-05, 3.6792e-05, 3.9562e-05, 3.8526e-05, 3.6911e-05, 3.2983e-05, + 4.0294e-05, 3.1893e-05], device='cuda:2') +2022-11-16 00:22:46,106 INFO [train.py:876] (2/4) Epoch 8, batch 5700, loss[loss=0.0957, simple_loss=0.1308, pruned_loss=0.0303, over 5551.00 frames. ], tot_loss[loss=0.1391, simple_loss=0.158, pruned_loss=0.06011, over 1085932.30 frames. ], batch size: 13, lr: 9.92e-03, grad_scale: 16.0 +2022-11-16 00:22:58,516 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.019e+02 1.608e+02 1.896e+02 2.176e+02 4.174e+02, threshold=3.791e+02, percent-clipped=1.0 +2022-11-16 00:23:43,234 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-11-16 00:23:52,801 INFO [train.py:876] (2/4) Epoch 8, batch 5800, loss[loss=0.07836, simple_loss=0.1131, pruned_loss=0.02184, over 5417.00 frames. ], tot_loss[loss=0.1392, simple_loss=0.1582, pruned_loss=0.06014, over 1085972.66 frames. ], batch size: 11, lr: 9.91e-03, grad_scale: 16.0 +2022-11-16 00:24:05,869 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.015e+02 1.678e+02 1.927e+02 2.406e+02 3.937e+02, threshold=3.853e+02, percent-clipped=1.0 +2022-11-16 00:24:21,623 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1919, 2.1884, 2.4365, 3.3377, 3.2414, 2.4770, 2.1752, 3.4747], + device='cuda:2'), covar=tensor([0.0764, 0.2480, 0.1919, 0.1968, 0.1062, 0.2484, 0.1897, 0.0555], + device='cuda:2'), in_proj_covar=tensor([0.0223, 0.0204, 0.0197, 0.0319, 0.0219, 0.0210, 0.0193, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 00:24:40,856 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-11-16 00:24:43,910 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8285, 0.9932, 1.1478, 0.8652, 0.6607, 1.0434, 0.8857, 0.8049], + device='cuda:2'), covar=tensor([0.0014, 0.0011, 0.0010, 0.0023, 0.0016, 0.0018, 0.0018, 0.0027], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0021, 0.0027, 0.0023, 0.0022, 0.0025, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.7990e-05, 1.9603e-05, 1.9470e-05, 2.6383e-05, 2.2032e-05, 2.1601e-05, + 2.4047e-05, 2.5441e-05], device='cuda:2') +2022-11-16 00:24:52,818 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.19 vs. limit=2.0 +2022-11-16 00:24:57,435 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=56801.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:24:58,795 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56803.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:25:00,325 INFO [train.py:876] (2/4) Epoch 8, batch 5900, loss[loss=0.121, simple_loss=0.156, pruned_loss=0.04299, over 5560.00 frames. ], tot_loss[loss=0.1398, simple_loss=0.1578, pruned_loss=0.06085, over 1079332.46 frames. ], batch size: 25, lr: 9.90e-03, grad_scale: 16.0 +2022-11-16 00:25:13,654 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.513e+01 1.636e+02 2.086e+02 2.680e+02 4.178e+02, threshold=4.173e+02, percent-clipped=3.0 +2022-11-16 00:25:18,490 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7860, 4.2858, 3.8401, 3.7038, 2.1829, 4.0685, 2.2619, 3.2949], + device='cuda:2'), covar=tensor([0.0467, 0.0104, 0.0212, 0.0308, 0.0565, 0.0144, 0.0512, 0.0176], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0154, 0.0165, 0.0184, 0.0179, 0.0163, 0.0175, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:25:29,996 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=56849.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:25:40,364 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=56864.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:25:50,690 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=56879.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 00:25:54,758 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2961, 4.1787, 4.2119, 3.9473, 2.6001, 4.5278, 2.4570, 3.9495], + device='cuda:2'), covar=tensor([0.0326, 0.0371, 0.0157, 0.0339, 0.0533, 0.0109, 0.0555, 0.0110], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0153, 0.0165, 0.0184, 0.0178, 0.0162, 0.0175, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:26:07,761 INFO [train.py:876] (2/4) Epoch 8, batch 6000, loss[loss=0.1164, simple_loss=0.1549, pruned_loss=0.03894, over 5489.00 frames. ], tot_loss[loss=0.1389, simple_loss=0.1574, pruned_loss=0.06021, over 1082754.98 frames. ], batch size: 13, lr: 9.89e-03, grad_scale: 16.0 +2022-11-16 00:26:07,761 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 00:26:17,089 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7077, 3.9562, 3.7094, 3.4347, 1.9172, 3.8248, 2.0902, 3.2497], + device='cuda:2'), covar=tensor([0.0539, 0.0155, 0.0184, 0.0414, 0.0757, 0.0168, 0.0659, 0.0178], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0153, 0.0165, 0.0185, 0.0179, 0.0162, 0.0176, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:26:25,620 INFO [train.py:908] (2/4) Epoch 8, validation: loss=0.1622, simple_loss=0.1823, pruned_loss=0.07105, over 1530663.00 frames. +2022-11-16 00:26:25,620 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 00:26:27,747 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56908.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:26:38,698 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.650e+01 1.622e+02 2.023e+02 2.494e+02 5.348e+02, threshold=4.047e+02, percent-clipped=2.0 +2022-11-16 00:26:51,863 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0221, 1.1497, 1.0745, 0.7617, 1.3037, 1.2315, 0.5769, 1.3238], + device='cuda:2'), covar=tensor([0.0050, 0.0039, 0.0043, 0.0049, 0.0035, 0.0031, 0.0072, 0.0047], + device='cuda:2'), in_proj_covar=tensor([0.0047, 0.0041, 0.0045, 0.0043, 0.0042, 0.0038, 0.0044, 0.0037], + device='cuda:2'), out_proj_covar=tensor([4.2227e-05, 3.7212e-05, 4.0216e-05, 3.9067e-05, 3.7125e-05, 3.2970e-05, + 4.0886e-05, 3.2634e-05], device='cuda:2') +2022-11-16 00:27:09,216 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=56969.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:27:25,011 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56993.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:27:26,253 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56995.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:27:32,829 INFO [train.py:876] (2/4) Epoch 8, batch 6100, loss[loss=0.1872, simple_loss=0.188, pruned_loss=0.09326, over 5562.00 frames. ], tot_loss[loss=0.1389, simple_loss=0.1576, pruned_loss=0.06012, over 1085692.65 frames. ], batch size: 46, lr: 9.88e-03, grad_scale: 16.0 +2022-11-16 00:27:45,774 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.054e+01 1.623e+02 1.859e+02 2.466e+02 4.899e+02, threshold=3.719e+02, percent-clipped=4.0 +2022-11-16 00:28:07,739 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57054.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:28:09,125 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57056.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:28:18,060 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.72 vs. limit=5.0 +2022-11-16 00:28:39,157 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.8930, 5.3534, 5.7984, 5.4277, 5.9944, 5.9039, 5.0799, 5.9483], + device='cuda:2'), covar=tensor([0.0382, 0.0273, 0.0345, 0.0263, 0.0319, 0.0118, 0.0179, 0.0242], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0135, 0.0105, 0.0133, 0.0152, 0.0088, 0.0114, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 00:28:43,877 INFO [train.py:876] (2/4) Epoch 8, batch 6200, loss[loss=0.1824, simple_loss=0.1865, pruned_loss=0.0891, over 5480.00 frames. ], tot_loss[loss=0.137, simple_loss=0.1566, pruned_loss=0.05868, over 1094650.65 frames. ], batch size: 58, lr: 9.88e-03, grad_scale: 16.0 +2022-11-16 00:28:54,874 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1557, 3.3299, 2.5357, 1.7463, 3.0685, 1.1681, 3.1566, 1.5531], + device='cuda:2'), covar=tensor([0.1379, 0.0242, 0.0997, 0.1891, 0.0310, 0.2344, 0.0267, 0.1849], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0107, 0.0115, 0.0115, 0.0106, 0.0128, 0.0099, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:28:56,826 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.079e+02 1.699e+02 2.012e+02 2.315e+02 3.529e+02, threshold=4.025e+02, percent-clipped=0.0 +2022-11-16 00:29:20,072 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6586, 4.1070, 3.7334, 3.5122, 2.1597, 4.0695, 2.3389, 3.2811], + device='cuda:2'), covar=tensor([0.0426, 0.0185, 0.0192, 0.0343, 0.0585, 0.0114, 0.0444, 0.0116], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0154, 0.0165, 0.0185, 0.0178, 0.0162, 0.0175, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:29:22,009 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57159.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:29:28,804 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3676, 0.9272, 1.0713, 0.8142, 1.0491, 1.2612, 0.6711, 0.9378], + device='cuda:2'), covar=tensor([0.0608, 0.0923, 0.0847, 0.1289, 0.0692, 0.0692, 0.1192, 0.0595], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0017, 0.0013, 0.0015, 0.0013, 0.0011, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.0587e-05, 8.0676e-05, 6.3437e-05, 7.3271e-05, 6.5870e-05, 5.9823e-05, + 7.4554e-05, 6.0631e-05], device='cuda:2') +2022-11-16 00:29:35,670 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57179.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 00:29:40,822 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57186.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:29:49,418 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57197.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:29:55,086 INFO [train.py:876] (2/4) Epoch 8, batch 6300, loss[loss=0.1364, simple_loss=0.1491, pruned_loss=0.06183, over 5497.00 frames. ], tot_loss[loss=0.1356, simple_loss=0.1554, pruned_loss=0.05789, over 1087487.44 frames. ], batch size: 49, lr: 9.87e-03, grad_scale: 16.0 +2022-11-16 00:30:03,873 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.72 vs. limit=5.0 +2022-11-16 00:30:05,669 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6560, 3.7332, 3.6124, 3.4352, 2.1914, 3.6883, 2.1636, 3.1314], + device='cuda:2'), covar=tensor([0.0349, 0.0166, 0.0178, 0.0318, 0.0437, 0.0132, 0.0469, 0.0201], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0154, 0.0166, 0.0185, 0.0179, 0.0163, 0.0175, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:30:08,132 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.308e+02 1.749e+02 2.073e+02 2.619e+02 6.715e+02, threshold=4.147e+02, percent-clipped=6.0 +2022-11-16 00:30:10,295 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57227.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 00:30:23,463 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57244.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:30:25,503 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57247.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:30:33,118 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57258.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 00:30:37,154 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57264.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:30:44,151 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5745, 1.8000, 2.0907, 1.4690, 1.0621, 2.6583, 2.3209, 1.7828], + device='cuda:2'), covar=tensor([0.1128, 0.1069, 0.0924, 0.2354, 0.3658, 0.0670, 0.0933, 0.1347], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0065, 0.0065, 0.0080, 0.0059, 0.0051, 0.0056, 0.0065], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-11-16 00:30:52,463 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.30 vs. limit=2.0 +2022-11-16 00:31:03,819 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1828, 4.6022, 4.1581, 4.5399, 4.6591, 3.9147, 4.1552, 3.9985], + device='cuda:2'), covar=tensor([0.0325, 0.0450, 0.1428, 0.0383, 0.0409, 0.0429, 0.0569, 0.0767], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0164, 0.0262, 0.0157, 0.0206, 0.0162, 0.0172, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 00:31:06,481 INFO [train.py:876] (2/4) Epoch 8, batch 6400, loss[loss=0.07601, simple_loss=0.1162, pruned_loss=0.0179, over 5050.00 frames. ], tot_loss[loss=0.1332, simple_loss=0.1537, pruned_loss=0.0564, over 1084100.33 frames. ], batch size: 7, lr: 9.86e-03, grad_scale: 16.0 +2022-11-16 00:31:06,658 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57305.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:31:19,419 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.947e+01 1.601e+02 2.011e+02 2.621e+02 5.168e+02, threshold=4.022e+02, percent-clipped=2.0 +2022-11-16 00:31:36,919 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57349.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:31:38,741 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57351.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:32:03,058 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8577, 3.7759, 3.6944, 3.5272, 3.7519, 3.5561, 1.3489, 3.8539], + device='cuda:2'), covar=tensor([0.0252, 0.0333, 0.0375, 0.0369, 0.0343, 0.0407, 0.3396, 0.0359], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0084, 0.0083, 0.0073, 0.0099, 0.0085, 0.0128, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 00:32:09,897 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8899, 1.1399, 1.1151, 1.1332, 1.5193, 1.1924, 1.0743, 1.0415], + device='cuda:2'), covar=tensor([0.0328, 0.0859, 0.2253, 0.0949, 0.1971, 0.1618, 0.1384, 0.1147], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0017, 0.0012, 0.0015, 0.0013, 0.0011, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.0420e-05, 8.0488e-05, 6.2531e-05, 7.3726e-05, 6.6426e-05, 5.9760e-05, + 7.4872e-05, 6.0688e-05], device='cuda:2') +2022-11-16 00:32:11,882 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57398.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:32:17,278 INFO [train.py:876] (2/4) Epoch 8, batch 6500, loss[loss=0.1562, simple_loss=0.1614, pruned_loss=0.07547, over 5556.00 frames. ], tot_loss[loss=0.134, simple_loss=0.1545, pruned_loss=0.05681, over 1083069.67 frames. ], batch size: 46, lr: 9.85e-03, grad_scale: 16.0 +2022-11-16 00:32:30,735 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.052e+02 1.615e+02 1.950e+02 2.299e+02 5.072e+02, threshold=3.899e+02, percent-clipped=1.0 +2022-11-16 00:32:47,611 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5393, 1.9417, 1.6054, 1.2265, 1.4162, 2.0932, 1.8436, 2.1146], + device='cuda:2'), covar=tensor([0.1830, 0.1392, 0.1720, 0.2682, 0.1306, 0.0878, 0.0808, 0.1278], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0186, 0.0160, 0.0191, 0.0174, 0.0184, 0.0159, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 00:32:55,154 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57459.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:32:55,209 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57459.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:32:57,561 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57462.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:33:20,310 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57495.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:33:26,885 INFO [train.py:876] (2/4) Epoch 8, batch 6600, loss[loss=0.1331, simple_loss=0.1622, pruned_loss=0.05201, over 5646.00 frames. ], tot_loss[loss=0.1353, simple_loss=0.1555, pruned_loss=0.05756, over 1088798.11 frames. ], batch size: 32, lr: 9.84e-03, grad_scale: 16.0 +2022-11-16 00:33:28,257 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57507.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:33:39,766 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57523.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:33:40,181 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.133e+02 1.548e+02 1.936e+02 2.529e+02 5.371e+02, threshold=3.872e+02, percent-clipped=1.0 +2022-11-16 00:33:46,982 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8936, 2.3776, 3.5642, 2.9599, 3.7603, 2.3204, 3.2939, 3.8999], + device='cuda:2'), covar=tensor([0.0498, 0.1749, 0.0839, 0.1814, 0.0637, 0.1806, 0.1318, 0.0844], + device='cuda:2'), in_proj_covar=tensor([0.0223, 0.0191, 0.0202, 0.0207, 0.0218, 0.0190, 0.0218, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:33:52,094 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57542.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:33:59,273 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57553.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 00:34:01,300 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57556.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:34:05,842 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8836, 4.0328, 3.8469, 3.3895, 2.3214, 4.1325, 2.3584, 3.3005], + device='cuda:2'), covar=tensor([0.0350, 0.0125, 0.0201, 0.0485, 0.0482, 0.0124, 0.0458, 0.0157], + device='cuda:2'), in_proj_covar=tensor([0.0183, 0.0157, 0.0169, 0.0188, 0.0181, 0.0166, 0.0178, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:34:06,407 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57564.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:34:30,739 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57600.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:34:33,919 INFO [train.py:876] (2/4) Epoch 8, batch 6700, loss[loss=0.1112, simple_loss=0.1439, pruned_loss=0.03931, over 5715.00 frames. ], tot_loss[loss=0.134, simple_loss=0.1544, pruned_loss=0.0568, over 1088453.08 frames. ], batch size: 28, lr: 9.83e-03, grad_scale: 16.0 +2022-11-16 00:34:38,597 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57612.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:34:39,400 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2552, 2.7542, 2.8875, 2.6830, 1.6848, 2.8127, 1.9098, 2.3331], + device='cuda:2'), covar=tensor([0.0277, 0.0120, 0.0126, 0.0190, 0.0358, 0.0148, 0.0324, 0.0149], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0152, 0.0166, 0.0183, 0.0177, 0.0162, 0.0174, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:34:46,348 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.030e+02 1.668e+02 2.078e+02 2.572e+02 7.235e+02, threshold=4.155e+02, percent-clipped=5.0 +2022-11-16 00:34:51,714 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3462, 4.2374, 3.0621, 4.0914, 3.2641, 3.0353, 2.2763, 3.4947], + device='cuda:2'), covar=tensor([0.1730, 0.0255, 0.0945, 0.0293, 0.0696, 0.0945, 0.2116, 0.0338], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0139, 0.0165, 0.0142, 0.0177, 0.0177, 0.0174, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 00:35:04,284 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57649.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:35:05,564 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57651.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:35:08,678 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7424, 2.5506, 2.5665, 2.7192, 2.6465, 2.5129, 3.0038, 2.7502], + device='cuda:2'), covar=tensor([0.0585, 0.1167, 0.0895, 0.1308, 0.0798, 0.0530, 0.1088, 0.0922], + device='cuda:2'), in_proj_covar=tensor([0.0079, 0.0101, 0.0089, 0.0110, 0.0082, 0.0073, 0.0138, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 00:35:36,534 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57697.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:35:37,907 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57699.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:35:41,754 INFO [train.py:876] (2/4) Epoch 8, batch 6800, loss[loss=0.1173, simple_loss=0.1458, pruned_loss=0.04442, over 5524.00 frames. ], tot_loss[loss=0.135, simple_loss=0.1551, pruned_loss=0.05751, over 1084106.96 frames. ], batch size: 13, lr: 9.82e-03, grad_scale: 16.0 +2022-11-16 00:35:43,217 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0566, 2.9319, 2.4530, 3.0023, 2.3506, 3.1234, 3.2363, 3.5384], + device='cuda:2'), covar=tensor([0.0935, 0.1629, 0.3034, 0.1725, 0.1633, 0.0795, 0.1211, 0.1310], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0089, 0.0097, 0.0085, 0.0081, 0.0086, 0.0090, 0.0065], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:35:53,964 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.090e+02 1.669e+02 2.102e+02 2.619e+02 4.252e+02, threshold=4.204e+02, percent-clipped=2.0 +2022-11-16 00:36:00,324 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57733.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:36:14,310 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-11-16 00:36:14,618 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57754.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:36:41,883 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57794.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:36:45,260 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5897, 3.6165, 3.5153, 3.4122, 1.9516, 3.5646, 2.2100, 2.9989], + device='cuda:2'), covar=tensor([0.0349, 0.0158, 0.0149, 0.0258, 0.0465, 0.0146, 0.0426, 0.0156], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0152, 0.0167, 0.0185, 0.0178, 0.0163, 0.0175, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:36:49,278 INFO [train.py:876] (2/4) Epoch 8, batch 6900, loss[loss=0.1821, simple_loss=0.1894, pruned_loss=0.0874, over 5481.00 frames. ], tot_loss[loss=0.1378, simple_loss=0.1574, pruned_loss=0.05912, over 1078461.34 frames. ], batch size: 64, lr: 9.82e-03, grad_scale: 16.0 +2022-11-16 00:36:57,824 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57818.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:37:01,645 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.095e+02 1.691e+02 2.166e+02 2.756e+02 5.042e+02, threshold=4.332e+02, percent-clipped=3.0 +2022-11-16 00:37:13,697 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9726, 5.4408, 3.3995, 5.0356, 4.1674, 3.8956, 3.3568, 4.9443], + device='cuda:2'), covar=tensor([0.1495, 0.0319, 0.0964, 0.0239, 0.0443, 0.0757, 0.1723, 0.0178], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0139, 0.0165, 0.0142, 0.0176, 0.0177, 0.0174, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 00:37:13,702 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57842.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:37:20,215 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57851.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:37:21,618 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57853.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 00:37:45,648 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57890.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:37:53,400 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57900.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:37:54,292 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57901.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:37:56,852 INFO [train.py:876] (2/4) Epoch 8, batch 7000, loss[loss=0.174, simple_loss=0.1906, pruned_loss=0.07871, over 5670.00 frames. ], tot_loss[loss=0.1366, simple_loss=0.1566, pruned_loss=0.05834, over 1081637.46 frames. ], batch size: 38, lr: 9.81e-03, grad_scale: 16.0 +2022-11-16 00:38:09,168 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.116e+02 1.725e+02 2.099e+02 2.587e+02 4.633e+02, threshold=4.198e+02, percent-clipped=1.0 +2022-11-16 00:38:13,881 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.7386, 5.0856, 5.5792, 5.0401, 5.7928, 5.6073, 4.8667, 5.7266], + device='cuda:2'), covar=tensor([0.0267, 0.0260, 0.0354, 0.0269, 0.0267, 0.0136, 0.0224, 0.0206], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0135, 0.0103, 0.0133, 0.0152, 0.0089, 0.0114, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 00:38:25,214 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57948.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:38:31,893 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5546, 2.2019, 3.2314, 2.8745, 3.1440, 2.2004, 2.9930, 3.5163], + device='cuda:2'), covar=tensor([0.0589, 0.1637, 0.0757, 0.1327, 0.0669, 0.1585, 0.1054, 0.0725], + device='cuda:2'), in_proj_covar=tensor([0.0227, 0.0196, 0.0211, 0.0214, 0.0227, 0.0196, 0.0223, 0.0224], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:38:51,188 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.78 vs. limit=2.0 +2022-11-16 00:39:03,880 INFO [train.py:876] (2/4) Epoch 8, batch 7100, loss[loss=0.1256, simple_loss=0.1552, pruned_loss=0.04807, over 5739.00 frames. ], tot_loss[loss=0.1389, simple_loss=0.1583, pruned_loss=0.05974, over 1085314.58 frames. ], batch size: 20, lr: 9.80e-03, grad_scale: 16.0 +2022-11-16 00:39:06,522 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1736, 4.9989, 5.1034, 5.0422, 4.4625, 4.1855, 5.7159, 4.7692], + device='cuda:2'), covar=tensor([0.0347, 0.0904, 0.0383, 0.1761, 0.0471, 0.0449, 0.0618, 0.0677], + device='cuda:2'), in_proj_covar=tensor([0.0077, 0.0098, 0.0087, 0.0109, 0.0081, 0.0073, 0.0135, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 00:39:16,448 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9377, 3.1926, 2.4256, 2.8998, 2.2146, 2.3899, 1.8646, 2.6010], + device='cuda:2'), covar=tensor([0.1622, 0.0304, 0.1084, 0.0552, 0.1554, 0.1205, 0.2005, 0.0632], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0138, 0.0162, 0.0142, 0.0174, 0.0175, 0.0171, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 00:39:16,951 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.016e+02 1.751e+02 2.181e+02 2.576e+02 4.312e+02, threshold=4.361e+02, percent-clipped=1.0 +2022-11-16 00:39:36,869 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1771, 3.9607, 2.9768, 3.7473, 3.0056, 2.6776, 2.2168, 3.2559], + device='cuda:2'), covar=tensor([0.1735, 0.0252, 0.0981, 0.0363, 0.0931, 0.1260, 0.2069, 0.0478], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0139, 0.0164, 0.0142, 0.0176, 0.0178, 0.0173, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 00:39:36,875 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58054.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:40:00,895 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=58089.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:40:09,399 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58102.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:40:11,337 INFO [train.py:876] (2/4) Epoch 8, batch 7200, loss[loss=0.1141, simple_loss=0.1432, pruned_loss=0.04255, over 5566.00 frames. ], tot_loss[loss=0.1369, simple_loss=0.157, pruned_loss=0.05839, over 1085435.46 frames. ], batch size: 14, lr: 9.79e-03, grad_scale: 16.0 +2022-11-16 00:40:20,265 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58118.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:40:24,026 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.930e+01 1.590e+02 1.880e+02 2.458e+02 4.390e+02, threshold=3.761e+02, percent-clipped=1.0 +2022-11-16 00:40:36,098 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58141.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:40:38,010 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58144.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:40:42,425 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58151.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:40:51,880 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58166.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:41:42,125 INFO [train.py:876] (2/4) Epoch 9, batch 0, loss[loss=0.1661, simple_loss=0.1934, pruned_loss=0.06942, over 5621.00 frames. ], tot_loss[loss=0.1661, simple_loss=0.1934, pruned_loss=0.06942, over 5621.00 frames. ], batch size: 23, lr: 9.26e-03, grad_scale: 16.0 +2022-11-16 00:41:42,125 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 00:41:50,316 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8244, 3.9756, 3.3895, 3.9677, 3.8849, 3.3832, 3.7369, 3.6391], + device='cuda:2'), covar=tensor([0.0206, 0.0428, 0.1183, 0.0346, 0.0570, 0.0399, 0.0455, 0.0371], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0165, 0.0260, 0.0159, 0.0207, 0.0164, 0.0174, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 00:41:58,752 INFO [train.py:908] (2/4) Epoch 9, validation: loss=0.1631, simple_loss=0.1836, pruned_loss=0.0713, over 1530663.00 frames. +2022-11-16 00:41:58,753 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 00:42:07,408 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4103, 1.4061, 1.3332, 1.1291, 1.2607, 1.3829, 1.1100, 1.5383], + device='cuda:2'), covar=tensor([0.0040, 0.0041, 0.0040, 0.0039, 0.0038, 0.0035, 0.0047, 0.0027], + device='cuda:2'), in_proj_covar=tensor([0.0047, 0.0042, 0.0044, 0.0044, 0.0042, 0.0039, 0.0043, 0.0038], + device='cuda:2'), out_proj_covar=tensor([4.2649e-05, 3.7631e-05, 3.9390e-05, 3.9913e-05, 3.7478e-05, 3.3535e-05, + 3.9605e-05, 3.3448e-05], device='cuda:2') +2022-11-16 00:42:13,716 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58199.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:42:16,531 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58202.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:42:18,492 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58205.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:42:31,155 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.117e+02 1.772e+02 2.208e+02 2.571e+02 4.464e+02, threshold=4.417e+02, percent-clipped=3.0 +2022-11-16 00:43:06,383 INFO [train.py:876] (2/4) Epoch 9, batch 100, loss[loss=0.1549, simple_loss=0.1694, pruned_loss=0.07015, over 5555.00 frames. ], tot_loss[loss=0.1356, simple_loss=0.1565, pruned_loss=0.05728, over 430680.64 frames. ], batch size: 46, lr: 9.26e-03, grad_scale: 16.0 +2022-11-16 00:43:23,365 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6804, 4.2935, 4.6172, 4.2928, 4.7942, 4.6497, 4.1862, 4.7766], + device='cuda:2'), covar=tensor([0.0369, 0.0294, 0.0401, 0.0266, 0.0312, 0.0177, 0.0287, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0136, 0.0103, 0.0133, 0.0151, 0.0088, 0.0112, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 00:43:38,872 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.840e+01 1.567e+02 1.869e+02 2.319e+02 4.319e+02, threshold=3.738e+02, percent-clipped=0.0 +2022-11-16 00:44:09,366 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1606, 3.4010, 2.7018, 1.6517, 3.2775, 1.3091, 3.2910, 1.5670], + device='cuda:2'), covar=tensor([0.1598, 0.0189, 0.0686, 0.2062, 0.0241, 0.2216, 0.0270, 0.1974], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0108, 0.0116, 0.0118, 0.0109, 0.0128, 0.0099, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:44:13,599 INFO [train.py:876] (2/4) Epoch 9, batch 200, loss[loss=0.2046, simple_loss=0.2069, pruned_loss=0.1011, over 5462.00 frames. ], tot_loss[loss=0.1391, simple_loss=0.1585, pruned_loss=0.05981, over 689106.31 frames. ], batch size: 71, lr: 9.25e-03, grad_scale: 16.0 +2022-11-16 00:44:22,162 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58389.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:44:46,848 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.538e+01 1.502e+02 1.721e+02 2.221e+02 4.054e+02, threshold=3.441e+02, percent-clipped=2.0 +2022-11-16 00:44:55,449 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58437.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:45:21,752 INFO [train.py:876] (2/4) Epoch 9, batch 300, loss[loss=0.1337, simple_loss=0.1574, pruned_loss=0.05495, over 5259.00 frames. ], tot_loss[loss=0.1372, simple_loss=0.1574, pruned_loss=0.05849, over 852067.88 frames. ], batch size: 79, lr: 9.24e-03, grad_scale: 16.0 +2022-11-16 00:45:35,856 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=58497.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:45:37,824 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=58500.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:45:53,985 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.037e+02 1.573e+02 2.012e+02 2.567e+02 4.757e+02, threshold=4.023e+02, percent-clipped=7.0 +2022-11-16 00:46:18,314 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.30 vs. limit=5.0 +2022-11-16 00:46:29,079 INFO [train.py:876] (2/4) Epoch 9, batch 400, loss[loss=0.1078, simple_loss=0.1407, pruned_loss=0.03741, over 5545.00 frames. ], tot_loss[loss=0.1359, simple_loss=0.1567, pruned_loss=0.05754, over 946439.61 frames. ], batch size: 16, lr: 9.23e-03, grad_scale: 16.0 +2022-11-16 00:46:32,287 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-16 00:46:43,539 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-16 00:47:01,902 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.026e+02 1.664e+02 1.953e+02 2.587e+02 4.932e+02, threshold=3.905e+02, percent-clipped=2.0 +2022-11-16 00:47:30,771 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0856, 2.3682, 3.7861, 3.1804, 3.9563, 2.5347, 3.5541, 4.0955], + device='cuda:2'), covar=tensor([0.0663, 0.1859, 0.0676, 0.1645, 0.0523, 0.1780, 0.1207, 0.0815], + device='cuda:2'), in_proj_covar=tensor([0.0224, 0.0192, 0.0201, 0.0207, 0.0221, 0.0190, 0.0217, 0.0222], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:47:36,809 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8436, 2.0769, 1.5419, 1.2559, 1.4957, 2.2753, 1.9302, 2.3795], + device='cuda:2'), covar=tensor([0.1497, 0.1371, 0.1883, 0.2743, 0.1230, 0.1027, 0.0664, 0.1008], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0182, 0.0163, 0.0187, 0.0172, 0.0185, 0.0155, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 00:47:37,289 INFO [train.py:876] (2/4) Epoch 9, batch 500, loss[loss=0.1606, simple_loss=0.1669, pruned_loss=0.07717, over 5112.00 frames. ], tot_loss[loss=0.1338, simple_loss=0.1555, pruned_loss=0.05605, over 1006672.04 frames. ], batch size: 91, lr: 9.22e-03, grad_scale: 16.0 +2022-11-16 00:47:40,005 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58681.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:47:57,041 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4073, 2.0155, 1.6006, 0.8975, 1.3658, 1.3307, 1.3731, 1.7844], + device='cuda:2'), covar=tensor([0.0037, 0.0035, 0.0031, 0.0037, 0.0032, 0.0027, 0.0030, 0.0040], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0043, 0.0045, 0.0045, 0.0043, 0.0039, 0.0044, 0.0038], + device='cuda:2'), out_proj_covar=tensor([4.4079e-05, 3.8453e-05, 4.0167e-05, 4.0713e-05, 3.8347e-05, 3.4009e-05, + 4.0528e-05, 3.3730e-05], device='cuda:2') +2022-11-16 00:48:09,722 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.047e+02 1.707e+02 2.003e+02 2.573e+02 4.379e+02, threshold=4.006e+02, percent-clipped=1.0 +2022-11-16 00:48:20,977 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58742.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:48:32,601 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-11-16 00:48:36,099 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5029, 3.1457, 2.9761, 1.7113, 3.0534, 3.1826, 3.5742, 3.7339], + device='cuda:2'), covar=tensor([0.1678, 0.1259, 0.1224, 0.2716, 0.0460, 0.0900, 0.0366, 0.0545], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0182, 0.0163, 0.0189, 0.0172, 0.0184, 0.0156, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 00:48:43,684 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58775.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:48:44,828 INFO [train.py:876] (2/4) Epoch 9, batch 600, loss[loss=0.1224, simple_loss=0.1495, pruned_loss=0.04761, over 5792.00 frames. ], tot_loss[loss=0.1328, simple_loss=0.1545, pruned_loss=0.0555, over 1039169.85 frames. ], batch size: 21, lr: 9.22e-03, grad_scale: 16.0 +2022-11-16 00:48:52,537 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.67 vs. limit=5.0 +2022-11-16 00:48:58,081 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58797.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:49:00,344 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58800.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:49:11,407 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9319, 1.5978, 1.3436, 1.4017, 1.0372, 1.9688, 1.6638, 1.0941], + device='cuda:2'), covar=tensor([0.1928, 0.0855, 0.2150, 0.2115, 0.2547, 0.0758, 0.1306, 0.2102], + device='cuda:2'), in_proj_covar=tensor([0.0077, 0.0068, 0.0068, 0.0080, 0.0060, 0.0052, 0.0057, 0.0066], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-11-16 00:49:17,737 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.043e+02 1.543e+02 1.852e+02 2.303e+02 3.719e+02, threshold=3.705e+02, percent-clipped=0.0 +2022-11-16 00:49:25,162 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58836.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:49:30,910 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58845.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:49:32,855 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58848.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:49:40,647 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5193, 1.2155, 1.3018, 1.0023, 1.6300, 1.7104, 1.1117, 1.3781], + device='cuda:2'), covar=tensor([0.0914, 0.0546, 0.1023, 0.1005, 0.1038, 0.0485, 0.0808, 0.0551], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0012, 0.0015, 0.0014, 0.0011, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.2673e-05, 8.3606e-05, 6.3468e-05, 7.4515e-05, 6.7692e-05, 6.1226e-05, + 7.7534e-05, 6.1038e-05], device='cuda:2') +2022-11-16 00:49:52,487 INFO [train.py:876] (2/4) Epoch 9, batch 700, loss[loss=0.0891, simple_loss=0.1282, pruned_loss=0.02501, over 5475.00 frames. ], tot_loss[loss=0.1322, simple_loss=0.154, pruned_loss=0.05517, over 1054647.72 frames. ], batch size: 10, lr: 9.21e-03, grad_scale: 8.0 +2022-11-16 00:50:11,761 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58906.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:50:25,242 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.031e+02 1.741e+02 2.186e+02 2.790e+02 4.131e+02, threshold=4.372e+02, percent-clipped=5.0 +2022-11-16 00:50:47,583 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3370, 2.6306, 2.9135, 2.6435, 1.8134, 2.8217, 1.8889, 2.3234], + device='cuda:2'), covar=tensor([0.0288, 0.0156, 0.0130, 0.0247, 0.0373, 0.0154, 0.0393, 0.0155], + device='cuda:2'), in_proj_covar=tensor([0.0184, 0.0156, 0.0169, 0.0188, 0.0179, 0.0169, 0.0177, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:50:52,785 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58967.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:50:59,891 INFO [train.py:876] (2/4) Epoch 9, batch 800, loss[loss=0.1154, simple_loss=0.1399, pruned_loss=0.04546, over 5787.00 frames. ], tot_loss[loss=0.132, simple_loss=0.1534, pruned_loss=0.05531, over 1063636.59 frames. ], batch size: 21, lr: 9.20e-03, grad_scale: 8.0 +2022-11-16 00:51:28,078 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6782, 4.2474, 3.2630, 1.8035, 4.1757, 1.6703, 4.0247, 2.1954], + device='cuda:2'), covar=tensor([0.1362, 0.0198, 0.0854, 0.2419, 0.0213, 0.2123, 0.0223, 0.1727], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0106, 0.0114, 0.0114, 0.0106, 0.0125, 0.0098, 0.0114], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:51:33,806 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.601e+01 1.692e+02 2.170e+02 2.841e+02 6.263e+02, threshold=4.339e+02, percent-clipped=3.0 +2022-11-16 00:51:41,842 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59037.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:52:08,567 INFO [train.py:876] (2/4) Epoch 9, batch 900, loss[loss=0.2174, simple_loss=0.1974, pruned_loss=0.1187, over 3050.00 frames. ], tot_loss[loss=0.1336, simple_loss=0.1543, pruned_loss=0.0565, over 1070032.50 frames. ], batch size: 284, lr: 9.19e-03, grad_scale: 8.0 +2022-11-16 00:52:10,557 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.21 vs. limit=5.0 +2022-11-16 00:52:20,187 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-16 00:52:42,261 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.620e+01 1.634e+02 1.980e+02 2.467e+02 5.009e+02, threshold=3.960e+02, percent-clipped=1.0 +2022-11-16 00:52:44,385 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1596, 4.7985, 4.2997, 4.7479, 4.7354, 4.1755, 4.3051, 3.8766], + device='cuda:2'), covar=tensor([0.0340, 0.0453, 0.1378, 0.0394, 0.0411, 0.0358, 0.0619, 0.0726], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0167, 0.0266, 0.0163, 0.0209, 0.0169, 0.0176, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 00:52:46,080 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59131.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:52:55,946 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.4414, 4.8792, 5.2695, 4.8854, 5.5451, 5.5048, 4.6321, 5.4430], + device='cuda:2'), covar=tensor([0.0371, 0.0312, 0.0484, 0.0396, 0.0357, 0.0129, 0.0322, 0.0263], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0138, 0.0106, 0.0136, 0.0157, 0.0089, 0.0116, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 00:52:57,300 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8027, 2.3077, 2.7420, 1.7387, 1.5355, 3.1832, 2.7313, 2.2064], + device='cuda:2'), covar=tensor([0.0532, 0.0909, 0.0433, 0.2506, 0.2244, 1.0746, 0.0756, 0.0729], + device='cuda:2'), in_proj_covar=tensor([0.0079, 0.0069, 0.0067, 0.0083, 0.0061, 0.0050, 0.0058, 0.0067], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-11-16 00:53:08,454 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=59164.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:53:16,820 INFO [train.py:876] (2/4) Epoch 9, batch 1000, loss[loss=0.117, simple_loss=0.1433, pruned_loss=0.04531, over 5620.00 frames. ], tot_loss[loss=0.1321, simple_loss=0.1531, pruned_loss=0.0555, over 1071626.37 frames. ], batch size: 24, lr: 9.19e-03, grad_scale: 8.0 +2022-11-16 00:53:50,619 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=59225.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:53:51,081 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.814e+01 1.686e+02 1.957e+02 2.486e+02 6.337e+02, threshold=3.914e+02, percent-clipped=1.0 +2022-11-16 00:54:18,046 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59262.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:54:28,528 INFO [train.py:876] (2/4) Epoch 9, batch 1100, loss[loss=0.107, simple_loss=0.1359, pruned_loss=0.03907, over 5533.00 frames. ], tot_loss[loss=0.1327, simple_loss=0.1536, pruned_loss=0.0559, over 1073256.94 frames. ], batch size: 21, lr: 9.18e-03, grad_scale: 8.0 +2022-11-16 00:55:01,718 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.767e+01 1.618e+02 1.962e+02 2.354e+02 6.461e+02, threshold=3.925e+02, percent-clipped=2.0 +2022-11-16 00:55:03,859 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7735, 1.1155, 1.6047, 1.0020, 1.6021, 1.2484, 0.9933, 1.4117], + device='cuda:2'), covar=tensor([0.0448, 0.0408, 0.1396, 0.0761, 0.1288, 0.1064, 0.0659, 0.0336], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0013, 0.0016, 0.0014, 0.0012, 0.0017, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.3966e-05, 8.4870e-05, 6.5596e-05, 7.7010e-05, 7.0128e-05, 6.3028e-05, + 7.9317e-05, 6.2578e-05], device='cuda:2') +2022-11-16 00:55:08,995 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=59337.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:55:12,430 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-16 00:55:12,785 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2536, 4.7656, 4.2544, 4.6905, 4.7384, 3.9617, 4.3249, 4.1528], + device='cuda:2'), covar=tensor([0.0324, 0.0423, 0.1292, 0.0402, 0.0396, 0.0477, 0.0548, 0.0402], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0168, 0.0265, 0.0164, 0.0210, 0.0169, 0.0177, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 00:55:36,029 INFO [train.py:876] (2/4) Epoch 9, batch 1200, loss[loss=0.09155, simple_loss=0.1159, pruned_loss=0.03358, over 4516.00 frames. ], tot_loss[loss=0.1315, simple_loss=0.1529, pruned_loss=0.055, over 1079897.86 frames. ], batch size: 5, lr: 9.17e-03, grad_scale: 8.0 +2022-11-16 00:55:41,356 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=59385.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:55:51,442 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6623, 4.4323, 3.3828, 2.0069, 4.2643, 1.6792, 4.0987, 2.3975], + device='cuda:2'), covar=tensor([0.1239, 0.0138, 0.0543, 0.2096, 0.0189, 0.1885, 0.0200, 0.1571], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0107, 0.0115, 0.0113, 0.0105, 0.0124, 0.0098, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:56:09,404 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.118e+02 1.585e+02 1.981e+02 2.415e+02 4.268e+02, threshold=3.961e+02, percent-clipped=1.0 +2022-11-16 00:56:12,839 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=59431.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:56:18,750 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7137, 1.5547, 1.4873, 1.0370, 1.2395, 1.5607, 1.1096, 1.2330], + device='cuda:2'), covar=tensor([0.0021, 0.0032, 0.0025, 0.0050, 0.0043, 0.0047, 0.0035, 0.0034], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0022, 0.0027, 0.0024, 0.0023, 0.0026, 0.0026], + device='cuda:2'), out_proj_covar=tensor([1.9150e-05, 2.0191e-05, 1.9576e-05, 2.6777e-05, 2.2509e-05, 2.2010e-05, + 2.4999e-05, 2.6133e-05], device='cuda:2') +2022-11-16 00:56:43,602 INFO [train.py:876] (2/4) Epoch 9, batch 1300, loss[loss=0.1072, simple_loss=0.1457, pruned_loss=0.03437, over 5483.00 frames. ], tot_loss[loss=0.1306, simple_loss=0.1528, pruned_loss=0.05421, over 1087249.56 frames. ], batch size: 17, lr: 9.16e-03, grad_scale: 8.0 +2022-11-16 00:56:45,395 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=59479.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:57:00,733 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=59502.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:57:13,467 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59520.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:57:17,256 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.599e+01 1.572e+02 1.990e+02 2.382e+02 3.719e+02, threshold=3.980e+02, percent-clipped=0.0 +2022-11-16 00:57:40,984 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=59562.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:57:41,699 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=59563.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:57:51,216 INFO [train.py:876] (2/4) Epoch 9, batch 1400, loss[loss=0.09604, simple_loss=0.1292, pruned_loss=0.03142, over 5550.00 frames. ], tot_loss[loss=0.1313, simple_loss=0.1536, pruned_loss=0.05453, over 1089626.71 frames. ], batch size: 14, lr: 9.15e-03, grad_scale: 8.0 +2022-11-16 00:58:13,887 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=59610.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 00:58:24,720 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.063e+02 1.626e+02 1.961e+02 2.375e+02 3.725e+02, threshold=3.922e+02, percent-clipped=0.0 +2022-11-16 00:58:34,659 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2751, 2.3582, 2.7719, 2.4816, 1.5892, 2.6104, 1.7917, 2.1450], + device='cuda:2'), covar=tensor([0.0244, 0.0136, 0.0158, 0.0225, 0.0354, 0.0154, 0.0356, 0.0165], + device='cuda:2'), in_proj_covar=tensor([0.0186, 0.0160, 0.0173, 0.0194, 0.0183, 0.0173, 0.0182, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 00:58:40,809 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0891, 2.9058, 3.1948, 1.5956, 2.9543, 3.3502, 3.3064, 3.7226], + device='cuda:2'), covar=tensor([0.2139, 0.1811, 0.1103, 0.3302, 0.0516, 0.0791, 0.0405, 0.0711], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0182, 0.0162, 0.0189, 0.0170, 0.0183, 0.0154, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 00:58:43,495 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6250, 1.0355, 1.1811, 0.6829, 1.2974, 1.1876, 0.7129, 1.1694], + device='cuda:2'), covar=tensor([0.0323, 0.0344, 0.0599, 0.1368, 0.0555, 0.0704, 0.1339, 0.0379], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0013, 0.0016, 0.0014, 0.0012, 0.0017, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.2511e-05, 8.3813e-05, 6.4609e-05, 7.6039e-05, 6.9685e-05, 6.2603e-05, + 7.8593e-05, 6.1999e-05], device='cuda:2') +2022-11-16 00:58:58,834 INFO [train.py:876] (2/4) Epoch 9, batch 1500, loss[loss=0.1306, simple_loss=0.1534, pruned_loss=0.05391, over 5701.00 frames. ], tot_loss[loss=0.1302, simple_loss=0.1532, pruned_loss=0.05357, over 1094721.56 frames. ], batch size: 17, lr: 9.15e-03, grad_scale: 8.0 +2022-11-16 00:59:10,232 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2644, 5.2309, 3.7813, 2.0659, 5.0088, 2.2264, 4.6678, 2.7246], + device='cuda:2'), covar=tensor([0.1168, 0.0169, 0.0561, 0.2439, 0.0208, 0.1812, 0.0180, 0.1633], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0106, 0.0115, 0.0115, 0.0105, 0.0124, 0.0098, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 00:59:10,969 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0434, 1.4120, 0.9802, 0.7564, 1.3580, 1.2265, 0.5908, 1.1469], + device='cuda:2'), covar=tensor([0.0042, 0.0030, 0.0042, 0.0037, 0.0039, 0.0027, 0.0075, 0.0037], + device='cuda:2'), in_proj_covar=tensor([0.0049, 0.0043, 0.0046, 0.0046, 0.0044, 0.0039, 0.0044, 0.0039], + device='cuda:2'), out_proj_covar=tensor([4.4713e-05, 3.9014e-05, 4.0866e-05, 4.1133e-05, 3.9046e-05, 3.4347e-05, + 4.0389e-05, 3.5003e-05], device='cuda:2') +2022-11-16 00:59:31,912 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.993e+01 1.447e+02 1.801e+02 2.298e+02 3.676e+02, threshold=3.602e+02, percent-clipped=0.0 +2022-11-16 01:00:06,516 INFO [train.py:876] (2/4) Epoch 9, batch 1600, loss[loss=0.1296, simple_loss=0.1504, pruned_loss=0.05447, over 5293.00 frames. ], tot_loss[loss=0.1311, simple_loss=0.1533, pruned_loss=0.05442, over 1085351.49 frames. ], batch size: 79, lr: 9.14e-03, grad_scale: 8.0 +2022-11-16 01:00:36,440 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=59820.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:00:40,153 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.161e+01 1.571e+02 1.942e+02 2.410e+02 4.577e+02, threshold=3.885e+02, percent-clipped=5.0 +2022-11-16 01:00:45,613 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.43 vs. limit=2.0 +2022-11-16 01:00:55,353 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=59848.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:01:01,634 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59858.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:01:08,520 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=59868.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:01:14,347 INFO [train.py:876] (2/4) Epoch 9, batch 1700, loss[loss=0.1739, simple_loss=0.1725, pruned_loss=0.0877, over 5470.00 frames. ], tot_loss[loss=0.1321, simple_loss=0.1535, pruned_loss=0.05537, over 1088070.24 frames. ], batch size: 64, lr: 9.13e-03, grad_scale: 8.0 +2022-11-16 01:01:22,425 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=59889.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:01:35,859 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=59909.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:01:47,043 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.735e+01 1.514e+02 1.939e+02 2.316e+02 5.054e+02, threshold=3.877e+02, percent-clipped=3.0 +2022-11-16 01:02:01,923 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-11-16 01:02:02,989 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=59950.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:02:21,363 INFO [train.py:876] (2/4) Epoch 9, batch 1800, loss[loss=0.1249, simple_loss=0.1495, pruned_loss=0.05012, over 5765.00 frames. ], tot_loss[loss=0.1331, simple_loss=0.1544, pruned_loss=0.05592, over 1089348.99 frames. ], batch size: 26, lr: 9.12e-03, grad_scale: 8.0 +2022-11-16 01:02:59,240 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 1.503e+02 1.882e+02 2.236e+02 4.811e+02, threshold=3.764e+02, percent-clipped=2.0 +2022-11-16 01:03:08,921 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60040.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:03:16,846 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60052.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:03:22,488 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3405, 2.3818, 2.0637, 2.3585, 2.3965, 2.1643, 2.1252, 2.2336], + device='cuda:2'), covar=tensor([0.0475, 0.0872, 0.1832, 0.0668, 0.0822, 0.0589, 0.1041, 0.0732], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0170, 0.0265, 0.0165, 0.0212, 0.0168, 0.0176, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:03:25,140 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60065.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:03:28,967 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8116, 3.1801, 2.3029, 2.8395, 2.0649, 2.4565, 1.8476, 2.7680], + device='cuda:2'), covar=tensor([0.1472, 0.0256, 0.0986, 0.0444, 0.1323, 0.0888, 0.1748, 0.0440], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0140, 0.0163, 0.0141, 0.0174, 0.0175, 0.0172, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 01:03:33,731 INFO [train.py:876] (2/4) Epoch 9, batch 1900, loss[loss=0.1173, simple_loss=0.1501, pruned_loss=0.0422, over 5743.00 frames. ], tot_loss[loss=0.1309, simple_loss=0.1524, pruned_loss=0.0547, over 1086901.62 frames. ], batch size: 15, lr: 9.12e-03, grad_scale: 8.0 +2022-11-16 01:03:34,464 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60078.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:03:49,823 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60101.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 01:03:57,574 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60113.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 01:04:06,951 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.557e+01 1.713e+02 2.167e+02 2.668e+02 5.052e+02, threshold=4.333e+02, percent-clipped=7.0 +2022-11-16 01:04:07,150 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60126.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:04:10,576 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.05 vs. limit=2.0 +2022-11-16 01:04:15,577 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60139.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:04:26,451 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-11-16 01:04:28,112 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60158.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:04:32,073 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8136, 1.1503, 1.5181, 0.9630, 1.6580, 1.5371, 1.0090, 1.6069], + device='cuda:2'), covar=tensor([0.0296, 0.0690, 0.0383, 0.1153, 0.1695, 0.0638, 0.1292, 0.0816], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0013, 0.0016, 0.0014, 0.0012, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.3101e-05, 8.3636e-05, 6.4291e-05, 7.6195e-05, 6.8705e-05, 6.2240e-05, + 7.7976e-05, 6.1639e-05], device='cuda:2') +2022-11-16 01:04:40,601 INFO [train.py:876] (2/4) Epoch 9, batch 2000, loss[loss=0.1812, simple_loss=0.1882, pruned_loss=0.08706, over 5629.00 frames. ], tot_loss[loss=0.1305, simple_loss=0.1523, pruned_loss=0.05434, over 1083012.72 frames. ], batch size: 38, lr: 9.11e-03, grad_scale: 8.0 +2022-11-16 01:04:49,194 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-11-16 01:04:55,775 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60198.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:04:59,905 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60204.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:05:00,593 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2988, 4.6025, 4.1983, 4.5823, 4.5994, 3.9035, 4.2580, 4.0708], + device='cuda:2'), covar=tensor([0.0307, 0.0342, 0.1152, 0.0376, 0.0357, 0.0481, 0.0647, 0.0610], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0169, 0.0264, 0.0164, 0.0207, 0.0168, 0.0176, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:05:01,212 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60206.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:05:13,432 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9622, 3.2136, 3.4324, 1.5574, 2.9309, 3.7012, 3.4915, 3.7499], + device='cuda:2'), covar=tensor([0.2319, 0.1374, 0.0838, 0.2723, 0.0675, 0.0477, 0.0333, 0.0628], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0183, 0.0161, 0.0187, 0.0170, 0.0185, 0.0155, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 01:05:14,562 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 1.629e+02 2.002e+02 2.495e+02 4.506e+02, threshold=4.003e+02, percent-clipped=3.0 +2022-11-16 01:05:27,713 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60245.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:05:37,280 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60259.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:05:49,243 INFO [train.py:876] (2/4) Epoch 9, batch 2100, loss[loss=0.162, simple_loss=0.1613, pruned_loss=0.08133, over 5508.00 frames. ], tot_loss[loss=0.1302, simple_loss=0.1525, pruned_loss=0.05393, over 1084422.86 frames. ], batch size: 49, lr: 9.10e-03, grad_scale: 8.0 +2022-11-16 01:06:19,873 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0005, 1.9156, 1.7974, 1.7079, 1.4645, 1.8606, 2.1506, 1.8059], + device='cuda:2'), covar=tensor([0.0044, 0.0087, 0.0017, 0.0031, 0.0116, 0.0102, 0.0026, 0.0032], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0022, 0.0028, 0.0024, 0.0023, 0.0026, 0.0026], + device='cuda:2'), out_proj_covar=tensor([1.9395e-05, 2.0071e-05, 1.9511e-05, 2.7715e-05, 2.2885e-05, 2.2543e-05, + 2.5381e-05, 2.5908e-05], device='cuda:2') +2022-11-16 01:06:22,292 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.109e+02 1.667e+02 2.049e+02 2.499e+02 3.972e+02, threshold=4.098e+02, percent-clipped=0.0 +2022-11-16 01:06:51,806 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3726, 2.4227, 2.1048, 2.3978, 2.4318, 2.1886, 2.1438, 2.2859], + device='cuda:2'), covar=tensor([0.0405, 0.0682, 0.1612, 0.0582, 0.0543, 0.0607, 0.1165, 0.0578], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0170, 0.0262, 0.0163, 0.0207, 0.0166, 0.0176, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:06:56,813 INFO [train.py:876] (2/4) Epoch 9, batch 2200, loss[loss=0.1303, simple_loss=0.1627, pruned_loss=0.04898, over 5617.00 frames. ], tot_loss[loss=0.1319, simple_loss=0.1538, pruned_loss=0.05505, over 1081579.37 frames. ], batch size: 43, lr: 9.09e-03, grad_scale: 8.0 +2022-11-16 01:07:09,678 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60396.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 01:07:18,006 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60408.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 01:07:26,516 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60421.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:07:30,027 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.086e+02 1.625e+02 1.909e+02 2.506e+02 7.347e+02, threshold=3.819e+02, percent-clipped=1.0 +2022-11-16 01:07:35,204 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60434.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:08:04,143 INFO [train.py:876] (2/4) Epoch 9, batch 2300, loss[loss=0.08565, simple_loss=0.1281, pruned_loss=0.0216, over 5514.00 frames. ], tot_loss[loss=0.1294, simple_loss=0.1516, pruned_loss=0.05356, over 1085913.83 frames. ], batch size: 14, lr: 9.09e-03, grad_scale: 8.0 +2022-11-16 01:08:12,631 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1929, 1.5437, 1.1423, 0.7116, 1.4219, 1.2510, 0.6162, 1.2314], + device='cuda:2'), covar=tensor([0.0045, 0.0025, 0.0037, 0.0041, 0.0029, 0.0034, 0.0056, 0.0034], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0041, 0.0043, 0.0044, 0.0043, 0.0038, 0.0042, 0.0037], + device='cuda:2'), out_proj_covar=tensor([4.3407e-05, 3.7420e-05, 3.8880e-05, 3.9801e-05, 3.7909e-05, 3.3355e-05, + 3.9049e-05, 3.2902e-05], device='cuda:2') +2022-11-16 01:08:22,442 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60504.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:08:35,174 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2222, 4.2338, 4.2009, 4.4294, 3.6728, 3.4743, 4.8508, 4.3213], + device='cuda:2'), covar=tensor([0.0438, 0.0754, 0.0491, 0.0966, 0.0570, 0.0471, 0.0746, 0.0477], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0103, 0.0089, 0.0114, 0.0085, 0.0074, 0.0140, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:08:37,685 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.337e+01 1.638e+02 1.971e+02 2.511e+02 5.541e+02, threshold=3.943e+02, percent-clipped=0.0 +2022-11-16 01:08:50,480 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60545.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:08:54,914 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60552.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:08:56,297 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60554.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:09:12,106 INFO [train.py:876] (2/4) Epoch 9, batch 2400, loss[loss=0.1025, simple_loss=0.1354, pruned_loss=0.03483, over 5014.00 frames. ], tot_loss[loss=0.1305, simple_loss=0.1527, pruned_loss=0.05421, over 1087329.09 frames. ], batch size: 7, lr: 9.08e-03, grad_scale: 8.0 +2022-11-16 01:09:21,234 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3465, 2.3840, 2.6831, 1.6416, 1.1369, 3.1242, 2.2828, 2.3419], + device='cuda:2'), covar=tensor([0.0870, 0.0992, 0.0627, 0.2721, 0.2321, 0.1275, 0.1656, 0.1048], + device='cuda:2'), in_proj_covar=tensor([0.0079, 0.0070, 0.0070, 0.0081, 0.0062, 0.0050, 0.0060, 0.0068], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-11-16 01:09:22,867 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60593.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:09:37,447 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2483, 2.4189, 3.7346, 3.2262, 4.0896, 2.6366, 3.5099, 4.2171], + device='cuda:2'), covar=tensor([0.0714, 0.1797, 0.0768, 0.1551, 0.0498, 0.1640, 0.1229, 0.0643], + device='cuda:2'), in_proj_covar=tensor([0.0232, 0.0193, 0.0205, 0.0209, 0.0226, 0.0193, 0.0223, 0.0222], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 01:09:38,075 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3906, 2.1451, 2.5661, 3.4437, 3.4001, 2.5570, 2.1113, 3.4949], + device='cuda:2'), covar=tensor([0.0838, 0.2684, 0.2003, 0.2614, 0.1148, 0.2935, 0.2276, 0.0620], + device='cuda:2'), in_proj_covar=tensor([0.0229, 0.0202, 0.0195, 0.0318, 0.0224, 0.0206, 0.0193, 0.0229], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 01:09:40,333 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4940, 1.8759, 1.2197, 1.1082, 1.4353, 2.1956, 1.8478, 2.0015], + device='cuda:2'), covar=tensor([0.1141, 0.0712, 0.1666, 0.2097, 0.0948, 0.0593, 0.0577, 0.1000], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0184, 0.0163, 0.0190, 0.0170, 0.0186, 0.0157, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 01:09:44,907 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.073e+02 1.802e+02 2.307e+02 2.858e+02 9.175e+02, threshold=4.615e+02, percent-clipped=3.0 +2022-11-16 01:09:57,578 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7560, 1.1138, 1.1019, 0.8871, 1.4024, 1.3947, 0.8183, 1.1138], + device='cuda:2'), covar=tensor([0.0310, 0.0574, 0.0630, 0.1243, 0.1313, 0.0390, 0.0904, 0.0889], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0012, 0.0016, 0.0014, 0.0012, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.3602e-05, 8.4745e-05, 6.4433e-05, 7.7247e-05, 6.9578e-05, 6.2583e-05, + 7.8651e-05, 6.3821e-05], device='cuda:2') +2022-11-16 01:10:19,001 INFO [train.py:876] (2/4) Epoch 9, batch 2500, loss[loss=0.1144, simple_loss=0.1499, pruned_loss=0.0395, over 5500.00 frames. ], tot_loss[loss=0.1309, simple_loss=0.1524, pruned_loss=0.05469, over 1087606.86 frames. ], batch size: 17, lr: 9.07e-03, grad_scale: 8.0 +2022-11-16 01:10:31,964 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60696.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:10:40,130 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60708.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:10:41,351 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0448, 1.4800, 1.8590, 1.4618, 1.0191, 2.3422, 1.7889, 1.6923], + device='cuda:2'), covar=tensor([0.1360, 0.1231, 0.0820, 0.2177, 0.2134, 0.0715, 0.1117, 0.1586], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0071, 0.0070, 0.0082, 0.0063, 0.0050, 0.0061, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-11-16 01:10:46,715 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60718.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:10:48,600 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60721.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:10:51,736 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.647e+01 1.621e+02 1.973e+02 2.538e+02 4.790e+02, threshold=3.945e+02, percent-clipped=1.0 +2022-11-16 01:10:56,344 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-16 01:10:57,745 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60734.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:11:04,185 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60744.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:11:11,860 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60756.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:11:12,659 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1859, 1.3668, 1.0851, 1.0917, 1.3197, 1.4784, 1.1217, 1.4328], + device='cuda:2'), covar=tensor([0.0301, 0.0566, 0.0897, 0.0844, 0.1855, 0.0843, 0.0832, 0.0402], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0012, 0.0016, 0.0014, 0.0012, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.3517e-05, 8.3624e-05, 6.3297e-05, 7.6070e-05, 6.8748e-05, 6.1738e-05, + 7.7628e-05, 6.2648e-05], device='cuda:2') +2022-11-16 01:11:20,528 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60769.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:11:21,235 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4498, 3.6070, 3.7059, 3.4254, 3.5585, 3.5326, 1.4135, 3.6857], + device='cuda:2'), covar=tensor([0.0293, 0.0294, 0.0259, 0.0243, 0.0312, 0.0332, 0.2809, 0.0285], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0082, 0.0083, 0.0075, 0.0099, 0.0085, 0.0128, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:11:25,653 INFO [train.py:876] (2/4) Epoch 9, batch 2600, loss[loss=0.1345, simple_loss=0.1619, pruned_loss=0.05355, over 5782.00 frames. ], tot_loss[loss=0.1316, simple_loss=0.153, pruned_loss=0.05508, over 1086049.94 frames. ], batch size: 14, lr: 9.06e-03, grad_scale: 8.0 +2022-11-16 01:11:27,137 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60779.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:11:28,890 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60782.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:11:35,405 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7179, 4.7227, 3.5998, 2.0415, 4.4187, 2.0799, 4.4396, 2.5511], + device='cuda:2'), covar=tensor([0.1357, 0.0132, 0.0507, 0.1980, 0.0128, 0.1545, 0.0153, 0.1394], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0106, 0.0113, 0.0114, 0.0103, 0.0125, 0.0097, 0.0114], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 01:11:39,290 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1493, 2.2171, 2.2475, 2.3064, 2.0056, 1.6726, 2.0245, 2.5987], + device='cuda:2'), covar=tensor([0.1038, 0.1574, 0.2355, 0.1336, 0.1940, 0.1741, 0.2000, 0.2568], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0089, 0.0098, 0.0084, 0.0083, 0.0088, 0.0090, 0.0067], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 01:11:58,494 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.32 vs. limit=5.0 +2022-11-16 01:11:58,599 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.024e+02 1.669e+02 1.987e+02 2.479e+02 5.022e+02, threshold=3.974e+02, percent-clipped=3.0 +2022-11-16 01:12:18,173 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60854.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:12:33,336 INFO [train.py:876] (2/4) Epoch 9, batch 2700, loss[loss=0.1695, simple_loss=0.168, pruned_loss=0.08547, over 4710.00 frames. ], tot_loss[loss=0.1305, simple_loss=0.1525, pruned_loss=0.0543, over 1083630.13 frames. ], batch size: 135, lr: 9.06e-03, grad_scale: 16.0 +2022-11-16 01:12:50,327 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60902.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:13:06,004 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.108e+02 1.571e+02 1.865e+02 2.553e+02 5.202e+02, threshold=3.729e+02, percent-clipped=2.0 +2022-11-16 01:13:30,015 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.55 vs. limit=5.0 +2022-11-16 01:13:40,754 INFO [train.py:876] (2/4) Epoch 9, batch 2800, loss[loss=0.1493, simple_loss=0.1668, pruned_loss=0.06592, over 5595.00 frames. ], tot_loss[loss=0.1294, simple_loss=0.152, pruned_loss=0.05338, over 1087064.99 frames. ], batch size: 43, lr: 9.05e-03, grad_scale: 16.0 +2022-11-16 01:14:13,879 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.037e+02 1.565e+02 1.869e+02 2.208e+02 4.201e+02, threshold=3.737e+02, percent-clipped=2.0 +2022-11-16 01:14:25,819 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7452, 0.6043, 0.6157, 0.4705, 0.6944, 0.6852, 0.4100, 0.6066], + device='cuda:2'), covar=tensor([0.0193, 0.0244, 0.0223, 0.0278, 0.0207, 0.0164, 0.0454, 0.0248], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0012, 0.0016, 0.0014, 0.0012, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.4135e-05, 8.5457e-05, 6.4482e-05, 7.7259e-05, 7.0664e-05, 6.3489e-05, + 7.8981e-05, 6.3810e-05], device='cuda:2') +2022-11-16 01:14:37,287 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=61061.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:14:46,629 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=61074.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:14:48,549 INFO [train.py:876] (2/4) Epoch 9, batch 2900, loss[loss=0.0912, simple_loss=0.1277, pruned_loss=0.02733, over 5557.00 frames. ], tot_loss[loss=0.1288, simple_loss=0.1514, pruned_loss=0.05306, over 1085581.26 frames. ], batch size: 16, lr: 9.04e-03, grad_scale: 16.0 +2022-11-16 01:15:15,085 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4440, 1.5656, 1.8103, 1.7936, 1.6363, 1.4967, 1.5617, 1.7076], + device='cuda:2'), covar=tensor([0.3375, 0.3032, 0.3262, 0.1885, 0.2516, 0.3369, 0.2490, 0.1028], + device='cuda:2'), in_proj_covar=tensor([0.0094, 0.0092, 0.0100, 0.0086, 0.0086, 0.0090, 0.0092, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 01:15:18,596 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=61122.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:15:21,365 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.003e+02 1.585e+02 1.881e+02 2.261e+02 3.985e+02, threshold=3.762e+02, percent-clipped=2.0 +2022-11-16 01:15:39,507 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0584, 3.6495, 2.4363, 3.3141, 2.7322, 2.6253, 1.9338, 3.0730], + device='cuda:2'), covar=tensor([0.1427, 0.0284, 0.1156, 0.0413, 0.0990, 0.1007, 0.2019, 0.0457], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0138, 0.0163, 0.0141, 0.0174, 0.0174, 0.0171, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 01:15:55,641 INFO [train.py:876] (2/4) Epoch 9, batch 3000, loss[loss=0.103, simple_loss=0.1328, pruned_loss=0.03661, over 5729.00 frames. ], tot_loss[loss=0.13, simple_loss=0.1519, pruned_loss=0.05409, over 1083940.84 frames. ], batch size: 11, lr: 9.03e-03, grad_scale: 16.0 +2022-11-16 01:15:55,642 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 01:16:08,315 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7235, 1.2232, 1.1691, 0.8291, 1.3808, 1.5909, 0.7752, 1.3295], + device='cuda:2'), covar=tensor([0.0447, 0.0411, 0.0550, 0.1188, 0.0362, 0.0174, 0.0811, 0.0364], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0012, 0.0016, 0.0014, 0.0012, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.4046e-05, 8.4073e-05, 6.3287e-05, 7.6124e-05, 6.9519e-05, 6.2400e-05, + 7.8048e-05, 6.2975e-05], device='cuda:2') +2022-11-16 01:16:16,568 INFO [train.py:908] (2/4) Epoch 9, validation: loss=0.1637, simple_loss=0.1831, pruned_loss=0.07219, over 1530663.00 frames. +2022-11-16 01:16:16,569 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 01:16:39,732 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8746, 1.3314, 1.1967, 0.8174, 1.0154, 1.2870, 0.8948, 1.0806], + device='cuda:2'), covar=tensor([0.0057, 0.0046, 0.0056, 0.0054, 0.0044, 0.0047, 0.0071, 0.0075], + device='cuda:2'), in_proj_covar=tensor([0.0049, 0.0044, 0.0045, 0.0046, 0.0045, 0.0040, 0.0044, 0.0039], + device='cuda:2'), out_proj_covar=tensor([4.4738e-05, 3.9742e-05, 4.0633e-05, 4.1208e-05, 4.0143e-05, 3.4886e-05, + 4.0748e-05, 3.4399e-05], device='cuda:2') +2022-11-16 01:16:42,258 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.2319, 4.7122, 5.0376, 4.6764, 5.3008, 5.2116, 4.6287, 5.2541], + device='cuda:2'), covar=tensor([0.0321, 0.0305, 0.0419, 0.0309, 0.0309, 0.0118, 0.0244, 0.0228], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0142, 0.0107, 0.0141, 0.0165, 0.0094, 0.0119, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 01:16:49,304 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.005e+02 1.614e+02 2.000e+02 2.380e+02 4.865e+02, threshold=4.000e+02, percent-clipped=3.0 +2022-11-16 01:17:23,851 INFO [train.py:876] (2/4) Epoch 9, batch 3100, loss[loss=0.1051, simple_loss=0.1362, pruned_loss=0.03704, over 5501.00 frames. ], tot_loss[loss=0.1318, simple_loss=0.1532, pruned_loss=0.05526, over 1084872.76 frames. ], batch size: 12, lr: 9.03e-03, grad_scale: 16.0 +2022-11-16 01:17:56,596 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8335, 3.7140, 3.6179, 3.6079, 3.9939, 3.8487, 3.8340, 4.0187], + device='cuda:2'), covar=tensor([0.1113, 0.0732, 0.1046, 0.0838, 0.0945, 0.0499, 0.0674, 0.0786], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0140, 0.0104, 0.0138, 0.0162, 0.0092, 0.0117, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 01:17:57,139 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.004e+02 1.659e+02 1.981e+02 2.507e+02 4.168e+02, threshold=3.962e+02, percent-clipped=1.0 +2022-11-16 01:18:17,676 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-11-16 01:18:30,148 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=61374.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:18:31,952 INFO [train.py:876] (2/4) Epoch 9, batch 3200, loss[loss=0.1038, simple_loss=0.1319, pruned_loss=0.0378, over 5709.00 frames. ], tot_loss[loss=0.1314, simple_loss=0.1529, pruned_loss=0.05496, over 1084472.62 frames. ], batch size: 11, lr: 9.02e-03, grad_scale: 16.0 +2022-11-16 01:18:36,100 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-11-16 01:18:59,219 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=61417.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:19:02,493 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=61422.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:19:04,949 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.027e+02 1.596e+02 1.869e+02 2.254e+02 4.160e+02, threshold=3.737e+02, percent-clipped=1.0 +2022-11-16 01:19:39,535 INFO [train.py:876] (2/4) Epoch 9, batch 3300, loss[loss=0.1054, simple_loss=0.1331, pruned_loss=0.03882, over 5565.00 frames. ], tot_loss[loss=0.1312, simple_loss=0.153, pruned_loss=0.05466, over 1089941.03 frames. ], batch size: 13, lr: 9.01e-03, grad_scale: 16.0 +2022-11-16 01:19:41,093 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-11-16 01:19:53,966 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-16 01:20:01,256 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3182, 3.9346, 4.1590, 3.8913, 4.3797, 4.1648, 4.0356, 4.3124], + device='cuda:2'), covar=tensor([0.0338, 0.0339, 0.0410, 0.0353, 0.0355, 0.0240, 0.0295, 0.0420], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0140, 0.0103, 0.0138, 0.0161, 0.0093, 0.0117, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 01:20:06,201 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-11-16 01:20:07,499 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.02 vs. limit=5.0 +2022-11-16 01:20:12,868 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.100e+02 1.659e+02 1.971e+02 2.388e+02 4.331e+02, threshold=3.941e+02, percent-clipped=1.0 +2022-11-16 01:20:34,619 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1959, 3.3184, 2.3879, 1.7114, 3.1384, 1.3298, 3.0989, 1.8302], + device='cuda:2'), covar=tensor([0.1318, 0.0185, 0.1016, 0.1925, 0.0243, 0.2000, 0.0279, 0.1490], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0104, 0.0113, 0.0113, 0.0102, 0.0122, 0.0098, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 01:20:47,471 INFO [train.py:876] (2/4) Epoch 9, batch 3400, loss[loss=0.08339, simple_loss=0.1198, pruned_loss=0.02348, over 5130.00 frames. ], tot_loss[loss=0.1331, simple_loss=0.1548, pruned_loss=0.05571, over 1083452.71 frames. ], batch size: 6, lr: 9.01e-03, grad_scale: 16.0 +2022-11-16 01:20:48,652 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-11-16 01:20:55,347 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-11-16 01:21:20,963 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.197e+01 1.625e+02 1.885e+02 2.405e+02 4.262e+02, threshold=3.771e+02, percent-clipped=2.0 +2022-11-16 01:21:23,786 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3021, 0.8852, 0.9758, 0.8543, 1.1075, 1.2944, 0.6698, 0.7950], + device='cuda:2'), covar=tensor([0.0245, 0.0316, 0.0319, 0.0439, 0.0349, 0.0222, 0.0604, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0012, 0.0016, 0.0014, 0.0012, 0.0016, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.4235e-05, 8.4420e-05, 6.4149e-05, 7.6573e-05, 6.9421e-05, 6.2352e-05, + 7.8996e-05, 6.3183e-05], device='cuda:2') +2022-11-16 01:21:53,963 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2701, 2.4178, 3.2758, 4.0498, 4.0683, 3.2874, 2.5101, 3.9929], + device='cuda:2'), covar=tensor([0.0435, 0.3133, 0.1983, 0.2755, 0.1409, 0.2548, 0.2301, 0.0867], + device='cuda:2'), in_proj_covar=tensor([0.0227, 0.0202, 0.0193, 0.0313, 0.0224, 0.0202, 0.0192, 0.0227], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 01:21:55,073 INFO [train.py:876] (2/4) Epoch 9, batch 3500, loss[loss=0.09373, simple_loss=0.1035, pruned_loss=0.04199, over 5183.00 frames. ], tot_loss[loss=0.1313, simple_loss=0.1534, pruned_loss=0.05458, over 1086788.45 frames. ], batch size: 8, lr: 9.00e-03, grad_scale: 16.0 +2022-11-16 01:22:22,298 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=61717.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:22:28,020 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.034e+02 1.632e+02 2.023e+02 2.591e+02 5.866e+02, threshold=4.047e+02, percent-clipped=6.0 +2022-11-16 01:22:32,173 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=61732.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:22:39,386 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=61743.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:22:41,646 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.6236, 0.5828, 0.6777, 0.4106, 0.6922, 0.7010, 0.3223, 0.6303], + device='cuda:2'), covar=tensor([0.0242, 0.0288, 0.0254, 0.0312, 0.0246, 0.0195, 0.0583, 0.0257], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0013, 0.0016, 0.0014, 0.0012, 0.0017, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.5543e-05, 8.6517e-05, 6.5223e-05, 7.8017e-05, 7.0683e-05, 6.3258e-05, + 8.0644e-05, 6.3889e-05], device='cuda:2') +2022-11-16 01:22:52,487 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8339, 2.6482, 2.7506, 1.4350, 2.7919, 3.2252, 2.9855, 3.2115], + device='cuda:2'), covar=tensor([0.2595, 0.1626, 0.1102, 0.2976, 0.0790, 0.0695, 0.0724, 0.0963], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0182, 0.0164, 0.0187, 0.0170, 0.0188, 0.0160, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 01:22:54,374 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=61765.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:23:02,302 INFO [train.py:876] (2/4) Epoch 9, batch 3600, loss[loss=0.1353, simple_loss=0.1515, pruned_loss=0.05955, over 5614.00 frames. ], tot_loss[loss=0.1316, simple_loss=0.1533, pruned_loss=0.05492, over 1086516.72 frames. ], batch size: 50, lr: 8.99e-03, grad_scale: 16.0 +2022-11-16 01:23:11,043 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9266, 3.8004, 3.7382, 3.6461, 2.1154, 3.9488, 2.2963, 3.0763], + device='cuda:2'), covar=tensor([0.0333, 0.0168, 0.0150, 0.0271, 0.0474, 0.0137, 0.0465, 0.0170], + device='cuda:2'), in_proj_covar=tensor([0.0186, 0.0160, 0.0168, 0.0189, 0.0182, 0.0169, 0.0180, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 01:23:12,912 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=61793.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:23:20,618 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=61804.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:23:23,183 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0415, 4.4488, 4.7656, 4.4196, 5.0377, 4.9007, 4.4248, 4.9912], + device='cuda:2'), covar=tensor([0.0231, 0.0331, 0.0437, 0.0326, 0.0306, 0.0168, 0.0245, 0.0256], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0140, 0.0103, 0.0139, 0.0159, 0.0092, 0.0116, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 01:23:27,213 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.45 vs. limit=5.0 +2022-11-16 01:23:35,479 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.175e+02 1.617e+02 2.067e+02 2.692e+02 5.357e+02, threshold=4.135e+02, percent-clipped=2.0 +2022-11-16 01:23:35,742 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2024, 2.6856, 3.7026, 3.1780, 4.0404, 2.6016, 3.5764, 4.1374], + device='cuda:2'), covar=tensor([0.0583, 0.1735, 0.0998, 0.1570, 0.0562, 0.1572, 0.1227, 0.0770], + device='cuda:2'), in_proj_covar=tensor([0.0234, 0.0195, 0.0207, 0.0211, 0.0230, 0.0192, 0.0226, 0.0225], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 01:23:40,608 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-11-16 01:23:44,159 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3354, 2.2249, 2.5603, 3.3430, 3.1783, 2.5362, 2.0462, 3.4475], + device='cuda:2'), covar=tensor([0.0857, 0.2389, 0.2216, 0.2607, 0.1218, 0.2731, 0.2308, 0.0725], + device='cuda:2'), in_proj_covar=tensor([0.0227, 0.0199, 0.0194, 0.0315, 0.0219, 0.0203, 0.0192, 0.0229], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 01:23:51,070 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.45 vs. limit=5.0 +2022-11-16 01:24:10,278 INFO [train.py:876] (2/4) Epoch 9, batch 3700, loss[loss=0.1176, simple_loss=0.1444, pruned_loss=0.04539, over 5639.00 frames. ], tot_loss[loss=0.1319, simple_loss=0.1533, pruned_loss=0.05525, over 1080425.03 frames. ], batch size: 43, lr: 8.98e-03, grad_scale: 16.0 +2022-11-16 01:24:11,055 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4143, 1.2207, 1.3677, 1.1685, 1.8126, 1.6223, 1.1022, 1.5558], + device='cuda:2'), covar=tensor([0.1794, 0.0764, 0.0737, 0.0976, 0.1537, 0.0539, 0.0571, 0.0296], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0018, 0.0013, 0.0016, 0.0014, 0.0012, 0.0017, 0.0012], + device='cuda:2'), out_proj_covar=tensor([6.5309e-05, 8.6195e-05, 6.4897e-05, 7.7911e-05, 7.0885e-05, 6.3459e-05, + 8.0142e-05, 6.3817e-05], device='cuda:2') +2022-11-16 01:24:34,716 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=61914.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:24:35,414 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9787, 1.4066, 1.2527, 0.8770, 1.0377, 1.2624, 0.8683, 1.2653], + device='cuda:2'), covar=tensor([0.0061, 0.0039, 0.0046, 0.0065, 0.0065, 0.0036, 0.0061, 0.0043], + device='cuda:2'), in_proj_covar=tensor([0.0051, 0.0046, 0.0047, 0.0048, 0.0047, 0.0042, 0.0045, 0.0040], + device='cuda:2'), out_proj_covar=tensor([4.6290e-05, 4.1618e-05, 4.2500e-05, 4.3053e-05, 4.1581e-05, 3.6454e-05, + 4.1612e-05, 3.5981e-05], device='cuda:2') +2022-11-16 01:24:42,850 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.652e+01 1.593e+02 2.000e+02 2.466e+02 4.913e+02, threshold=3.999e+02, percent-clipped=2.0 +2022-11-16 01:24:51,779 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6202, 2.7466, 2.4949, 2.9949, 2.3005, 2.6400, 2.7840, 2.9852], + device='cuda:2'), covar=tensor([0.1357, 0.1553, 0.2281, 0.1640, 0.1898, 0.2076, 0.1522, 0.4023], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0094, 0.0098, 0.0087, 0.0087, 0.0090, 0.0091, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 01:25:04,322 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4639, 0.8777, 1.3397, 1.0937, 0.8256, 1.1027, 1.0082, 1.1478], + device='cuda:2'), covar=tensor([0.0040, 0.0079, 0.0043, 0.0090, 0.0123, 0.0120, 0.0050, 0.0054], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0020, 0.0022, 0.0028, 0.0024, 0.0022, 0.0026, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.8848e-05, 1.9338e-05, 1.9659e-05, 2.7524e-05, 2.2804e-05, 2.1252e-05, + 2.5165e-05, 2.4820e-05], device='cuda:2') +2022-11-16 01:25:15,832 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=61975.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:25:16,951 INFO [train.py:876] (2/4) Epoch 9, batch 3800, loss[loss=0.1332, simple_loss=0.1662, pruned_loss=0.05014, over 5812.00 frames. ], tot_loss[loss=0.1319, simple_loss=0.1538, pruned_loss=0.05498, over 1084392.45 frames. ], batch size: 21, lr: 8.98e-03, grad_scale: 16.0 +2022-11-16 01:25:29,447 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 01:25:36,090 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0174, 1.1740, 1.5973, 1.0822, 1.1809, 1.4582, 1.1783, 1.4780], + device='cuda:2'), covar=tensor([0.0024, 0.0155, 0.0042, 0.0051, 0.0091, 0.0053, 0.0038, 0.0036], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0020, 0.0022, 0.0028, 0.0024, 0.0022, 0.0026, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.8813e-05, 1.9499e-05, 1.9786e-05, 2.7538e-05, 2.2888e-05, 2.1334e-05, + 2.5310e-05, 2.5020e-05], device='cuda:2') +2022-11-16 01:25:50,163 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.003e+01 1.597e+02 1.892e+02 2.479e+02 4.563e+02, threshold=3.783e+02, percent-clipped=2.0 +2022-11-16 01:25:58,665 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2549, 3.8301, 2.7771, 3.5449, 2.8532, 2.8233, 2.0726, 3.3079], + device='cuda:2'), covar=tensor([0.1394, 0.0251, 0.0953, 0.0401, 0.0877, 0.0996, 0.1985, 0.0370], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0138, 0.0163, 0.0141, 0.0175, 0.0174, 0.0172, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 01:26:14,481 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.45 vs. limit=5.0 +2022-11-16 01:26:15,111 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.94 vs. limit=5.0 +2022-11-16 01:26:17,658 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1619, 2.7758, 3.3088, 1.8813, 2.9302, 3.4928, 3.5285, 3.9945], + device='cuda:2'), covar=tensor([0.2103, 0.1646, 0.0617, 0.2806, 0.0814, 0.1005, 0.0372, 0.0575], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0180, 0.0163, 0.0186, 0.0168, 0.0188, 0.0157, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 01:26:24,517 INFO [train.py:876] (2/4) Epoch 9, batch 3900, loss[loss=0.1245, simple_loss=0.1507, pruned_loss=0.04917, over 5685.00 frames. ], tot_loss[loss=0.1308, simple_loss=0.153, pruned_loss=0.05428, over 1086426.98 frames. ], batch size: 28, lr: 8.97e-03, grad_scale: 16.0 +2022-11-16 01:26:31,650 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62088.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:26:35,168 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7331, 2.6891, 2.6375, 2.8233, 2.8312, 2.6328, 3.0589, 2.8334], + device='cuda:2'), covar=tensor([0.0484, 0.0955, 0.0585, 0.1069, 0.0532, 0.0411, 0.0910, 0.0724], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0101, 0.0089, 0.0111, 0.0083, 0.0074, 0.0141, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:26:39,492 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62099.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:26:55,163 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62122.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:26:57,083 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7701, 1.6313, 2.3155, 1.5451, 1.0768, 2.6420, 2.0646, 1.8044], + device='cuda:2'), covar=tensor([0.1243, 0.1409, 0.0796, 0.2572, 0.3263, 0.0821, 0.1629, 0.1490], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0074, 0.0074, 0.0088, 0.0065, 0.0056, 0.0062, 0.0073], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:26:57,546 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.082e+02 1.676e+02 2.020e+02 2.333e+02 7.144e+02, threshold=4.039e+02, percent-clipped=2.0 +2022-11-16 01:27:32,466 INFO [train.py:876] (2/4) Epoch 9, batch 4000, loss[loss=0.0828, simple_loss=0.1156, pruned_loss=0.025, over 5549.00 frames. ], tot_loss[loss=0.1285, simple_loss=0.1513, pruned_loss=0.05285, over 1086486.09 frames. ], batch size: 10, lr: 8.96e-03, grad_scale: 16.0 +2022-11-16 01:27:36,462 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62183.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:28:05,700 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.381e+01 1.660e+02 2.083e+02 2.529e+02 5.015e+02, threshold=4.165e+02, percent-clipped=1.0 +2022-11-16 01:28:35,676 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62270.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:28:37,726 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62273.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:28:40,194 INFO [train.py:876] (2/4) Epoch 9, batch 4100, loss[loss=0.1132, simple_loss=0.1472, pruned_loss=0.0396, over 5708.00 frames. ], tot_loss[loss=0.129, simple_loss=0.151, pruned_loss=0.05348, over 1082549.98 frames. ], batch size: 19, lr: 8.96e-03, grad_scale: 16.0 +2022-11-16 01:29:13,574 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.063e+02 1.483e+02 1.988e+02 2.459e+02 5.062e+02, threshold=3.976e+02, percent-clipped=4.0 +2022-11-16 01:29:19,048 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62334.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:29:27,106 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.24 vs. limit=2.0 +2022-11-16 01:29:48,373 INFO [train.py:876] (2/4) Epoch 9, batch 4200, loss[loss=0.1409, simple_loss=0.1775, pruned_loss=0.05214, over 5613.00 frames. ], tot_loss[loss=0.1276, simple_loss=0.1498, pruned_loss=0.05266, over 1079083.67 frames. ], batch size: 18, lr: 8.95e-03, grad_scale: 16.0 +2022-11-16 01:29:53,716 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0289, 2.6139, 3.6379, 3.1978, 3.9141, 2.4790, 3.4775, 4.0271], + device='cuda:2'), covar=tensor([0.0651, 0.1439, 0.0848, 0.1501, 0.0544, 0.1610, 0.1170, 0.0741], + device='cuda:2'), in_proj_covar=tensor([0.0232, 0.0193, 0.0207, 0.0206, 0.0225, 0.0190, 0.0221, 0.0222], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 01:29:55,578 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62388.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:29:57,575 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62391.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:30:02,848 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62399.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:30:21,289 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.948e+01 1.584e+02 2.002e+02 2.529e+02 4.819e+02, threshold=4.003e+02, percent-clipped=3.0 +2022-11-16 01:30:28,436 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62436.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:30:35,829 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62447.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:30:39,208 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62452.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:30:55,989 INFO [train.py:876] (2/4) Epoch 9, batch 4300, loss[loss=0.07645, simple_loss=0.116, pruned_loss=0.01844, over 5561.00 frames. ], tot_loss[loss=0.1277, simple_loss=0.1508, pruned_loss=0.0523, over 1085042.20 frames. ], batch size: 13, lr: 8.94e-03, grad_scale: 16.0 +2022-11-16 01:30:57,019 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62478.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:31:05,873 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4283, 1.3697, 1.7850, 0.8960, 1.3178, 1.3953, 1.1041, 1.4435], + device='cuda:2'), covar=tensor([0.0048, 0.0055, 0.0041, 0.0056, 0.0044, 0.0037, 0.0045, 0.0055], + device='cuda:2'), in_proj_covar=tensor([0.0051, 0.0045, 0.0047, 0.0048, 0.0047, 0.0042, 0.0045, 0.0040], + device='cuda:2'), out_proj_covar=tensor([4.6389e-05, 4.1234e-05, 4.2395e-05, 4.3207e-05, 4.1592e-05, 3.6802e-05, + 4.1752e-05, 3.5679e-05], device='cuda:2') +2022-11-16 01:31:28,851 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.018e+02 1.728e+02 2.092e+02 2.577e+02 4.668e+02, threshold=4.183e+02, percent-clipped=3.0 +2022-11-16 01:31:42,544 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62545.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:31:58,783 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62570.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:32:03,252 INFO [train.py:876] (2/4) Epoch 9, batch 4400, loss[loss=0.1003, simple_loss=0.1389, pruned_loss=0.03084, over 5721.00 frames. ], tot_loss[loss=0.1298, simple_loss=0.1519, pruned_loss=0.05385, over 1084874.56 frames. ], batch size: 15, lr: 8.93e-03, grad_scale: 16.0 +2022-11-16 01:32:08,965 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62585.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:32:10,881 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.19 vs. limit=2.0 +2022-11-16 01:32:18,776 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8453, 1.2132, 1.6688, 1.1020, 1.4266, 1.5893, 1.2050, 1.0323], + device='cuda:2'), covar=tensor([0.0019, 0.0039, 0.0029, 0.0034, 0.0043, 0.0027, 0.0031, 0.0036], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0021, 0.0028, 0.0024, 0.0022, 0.0026, 0.0025], + device='cuda:2'), out_proj_covar=tensor([1.9068e-05, 1.9679e-05, 1.9408e-05, 2.7024e-05, 2.2554e-05, 2.1213e-05, + 2.5610e-05, 2.5508e-05], device='cuda:2') +2022-11-16 01:32:23,696 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62606.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:32:31,376 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62618.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:32:36,538 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.118e+02 1.591e+02 1.968e+02 2.413e+02 5.184e+02, threshold=3.935e+02, percent-clipped=2.0 +2022-11-16 01:32:38,752 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62629.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:32:51,039 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62646.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:33:02,143 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.93 vs. limit=5.0 +2022-11-16 01:33:04,323 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62666.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:33:11,149 INFO [train.py:876] (2/4) Epoch 9, batch 4500, loss[loss=0.1725, simple_loss=0.1852, pruned_loss=0.07991, over 5475.00 frames. ], tot_loss[loss=0.1316, simple_loss=0.153, pruned_loss=0.05504, over 1083515.47 frames. ], batch size: 53, lr: 8.93e-03, grad_scale: 16.0 +2022-11-16 01:33:41,727 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8355, 1.2606, 1.1019, 1.2135, 1.1732, 1.2132, 1.0454, 1.1043], + device='cuda:2'), covar=tensor([0.1997, 0.1049, 0.1350, 0.0774, 0.0997, 0.1141, 0.1066, 0.0480], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0095, 0.0100, 0.0087, 0.0086, 0.0091, 0.0092, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 01:33:44,305 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.986e+01 1.583e+02 2.006e+02 2.339e+02 4.798e+02, threshold=4.012e+02, percent-clipped=3.0 +2022-11-16 01:33:45,183 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62727.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:33:58,105 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62747.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:34:16,533 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62773.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:34:18,990 INFO [train.py:876] (2/4) Epoch 9, batch 4600, loss[loss=0.1091, simple_loss=0.1454, pruned_loss=0.03637, over 5568.00 frames. ], tot_loss[loss=0.1314, simple_loss=0.1534, pruned_loss=0.05469, over 1087822.06 frames. ], batch size: 14, lr: 8.92e-03, grad_scale: 16.0 +2022-11-16 01:34:19,703 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62778.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:34:25,527 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6649, 4.4367, 4.6942, 4.1404, 4.9004, 4.0723, 2.5703, 4.9782], + device='cuda:2'), covar=tensor([0.0258, 0.0464, 0.0314, 0.0412, 0.0292, 0.0657, 0.2164, 0.0288], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0083, 0.0085, 0.0076, 0.0100, 0.0087, 0.0129, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:34:38,397 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4481, 1.4435, 1.7417, 0.9986, 1.3357, 1.4464, 1.2877, 1.6553], + device='cuda:2'), covar=tensor([0.0036, 0.0043, 0.0032, 0.0038, 0.0035, 0.0027, 0.0034, 0.0030], + device='cuda:2'), in_proj_covar=tensor([0.0051, 0.0045, 0.0047, 0.0047, 0.0046, 0.0041, 0.0045, 0.0040], + device='cuda:2'), out_proj_covar=tensor([4.6543e-05, 4.0623e-05, 4.2411e-05, 4.2793e-05, 4.0986e-05, 3.6124e-05, + 4.1545e-05, 3.5356e-05], device='cuda:2') +2022-11-16 01:34:41,028 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9425, 1.4354, 1.4240, 1.2736, 0.8998, 1.9693, 1.4872, 1.0055], + device='cuda:2'), covar=tensor([0.2514, 0.0850, 0.2313, 0.2738, 0.3260, 0.0502, 0.1802, 0.3116], + device='cuda:2'), in_proj_covar=tensor([0.0082, 0.0073, 0.0072, 0.0085, 0.0064, 0.0053, 0.0061, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:34:52,261 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.537e+01 1.546e+02 1.870e+02 2.337e+02 4.153e+02, threshold=3.740e+02, percent-clipped=1.0 +2022-11-16 01:34:52,349 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62826.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:34:57,792 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62834.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:35:05,000 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7769, 4.1327, 3.6941, 4.0593, 4.1561, 3.4217, 3.7710, 3.4267], + device='cuda:2'), covar=tensor([0.0614, 0.0449, 0.1421, 0.0425, 0.0436, 0.0601, 0.0754, 0.0690], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0169, 0.0266, 0.0167, 0.0207, 0.0167, 0.0180, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:35:11,129 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-11-16 01:35:27,135 INFO [train.py:876] (2/4) Epoch 9, batch 4700, loss[loss=0.1173, simple_loss=0.1447, pruned_loss=0.04495, over 5716.00 frames. ], tot_loss[loss=0.1282, simple_loss=0.1507, pruned_loss=0.05282, over 1081382.48 frames. ], batch size: 19, lr: 8.91e-03, grad_scale: 32.0 +2022-11-16 01:35:42,716 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62901.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:36:00,632 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.037e+02 1.617e+02 2.054e+02 2.509e+02 4.948e+02, threshold=4.108e+02, percent-clipped=3.0 +2022-11-16 01:36:02,090 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62929.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:36:09,995 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62941.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:36:34,658 INFO [train.py:876] (2/4) Epoch 9, batch 4800, loss[loss=0.09181, simple_loss=0.1187, pruned_loss=0.03244, over 5032.00 frames. ], tot_loss[loss=0.1275, simple_loss=0.1504, pruned_loss=0.05224, over 1079134.61 frames. ], batch size: 5, lr: 8.91e-03, grad_scale: 16.0 +2022-11-16 01:36:34,703 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62977.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:37:05,679 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63022.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:37:09,061 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.223e+01 1.584e+02 1.833e+02 2.198e+02 5.399e+02, threshold=3.666e+02, percent-clipped=2.0 +2022-11-16 01:37:22,501 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63047.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:37:42,694 INFO [train.py:876] (2/4) Epoch 9, batch 4900, loss[loss=0.1144, simple_loss=0.1472, pruned_loss=0.04076, over 5604.00 frames. ], tot_loss[loss=0.1294, simple_loss=0.1516, pruned_loss=0.05363, over 1076920.48 frames. ], batch size: 22, lr: 8.90e-03, grad_scale: 16.0 +2022-11-16 01:37:46,516 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3614, 1.5210, 1.6065, 0.9995, 1.3216, 1.2894, 1.0722, 1.3760], + device='cuda:2'), covar=tensor([0.0041, 0.0040, 0.0037, 0.0046, 0.0043, 0.0038, 0.0055, 0.0081], + device='cuda:2'), in_proj_covar=tensor([0.0051, 0.0045, 0.0047, 0.0047, 0.0047, 0.0041, 0.0046, 0.0040], + device='cuda:2'), out_proj_covar=tensor([4.6478e-05, 4.0869e-05, 4.2289e-05, 4.2439e-05, 4.1056e-05, 3.6081e-05, + 4.1773e-05, 3.5473e-05], device='cuda:2') +2022-11-16 01:37:54,810 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63095.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:38:00,880 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3961, 1.1443, 1.4180, 0.9980, 1.2358, 1.2664, 1.0078, 0.7236], + device='cuda:2'), covar=tensor([0.0022, 0.0038, 0.0025, 0.0042, 0.0037, 0.0031, 0.0036, 0.0052], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0022, 0.0028, 0.0025, 0.0023, 0.0027, 0.0026], + device='cuda:2'), out_proj_covar=tensor([1.9615e-05, 2.0425e-05, 2.0298e-05, 2.7832e-05, 2.3506e-05, 2.2196e-05, + 2.6248e-05, 2.6457e-05], device='cuda:2') +2022-11-16 01:38:17,347 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.017e+02 1.788e+02 2.171e+02 2.737e+02 6.659e+02, threshold=4.342e+02, percent-clipped=3.0 +2022-11-16 01:38:18,115 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63129.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:38:49,086 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2428, 4.3024, 2.8671, 4.0311, 3.3031, 2.9217, 2.3268, 3.7606], + device='cuda:2'), covar=tensor([0.1568, 0.0188, 0.1067, 0.0316, 0.0689, 0.0980, 0.1850, 0.0280], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0137, 0.0162, 0.0142, 0.0176, 0.0172, 0.0169, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 01:38:49,665 INFO [train.py:876] (2/4) Epoch 9, batch 5000, loss[loss=0.1053, simple_loss=0.1292, pruned_loss=0.04067, over 5791.00 frames. ], tot_loss[loss=0.1278, simple_loss=0.1507, pruned_loss=0.05246, over 1085529.49 frames. ], batch size: 22, lr: 8.89e-03, grad_scale: 8.0 +2022-11-16 01:39:06,643 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63201.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:39:16,550 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-11-16 01:39:19,720 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63221.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:39:20,296 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5275, 3.8931, 3.0479, 3.7922, 3.7253, 3.6257, 4.0309, 3.7760], + device='cuda:2'), covar=tensor([0.0839, 0.0789, 0.2101, 0.0844, 0.0985, 0.0442, 0.0521, 0.0531], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0170, 0.0267, 0.0169, 0.0210, 0.0168, 0.0183, 0.0168], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:39:24,001 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.228e+01 1.555e+02 1.927e+02 2.355e+02 4.069e+02, threshold=3.855e+02, percent-clipped=0.0 +2022-11-16 01:39:32,915 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63241.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:39:38,709 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63249.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:39:52,771 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8702, 3.5105, 2.2773, 3.1561, 2.5228, 2.3881, 1.7269, 3.0743], + device='cuda:2'), covar=tensor([0.1410, 0.0221, 0.1124, 0.0397, 0.1062, 0.1084, 0.2007, 0.0377], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0137, 0.0162, 0.0141, 0.0176, 0.0172, 0.0169, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 01:39:57,147 INFO [train.py:876] (2/4) Epoch 9, batch 5100, loss[loss=0.1346, simple_loss=0.1643, pruned_loss=0.05244, over 5662.00 frames. ], tot_loss[loss=0.1268, simple_loss=0.1501, pruned_loss=0.05175, over 1089550.41 frames. ], batch size: 32, lr: 8.88e-03, grad_scale: 8.0 +2022-11-16 01:40:00,545 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63282.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:40:05,000 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63289.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:40:27,681 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63322.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:40:31,503 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.064e+01 1.589e+02 1.945e+02 2.471e+02 3.864e+02, threshold=3.889e+02, percent-clipped=1.0 +2022-11-16 01:40:39,066 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.30 vs. limit=2.0 +2022-11-16 01:40:59,525 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63370.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:41:04,333 INFO [train.py:876] (2/4) Epoch 9, batch 5200, loss[loss=0.1277, simple_loss=0.1466, pruned_loss=0.05443, over 5747.00 frames. ], tot_loss[loss=0.1253, simple_loss=0.1489, pruned_loss=0.05084, over 1087071.37 frames. ], batch size: 27, lr: 8.88e-03, grad_scale: 8.0 +2022-11-16 01:41:33,508 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63420.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:41:34,802 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63422.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:41:38,978 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.008e+02 1.457e+02 1.778e+02 2.243e+02 4.193e+02, threshold=3.556e+02, percent-clipped=1.0 +2022-11-16 01:41:39,821 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63429.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:42:00,115 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1633, 2.0641, 2.5314, 1.5444, 1.1082, 3.0802, 2.2713, 2.1795], + device='cuda:2'), covar=tensor([0.1039, 0.1447, 0.0732, 0.2819, 0.4372, 0.0961, 0.2422, 0.1314], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0075, 0.0074, 0.0085, 0.0064, 0.0055, 0.0064, 0.0073], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:42:11,809 INFO [train.py:876] (2/4) Epoch 9, batch 5300, loss[loss=0.1106, simple_loss=0.1395, pruned_loss=0.04086, over 5773.00 frames. ], tot_loss[loss=0.126, simple_loss=0.1495, pruned_loss=0.05128, over 1087993.93 frames. ], batch size: 26, lr: 8.87e-03, grad_scale: 8.0 +2022-11-16 01:42:11,846 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63477.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:42:14,596 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63481.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:42:16,189 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63483.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:42:16,846 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8905, 2.3482, 2.8511, 3.6739, 3.7425, 2.6779, 2.3401, 3.6578], + device='cuda:2'), covar=tensor([0.0703, 0.2924, 0.2185, 0.2685, 0.1003, 0.3264, 0.2245, 0.0686], + device='cuda:2'), in_proj_covar=tensor([0.0231, 0.0198, 0.0192, 0.0312, 0.0220, 0.0207, 0.0191, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 01:42:24,689 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0568, 1.8819, 2.2860, 2.1136, 1.3623, 2.1001, 1.4329, 1.6960], + device='cuda:2'), covar=tensor([0.0172, 0.0093, 0.0087, 0.0116, 0.0252, 0.0099, 0.0239, 0.0137], + device='cuda:2'), in_proj_covar=tensor([0.0188, 0.0163, 0.0172, 0.0192, 0.0185, 0.0172, 0.0184, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 01:42:31,989 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9134, 4.5167, 3.4726, 2.0863, 4.2821, 1.7757, 4.1453, 2.4385], + device='cuda:2'), covar=tensor([0.1174, 0.0109, 0.0652, 0.1759, 0.0165, 0.1671, 0.0182, 0.1546], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0105, 0.0115, 0.0114, 0.0105, 0.0124, 0.0099, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 01:42:41,996 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6784, 1.8587, 2.4777, 2.3510, 2.2321, 1.7812, 2.2759, 2.6217], + device='cuda:2'), covar=tensor([0.0745, 0.1695, 0.0743, 0.1196, 0.1033, 0.1544, 0.1032, 0.0897], + device='cuda:2'), in_proj_covar=tensor([0.0232, 0.0194, 0.0206, 0.0207, 0.0229, 0.0191, 0.0225, 0.0224], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 01:42:46,192 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.109e+01 1.553e+02 1.965e+02 2.267e+02 4.174e+02, threshold=3.929e+02, percent-clipped=2.0 +2022-11-16 01:43:13,466 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63569.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:43:19,371 INFO [train.py:876] (2/4) Epoch 9, batch 5400, loss[loss=0.1761, simple_loss=0.1813, pruned_loss=0.08545, over 5594.00 frames. ], tot_loss[loss=0.1266, simple_loss=0.1498, pruned_loss=0.05174, over 1080349.44 frames. ], batch size: 50, lr: 8.86e-03, grad_scale: 8.0 +2022-11-16 01:43:19,442 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63577.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:43:24,434 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0143, 1.2966, 2.0380, 1.5661, 1.7185, 1.6530, 1.7195, 1.5053], + device='cuda:2'), covar=tensor([0.0024, 0.0068, 0.0031, 0.0033, 0.0037, 0.0125, 0.0029, 0.0032], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0021, 0.0023, 0.0029, 0.0025, 0.0023, 0.0027, 0.0027], + device='cuda:2'), out_proj_covar=tensor([1.9807e-05, 2.0392e-05, 2.0364e-05, 2.8351e-05, 2.3302e-05, 2.2671e-05, + 2.6128e-05, 2.6870e-05], device='cuda:2') +2022-11-16 01:43:25,758 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63586.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:43:26,003 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-16 01:43:55,258 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.021e+02 1.654e+02 2.116e+02 2.560e+02 5.660e+02, threshold=4.233e+02, percent-clipped=4.0 +2022-11-16 01:43:55,490 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.5270, 0.7756, 1.0040, 0.6145, 0.7984, 0.6281, 0.4579, 0.9200], + device='cuda:2'), covar=tensor([0.0062, 0.0026, 0.0039, 0.0046, 0.0031, 0.0047, 0.0063, 0.0031], + device='cuda:2'), in_proj_covar=tensor([0.0052, 0.0046, 0.0048, 0.0048, 0.0047, 0.0043, 0.0046, 0.0041], + device='cuda:2'), out_proj_covar=tensor([4.6714e-05, 4.1406e-05, 4.2938e-05, 4.3396e-05, 4.1775e-05, 3.7015e-05, + 4.2299e-05, 3.6052e-05], device='cuda:2') +2022-11-16 01:43:56,868 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63630.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:44:09,232 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63647.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:44:18,454 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6282, 1.6519, 2.3153, 1.6094, 0.9124, 2.7760, 2.1184, 1.9806], + device='cuda:2'), covar=tensor([0.1115, 0.1875, 0.1057, 0.2942, 0.4144, 0.0521, 0.1342, 0.1701], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0074, 0.0072, 0.0084, 0.0063, 0.0053, 0.0062, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:44:30,044 INFO [train.py:876] (2/4) Epoch 9, batch 5500, loss[loss=0.09572, simple_loss=0.1313, pruned_loss=0.03007, over 5590.00 frames. ], tot_loss[loss=0.1291, simple_loss=0.1512, pruned_loss=0.05351, over 1073503.45 frames. ], batch size: 22, lr: 8.86e-03, grad_scale: 8.0 +2022-11-16 01:45:04,282 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.831e+01 1.633e+02 1.870e+02 2.382e+02 5.170e+02, threshold=3.741e+02, percent-clipped=1.0 +2022-11-16 01:45:36,944 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63776.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:45:37,521 INFO [train.py:876] (2/4) Epoch 9, batch 5600, loss[loss=0.1026, simple_loss=0.1463, pruned_loss=0.02944, over 5565.00 frames. ], tot_loss[loss=0.1288, simple_loss=0.1513, pruned_loss=0.05319, over 1082218.70 frames. ], batch size: 25, lr: 8.85e-03, grad_scale: 8.0 +2022-11-16 01:45:38,275 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63778.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:45:45,351 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6057, 2.8281, 2.8776, 2.6559, 2.7678, 2.7102, 1.2926, 2.8575], + device='cuda:2'), covar=tensor([0.0951, 0.0619, 0.0620, 0.0617, 0.0803, 0.0789, 0.4244, 0.0842], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0082, 0.0086, 0.0078, 0.0101, 0.0086, 0.0131, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:46:00,504 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63810.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:46:12,218 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.070e+02 1.664e+02 2.053e+02 2.570e+02 4.903e+02, threshold=4.106e+02, percent-clipped=3.0 +2022-11-16 01:46:17,312 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-11-16 01:46:23,811 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63845.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:46:41,790 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63871.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:46:45,513 INFO [train.py:876] (2/4) Epoch 9, batch 5700, loss[loss=0.1112, simple_loss=0.1386, pruned_loss=0.04188, over 5684.00 frames. ], tot_loss[loss=0.129, simple_loss=0.1512, pruned_loss=0.05338, over 1083300.32 frames. ], batch size: 34, lr: 8.84e-03, grad_scale: 8.0 +2022-11-16 01:46:45,639 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63877.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:46:58,402 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-11-16 01:47:05,279 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63906.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:47:12,356 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63916.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:47:18,379 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63925.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:47:18,412 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63925.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:47:20,209 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.040e+01 1.586e+02 1.908e+02 2.453e+02 4.306e+02, threshold=3.816e+02, percent-clipped=1.0 +2022-11-16 01:47:29,396 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63942.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:47:53,511 INFO [train.py:876] (2/4) Epoch 9, batch 5800, loss[loss=0.1303, simple_loss=0.1535, pruned_loss=0.05351, over 5771.00 frames. ], tot_loss[loss=0.1307, simple_loss=0.1523, pruned_loss=0.05448, over 1080488.89 frames. ], batch size: 21, lr: 8.84e-03, grad_scale: 8.0 +2022-11-16 01:47:53,669 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63977.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:48:06,706 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6448, 1.2266, 1.5519, 0.9999, 1.4647, 1.4603, 1.1064, 0.8731], + device='cuda:2'), covar=tensor([0.0019, 0.0043, 0.0035, 0.0049, 0.0030, 0.0051, 0.0039, 0.0053], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0023, 0.0030, 0.0025, 0.0024, 0.0028, 0.0028], + device='cuda:2'), out_proj_covar=tensor([2.0162e-05, 2.1242e-05, 2.1064e-05, 2.8896e-05, 2.3860e-05, 2.3133e-05, + 2.6979e-05, 2.7795e-05], device='cuda:2') +2022-11-16 01:48:25,315 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64023.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:48:28,341 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.198e+01 1.501e+02 1.757e+02 2.288e+02 3.552e+02, threshold=3.514e+02, percent-clipped=0.0 +2022-11-16 01:48:56,305 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8315, 0.8046, 1.0151, 0.6239, 0.7541, 0.9193, 0.6143, 1.1219], + device='cuda:2'), covar=tensor([0.0058, 0.0028, 0.0041, 0.0044, 0.0036, 0.0046, 0.0079, 0.0031], + device='cuda:2'), in_proj_covar=tensor([0.0050, 0.0044, 0.0046, 0.0046, 0.0046, 0.0041, 0.0044, 0.0039], + device='cuda:2'), out_proj_covar=tensor([4.5099e-05, 4.0059e-05, 4.1415e-05, 4.1255e-05, 4.0379e-05, 3.5368e-05, + 4.0304e-05, 3.4663e-05], device='cuda:2') +2022-11-16 01:49:00,496 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64076.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:49:01,354 INFO [train.py:876] (2/4) Epoch 9, batch 5900, loss[loss=0.09428, simple_loss=0.124, pruned_loss=0.03229, over 5761.00 frames. ], tot_loss[loss=0.1314, simple_loss=0.1531, pruned_loss=0.05484, over 1080637.28 frames. ], batch size: 15, lr: 8.83e-03, grad_scale: 8.0 +2022-11-16 01:49:02,112 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64078.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:49:06,069 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64084.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:49:07,086 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.14 vs. limit=5.0 +2022-11-16 01:49:32,765 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64124.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:49:34,400 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64126.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:49:35,642 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.966e+01 1.656e+02 2.045e+02 2.527e+02 4.457e+02, threshold=4.090e+02, percent-clipped=4.0 +2022-11-16 01:49:54,696 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-11-16 01:49:54,939 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3216, 3.3341, 3.3015, 3.3900, 3.2848, 2.9650, 3.7537, 3.3143], + device='cuda:2'), covar=tensor([0.0615, 0.0917, 0.0563, 0.1089, 0.0560, 0.0488, 0.0773, 0.0804], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0106, 0.0089, 0.0115, 0.0085, 0.0076, 0.0140, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:50:00,883 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64166.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:50:08,146 INFO [train.py:876] (2/4) Epoch 9, batch 6000, loss[loss=0.1506, simple_loss=0.1715, pruned_loss=0.06484, over 5567.00 frames. ], tot_loss[loss=0.1296, simple_loss=0.152, pruned_loss=0.05366, over 1083270.39 frames. ], batch size: 50, lr: 8.82e-03, grad_scale: 8.0 +2022-11-16 01:50:08,146 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 01:50:25,851 INFO [train.py:908] (2/4) Epoch 9, validation: loss=0.1648, simple_loss=0.1829, pruned_loss=0.07333, over 1530663.00 frames. +2022-11-16 01:50:25,852 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 01:50:42,278 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64201.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:50:57,241 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8736, 4.4103, 3.8374, 4.3668, 4.3864, 3.7795, 4.1033, 3.7146], + device='cuda:2'), covar=tensor([0.0666, 0.0509, 0.1736, 0.0494, 0.0442, 0.0518, 0.0530, 0.0580], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0170, 0.0265, 0.0166, 0.0209, 0.0168, 0.0180, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:50:57,933 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64225.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:50:59,715 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.067e+01 1.626e+02 1.990e+02 2.309e+02 5.533e+02, threshold=3.980e+02, percent-clipped=3.0 +2022-11-16 01:51:02,877 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64232.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:51:10,126 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64242.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:51:14,245 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8201, 4.1248, 2.2978, 3.9904, 3.2444, 2.5274, 2.3899, 3.5941], + device='cuda:2'), covar=tensor([0.2419, 0.0379, 0.1866, 0.0438, 0.0928, 0.1671, 0.2352, 0.0467], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0138, 0.0164, 0.0143, 0.0177, 0.0174, 0.0170, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 01:51:29,858 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64272.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:51:30,475 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64273.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:51:33,038 INFO [train.py:876] (2/4) Epoch 9, batch 6100, loss[loss=0.1026, simple_loss=0.1327, pruned_loss=0.03622, over 5521.00 frames. ], tot_loss[loss=0.1282, simple_loss=0.1512, pruned_loss=0.05259, over 1086337.09 frames. ], batch size: 13, lr: 8.82e-03, grad_scale: 8.0 +2022-11-16 01:51:41,915 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64290.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:51:44,145 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64293.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:52:07,148 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.015e+02 1.572e+02 1.862e+02 2.390e+02 5.405e+02, threshold=3.724e+02, percent-clipped=2.0 +2022-11-16 01:52:13,903 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6876, 1.2680, 1.5929, 1.1213, 1.5683, 1.4893, 1.3165, 1.0025], + device='cuda:2'), covar=tensor([0.0025, 0.0042, 0.0044, 0.0052, 0.0043, 0.0040, 0.0033, 0.0043], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0023, 0.0030, 0.0025, 0.0024, 0.0028, 0.0028], + device='cuda:2'), out_proj_covar=tensor([2.0344e-05, 2.0987e-05, 2.1230e-05, 2.9011e-05, 2.3960e-05, 2.3277e-05, + 2.7641e-05, 2.7701e-05], device='cuda:2') +2022-11-16 01:52:18,850 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64345.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:52:24,688 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2500, 2.5696, 3.3703, 4.0280, 4.0997, 3.3307, 2.8970, 4.1902], + device='cuda:2'), covar=tensor([0.0565, 0.3870, 0.2743, 0.3970, 0.1044, 0.3133, 0.2452, 0.0572], + device='cuda:2'), in_proj_covar=tensor([0.0235, 0.0201, 0.0193, 0.0314, 0.0222, 0.0206, 0.0194, 0.0234], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 01:52:40,447 INFO [train.py:876] (2/4) Epoch 9, batch 6200, loss[loss=0.1838, simple_loss=0.1821, pruned_loss=0.09277, over 5486.00 frames. ], tot_loss[loss=0.1299, simple_loss=0.1518, pruned_loss=0.05404, over 1079237.44 frames. ], batch size: 58, lr: 8.81e-03, grad_scale: 8.0 +2022-11-16 01:52:41,777 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64379.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:52:44,395 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7218, 1.9188, 1.9304, 1.6951, 1.8926, 1.9004, 0.8067, 1.9680], + device='cuda:2'), covar=tensor([0.0365, 0.0290, 0.0333, 0.0343, 0.0287, 0.0359, 0.2096, 0.0370], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0081, 0.0085, 0.0076, 0.0100, 0.0085, 0.0129, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:52:48,060 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-11-16 01:53:00,245 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64406.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:53:14,912 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.123e+02 1.648e+02 1.944e+02 2.421e+02 4.973e+02, threshold=3.888e+02, percent-clipped=5.0 +2022-11-16 01:53:39,854 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64465.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:53:40,800 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64466.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:53:48,094 INFO [train.py:876] (2/4) Epoch 9, batch 6300, loss[loss=0.1349, simple_loss=0.1591, pruned_loss=0.0554, over 5739.00 frames. ], tot_loss[loss=0.129, simple_loss=0.1512, pruned_loss=0.05341, over 1079752.73 frames. ], batch size: 20, lr: 8.80e-03, grad_scale: 8.0 +2022-11-16 01:54:03,902 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64501.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:54:12,489 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64514.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:54:21,622 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64526.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:54:22,703 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.211e+02 1.573e+02 1.938e+02 2.415e+02 6.064e+02, threshold=3.877e+02, percent-clipped=4.0 +2022-11-16 01:54:36,387 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64549.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:54:44,566 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8768, 3.4829, 2.3566, 3.2660, 2.5417, 2.4298, 1.8519, 2.9473], + device='cuda:2'), covar=tensor([0.1565, 0.0305, 0.1335, 0.0406, 0.1266, 0.1145, 0.2097, 0.0480], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0138, 0.0164, 0.0143, 0.0175, 0.0171, 0.0168, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 01:54:50,262 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5309, 2.4688, 2.6093, 3.5952, 3.4038, 2.5128, 2.1106, 3.5317], + device='cuda:2'), covar=tensor([0.0849, 0.2372, 0.2053, 0.1915, 0.1155, 0.2976, 0.2258, 0.0703], + device='cuda:2'), in_proj_covar=tensor([0.0234, 0.0199, 0.0193, 0.0311, 0.0220, 0.0205, 0.0191, 0.0233], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 01:54:52,156 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64572.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:54:55,721 INFO [train.py:876] (2/4) Epoch 9, batch 6400, loss[loss=0.1342, simple_loss=0.1515, pruned_loss=0.05843, over 5062.00 frames. ], tot_loss[loss=0.1287, simple_loss=0.1512, pruned_loss=0.05312, over 1086375.16 frames. ], batch size: 91, lr: 8.80e-03, grad_scale: 8.0 +2022-11-16 01:55:01,693 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7710, 2.5708, 2.6643, 2.4391, 2.8377, 2.7143, 2.7025, 2.7360], + device='cuda:2'), covar=tensor([0.0454, 0.0501, 0.0484, 0.0538, 0.0475, 0.0286, 0.0399, 0.0672], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0138, 0.0103, 0.0137, 0.0160, 0.0093, 0.0116, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 01:55:03,014 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64588.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:55:03,719 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64589.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:55:24,424 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64620.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:55:24,467 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0529, 4.5288, 4.1245, 4.5332, 4.5687, 3.9174, 4.1262, 3.9414], + device='cuda:2'), covar=tensor([0.0393, 0.0408, 0.1247, 0.0384, 0.0384, 0.0478, 0.0443, 0.0482], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0171, 0.0265, 0.0166, 0.0209, 0.0169, 0.0180, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:55:30,096 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.065e+02 1.639e+02 1.887e+02 2.519e+02 5.774e+02, threshold=3.775e+02, percent-clipped=3.0 +2022-11-16 01:55:40,434 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1495, 4.2233, 4.2785, 4.4126, 3.8382, 3.7612, 4.8497, 4.1993], + device='cuda:2'), covar=tensor([0.0476, 0.0965, 0.0354, 0.1052, 0.0483, 0.0366, 0.0709, 0.0461], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0105, 0.0089, 0.0115, 0.0084, 0.0075, 0.0141, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:55:43,727 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64648.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:55:44,272 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7980, 4.2678, 4.6845, 4.3013, 4.8235, 4.7260, 4.2206, 4.8030], + device='cuda:2'), covar=tensor([0.0340, 0.0312, 0.0327, 0.0309, 0.0379, 0.0175, 0.0294, 0.0263], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0139, 0.0102, 0.0138, 0.0162, 0.0093, 0.0117, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 01:55:45,010 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64650.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:56:02,884 INFO [train.py:876] (2/4) Epoch 9, batch 6500, loss[loss=0.1238, simple_loss=0.1569, pruned_loss=0.04536, over 5742.00 frames. ], tot_loss[loss=0.1273, simple_loss=0.15, pruned_loss=0.05224, over 1083705.38 frames. ], batch size: 27, lr: 8.79e-03, grad_scale: 8.0 +2022-11-16 01:56:04,251 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64679.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:56:15,329 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-16 01:56:19,470 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64701.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:56:20,839 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64703.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:56:24,835 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64709.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:56:35,500 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0455, 2.7689, 2.5483, 3.1038, 2.6866, 3.6176, 3.1575, 3.0947], + device='cuda:2'), covar=tensor([0.0752, 0.1714, 0.2344, 0.1679, 0.1947, 0.0667, 0.1482, 0.5807], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0097, 0.0101, 0.0090, 0.0088, 0.0092, 0.0094, 0.0071], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 01:56:36,723 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64727.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:56:37,278 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.652e+01 1.582e+02 1.894e+02 2.392e+02 5.037e+02, threshold=3.789e+02, percent-clipped=1.0 +2022-11-16 01:56:49,585 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64745.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:56:54,305 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64752.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:57:02,050 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64764.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:57:10,225 INFO [train.py:876] (2/4) Epoch 9, batch 6600, loss[loss=0.1128, simple_loss=0.1497, pruned_loss=0.03791, over 5594.00 frames. ], tot_loss[loss=0.1247, simple_loss=0.1483, pruned_loss=0.05055, over 1085036.64 frames. ], batch size: 25, lr: 8.78e-03, grad_scale: 8.0 +2022-11-16 01:57:30,814 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64806.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:57:35,342 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64813.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:57:40,464 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64821.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:57:44,982 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.136e+02 1.581e+02 1.869e+02 2.492e+02 4.006e+02, threshold=3.739e+02, percent-clipped=2.0 +2022-11-16 01:57:51,484 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1196, 1.9973, 2.4883, 1.6203, 1.2269, 3.1960, 2.3547, 2.1640], + device='cuda:2'), covar=tensor([0.1140, 0.1395, 0.0852, 0.3006, 0.3933, 0.0923, 0.1109, 0.1506], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0076, 0.0074, 0.0088, 0.0066, 0.0054, 0.0063, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 01:57:56,909 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4837, 4.5336, 4.7309, 4.8563, 4.3031, 3.8928, 5.2093, 4.6202], + device='cuda:2'), covar=tensor([0.0472, 0.1001, 0.0340, 0.1156, 0.0466, 0.0402, 0.0657, 0.0539], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0106, 0.0090, 0.0116, 0.0086, 0.0076, 0.0143, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 01:58:10,999 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1237, 3.3611, 2.5418, 1.6219, 3.1663, 1.2208, 3.2143, 1.6466], + device='cuda:2'), covar=tensor([0.1459, 0.0194, 0.0962, 0.2210, 0.0299, 0.2394, 0.0313, 0.1895], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0104, 0.0114, 0.0113, 0.0103, 0.0122, 0.0099, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 01:58:18,046 INFO [train.py:876] (2/4) Epoch 9, batch 6700, loss[loss=0.1111, simple_loss=0.1446, pruned_loss=0.03873, over 5531.00 frames. ], tot_loss[loss=0.125, simple_loss=0.1486, pruned_loss=0.05077, over 1089879.04 frames. ], batch size: 21, lr: 8.77e-03, grad_scale: 8.0 +2022-11-16 01:58:25,444 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64888.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:58:40,096 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0530, 4.5382, 4.8187, 4.5959, 5.0888, 5.0221, 4.3950, 5.0814], + device='cuda:2'), covar=tensor([0.0288, 0.0339, 0.0398, 0.0316, 0.0318, 0.0145, 0.0251, 0.0244], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0140, 0.0103, 0.0138, 0.0161, 0.0093, 0.0116, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 01:58:52,431 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.525e+01 1.574e+02 2.010e+02 2.472e+02 4.884e+02, threshold=4.021e+02, percent-clipped=4.0 +2022-11-16 01:58:57,701 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64936.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:59:03,713 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64945.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:59:25,580 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64976.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:59:26,079 INFO [train.py:876] (2/4) Epoch 9, batch 6800, loss[loss=0.1203, simple_loss=0.141, pruned_loss=0.04979, over 5566.00 frames. ], tot_loss[loss=0.1262, simple_loss=0.1498, pruned_loss=0.05128, over 1092166.00 frames. ], batch size: 43, lr: 8.77e-03, grad_scale: 8.0 +2022-11-16 01:59:45,806 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65001.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 01:59:48,101 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65004.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:00:03,706 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.16 vs. limit=5.0 +2022-11-16 02:00:04,568 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.135e+02 1.642e+02 2.032e+02 2.678e+02 4.129e+02, threshold=4.063e+02, percent-clipped=1.0 +2022-11-16 02:00:10,609 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65037.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:00:18,372 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65049.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:00:22,724 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=65055.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:00:25,194 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65059.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:00:33,156 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8268, 4.8586, 5.0390, 4.9381, 4.5007, 4.2917, 5.5353, 4.9101], + device='cuda:2'), covar=tensor([0.0356, 0.0743, 0.0298, 0.1041, 0.0432, 0.0362, 0.0492, 0.0594], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0103, 0.0087, 0.0113, 0.0083, 0.0074, 0.0138, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:00:37,936 INFO [train.py:876] (2/4) Epoch 9, batch 6900, loss[loss=0.101, simple_loss=0.1307, pruned_loss=0.03567, over 5743.00 frames. ], tot_loss[loss=0.1262, simple_loss=0.1501, pruned_loss=0.05112, over 1087281.36 frames. ], batch size: 14, lr: 8.76e-03, grad_scale: 8.0 +2022-11-16 02:00:53,226 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2294, 5.0790, 3.7799, 2.1954, 4.5174, 2.2637, 4.5422, 2.9682], + device='cuda:2'), covar=tensor([0.1134, 0.0105, 0.0464, 0.2061, 0.0206, 0.1585, 0.0220, 0.1272], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0102, 0.0113, 0.0112, 0.0102, 0.0121, 0.0098, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:00:53,904 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65101.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:00:58,441 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65108.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:01:04,196 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65116.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:01:07,819 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65121.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:01:12,219 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2680, 2.9763, 3.1789, 1.6386, 2.8574, 3.3671, 3.2782, 3.7964], + device='cuda:2'), covar=tensor([0.2045, 0.1688, 0.1409, 0.3203, 0.0487, 0.0925, 0.0460, 0.0738], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0179, 0.0163, 0.0185, 0.0173, 0.0190, 0.0157, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 02:01:12,982 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.038e+02 1.555e+02 1.818e+02 2.213e+02 4.720e+02, threshold=3.636e+02, percent-clipped=2.0 +2022-11-16 02:01:15,841 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.25 vs. limit=2.0 +2022-11-16 02:01:32,678 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0668, 1.8628, 2.0547, 2.1048, 1.8806, 1.6171, 1.7526, 2.3055], + device='cuda:2'), covar=tensor([0.1551, 0.2326, 0.2312, 0.1809, 0.2174, 0.2502, 0.2072, 0.1220], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0096, 0.0099, 0.0089, 0.0086, 0.0091, 0.0094, 0.0070], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:01:40,199 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65169.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:01:45,809 INFO [train.py:876] (2/4) Epoch 9, batch 7000, loss[loss=0.183, simple_loss=0.1951, pruned_loss=0.08546, over 5638.00 frames. ], tot_loss[loss=0.1275, simple_loss=0.1509, pruned_loss=0.05204, over 1085438.09 frames. ], batch size: 38, lr: 8.75e-03, grad_scale: 16.0 +2022-11-16 02:02:16,379 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6530, 2.3900, 1.9665, 1.9525, 1.2510, 1.9886, 1.5213, 2.1556], + device='cuda:2'), covar=tensor([0.1089, 0.0322, 0.0903, 0.0673, 0.2160, 0.0923, 0.1667, 0.0575], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0138, 0.0162, 0.0141, 0.0173, 0.0173, 0.0168, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 02:02:19,940 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.109e+02 1.702e+02 2.050e+02 2.485e+02 3.887e+02, threshold=4.100e+02, percent-clipped=2.0 +2022-11-16 02:02:32,227 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65245.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:02:41,338 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3860, 5.0955, 3.7675, 2.1517, 4.7933, 2.0963, 4.8153, 2.9095], + device='cuda:2'), covar=tensor([0.1170, 0.0153, 0.0549, 0.2202, 0.0201, 0.1933, 0.0165, 0.1566], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0103, 0.0113, 0.0111, 0.0102, 0.0121, 0.0098, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:02:53,106 INFO [train.py:876] (2/4) Epoch 9, batch 7100, loss[loss=0.113, simple_loss=0.1464, pruned_loss=0.03981, over 5481.00 frames. ], tot_loss[loss=0.128, simple_loss=0.1508, pruned_loss=0.05257, over 1079225.33 frames. ], batch size: 17, lr: 8.75e-03, grad_scale: 16.0 +2022-11-16 02:02:58,817 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=65285.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:03:04,615 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65293.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:03:07,706 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.56 vs. limit=5.0 +2022-11-16 02:03:12,358 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65304.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:03:28,151 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.907e+01 1.625e+02 1.964e+02 2.489e+02 4.704e+02, threshold=3.927e+02, percent-clipped=1.0 +2022-11-16 02:03:31,219 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65332.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:03:41,020 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65346.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:03:45,050 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65352.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:03:49,612 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65359.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:04:01,558 INFO [train.py:876] (2/4) Epoch 9, batch 7200, loss[loss=0.1711, simple_loss=0.1908, pruned_loss=0.07569, over 5559.00 frames. ], tot_loss[loss=0.1283, simple_loss=0.1506, pruned_loss=0.05296, over 1072187.91 frames. ], batch size: 46, lr: 8.74e-03, grad_scale: 16.0 +2022-11-16 02:04:08,911 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7727, 3.4136, 3.6249, 3.3571, 3.8396, 3.7195, 3.5674, 3.7863], + device='cuda:2'), covar=tensor([0.0370, 0.0375, 0.0429, 0.0424, 0.0389, 0.0203, 0.0301, 0.0450], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0141, 0.0104, 0.0139, 0.0162, 0.0093, 0.0117, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 02:04:18,244 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65401.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:04:22,062 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65407.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:04:22,790 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65408.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:04:24,640 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65411.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:04:27,824 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2881, 4.2296, 4.3918, 4.3541, 4.0409, 3.9413, 4.8881, 4.4318], + device='cuda:2'), covar=tensor([0.0446, 0.0948, 0.0362, 0.1453, 0.0424, 0.0351, 0.0750, 0.0514], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0105, 0.0089, 0.0116, 0.0085, 0.0075, 0.0140, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:04:29,430 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.26 vs. limit=5.0 +2022-11-16 02:04:30,817 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-11-16 02:04:35,328 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.623e+01 1.548e+02 1.861e+02 2.163e+02 4.412e+02, threshold=3.722e+02, percent-clipped=1.0 +2022-11-16 02:05:33,812 INFO [train.py:876] (2/4) Epoch 10, batch 0, loss[loss=0.1464, simple_loss=0.1664, pruned_loss=0.06323, over 5564.00 frames. ], tot_loss[loss=0.1464, simple_loss=0.1664, pruned_loss=0.06323, over 5564.00 frames. ], batch size: 24, lr: 8.31e-03, grad_scale: 16.0 +2022-11-16 02:05:33,813 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 02:05:40,970 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0417, 1.3683, 2.1455, 1.5156, 1.4648, 1.9762, 1.7617, 1.6743], + device='cuda:2'), covar=tensor([0.0027, 0.0127, 0.0017, 0.0068, 0.0056, 0.0055, 0.0032, 0.0044], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0023, 0.0024, 0.0030, 0.0026, 0.0024, 0.0029, 0.0028], + device='cuda:2'), out_proj_covar=tensor([2.1427e-05, 2.1643e-05, 2.1729e-05, 2.9859e-05, 2.4173e-05, 2.3394e-05, + 2.7580e-05, 2.8184e-05], device='cuda:2') +2022-11-16 02:05:43,061 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7805, 3.6267, 3.5455, 3.5446, 3.8777, 3.7472, 3.8009, 3.8854], + device='cuda:2'), covar=tensor([0.0430, 0.0387, 0.0509, 0.0474, 0.0458, 0.0262, 0.0258, 0.0394], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0139, 0.0104, 0.0137, 0.0161, 0.0092, 0.0116, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 02:05:50,440 INFO [train.py:908] (2/4) Epoch 10, validation: loss=0.1665, simple_loss=0.1839, pruned_loss=0.07458, over 1530663.00 frames. +2022-11-16 02:05:50,440 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 02:05:50,491 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65449.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:05:55,449 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65456.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:06:11,539 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.79 vs. limit=2.0 +2022-11-16 02:06:24,049 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7418, 2.7467, 2.5587, 2.9892, 2.3574, 2.4487, 2.7462, 3.2104], + device='cuda:2'), covar=tensor([0.0829, 0.1531, 0.2039, 0.0995, 0.1392, 0.1011, 0.1322, 0.1664], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0095, 0.0098, 0.0090, 0.0085, 0.0091, 0.0093, 0.0070], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:06:43,807 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.371e+01 1.569e+02 1.978e+02 2.490e+02 6.089e+02, threshold=3.956e+02, percent-clipped=4.0 +2022-11-16 02:06:44,733 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0667, 1.9473, 2.3327, 2.1384, 1.2826, 1.8805, 1.5358, 1.6036], + device='cuda:2'), covar=tensor([0.0162, 0.0090, 0.0099, 0.0130, 0.0263, 0.0137, 0.0258, 0.0163], + device='cuda:2'), in_proj_covar=tensor([0.0187, 0.0164, 0.0174, 0.0196, 0.0185, 0.0173, 0.0184, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:06:46,658 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=65532.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:06:57,657 INFO [train.py:876] (2/4) Epoch 10, batch 100, loss[loss=0.1094, simple_loss=0.1453, pruned_loss=0.03669, over 5801.00 frames. ], tot_loss[loss=0.1243, simple_loss=0.1479, pruned_loss=0.05031, over 426126.89 frames. ], batch size: 21, lr: 8.30e-03, grad_scale: 16.0 +2022-11-16 02:07:22,733 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.37 vs. limit=2.0 +2022-11-16 02:07:27,703 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65593.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:07:31,592 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8498, 4.7744, 4.7123, 4.9471, 4.3306, 4.0767, 5.4944, 4.7000], + device='cuda:2'), covar=tensor([0.0360, 0.0709, 0.0250, 0.1066, 0.0412, 0.0271, 0.0687, 0.0561], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0104, 0.0088, 0.0114, 0.0084, 0.0074, 0.0138, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:07:38,637 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9570, 2.1465, 2.3308, 3.2318, 3.0256, 2.4635, 2.1545, 3.1893], + device='cuda:2'), covar=tensor([0.1177, 0.2825, 0.2186, 0.2123, 0.1280, 0.2830, 0.2143, 0.0819], + device='cuda:2'), in_proj_covar=tensor([0.0238, 0.0204, 0.0197, 0.0314, 0.0225, 0.0210, 0.0193, 0.0239], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 02:07:51,875 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.404e+01 1.672e+02 2.008e+02 2.472e+02 6.251e+02, threshold=4.017e+02, percent-clipped=3.0 +2022-11-16 02:07:54,629 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65632.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:08:00,535 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65641.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:08:05,595 INFO [train.py:876] (2/4) Epoch 10, batch 200, loss[loss=0.1628, simple_loss=0.1747, pruned_loss=0.07547, over 5513.00 frames. ], tot_loss[loss=0.126, simple_loss=0.1499, pruned_loss=0.05108, over 691176.28 frames. ], batch size: 46, lr: 8.30e-03, grad_scale: 16.0 +2022-11-16 02:08:26,067 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65680.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:08:47,441 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65711.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:08:58,246 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.788e+01 1.559e+02 1.915e+02 2.411e+02 5.348e+02, threshold=3.830e+02, percent-clipped=2.0 +2022-11-16 02:09:03,891 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7022, 2.0600, 3.2730, 2.8047, 3.4843, 2.2265, 3.0255, 3.6858], + device='cuda:2'), covar=tensor([0.0759, 0.1899, 0.0946, 0.1707, 0.0773, 0.1906, 0.1533, 0.0985], + device='cuda:2'), in_proj_covar=tensor([0.0233, 0.0192, 0.0208, 0.0206, 0.0229, 0.0190, 0.0224, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:09:05,133 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-11-16 02:09:12,972 INFO [train.py:876] (2/4) Epoch 10, batch 300, loss[loss=0.1463, simple_loss=0.1564, pruned_loss=0.06813, over 5181.00 frames. ], tot_loss[loss=0.1265, simple_loss=0.1502, pruned_loss=0.05142, over 849355.07 frames. ], batch size: 91, lr: 8.29e-03, grad_scale: 16.0 +2022-11-16 02:09:19,521 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65759.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:09:30,575 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-11-16 02:09:36,203 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 02:09:37,577 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2150, 3.2925, 2.5131, 1.7419, 3.2126, 1.1884, 3.1545, 1.5963], + device='cuda:2'), covar=tensor([0.1215, 0.0195, 0.0913, 0.1710, 0.0244, 0.2148, 0.0248, 0.1663], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0103, 0.0112, 0.0113, 0.0102, 0.0123, 0.0099, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:10:05,914 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.600e+01 1.581e+02 1.964e+02 2.494e+02 5.554e+02, threshold=3.929e+02, percent-clipped=0.0 +2022-11-16 02:10:12,711 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=65838.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:10:20,651 INFO [train.py:876] (2/4) Epoch 10, batch 400, loss[loss=0.07931, simple_loss=0.1167, pruned_loss=0.02096, over 5755.00 frames. ], tot_loss[loss=0.1296, simple_loss=0.1521, pruned_loss=0.05353, over 936859.37 frames. ], batch size: 13, lr: 8.28e-03, grad_scale: 16.0 +2022-11-16 02:10:31,436 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5050, 4.7066, 3.1881, 4.5494, 3.5721, 3.2802, 2.7934, 4.1182], + device='cuda:2'), covar=tensor([0.1412, 0.0207, 0.1041, 0.0251, 0.0604, 0.0862, 0.1710, 0.0271], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0142, 0.0162, 0.0142, 0.0176, 0.0174, 0.0171, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 02:10:46,441 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65888.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:10:53,792 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65899.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:10:55,364 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.73 vs. limit=5.0 +2022-11-16 02:11:08,706 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4560, 3.5755, 3.4653, 3.6320, 3.2036, 3.1276, 4.0736, 3.6060], + device='cuda:2'), covar=tensor([0.0536, 0.0792, 0.0539, 0.0895, 0.0619, 0.0446, 0.0621, 0.0623], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0103, 0.0088, 0.0113, 0.0083, 0.0074, 0.0139, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:11:12,318 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.19 vs. limit=2.0 +2022-11-16 02:11:13,839 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.863e+01 1.601e+02 1.999e+02 2.596e+02 7.493e+02, threshold=3.998e+02, percent-clipped=4.0 +2022-11-16 02:11:22,469 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65941.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:11:26,496 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9968, 3.6070, 2.5220, 3.4093, 2.6975, 2.5920, 1.9896, 3.1558], + device='cuda:2'), covar=tensor([0.1650, 0.0289, 0.1230, 0.0418, 0.1100, 0.1227, 0.2087, 0.0488], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0140, 0.0160, 0.0141, 0.0174, 0.0172, 0.0169, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 02:11:27,627 INFO [train.py:876] (2/4) Epoch 10, batch 500, loss[loss=0.108, simple_loss=0.143, pruned_loss=0.03655, over 5536.00 frames. ], tot_loss[loss=0.1277, simple_loss=0.1513, pruned_loss=0.05199, over 999225.37 frames. ], batch size: 15, lr: 8.28e-03, grad_scale: 16.0 +2022-11-16 02:11:55,177 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65989.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:12:21,981 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.783e+01 1.631e+02 2.001e+02 2.434e+02 4.617e+02, threshold=4.002e+02, percent-clipped=2.0 +2022-11-16 02:12:31,680 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.74 vs. limit=2.0 +2022-11-16 02:12:35,743 INFO [train.py:876] (2/4) Epoch 10, batch 600, loss[loss=0.1109, simple_loss=0.1403, pruned_loss=0.04077, over 5613.00 frames. ], tot_loss[loss=0.1274, simple_loss=0.1512, pruned_loss=0.05182, over 1035640.06 frames. ], batch size: 24, lr: 8.27e-03, grad_scale: 16.0 +2022-11-16 02:12:57,211 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9345, 2.2316, 3.5165, 3.0001, 3.6734, 2.5890, 3.3384, 3.9124], + device='cuda:2'), covar=tensor([0.0587, 0.1917, 0.0904, 0.1594, 0.0630, 0.1568, 0.1236, 0.0917], + device='cuda:2'), in_proj_covar=tensor([0.0231, 0.0190, 0.0206, 0.0204, 0.0228, 0.0190, 0.0221, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:13:00,140 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7635, 0.3918, 0.7774, 0.5875, 0.7234, 0.7651, 0.3657, 0.7712], + device='cuda:2'), covar=tensor([0.0192, 0.0205, 0.0148, 0.0213, 0.0165, 0.0156, 0.0440, 0.0160], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0019, 0.0013, 0.0017, 0.0014, 0.0013, 0.0018, 0.0013], + device='cuda:2'), out_proj_covar=tensor([6.7632e-05, 9.2580e-05, 6.9585e-05, 8.3507e-05, 7.4127e-05, 6.8334e-05, + 8.6098e-05, 6.7936e-05], device='cuda:2') +2022-11-16 02:13:22,379 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-11-16 02:13:27,938 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.503e+01 1.627e+02 2.016e+02 2.685e+02 4.936e+02, threshold=4.031e+02, percent-clipped=2.0 +2022-11-16 02:13:39,964 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66144.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:13:43,123 INFO [train.py:876] (2/4) Epoch 10, batch 700, loss[loss=0.1273, simple_loss=0.1438, pruned_loss=0.05538, over 5562.00 frames. ], tot_loss[loss=0.125, simple_loss=0.1496, pruned_loss=0.05018, over 1062570.51 frames. ], batch size: 43, lr: 8.26e-03, grad_scale: 16.0 +2022-11-16 02:13:50,238 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66160.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:13:53,166 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-11-16 02:14:08,604 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66188.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:14:12,740 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66194.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:14:21,170 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66205.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:14:31,684 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66221.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:14:35,985 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.700e+01 1.569e+02 1.911e+02 2.505e+02 4.164e+02, threshold=3.822e+02, percent-clipped=1.0 +2022-11-16 02:14:41,298 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66236.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:14:50,399 INFO [train.py:876] (2/4) Epoch 10, batch 800, loss[loss=0.1654, simple_loss=0.1791, pruned_loss=0.07584, over 5672.00 frames. ], tot_loss[loss=0.1258, simple_loss=0.1499, pruned_loss=0.05082, over 1074495.19 frames. ], batch size: 36, lr: 8.26e-03, grad_scale: 16.0 +2022-11-16 02:14:57,087 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66258.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:15:38,161 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66319.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:15:42,081 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5134, 3.0311, 3.3338, 1.6731, 3.1504, 3.8671, 3.6292, 3.9467], + device='cuda:2'), covar=tensor([0.1716, 0.1489, 0.0902, 0.2906, 0.0691, 0.0505, 0.0324, 0.0477], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0185, 0.0166, 0.0191, 0.0179, 0.0195, 0.0162, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-11-16 02:15:43,768 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.006e+02 1.565e+02 1.875e+02 2.256e+02 4.217e+02, threshold=3.749e+02, percent-clipped=3.0 +2022-11-16 02:15:57,485 INFO [train.py:876] (2/4) Epoch 10, batch 900, loss[loss=0.1231, simple_loss=0.1524, pruned_loss=0.04686, over 5471.00 frames. ], tot_loss[loss=0.1232, simple_loss=0.1484, pruned_loss=0.04895, over 1081034.93 frames. ], batch size: 17, lr: 8.25e-03, grad_scale: 16.0 +2022-11-16 02:16:23,193 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2460, 1.0675, 1.6018, 0.9851, 1.8086, 1.5478, 1.0034, 1.1074], + device='cuda:2'), covar=tensor([0.0786, 0.0944, 0.0233, 0.1124, 0.0223, 0.1209, 0.1020, 0.0550], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0019, 0.0013, 0.0017, 0.0014, 0.0013, 0.0017, 0.0013], + device='cuda:2'), out_proj_covar=tensor([6.6639e-05, 9.1337e-05, 6.8264e-05, 8.1981e-05, 7.2723e-05, 6.7516e-05, + 8.4663e-05, 6.7545e-05], device='cuda:2') +2022-11-16 02:16:51,649 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.310e+02 1.944e+02 2.274e+02 2.934e+02 5.796e+02, threshold=4.548e+02, percent-clipped=10.0 +2022-11-16 02:16:58,357 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6450, 4.6688, 3.5414, 1.8436, 4.2629, 1.8648, 4.2626, 2.3573], + device='cuda:2'), covar=tensor([0.1335, 0.0123, 0.0507, 0.2251, 0.0152, 0.1754, 0.0158, 0.1708], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0103, 0.0114, 0.0114, 0.0103, 0.0122, 0.0100, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:17:05,484 INFO [train.py:876] (2/4) Epoch 10, batch 1000, loss[loss=0.1465, simple_loss=0.1633, pruned_loss=0.06483, over 5273.00 frames. ], tot_loss[loss=0.1241, simple_loss=0.1489, pruned_loss=0.04967, over 1090921.94 frames. ], batch size: 79, lr: 8.25e-03, grad_scale: 16.0 +2022-11-16 02:17:08,676 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1746, 3.2532, 2.9561, 3.2918, 3.3154, 2.9453, 2.8919, 3.0287], + device='cuda:2'), covar=tensor([0.0859, 0.0559, 0.1369, 0.0489, 0.0522, 0.0478, 0.0786, 0.0581], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0172, 0.0270, 0.0166, 0.0211, 0.0169, 0.0182, 0.0167], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:17:13,257 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8134, 4.8739, 3.6834, 2.0724, 4.5342, 1.7520, 4.5899, 2.7809], + device='cuda:2'), covar=tensor([0.1390, 0.0154, 0.0664, 0.2450, 0.0190, 0.2197, 0.0187, 0.1567], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0103, 0.0114, 0.0113, 0.0102, 0.0122, 0.0100, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:17:32,762 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1057, 4.5546, 4.1268, 4.5702, 4.5850, 3.7880, 4.1716, 3.9157], + device='cuda:2'), covar=tensor([0.0489, 0.0484, 0.1410, 0.0459, 0.0449, 0.0548, 0.0622, 0.0525], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0171, 0.0268, 0.0165, 0.0210, 0.0168, 0.0180, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 02:17:35,870 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3952, 4.2008, 3.1539, 1.9314, 3.8620, 1.6127, 3.7840, 2.2573], + device='cuda:2'), covar=tensor([0.1244, 0.0104, 0.0734, 0.1877, 0.0174, 0.1743, 0.0175, 0.1392], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0103, 0.0114, 0.0113, 0.0102, 0.0121, 0.0100, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:17:35,888 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66494.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:17:39,661 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66500.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:17:50,094 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66516.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:17:53,480 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66521.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:17:58,497 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.268e+01 1.607e+02 1.990e+02 2.689e+02 6.236e+02, threshold=3.979e+02, percent-clipped=3.0 +2022-11-16 02:18:08,459 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66542.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:18:13,075 INFO [train.py:876] (2/4) Epoch 10, batch 1100, loss[loss=0.1471, simple_loss=0.1817, pruned_loss=0.05626, over 5698.00 frames. ], tot_loss[loss=0.1251, simple_loss=0.1492, pruned_loss=0.05045, over 1086710.35 frames. ], batch size: 28, lr: 8.24e-03, grad_scale: 16.0 +2022-11-16 02:18:35,130 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66582.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:18:38,852 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-11-16 02:18:44,494 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.41 vs. limit=5.0 +2022-11-16 02:18:56,848 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66614.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:19:05,750 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.897e+01 1.663e+02 2.010e+02 2.428e+02 5.298e+02, threshold=4.020e+02, percent-clipped=1.0 +2022-11-16 02:19:13,233 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2341, 2.4531, 2.7603, 2.5221, 1.6098, 2.5531, 1.8413, 1.8710], + device='cuda:2'), covar=tensor([0.0225, 0.0118, 0.0121, 0.0166, 0.0343, 0.0130, 0.0295, 0.0189], + device='cuda:2'), in_proj_covar=tensor([0.0185, 0.0163, 0.0171, 0.0191, 0.0182, 0.0170, 0.0181, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:19:20,781 INFO [train.py:876] (2/4) Epoch 10, batch 1200, loss[loss=0.1582, simple_loss=0.1727, pruned_loss=0.07183, over 5351.00 frames. ], tot_loss[loss=0.1265, simple_loss=0.15, pruned_loss=0.05153, over 1083503.65 frames. ], batch size: 70, lr: 8.23e-03, grad_scale: 16.0 +2022-11-16 02:19:21,565 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4283, 1.0151, 1.1499, 0.9010, 1.3071, 1.4867, 0.7696, 1.2054], + device='cuda:2'), covar=tensor([0.0429, 0.0405, 0.0439, 0.1273, 0.0470, 0.0256, 0.0813, 0.0319], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0020, 0.0013, 0.0017, 0.0014, 0.0013, 0.0018, 0.0013], + device='cuda:2'), out_proj_covar=tensor([6.8052e-05, 9.3279e-05, 6.9770e-05, 8.3510e-05, 7.4400e-05, 6.8610e-05, + 8.7397e-05, 6.8467e-05], device='cuda:2') +2022-11-16 02:19:47,922 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.68 vs. limit=5.0 +2022-11-16 02:20:13,038 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.003e+02 1.536e+02 1.899e+02 2.389e+02 5.504e+02, threshold=3.797e+02, percent-clipped=2.0 +2022-11-16 02:20:15,164 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66731.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:20:27,608 INFO [train.py:876] (2/4) Epoch 10, batch 1300, loss[loss=0.1149, simple_loss=0.1511, pruned_loss=0.0393, over 5702.00 frames. ], tot_loss[loss=0.1249, simple_loss=0.1495, pruned_loss=0.05013, over 1086866.78 frames. ], batch size: 28, lr: 8.23e-03, grad_scale: 16.0 +2022-11-16 02:20:56,012 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66792.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:21:01,743 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66800.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:21:13,051 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66816.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:21:20,623 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.135e+02 1.546e+02 1.789e+02 2.414e+02 5.527e+02, threshold=3.579e+02, percent-clipped=2.0 +2022-11-16 02:21:33,580 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66848.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:21:34,171 INFO [train.py:876] (2/4) Epoch 10, batch 1400, loss[loss=0.08308, simple_loss=0.1186, pruned_loss=0.02376, over 5703.00 frames. ], tot_loss[loss=0.1253, simple_loss=0.1497, pruned_loss=0.05044, over 1086256.84 frames. ], batch size: 17, lr: 8.22e-03, grad_scale: 16.0 +2022-11-16 02:21:44,991 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66864.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:21:53,524 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66877.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:22:05,271 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2128, 2.3046, 2.2571, 2.4418, 2.1747, 1.7848, 2.1564, 2.5576], + device='cuda:2'), covar=tensor([0.1353, 0.1643, 0.2315, 0.1165, 0.1493, 0.1808, 0.1640, 0.1158], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0096, 0.0100, 0.0092, 0.0088, 0.0092, 0.0096, 0.0071], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:22:09,673 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7530, 4.7486, 4.6190, 4.8091, 4.4968, 3.9548, 5.2728, 4.6875], + device='cuda:2'), covar=tensor([0.0429, 0.0701, 0.0455, 0.1158, 0.0473, 0.0456, 0.0665, 0.0608], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0108, 0.0092, 0.0119, 0.0088, 0.0077, 0.0144, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:22:10,062 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.22 vs. limit=5.0 +2022-11-16 02:22:18,744 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66914.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:22:27,876 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.435e+01 1.629e+02 1.952e+02 2.381e+02 3.716e+02, threshold=3.904e+02, percent-clipped=1.0 +2022-11-16 02:22:41,672 INFO [train.py:876] (2/4) Epoch 10, batch 1500, loss[loss=0.144, simple_loss=0.165, pruned_loss=0.06145, over 5564.00 frames. ], tot_loss[loss=0.1255, simple_loss=0.1496, pruned_loss=0.05068, over 1086678.74 frames. ], batch size: 21, lr: 8.21e-03, grad_scale: 16.0 +2022-11-16 02:22:43,686 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2223, 1.5121, 1.8215, 1.4251, 1.1734, 2.1728, 1.7722, 1.4658], + device='cuda:2'), covar=tensor([0.1816, 0.1482, 0.1465, 0.2688, 0.2919, 0.1117, 0.1525, 0.2136], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0077, 0.0077, 0.0088, 0.0065, 0.0057, 0.0064, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 02:22:46,952 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66957.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:22:47,921 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-11-16 02:22:50,116 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66962.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:22:50,910 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1833, 2.7202, 3.1176, 4.1025, 4.1948, 3.3612, 2.9228, 4.0984], + device='cuda:2'), covar=tensor([0.0572, 0.2772, 0.2126, 0.2610, 0.0891, 0.2824, 0.2142, 0.0482], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0200, 0.0192, 0.0309, 0.0223, 0.0205, 0.0188, 0.0233], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 02:22:53,595 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.70 vs. limit=2.0 +2022-11-16 02:23:17,578 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3811, 2.5613, 2.4468, 2.6344, 2.1704, 1.8734, 2.3671, 2.9398], + device='cuda:2'), covar=tensor([0.1645, 0.1867, 0.2540, 0.1232, 0.2171, 0.1976, 0.1883, 0.2581], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0098, 0.0102, 0.0094, 0.0090, 0.0094, 0.0098, 0.0073], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:23:23,964 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8882, 0.4842, 0.7986, 0.7492, 0.8110, 0.8262, 0.3615, 0.7886], + device='cuda:2'), covar=tensor([0.0243, 0.0457, 0.0360, 0.0411, 0.0358, 0.0295, 0.0738, 0.0370], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0020, 0.0013, 0.0017, 0.0015, 0.0013, 0.0018, 0.0013], + device='cuda:2'), out_proj_covar=tensor([6.8799e-05, 9.5095e-05, 7.0560e-05, 8.4098e-05, 7.5098e-05, 6.9131e-05, + 8.8532e-05, 6.8745e-05], device='cuda:2') +2022-11-16 02:23:27,986 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67018.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:23:30,086 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.99 vs. limit=5.0 +2022-11-16 02:23:34,296 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.428e+01 1.625e+02 1.914e+02 2.331e+02 6.825e+02, threshold=3.828e+02, percent-clipped=2.0 +2022-11-16 02:23:49,312 INFO [train.py:876] (2/4) Epoch 10, batch 1600, loss[loss=0.1114, simple_loss=0.1318, pruned_loss=0.04554, over 5494.00 frames. ], tot_loss[loss=0.1227, simple_loss=0.1475, pruned_loss=0.04891, over 1085517.14 frames. ], batch size: 12, lr: 8.21e-03, grad_scale: 16.0 +2022-11-16 02:24:09,943 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7898, 4.6889, 4.0991, 4.1330, 4.7184, 4.2568, 1.6805, 4.8682], + device='cuda:2'), covar=tensor([0.0199, 0.0285, 0.0440, 0.0438, 0.0320, 0.0539, 0.3464, 0.0486], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0083, 0.0085, 0.0075, 0.0100, 0.0086, 0.0129, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:24:11,987 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67083.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:24:15,230 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=67087.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:24:32,385 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9262, 4.0217, 3.8568, 3.6138, 2.1044, 4.1973, 2.2694, 3.5422], + device='cuda:2'), covar=tensor([0.0401, 0.0217, 0.0209, 0.0411, 0.0709, 0.0152, 0.0526, 0.0155], + device='cuda:2'), in_proj_covar=tensor([0.0188, 0.0165, 0.0175, 0.0194, 0.0185, 0.0172, 0.0184, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:24:41,917 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.077e+02 1.648e+02 1.940e+02 2.386e+02 4.578e+02, threshold=3.880e+02, percent-clipped=4.0 +2022-11-16 02:24:53,232 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67144.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:24:56,734 INFO [train.py:876] (2/4) Epoch 10, batch 1700, loss[loss=0.1683, simple_loss=0.1649, pruned_loss=0.08589, over 4751.00 frames. ], tot_loss[loss=0.1239, simple_loss=0.1485, pruned_loss=0.04962, over 1090564.74 frames. ], batch size: 135, lr: 8.20e-03, grad_scale: 16.0 +2022-11-16 02:25:00,157 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-11-16 02:25:15,540 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=67177.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:25:16,927 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7412, 4.2781, 3.9124, 3.7907, 2.0340, 4.0386, 2.2586, 3.3118], + device='cuda:2'), covar=tensor([0.0391, 0.0140, 0.0158, 0.0330, 0.0647, 0.0150, 0.0506, 0.0192], + device='cuda:2'), in_proj_covar=tensor([0.0188, 0.0166, 0.0175, 0.0195, 0.0186, 0.0172, 0.0185, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:25:24,225 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5108, 4.3882, 4.2872, 4.3046, 4.5119, 4.2502, 2.0199, 4.6925], + device='cuda:2'), covar=tensor([0.0196, 0.0321, 0.0270, 0.0181, 0.0277, 0.0338, 0.2726, 0.0267], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0084, 0.0086, 0.0076, 0.0101, 0.0087, 0.0130, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:25:48,251 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=67225.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:25:50,859 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.967e+01 1.444e+02 1.809e+02 2.360e+02 5.215e+02, threshold=3.618e+02, percent-clipped=3.0 +2022-11-16 02:26:04,154 INFO [train.py:876] (2/4) Epoch 10, batch 1800, loss[loss=0.1982, simple_loss=0.1853, pruned_loss=0.1056, over 4736.00 frames. ], tot_loss[loss=0.1245, simple_loss=0.1487, pruned_loss=0.05012, over 1081032.87 frames. ], batch size: 135, lr: 8.20e-03, grad_scale: 16.0 +2022-11-16 02:26:11,498 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9087, 1.6735, 1.5918, 1.0389, 1.3808, 1.6243, 1.3243, 1.0237], + device='cuda:2'), covar=tensor([0.0018, 0.0038, 0.0038, 0.0051, 0.0043, 0.0077, 0.0030, 0.0045], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0022, 0.0023, 0.0030, 0.0025, 0.0024, 0.0027, 0.0028], + device='cuda:2'), out_proj_covar=tensor([2.1121e-05, 2.1151e-05, 2.0819e-05, 2.9050e-05, 2.3674e-05, 2.3061e-05, + 2.6203e-05, 2.7554e-05], device='cuda:2') +2022-11-16 02:26:17,967 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1054, 0.6990, 0.8447, 0.7032, 0.8264, 1.0404, 0.5786, 0.8124], + device='cuda:2'), covar=tensor([0.0309, 0.0379, 0.0299, 0.0535, 0.0334, 0.0257, 0.0761, 0.0306], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0020, 0.0013, 0.0017, 0.0014, 0.0012, 0.0018, 0.0013], + device='cuda:2'), out_proj_covar=tensor([6.8070e-05, 9.3611e-05, 6.8579e-05, 8.2251e-05, 7.3140e-05, 6.7384e-05, + 8.5838e-05, 6.7011e-05], device='cuda:2') +2022-11-16 02:26:26,746 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.44 vs. limit=5.0 +2022-11-16 02:26:47,961 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=67313.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:26:58,163 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.062e+02 1.612e+02 1.959e+02 2.585e+02 8.694e+02, threshold=3.917e+02, percent-clipped=8.0 +2022-11-16 02:27:11,094 INFO [train.py:876] (2/4) Epoch 10, batch 1900, loss[loss=0.1243, simple_loss=0.1487, pruned_loss=0.04995, over 5694.00 frames. ], tot_loss[loss=0.1235, simple_loss=0.1478, pruned_loss=0.04959, over 1088022.72 frames. ], batch size: 36, lr: 8.19e-03, grad_scale: 16.0 +2022-11-16 02:27:28,844 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3997, 2.6124, 4.1492, 3.5048, 4.4803, 2.9594, 3.9995, 4.5838], + device='cuda:2'), covar=tensor([0.0526, 0.2096, 0.0823, 0.1486, 0.0444, 0.1542, 0.1032, 0.0634], + device='cuda:2'), in_proj_covar=tensor([0.0235, 0.0193, 0.0211, 0.0207, 0.0231, 0.0194, 0.0225, 0.0229], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:27:37,293 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=67387.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:28:03,865 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.06 vs. limit=2.0 +2022-11-16 02:28:05,970 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.083e+02 1.608e+02 1.927e+02 2.290e+02 4.521e+02, threshold=3.854e+02, percent-clipped=3.0 +2022-11-16 02:28:10,019 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=67435.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:28:12,565 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=67439.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:28:16,702 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4133, 4.3255, 4.4353, 4.6998, 4.0431, 4.0732, 4.9695, 4.3433], + device='cuda:2'), covar=tensor([0.0504, 0.0961, 0.0361, 0.0934, 0.0454, 0.0314, 0.0655, 0.0548], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0106, 0.0090, 0.0117, 0.0086, 0.0075, 0.0143, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:28:19,258 INFO [train.py:876] (2/4) Epoch 10, batch 2000, loss[loss=0.09254, simple_loss=0.1252, pruned_loss=0.02996, over 5577.00 frames. ], tot_loss[loss=0.1228, simple_loss=0.1473, pruned_loss=0.04921, over 1087036.91 frames. ], batch size: 16, lr: 8.18e-03, grad_scale: 16.0 +2022-11-16 02:28:51,426 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8040, 2.8932, 3.0465, 2.7867, 2.9676, 2.8803, 1.0482, 3.0593], + device='cuda:2'), covar=tensor([0.0317, 0.0333, 0.0267, 0.0317, 0.0348, 0.0345, 0.3044, 0.0298], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0084, 0.0086, 0.0076, 0.0101, 0.0087, 0.0130, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:29:02,515 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67513.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:29:14,261 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.999e+01 1.503e+02 1.762e+02 2.315e+02 5.487e+02, threshold=3.525e+02, percent-clipped=4.0 +2022-11-16 02:29:27,327 INFO [train.py:876] (2/4) Epoch 10, batch 2100, loss[loss=0.1422, simple_loss=0.1583, pruned_loss=0.06298, over 5608.00 frames. ], tot_loss[loss=0.1238, simple_loss=0.1479, pruned_loss=0.04985, over 1089390.39 frames. ], batch size: 23, lr: 8.18e-03, grad_scale: 16.0 +2022-11-16 02:29:44,920 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67574.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:30:04,957 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2133, 2.0950, 1.9389, 2.2389, 1.8579, 1.6735, 2.0187, 2.3828], + device='cuda:2'), covar=tensor([0.1206, 0.1934, 0.2682, 0.1458, 0.2258, 0.2694, 0.1873, 0.1666], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0096, 0.0100, 0.0092, 0.0087, 0.0093, 0.0095, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:30:10,757 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=67613.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:30:21,802 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.895e+01 1.533e+02 1.917e+02 2.461e+02 4.676e+02, threshold=3.833e+02, percent-clipped=3.0 +2022-11-16 02:30:28,664 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4836, 4.0952, 4.3065, 4.0747, 4.5679, 4.3712, 4.1739, 4.5311], + device='cuda:2'), covar=tensor([0.0372, 0.0316, 0.0407, 0.0301, 0.0343, 0.0231, 0.0256, 0.0297], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0141, 0.0103, 0.0136, 0.0161, 0.0095, 0.0115, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 02:30:33,503 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4116, 3.2285, 3.2735, 3.0489, 1.9667, 3.1997, 2.1427, 2.7846], + device='cuda:2'), covar=tensor([0.0304, 0.0129, 0.0133, 0.0262, 0.0409, 0.0159, 0.0415, 0.0146], + device='cuda:2'), in_proj_covar=tensor([0.0189, 0.0166, 0.0173, 0.0195, 0.0184, 0.0172, 0.0185, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:30:35,261 INFO [train.py:876] (2/4) Epoch 10, batch 2200, loss[loss=0.1496, simple_loss=0.1659, pruned_loss=0.06664, over 5278.00 frames. ], tot_loss[loss=0.1239, simple_loss=0.1478, pruned_loss=0.05, over 1085570.78 frames. ], batch size: 79, lr: 8.17e-03, grad_scale: 16.0 +2022-11-16 02:30:43,330 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=67661.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:30:56,580 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67680.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:31:04,408 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67692.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:31:28,761 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.008e+02 1.628e+02 1.848e+02 2.167e+02 3.547e+02, threshold=3.696e+02, percent-clipped=0.0 +2022-11-16 02:31:36,136 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=67739.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:31:37,515 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67741.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:31:42,391 INFO [train.py:876] (2/4) Epoch 10, batch 2300, loss[loss=0.1041, simple_loss=0.137, pruned_loss=0.03562, over 5750.00 frames. ], tot_loss[loss=0.1232, simple_loss=0.1471, pruned_loss=0.04962, over 1082764.42 frames. ], batch size: 20, lr: 8.17e-03, grad_scale: 16.0 +2022-11-16 02:31:43,517 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6902, 2.7657, 2.9266, 2.7225, 2.7222, 2.7680, 1.0951, 2.8648], + device='cuda:2'), covar=tensor([0.0575, 0.0479, 0.0469, 0.0430, 0.0585, 0.0620, 0.3851, 0.0537], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0085, 0.0086, 0.0077, 0.0102, 0.0087, 0.0130, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:31:45,495 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67753.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:31:47,081 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-16 02:32:08,286 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=67787.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:32:36,230 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.663e+01 1.643e+02 1.965e+02 2.550e+02 4.357e+02, threshold=3.931e+02, percent-clipped=5.0 +2022-11-16 02:32:47,325 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3798, 3.4351, 3.3545, 3.5961, 3.0699, 3.1449, 3.8894, 3.3619], + device='cuda:2'), covar=tensor([0.0570, 0.0904, 0.0536, 0.1121, 0.0802, 0.0425, 0.0869, 0.0810], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0105, 0.0089, 0.0115, 0.0085, 0.0076, 0.0141, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:32:50,270 INFO [train.py:876] (2/4) Epoch 10, batch 2400, loss[loss=0.06744, simple_loss=0.113, pruned_loss=0.01096, over 5217.00 frames. ], tot_loss[loss=0.1243, simple_loss=0.1483, pruned_loss=0.05013, over 1082832.68 frames. ], batch size: 7, lr: 8.16e-03, grad_scale: 16.0 +2022-11-16 02:33:03,315 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=67869.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:33:16,133 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3661, 2.8441, 3.7849, 3.3394, 4.1837, 3.0132, 3.8439, 4.2351], + device='cuda:2'), covar=tensor([0.0431, 0.1307, 0.0801, 0.1373, 0.0539, 0.1243, 0.1022, 0.0670], + device='cuda:2'), in_proj_covar=tensor([0.0236, 0.0193, 0.0210, 0.0210, 0.0232, 0.0191, 0.0226, 0.0228], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:33:43,934 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.957e+01 1.533e+02 1.852e+02 2.262e+02 4.255e+02, threshold=3.703e+02, percent-clipped=1.0 +2022-11-16 02:33:53,482 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5424, 2.0720, 2.3980, 3.3965, 3.3569, 2.5267, 2.2678, 3.3607], + device='cuda:2'), covar=tensor([0.0751, 0.2985, 0.2368, 0.3512, 0.1497, 0.3252, 0.2284, 0.0986], + device='cuda:2'), in_proj_covar=tensor([0.0234, 0.0200, 0.0189, 0.0307, 0.0220, 0.0204, 0.0189, 0.0236], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 02:33:58,908 INFO [train.py:876] (2/4) Epoch 10, batch 2500, loss[loss=0.2194, simple_loss=0.1953, pruned_loss=0.1217, over 3025.00 frames. ], tot_loss[loss=0.1233, simple_loss=0.1481, pruned_loss=0.04921, over 1081044.11 frames. ], batch size: 284, lr: 8.15e-03, grad_scale: 16.0 +2022-11-16 02:34:16,006 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.91 vs. limit=5.0 +2022-11-16 02:34:22,087 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67981.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:34:54,545 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.078e+02 1.495e+02 1.844e+02 2.229e+02 3.731e+02, threshold=3.687e+02, percent-clipped=1.0 +2022-11-16 02:34:59,157 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68036.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:35:03,043 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68042.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:35:06,807 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68048.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:35:07,303 INFO [train.py:876] (2/4) Epoch 10, batch 2600, loss[loss=0.1641, simple_loss=0.167, pruned_loss=0.08056, over 5407.00 frames. ], tot_loss[loss=0.1243, simple_loss=0.1484, pruned_loss=0.05008, over 1079669.70 frames. ], batch size: 70, lr: 8.15e-03, grad_scale: 16.0 +2022-11-16 02:35:28,692 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-16 02:35:33,573 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=68087.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:36:01,566 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 1.584e+02 1.812e+02 2.334e+02 4.676e+02, threshold=3.625e+02, percent-clipped=1.0 +2022-11-16 02:36:14,138 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68148.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:36:14,627 INFO [train.py:876] (2/4) Epoch 10, batch 2700, loss[loss=0.1102, simple_loss=0.1378, pruned_loss=0.04128, over 5545.00 frames. ], tot_loss[loss=0.1229, simple_loss=0.1474, pruned_loss=0.04921, over 1081010.64 frames. ], batch size: 13, lr: 8.14e-03, grad_scale: 16.0 +2022-11-16 02:36:14,864 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.72 vs. limit=2.0 +2022-11-16 02:36:28,851 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68169.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:36:36,608 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-11-16 02:37:00,604 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68217.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:37:09,358 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.044e+01 1.607e+02 1.920e+02 2.533e+02 3.834e+02, threshold=3.840e+02, percent-clipped=3.0 +2022-11-16 02:37:22,802 INFO [train.py:876] (2/4) Epoch 10, batch 2800, loss[loss=0.08084, simple_loss=0.112, pruned_loss=0.02485, over 5509.00 frames. ], tot_loss[loss=0.1204, simple_loss=0.1459, pruned_loss=0.04741, over 1082270.54 frames. ], batch size: 12, lr: 8.14e-03, grad_scale: 16.0 +2022-11-16 02:37:53,966 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-11-16 02:38:09,697 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.25 vs. limit=2.0 +2022-11-16 02:38:16,586 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.672e+01 1.605e+02 1.834e+02 2.410e+02 3.703e+02, threshold=3.668e+02, percent-clipped=0.0 +2022-11-16 02:38:21,757 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68336.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:38:22,374 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68337.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:38:25,020 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4438, 2.4695, 2.2704, 2.3884, 2.1545, 2.1293, 2.1641, 2.7539], + device='cuda:2'), covar=tensor([0.1191, 0.1500, 0.2457, 0.1725, 0.1715, 0.1652, 0.2005, 0.1625], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0097, 0.0100, 0.0093, 0.0087, 0.0094, 0.0095, 0.0073], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:38:30,000 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68348.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:38:30,511 INFO [train.py:876] (2/4) Epoch 10, batch 2900, loss[loss=0.119, simple_loss=0.1458, pruned_loss=0.04609, over 5523.00 frames. ], tot_loss[loss=0.1233, simple_loss=0.148, pruned_loss=0.0493, over 1081208.52 frames. ], batch size: 17, lr: 8.13e-03, grad_scale: 16.0 +2022-11-16 02:38:34,608 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7840, 3.8375, 3.5732, 3.3170, 2.1998, 3.9165, 2.2623, 3.1078], + device='cuda:2'), covar=tensor([0.0360, 0.0168, 0.0245, 0.0311, 0.0493, 0.0132, 0.0488, 0.0190], + device='cuda:2'), in_proj_covar=tensor([0.0187, 0.0163, 0.0172, 0.0194, 0.0183, 0.0169, 0.0182, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:38:38,346 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0772, 3.3308, 2.4574, 1.8587, 3.1654, 1.2940, 3.1005, 1.9215], + device='cuda:2'), covar=tensor([0.1370, 0.0177, 0.0955, 0.1755, 0.0225, 0.1977, 0.0279, 0.1378], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0105, 0.0114, 0.0115, 0.0102, 0.0123, 0.0100, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:38:53,227 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68384.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:39:02,043 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68396.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:39:04,129 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=68399.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:39:23,978 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.125e+02 1.585e+02 1.960e+02 2.484e+02 4.720e+02, threshold=3.919e+02, percent-clipped=5.0 +2022-11-16 02:39:33,258 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68443.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:39:38,225 INFO [train.py:876] (2/4) Epoch 10, batch 3000, loss[loss=0.1301, simple_loss=0.1604, pruned_loss=0.04986, over 5717.00 frames. ], tot_loss[loss=0.1217, simple_loss=0.1466, pruned_loss=0.04838, over 1085631.01 frames. ], batch size: 19, lr: 8.12e-03, grad_scale: 16.0 +2022-11-16 02:39:38,225 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 02:39:47,602 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1374, 2.7998, 3.2742, 3.1163, 3.1127, 3.3852, 3.4655, 3.3163], + device='cuda:2'), covar=tensor([0.0604, 0.1673, 0.0482, 0.1181, 0.0844, 0.0359, 0.0815, 0.0849], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0106, 0.0089, 0.0115, 0.0085, 0.0076, 0.0141, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:39:48,035 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5872, 3.5019, 3.4898, 3.2641, 1.9491, 3.4513, 2.2279, 3.0771], + device='cuda:2'), covar=tensor([0.0358, 0.0157, 0.0136, 0.0247, 0.0558, 0.0167, 0.0493, 0.0184], + device='cuda:2'), in_proj_covar=tensor([0.0191, 0.0167, 0.0176, 0.0199, 0.0188, 0.0174, 0.0187, 0.0177], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:39:55,643 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6749, 1.9885, 2.3950, 2.8942, 2.8322, 2.3639, 1.9293, 2.6933], + device='cuda:2'), covar=tensor([0.1322, 0.1731, 0.1100, 0.0455, 0.0441, 0.1159, 0.1475, 0.1143], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0206, 0.0196, 0.0315, 0.0226, 0.0208, 0.0194, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 02:39:56,212 INFO [train.py:908] (2/4) Epoch 10, validation: loss=0.1681, simple_loss=0.1842, pruned_loss=0.07602, over 1530663.00 frames. +2022-11-16 02:39:56,213 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 02:40:03,625 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68460.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:40:13,748 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5563, 1.4143, 1.4367, 1.0488, 1.2386, 1.4093, 1.2743, 0.8598], + device='cuda:2'), covar=tensor([0.0025, 0.0036, 0.0032, 0.0052, 0.0040, 0.0044, 0.0036, 0.0053], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0023, 0.0023, 0.0031, 0.0027, 0.0025, 0.0029, 0.0029], + device='cuda:2'), out_proj_covar=tensor([2.2381e-05, 2.2243e-05, 2.1223e-05, 3.0692e-05, 2.5229e-05, 2.4095e-05, + 2.8261e-05, 2.8535e-05], device='cuda:2') +2022-11-16 02:40:14,957 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0934, 3.9299, 2.5973, 3.6553, 3.0353, 2.6012, 1.9350, 3.3678], + device='cuda:2'), covar=tensor([0.1634, 0.0310, 0.1251, 0.0450, 0.0790, 0.1174, 0.2264, 0.0426], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0138, 0.0161, 0.0146, 0.0175, 0.0172, 0.0166, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 02:40:16,871 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8521, 3.6754, 3.6225, 3.4817, 3.6752, 3.6315, 1.5856, 4.0140], + device='cuda:2'), covar=tensor([0.0256, 0.0400, 0.0331, 0.0374, 0.0342, 0.0445, 0.2980, 0.0229], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0085, 0.0084, 0.0076, 0.0101, 0.0088, 0.0128, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:40:17,597 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1322, 2.9632, 2.7652, 1.6062, 2.8027, 3.1887, 3.0638, 3.6327], + device='cuda:2'), covar=tensor([0.1774, 0.1531, 0.1266, 0.2704, 0.0760, 0.0793, 0.0515, 0.0589], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0186, 0.0165, 0.0188, 0.0179, 0.0196, 0.0168, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:40:23,257 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6320, 2.3563, 2.9585, 3.7375, 3.7304, 2.8938, 2.3273, 3.5630], + device='cuda:2'), covar=tensor([0.0995, 0.3240, 0.2094, 0.2400, 0.1251, 0.3227, 0.2336, 0.0867], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0204, 0.0194, 0.0312, 0.0224, 0.0208, 0.0192, 0.0238], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 02:40:49,560 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.217e+01 1.619e+02 2.006e+02 2.438e+02 5.141e+02, threshold=4.012e+02, percent-clipped=2.0 +2022-11-16 02:41:02,562 INFO [train.py:876] (2/4) Epoch 10, batch 3100, loss[loss=0.1869, simple_loss=0.1948, pruned_loss=0.08949, over 4785.00 frames. ], tot_loss[loss=0.121, simple_loss=0.1464, pruned_loss=0.04781, over 1091682.73 frames. ], batch size: 135, lr: 8.12e-03, grad_scale: 16.0 +2022-11-16 02:41:07,760 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-11-16 02:41:08,674 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6918, 1.1897, 1.4400, 1.0238, 2.0929, 1.4264, 1.4160, 1.4890], + device='cuda:2'), covar=tensor([0.0830, 0.0985, 0.0898, 0.1915, 0.0364, 0.1927, 0.0719, 0.0543], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0021, 0.0014, 0.0018, 0.0015, 0.0013, 0.0019, 0.0013], + device='cuda:2'), out_proj_covar=tensor([7.2096e-05, 9.7867e-05, 7.2840e-05, 8.7361e-05, 7.7543e-05, 7.0432e-05, + 9.1007e-05, 7.1539e-05], device='cuda:2') +2022-11-16 02:41:22,504 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=68578.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:41:50,983 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1456, 5.0714, 3.5295, 2.2775, 4.6292, 2.5201, 4.7457, 2.8748], + device='cuda:2'), covar=tensor([0.1045, 0.0081, 0.0612, 0.1804, 0.0137, 0.1300, 0.0132, 0.1125], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0103, 0.0113, 0.0113, 0.0101, 0.0123, 0.0100, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:41:57,045 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.658e+01 1.525e+02 1.984e+02 2.613e+02 4.758e+02, threshold=3.969e+02, percent-clipped=4.0 +2022-11-16 02:41:57,948 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0331, 2.6356, 3.0644, 3.9864, 4.0111, 3.1637, 2.4676, 3.8718], + device='cuda:2'), covar=tensor([0.0563, 0.3361, 0.2049, 0.2678, 0.0883, 0.3034, 0.2303, 0.0594], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0204, 0.0196, 0.0314, 0.0227, 0.0210, 0.0194, 0.0240], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 02:42:02,888 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68637.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:42:04,172 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68639.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:42:10,467 INFO [train.py:876] (2/4) Epoch 10, batch 3200, loss[loss=0.1002, simple_loss=0.1338, pruned_loss=0.03336, over 5737.00 frames. ], tot_loss[loss=0.1199, simple_loss=0.1464, pruned_loss=0.04672, over 1094272.32 frames. ], batch size: 15, lr: 8.11e-03, grad_scale: 16.0 +2022-11-16 02:42:35,039 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68685.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:42:35,150 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=68685.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:43:04,535 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.027e+02 1.562e+02 1.856e+02 2.201e+02 3.564e+02, threshold=3.712e+02, percent-clipped=0.0 +2022-11-16 02:43:14,442 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68743.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:43:16,474 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68746.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:43:17,042 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1209, 5.0242, 3.7756, 2.3018, 4.7184, 2.0261, 4.5889, 2.7137], + device='cuda:2'), covar=tensor([0.1051, 0.0092, 0.0409, 0.1820, 0.0122, 0.1594, 0.0146, 0.1378], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0104, 0.0113, 0.0114, 0.0101, 0.0123, 0.0100, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:43:18,252 INFO [train.py:876] (2/4) Epoch 10, batch 3300, loss[loss=0.1, simple_loss=0.1345, pruned_loss=0.03282, over 5500.00 frames. ], tot_loss[loss=0.1225, simple_loss=0.1479, pruned_loss=0.04852, over 1087416.74 frames. ], batch size: 17, lr: 8.11e-03, grad_scale: 16.0 +2022-11-16 02:43:22,189 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68755.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:43:46,725 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68791.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:44:12,127 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.117e+01 1.492e+02 1.907e+02 2.389e+02 5.299e+02, threshold=3.813e+02, percent-clipped=1.0 +2022-11-16 02:44:25,995 INFO [train.py:876] (2/4) Epoch 10, batch 3400, loss[loss=0.1345, simple_loss=0.1557, pruned_loss=0.05668, over 5611.00 frames. ], tot_loss[loss=0.1223, simple_loss=0.1469, pruned_loss=0.04886, over 1086822.91 frames. ], batch size: 23, lr: 8.10e-03, grad_scale: 16.0 +2022-11-16 02:44:42,001 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.16 vs. limit=5.0 +2022-11-16 02:45:07,931 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7160, 2.2254, 3.3180, 2.8142, 3.3918, 2.2982, 3.0775, 3.6777], + device='cuda:2'), covar=tensor([0.0620, 0.1765, 0.0942, 0.1529, 0.0663, 0.1654, 0.1282, 0.1078], + device='cuda:2'), in_proj_covar=tensor([0.0234, 0.0191, 0.0206, 0.0209, 0.0230, 0.0190, 0.0224, 0.0225], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:45:20,579 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.200e+01 1.439e+02 1.799e+02 2.243e+02 3.372e+02, threshold=3.599e+02, percent-clipped=0.0 +2022-11-16 02:45:24,304 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68934.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:45:34,200 INFO [train.py:876] (2/4) Epoch 10, batch 3500, loss[loss=0.09896, simple_loss=0.1294, pruned_loss=0.03428, over 5707.00 frames. ], tot_loss[loss=0.1218, simple_loss=0.1462, pruned_loss=0.04868, over 1084608.30 frames. ], batch size: 19, lr: 8.10e-03, grad_scale: 16.0 +2022-11-16 02:45:51,243 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3871, 2.7901, 3.8847, 3.2888, 4.1872, 3.1096, 3.8375, 4.3998], + device='cuda:2'), covar=tensor([0.0609, 0.1432, 0.0745, 0.1168, 0.0535, 0.1239, 0.1123, 0.0556], + device='cuda:2'), in_proj_covar=tensor([0.0233, 0.0190, 0.0205, 0.0206, 0.0229, 0.0190, 0.0223, 0.0224], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:46:19,705 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0510, 5.0407, 3.8477, 2.2319, 4.6757, 2.2948, 4.5668, 3.0299], + device='cuda:2'), covar=tensor([0.1167, 0.0109, 0.0417, 0.2004, 0.0142, 0.1571, 0.0234, 0.1145], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0104, 0.0113, 0.0114, 0.0102, 0.0124, 0.0100, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:46:28,138 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.544e+01 1.634e+02 1.923e+02 2.372e+02 5.246e+02, threshold=3.846e+02, percent-clipped=2.0 +2022-11-16 02:46:36,709 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69041.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:46:38,115 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69043.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:46:41,871 INFO [train.py:876] (2/4) Epoch 10, batch 3600, loss[loss=0.1497, simple_loss=0.1692, pruned_loss=0.06506, over 5758.00 frames. ], tot_loss[loss=0.1219, simple_loss=0.1464, pruned_loss=0.04874, over 1085669.28 frames. ], batch size: 20, lr: 8.09e-03, grad_scale: 16.0 +2022-11-16 02:46:46,006 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69055.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:46:54,265 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6453, 3.7251, 3.6497, 3.3466, 2.1002, 3.7866, 2.3703, 3.0204], + device='cuda:2'), covar=tensor([0.0361, 0.0189, 0.0171, 0.0327, 0.0567, 0.0147, 0.0438, 0.0215], + device='cuda:2'), in_proj_covar=tensor([0.0187, 0.0164, 0.0172, 0.0193, 0.0184, 0.0171, 0.0183, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:47:18,344 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69103.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:47:19,110 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69104.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:47:24,469 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8695, 4.0260, 3.9680, 3.6734, 3.9286, 3.8759, 1.5006, 3.9449], + device='cuda:2'), covar=tensor([0.0420, 0.0480, 0.0354, 0.0534, 0.0478, 0.0521, 0.4248, 0.0484], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0086, 0.0085, 0.0078, 0.0101, 0.0088, 0.0130, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 02:47:27,861 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9640, 1.4425, 1.7641, 1.5906, 1.6626, 1.6418, 1.8745, 1.6845], + device='cuda:2'), covar=tensor([0.0033, 0.0083, 0.0052, 0.0045, 0.0070, 0.0131, 0.0033, 0.0038], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0023, 0.0023, 0.0031, 0.0026, 0.0024, 0.0029, 0.0028], + device='cuda:2'), out_proj_covar=tensor([2.1711e-05, 2.1433e-05, 2.0776e-05, 2.9943e-05, 2.4617e-05, 2.3240e-05, + 2.7765e-05, 2.7789e-05], device='cuda:2') +2022-11-16 02:47:35,521 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.617e+01 1.458e+02 1.877e+02 2.187e+02 3.990e+02, threshold=3.754e+02, percent-clipped=1.0 +2022-11-16 02:47:42,725 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1900, 3.0090, 3.0627, 2.8129, 3.2433, 3.1130, 3.0234, 3.2417], + device='cuda:2'), covar=tensor([0.0398, 0.0388, 0.0485, 0.0405, 0.0418, 0.0234, 0.0375, 0.0432], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0140, 0.0102, 0.0135, 0.0160, 0.0097, 0.0116, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 02:47:49,304 INFO [train.py:876] (2/4) Epoch 10, batch 3700, loss[loss=0.2063, simple_loss=0.1949, pruned_loss=0.1089, over 5405.00 frames. ], tot_loss[loss=0.1224, simple_loss=0.147, pruned_loss=0.04891, over 1081729.88 frames. ], batch size: 58, lr: 8.08e-03, grad_scale: 32.0 +2022-11-16 02:47:58,268 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69162.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:48:02,152 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69168.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:48:21,464 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7425, 3.8454, 3.7280, 3.5410, 1.9485, 3.9389, 2.3006, 3.2944], + device='cuda:2'), covar=tensor([0.0401, 0.0237, 0.0183, 0.0324, 0.0688, 0.0153, 0.0487, 0.0149], + device='cuda:2'), in_proj_covar=tensor([0.0186, 0.0164, 0.0172, 0.0194, 0.0185, 0.0171, 0.0183, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:48:39,951 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5745, 4.0895, 4.3182, 4.0956, 4.6364, 4.3406, 4.0472, 4.5662], + device='cuda:2'), covar=tensor([0.0394, 0.0391, 0.0490, 0.0406, 0.0355, 0.0289, 0.0368, 0.0358], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0142, 0.0103, 0.0137, 0.0163, 0.0098, 0.0118, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 02:48:40,049 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69223.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:48:41,334 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69225.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:48:43,921 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69229.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:48:44,356 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.159e+01 1.529e+02 2.089e+02 2.315e+02 4.555e+02, threshold=4.177e+02, percent-clipped=1.0 +2022-11-16 02:48:47,073 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69234.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:48:47,446 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-11-16 02:48:54,975 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69245.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 02:48:57,846 INFO [train.py:876] (2/4) Epoch 10, batch 3800, loss[loss=0.1827, simple_loss=0.1948, pruned_loss=0.08533, over 5454.00 frames. ], tot_loss[loss=0.1235, simple_loss=0.1479, pruned_loss=0.04952, over 1078398.25 frames. ], batch size: 58, lr: 8.08e-03, grad_scale: 16.0 +2022-11-16 02:49:14,257 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69273.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:49:20,036 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69282.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:49:22,748 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69286.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:49:33,581 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 02:49:35,642 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.43 vs. limit=5.0 +2022-11-16 02:49:36,636 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69306.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:49:45,835 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69320.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:49:50,122 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9176, 0.7531, 0.8668, 0.6750, 1.1183, 1.0258, 0.5289, 1.1487], + device='cuda:2'), covar=tensor([0.0063, 0.0029, 0.0041, 0.0033, 0.0036, 0.0041, 0.0087, 0.0038], + device='cuda:2'), in_proj_covar=tensor([0.0054, 0.0048, 0.0051, 0.0050, 0.0050, 0.0045, 0.0046, 0.0043], + device='cuda:2'), out_proj_covar=tensor([4.8551e-05, 4.3144e-05, 4.5005e-05, 4.5312e-05, 4.4543e-05, 3.9627e-05, + 4.1922e-05, 3.7661e-05], device='cuda:2') +2022-11-16 02:49:53,221 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.037e+02 1.561e+02 1.843e+02 2.119e+02 2.947e+02, threshold=3.686e+02, percent-clipped=0.0 +2022-11-16 02:49:55,297 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69334.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 02:49:59,886 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69341.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:50:05,020 INFO [train.py:876] (2/4) Epoch 10, batch 3900, loss[loss=0.1124, simple_loss=0.1332, pruned_loss=0.04581, over 5716.00 frames. ], tot_loss[loss=0.1226, simple_loss=0.1474, pruned_loss=0.04887, over 1081683.19 frames. ], batch size: 12, lr: 8.07e-03, grad_scale: 8.0 +2022-11-16 02:50:14,764 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9204, 4.0517, 3.8115, 3.6574, 2.1160, 4.1805, 2.4382, 3.4073], + device='cuda:2'), covar=tensor([0.0388, 0.0166, 0.0195, 0.0286, 0.0595, 0.0152, 0.0480, 0.0318], + device='cuda:2'), in_proj_covar=tensor([0.0186, 0.0165, 0.0172, 0.0195, 0.0184, 0.0173, 0.0184, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:50:18,595 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69368.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:50:27,391 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69381.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:50:32,635 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69389.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:50:36,066 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69394.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:50:39,285 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69399.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:51:00,408 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69429.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:51:01,540 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.012e+02 1.631e+02 1.973e+02 2.599e+02 5.372e+02, threshold=3.946e+02, percent-clipped=3.0 +2022-11-16 02:51:13,629 INFO [train.py:876] (2/4) Epoch 10, batch 4000, loss[loss=0.09998, simple_loss=0.1387, pruned_loss=0.0306, over 5496.00 frames. ], tot_loss[loss=0.1215, simple_loss=0.1463, pruned_loss=0.04836, over 1082776.74 frames. ], batch size: 17, lr: 8.07e-03, grad_scale: 8.0 +2022-11-16 02:51:17,626 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69455.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:51:57,308 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8610, 1.6075, 1.8783, 1.7367, 1.6093, 1.5903, 1.7051, 1.7702], + device='cuda:2'), covar=tensor([0.0032, 0.0165, 0.0048, 0.0039, 0.0043, 0.0068, 0.0032, 0.0035], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0022, 0.0023, 0.0030, 0.0026, 0.0023, 0.0028, 0.0028], + device='cuda:2'), out_proj_covar=tensor([2.1356e-05, 2.0890e-05, 2.0579e-05, 2.8858e-05, 2.4178e-05, 2.2314e-05, + 2.6928e-05, 2.7101e-05], device='cuda:2') +2022-11-16 02:51:59,804 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69518.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:52:03,557 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69524.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:52:08,749 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.549e+01 1.604e+02 1.884e+02 2.288e+02 4.952e+02, threshold=3.768e+02, percent-clipped=2.0 +2022-11-16 02:52:20,884 INFO [train.py:876] (2/4) Epoch 10, batch 4100, loss[loss=0.1632, simple_loss=0.1742, pruned_loss=0.07617, over 5470.00 frames. ], tot_loss[loss=0.1215, simple_loss=0.1461, pruned_loss=0.0484, over 1081621.35 frames. ], batch size: 58, lr: 8.06e-03, grad_scale: 8.0 +2022-11-16 02:52:42,300 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69581.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:52:56,107 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69601.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:53:14,382 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69629.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:53:15,581 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.074e+02 1.609e+02 2.045e+02 2.634e+02 5.309e+02, threshold=4.090e+02, percent-clipped=7.0 +2022-11-16 02:53:28,121 INFO [train.py:876] (2/4) Epoch 10, batch 4200, loss[loss=0.06783, simple_loss=0.1031, pruned_loss=0.01626, over 5327.00 frames. ], tot_loss[loss=0.1212, simple_loss=0.1461, pruned_loss=0.04819, over 1083580.89 frames. ], batch size: 9, lr: 8.05e-03, grad_scale: 8.0 +2022-11-16 02:53:33,891 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4727, 1.0810, 1.6892, 1.2166, 1.5232, 1.4537, 1.1597, 1.7887], + device='cuda:2'), covar=tensor([0.0063, 0.0060, 0.0040, 0.0052, 0.0056, 0.0043, 0.0049, 0.0040], + device='cuda:2'), in_proj_covar=tensor([0.0055, 0.0050, 0.0052, 0.0052, 0.0052, 0.0047, 0.0048, 0.0044], + device='cuda:2'), out_proj_covar=tensor([4.9432e-05, 4.4999e-05, 4.6119e-05, 4.6750e-05, 4.6288e-05, 4.0861e-05, + 4.3205e-05, 3.9200e-05], device='cuda:2') +2022-11-16 02:53:35,991 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9581, 2.6073, 3.0507, 3.7405, 3.9491, 3.0450, 2.6533, 3.6958], + device='cuda:2'), covar=tensor([0.0599, 0.3021, 0.2001, 0.3882, 0.0905, 0.2952, 0.2006, 0.0897], + device='cuda:2'), in_proj_covar=tensor([0.0239, 0.0198, 0.0189, 0.0310, 0.0224, 0.0207, 0.0188, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 02:53:37,960 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6528, 2.8084, 2.4202, 2.8543, 2.3174, 2.0910, 2.6300, 3.2273], + device='cuda:2'), covar=tensor([0.1242, 0.1525, 0.2169, 0.1639, 0.2167, 0.1996, 0.1796, 0.2547], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0099, 0.0101, 0.0094, 0.0088, 0.0096, 0.0095, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 02:53:45,414 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69674.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:53:46,615 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69676.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:54:01,869 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69699.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:54:18,346 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69724.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:54:23,254 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.003e+02 1.585e+02 1.904e+02 2.422e+02 5.734e+02, threshold=3.808e+02, percent-clipped=2.0 +2022-11-16 02:54:26,125 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69735.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:54:34,708 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69747.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:54:36,045 INFO [train.py:876] (2/4) Epoch 10, batch 4300, loss[loss=0.08938, simple_loss=0.1269, pruned_loss=0.02594, over 5584.00 frames. ], tot_loss[loss=0.1214, simple_loss=0.1465, pruned_loss=0.04809, over 1089576.64 frames. ], batch size: 24, lr: 8.05e-03, grad_scale: 8.0 +2022-11-16 02:54:37,126 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69750.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:55:23,541 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69818.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:55:27,397 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69824.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:55:31,845 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.497e+01 1.489e+02 1.786e+02 2.151e+02 5.165e+02, threshold=3.571e+02, percent-clipped=4.0 +2022-11-16 02:55:44,015 INFO [train.py:876] (2/4) Epoch 10, batch 4400, loss[loss=0.08794, simple_loss=0.1205, pruned_loss=0.02771, over 5752.00 frames. ], tot_loss[loss=0.1222, simple_loss=0.1472, pruned_loss=0.0486, over 1090159.97 frames. ], batch size: 13, lr: 8.04e-03, grad_scale: 8.0 +2022-11-16 02:55:56,381 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69866.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:56:00,383 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69872.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:56:06,509 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69881.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:56:19,541 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69901.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:56:38,887 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69929.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:56:38,965 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69929.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 02:56:40,107 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.783e+01 1.456e+02 1.859e+02 2.401e+02 3.905e+02, threshold=3.718e+02, percent-clipped=1.0 +2022-11-16 02:56:52,008 INFO [train.py:876] (2/4) Epoch 10, batch 4500, loss[loss=0.1006, simple_loss=0.135, pruned_loss=0.03309, over 5576.00 frames. ], tot_loss[loss=0.119, simple_loss=0.1448, pruned_loss=0.0466, over 1094563.38 frames. ], batch size: 16, lr: 8.04e-03, grad_scale: 8.0 +2022-11-16 02:56:52,042 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69949.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:57:10,771 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69976.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:57:11,363 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69977.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:57:47,881 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70024.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:57:47,951 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70024.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:57:52,468 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=70030.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 02:57:52,994 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.972e+01 1.578e+02 1.852e+02 2.258e+02 4.002e+02, threshold=3.705e+02, percent-clipped=2.0 +2022-11-16 02:58:05,285 INFO [train.py:876] (2/4) Epoch 10, batch 4600, loss[loss=0.08062, simple_loss=0.1108, pruned_loss=0.02521, over 5242.00 frames. ], tot_loss[loss=0.1187, simple_loss=0.1448, pruned_loss=0.04628, over 1096964.61 frames. ], batch size: 8, lr: 8.03e-03, grad_scale: 8.0 +2022-11-16 02:58:06,023 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70050.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:58:14,378 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.36 vs. limit=2.0 +2022-11-16 02:58:20,691 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70072.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:58:38,728 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70098.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:58:39,875 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-16 02:58:46,575 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=70110.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 02:58:47,157 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3731, 3.9331, 4.1344, 3.9720, 4.4314, 4.2077, 4.0627, 4.4521], + device='cuda:2'), covar=tensor([0.0381, 0.0387, 0.0454, 0.0320, 0.0405, 0.0285, 0.0291, 0.0322], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0139, 0.0102, 0.0134, 0.0161, 0.0098, 0.0116, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 02:59:00,301 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.197e+02 1.693e+02 2.084e+02 2.528e+02 4.221e+02, threshold=4.168e+02, percent-clipped=4.0 +2022-11-16 02:59:11,866 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0261, 5.0583, 3.6239, 2.2288, 4.6492, 1.9828, 4.3855, 2.7407], + device='cuda:2'), covar=tensor([0.1208, 0.0094, 0.0477, 0.2118, 0.0153, 0.1756, 0.0368, 0.1482], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0103, 0.0113, 0.0113, 0.0100, 0.0122, 0.0100, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:59:13,008 INFO [train.py:876] (2/4) Epoch 10, batch 4700, loss[loss=0.1426, simple_loss=0.1675, pruned_loss=0.05881, over 5524.00 frames. ], tot_loss[loss=0.1214, simple_loss=0.1465, pruned_loss=0.04817, over 1087596.37 frames. ], batch size: 46, lr: 8.03e-03, grad_scale: 8.0 +2022-11-16 02:59:15,123 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9876, 5.1165, 3.7676, 2.2083, 4.7163, 2.0301, 4.6826, 2.5817], + device='cuda:2'), covar=tensor([0.1169, 0.0102, 0.0460, 0.2051, 0.0279, 0.1596, 0.0228, 0.1548], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0103, 0.0113, 0.0113, 0.0100, 0.0122, 0.0100, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 02:59:23,207 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.30 vs. limit=5.0 +2022-11-16 02:59:27,547 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9801, 4.5224, 4.7234, 4.4957, 5.0631, 4.8261, 4.4530, 5.0173], + device='cuda:2'), covar=tensor([0.0386, 0.0320, 0.0428, 0.0342, 0.0318, 0.0192, 0.0260, 0.0273], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0138, 0.0101, 0.0132, 0.0159, 0.0096, 0.0115, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 02:59:27,642 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=70171.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:00:08,314 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.549e+01 1.608e+02 1.952e+02 2.269e+02 5.385e+02, threshold=3.904e+02, percent-clipped=1.0 +2022-11-16 03:00:20,904 INFO [train.py:876] (2/4) Epoch 10, batch 4800, loss[loss=0.12, simple_loss=0.1545, pruned_loss=0.04274, over 5570.00 frames. ], tot_loss[loss=0.1209, simple_loss=0.1464, pruned_loss=0.04767, over 1083099.58 frames. ], batch size: 22, lr: 8.02e-03, grad_scale: 8.0 +2022-11-16 03:00:28,842 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=70260.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:01:10,001 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=70321.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:01:15,776 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70330.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 03:01:16,226 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.074e+02 1.701e+02 2.058e+02 2.547e+02 4.780e+02, threshold=4.115e+02, percent-clipped=1.0 +2022-11-16 03:01:28,225 INFO [train.py:876] (2/4) Epoch 10, batch 4900, loss[loss=0.1009, simple_loss=0.1318, pruned_loss=0.03499, over 5636.00 frames. ], tot_loss[loss=0.1216, simple_loss=0.1462, pruned_loss=0.04851, over 1080998.52 frames. ], batch size: 32, lr: 8.01e-03, grad_scale: 8.0 +2022-11-16 03:01:47,957 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70378.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 03:02:13,457 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5859, 1.0501, 1.1802, 0.8819, 1.1209, 1.3449, 0.7430, 1.1182], + device='cuda:2'), covar=tensor([0.0298, 0.0553, 0.0285, 0.0564, 0.0583, 0.0239, 0.0841, 0.0457], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0020, 0.0014, 0.0017, 0.0014, 0.0013, 0.0019, 0.0013], + device='cuda:2'), out_proj_covar=tensor([7.0831e-05, 9.7408e-05, 7.2827e-05, 8.6445e-05, 7.4896e-05, 6.9857e-05, + 9.1101e-05, 7.0136e-05], device='cuda:2') +2022-11-16 03:02:18,127 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-11-16 03:02:24,079 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.085e+02 1.613e+02 1.862e+02 2.297e+02 4.157e+02, threshold=3.724e+02, percent-clipped=1.0 +2022-11-16 03:02:36,299 INFO [train.py:876] (2/4) Epoch 10, batch 5000, loss[loss=0.09965, simple_loss=0.1406, pruned_loss=0.02935, over 5547.00 frames. ], tot_loss[loss=0.1218, simple_loss=0.1467, pruned_loss=0.04845, over 1089150.10 frames. ], batch size: 16, lr: 8.01e-03, grad_scale: 8.0 +2022-11-16 03:02:48,236 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=70466.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:03:11,936 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0527, 3.1558, 2.4404, 1.6312, 2.9376, 1.2570, 3.0820, 1.6489], + device='cuda:2'), covar=tensor([0.1805, 0.0319, 0.1204, 0.2606, 0.0427, 0.2651, 0.0384, 0.2221], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0104, 0.0113, 0.0114, 0.0101, 0.0122, 0.0099, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:03:26,453 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7487, 2.5472, 3.0187, 3.8858, 3.9176, 3.0200, 2.6287, 3.7386], + device='cuda:2'), covar=tensor([0.0671, 0.3393, 0.2142, 0.2420, 0.1050, 0.2646, 0.1931, 0.0739], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0204, 0.0190, 0.0314, 0.0228, 0.0209, 0.0192, 0.0244], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 03:03:32,300 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.646e+01 1.626e+02 2.052e+02 2.453e+02 4.423e+02, threshold=4.104e+02, percent-clipped=3.0 +2022-11-16 03:03:41,560 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0114, 3.5956, 2.5326, 3.2770, 2.6599, 2.5577, 2.0455, 3.0678], + device='cuda:2'), covar=tensor([0.1375, 0.0256, 0.1069, 0.0463, 0.1160, 0.1079, 0.1874, 0.0507], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0138, 0.0160, 0.0145, 0.0173, 0.0169, 0.0164, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 03:03:44,162 INFO [train.py:876] (2/4) Epoch 10, batch 5100, loss[loss=0.124, simple_loss=0.1415, pruned_loss=0.05327, over 5103.00 frames. ], tot_loss[loss=0.1216, simple_loss=0.1462, pruned_loss=0.04855, over 1086189.52 frames. ], batch size: 91, lr: 8.00e-03, grad_scale: 8.0 +2022-11-16 03:03:45,565 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0143, 3.6392, 2.5441, 3.3871, 2.6597, 2.5850, 2.0068, 3.1794], + device='cuda:2'), covar=tensor([0.1395, 0.0280, 0.1073, 0.0417, 0.1028, 0.1039, 0.1875, 0.0411], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0138, 0.0160, 0.0145, 0.0173, 0.0169, 0.0164, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 03:04:24,115 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3432, 3.4685, 3.6548, 3.2776, 3.5158, 3.4155, 1.3688, 3.7021], + device='cuda:2'), covar=tensor([0.0420, 0.0589, 0.0338, 0.0467, 0.0472, 0.0438, 0.3344, 0.0395], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0085, 0.0085, 0.0079, 0.0101, 0.0087, 0.0130, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:04:30,170 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=70616.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:04:40,395 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.739e+01 1.539e+02 1.803e+02 2.270e+02 4.392e+02, threshold=3.606e+02, percent-clipped=1.0 +2022-11-16 03:04:52,545 INFO [train.py:876] (2/4) Epoch 10, batch 5200, loss[loss=0.1676, simple_loss=0.1844, pruned_loss=0.07542, over 5352.00 frames. ], tot_loss[loss=0.1201, simple_loss=0.1455, pruned_loss=0.0473, over 1085131.11 frames. ], batch size: 70, lr: 8.00e-03, grad_scale: 8.0 +2022-11-16 03:04:57,869 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7961, 2.3794, 2.3235, 1.3431, 2.4786, 2.8121, 2.5102, 2.8093], + device='cuda:2'), covar=tensor([0.2343, 0.1914, 0.1627, 0.3525, 0.0764, 0.1111, 0.0644, 0.1289], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0188, 0.0165, 0.0190, 0.0181, 0.0200, 0.0169, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:05:26,811 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7354, 1.6789, 1.7406, 1.4441, 1.5356, 1.4864, 1.2718, 1.7183], + device='cuda:2'), covar=tensor([0.0042, 0.0046, 0.0038, 0.0044, 0.0046, 0.0033, 0.0055, 0.0051], + device='cuda:2'), in_proj_covar=tensor([0.0053, 0.0049, 0.0051, 0.0052, 0.0052, 0.0046, 0.0048, 0.0044], + device='cuda:2'), out_proj_covar=tensor([4.8040e-05, 4.3847e-05, 4.5559e-05, 4.6631e-05, 4.6434e-05, 4.0174e-05, + 4.3203e-05, 3.8996e-05], device='cuda:2') +2022-11-16 03:05:47,144 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.458e+01 1.499e+02 1.855e+02 2.184e+02 5.918e+02, threshold=3.711e+02, percent-clipped=4.0 +2022-11-16 03:05:48,583 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4198, 2.6746, 2.6798, 2.4899, 2.6162, 2.6273, 1.2257, 2.7279], + device='cuda:2'), covar=tensor([0.0316, 0.0274, 0.0283, 0.0306, 0.0349, 0.0328, 0.2508, 0.0339], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0084, 0.0085, 0.0079, 0.0100, 0.0087, 0.0128, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:05:53,857 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5506, 2.0359, 1.8248, 1.1856, 1.7891, 2.3884, 1.9983, 2.2818], + device='cuda:2'), covar=tensor([0.1872, 0.1711, 0.1631, 0.2964, 0.1192, 0.0881, 0.0676, 0.1333], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0188, 0.0164, 0.0188, 0.0180, 0.0200, 0.0168, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:05:59,635 INFO [train.py:876] (2/4) Epoch 10, batch 5300, loss[loss=0.1136, simple_loss=0.1452, pruned_loss=0.04101, over 5576.00 frames. ], tot_loss[loss=0.119, simple_loss=0.1451, pruned_loss=0.04651, over 1086314.96 frames. ], batch size: 16, lr: 7.99e-03, grad_scale: 8.0 +2022-11-16 03:06:11,218 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70766.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:06:17,545 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2121, 3.5700, 3.5008, 3.4974, 3.5007, 3.4682, 1.3529, 3.5902], + device='cuda:2'), covar=tensor([0.0556, 0.0386, 0.0455, 0.0329, 0.0461, 0.0429, 0.4053, 0.0450], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0085, 0.0085, 0.0079, 0.0101, 0.0087, 0.0129, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:06:34,919 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.18 vs. limit=2.0 +2022-11-16 03:06:43,996 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70814.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:06:55,260 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.500e+01 1.530e+02 1.938e+02 2.261e+02 4.134e+02, threshold=3.876e+02, percent-clipped=2.0 +2022-11-16 03:07:07,459 INFO [train.py:876] (2/4) Epoch 10, batch 5400, loss[loss=0.08009, simple_loss=0.1164, pruned_loss=0.02187, over 5735.00 frames. ], tot_loss[loss=0.1209, simple_loss=0.1463, pruned_loss=0.04775, over 1085717.72 frames. ], batch size: 13, lr: 7.99e-03, grad_scale: 8.0 +2022-11-16 03:07:43,696 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4084, 4.4761, 4.7410, 4.6893, 4.1470, 4.0530, 5.2089, 4.5937], + device='cuda:2'), covar=tensor([0.0471, 0.0711, 0.0293, 0.1016, 0.0481, 0.0268, 0.0450, 0.0486], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0108, 0.0093, 0.0120, 0.0089, 0.0080, 0.0145, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:07:52,769 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70916.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:08:02,766 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.058e+02 1.679e+02 2.107e+02 2.563e+02 5.005e+02, threshold=4.215e+02, percent-clipped=8.0 +2022-11-16 03:08:14,643 INFO [train.py:876] (2/4) Epoch 10, batch 5500, loss[loss=0.07579, simple_loss=0.1064, pruned_loss=0.02259, over 4766.00 frames. ], tot_loss[loss=0.1211, simple_loss=0.1466, pruned_loss=0.04781, over 1086662.31 frames. ], batch size: 5, lr: 7.98e-03, grad_scale: 8.0 +2022-11-16 03:08:24,878 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70964.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:08:30,508 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1384, 4.0464, 4.3477, 4.2234, 3.7286, 3.7709, 4.6535, 4.2418], + device='cuda:2'), covar=tensor([0.0387, 0.0826, 0.0340, 0.1174, 0.0666, 0.0368, 0.0594, 0.0437], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0107, 0.0092, 0.0119, 0.0088, 0.0079, 0.0144, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:08:44,002 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=70992.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:09:05,593 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.77 vs. limit=5.0 +2022-11-16 03:09:10,368 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=71030.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:09:10,839 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.028e+02 1.597e+02 2.081e+02 2.610e+02 5.785e+02, threshold=4.161e+02, percent-clipped=1.0 +2022-11-16 03:09:10,961 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5710, 3.4783, 3.7336, 3.6788, 3.2379, 3.1501, 4.0482, 3.6336], + device='cuda:2'), covar=tensor([0.0591, 0.0862, 0.0492, 0.1240, 0.0781, 0.0532, 0.0771, 0.0718], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0107, 0.0092, 0.0120, 0.0088, 0.0079, 0.0145, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:09:22,367 INFO [train.py:876] (2/4) Epoch 10, batch 5600, loss[loss=0.1694, simple_loss=0.1879, pruned_loss=0.07542, over 5559.00 frames. ], tot_loss[loss=0.1214, simple_loss=0.1472, pruned_loss=0.04779, over 1084607.98 frames. ], batch size: 40, lr: 7.98e-03, grad_scale: 8.0 +2022-11-16 03:09:25,137 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=71053.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:09:31,397 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7408, 4.0947, 3.9860, 3.7049, 3.9020, 3.7109, 1.4164, 4.0631], + device='cuda:2'), covar=tensor([0.0458, 0.0306, 0.0353, 0.0409, 0.0445, 0.0439, 0.3802, 0.0399], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0086, 0.0086, 0.0079, 0.0103, 0.0088, 0.0129, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:09:51,310 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=71091.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:09:57,785 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9626, 4.3293, 3.9041, 4.3260, 4.3219, 3.6005, 4.0475, 3.8052], + device='cuda:2'), covar=tensor([0.0679, 0.0570, 0.1562, 0.0486, 0.0606, 0.0691, 0.0688, 0.0695], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0177, 0.0271, 0.0173, 0.0218, 0.0176, 0.0188, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:10:07,179 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2392, 2.4911, 2.6012, 3.4969, 3.4047, 2.5398, 2.3152, 3.4527], + device='cuda:2'), covar=tensor([0.0865, 0.2146, 0.2009, 0.2199, 0.1007, 0.2883, 0.1869, 0.0797], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0202, 0.0191, 0.0316, 0.0228, 0.0209, 0.0192, 0.0244], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 03:10:18,060 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.341e+01 1.536e+02 1.905e+02 2.360e+02 4.065e+02, threshold=3.810e+02, percent-clipped=0.0 +2022-11-16 03:10:30,667 INFO [train.py:876] (2/4) Epoch 10, batch 5700, loss[loss=0.1378, simple_loss=0.1633, pruned_loss=0.05618, over 5697.00 frames. ], tot_loss[loss=0.1205, simple_loss=0.1462, pruned_loss=0.04742, over 1089486.41 frames. ], batch size: 36, lr: 7.97e-03, grad_scale: 8.0 +2022-11-16 03:10:41,407 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8210, 1.4638, 1.7400, 1.2246, 1.4404, 1.5561, 1.3048, 1.2652], + device='cuda:2'), covar=tensor([0.0030, 0.0034, 0.0019, 0.0059, 0.0073, 0.0064, 0.0038, 0.0055], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0024, 0.0024, 0.0032, 0.0028, 0.0026, 0.0030, 0.0031], + device='cuda:2'), out_proj_covar=tensor([2.2661e-05, 2.2642e-05, 2.2238e-05, 3.1296e-05, 2.6116e-05, 2.4524e-05, + 2.8914e-05, 3.0597e-05], device='cuda:2') +2022-11-16 03:11:04,233 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-11-16 03:11:26,880 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.887e+01 1.504e+02 1.909e+02 2.394e+02 3.838e+02, threshold=3.819e+02, percent-clipped=2.0 +2022-11-16 03:11:38,470 INFO [train.py:876] (2/4) Epoch 10, batch 5800, loss[loss=0.1017, simple_loss=0.142, pruned_loss=0.03074, over 5480.00 frames. ], tot_loss[loss=0.1215, simple_loss=0.1471, pruned_loss=0.048, over 1084859.05 frames. ], batch size: 12, lr: 7.96e-03, grad_scale: 4.0 +2022-11-16 03:12:18,821 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7079, 2.3245, 3.3597, 2.8744, 3.3055, 2.4757, 3.0653, 3.6478], + device='cuda:2'), covar=tensor([0.0697, 0.1581, 0.0892, 0.1803, 0.0779, 0.1600, 0.1160, 0.0805], + device='cuda:2'), in_proj_covar=tensor([0.0238, 0.0193, 0.0211, 0.0211, 0.0236, 0.0191, 0.0225, 0.0229], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:12:22,698 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5073, 1.0869, 1.0622, 0.8686, 1.1789, 1.2420, 0.8007, 1.1821], + device='cuda:2'), covar=tensor([0.0319, 0.0394, 0.0682, 0.0669, 0.0567, 0.0366, 0.0702, 0.0303], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0021, 0.0014, 0.0018, 0.0015, 0.0013, 0.0020, 0.0014], + device='cuda:2'), out_proj_covar=tensor([7.2662e-05, 1.0041e-04, 7.6494e-05, 8.9657e-05, 7.7457e-05, 7.1967e-05, + 9.5033e-05, 7.3398e-05], device='cuda:2') +2022-11-16 03:12:34,045 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.935e+01 1.517e+02 1.924e+02 2.485e+02 4.438e+02, threshold=3.847e+02, percent-clipped=4.0 +2022-11-16 03:12:44,749 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=71348.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:12:45,334 INFO [train.py:876] (2/4) Epoch 10, batch 5900, loss[loss=0.1043, simple_loss=0.1323, pruned_loss=0.03811, over 5751.00 frames. ], tot_loss[loss=0.1191, simple_loss=0.145, pruned_loss=0.04663, over 1090043.64 frames. ], batch size: 16, lr: 7.96e-03, grad_scale: 4.0 +2022-11-16 03:12:55,454 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7517, 4.2522, 3.7973, 3.4503, 2.2472, 4.1301, 2.3178, 3.2826], + device='cuda:2'), covar=tensor([0.0423, 0.0159, 0.0184, 0.0345, 0.0612, 0.0159, 0.0540, 0.0197], + device='cuda:2'), in_proj_covar=tensor([0.0186, 0.0166, 0.0173, 0.0195, 0.0186, 0.0171, 0.0183, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:12:55,959 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.7359, 5.0862, 5.3412, 5.0976, 5.7694, 5.5918, 4.7903, 5.6532], + device='cuda:2'), covar=tensor([0.0300, 0.0311, 0.0496, 0.0306, 0.0243, 0.0153, 0.0241, 0.0200], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0147, 0.0106, 0.0142, 0.0169, 0.0101, 0.0120, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 03:13:02,262 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7644, 2.3895, 2.7641, 3.7164, 3.7736, 2.7996, 2.5886, 3.5485], + device='cuda:2'), covar=tensor([0.0842, 0.3092, 0.2628, 0.2475, 0.1089, 0.2934, 0.2083, 0.1331], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0201, 0.0191, 0.0316, 0.0229, 0.0207, 0.0193, 0.0244], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 03:13:02,812 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1303, 3.9332, 2.7877, 3.7715, 3.1679, 2.5679, 2.0805, 3.4240], + device='cuda:2'), covar=tensor([0.1638, 0.0292, 0.1163, 0.0349, 0.0854, 0.1289, 0.2134, 0.0418], + device='cuda:2'), in_proj_covar=tensor([0.0157, 0.0140, 0.0162, 0.0146, 0.0176, 0.0171, 0.0165, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:13:04,374 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.53 vs. limit=2.0 +2022-11-16 03:13:10,621 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=71386.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:13:12,701 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9698, 4.2225, 3.9213, 3.8130, 2.2729, 4.1634, 2.4458, 3.4810], + device='cuda:2'), covar=tensor([0.0409, 0.0213, 0.0208, 0.0308, 0.0617, 0.0163, 0.0475, 0.0234], + device='cuda:2'), in_proj_covar=tensor([0.0185, 0.0165, 0.0172, 0.0194, 0.0184, 0.0170, 0.0181, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:13:19,407 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.73 vs. limit=5.0 +2022-11-16 03:13:25,600 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4748, 4.1704, 4.4730, 4.1071, 4.5643, 4.4397, 1.6693, 4.6306], + device='cuda:2'), covar=tensor([0.0218, 0.0514, 0.0285, 0.0262, 0.0231, 0.0213, 0.3130, 0.0236], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0087, 0.0086, 0.0079, 0.0102, 0.0088, 0.0130, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:13:37,115 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.62 vs. limit=5.0 +2022-11-16 03:13:42,136 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.076e+02 1.550e+02 1.857e+02 2.377e+02 5.014e+02, threshold=3.713e+02, percent-clipped=7.0 +2022-11-16 03:13:53,308 INFO [train.py:876] (2/4) Epoch 10, batch 6000, loss[loss=0.1416, simple_loss=0.1455, pruned_loss=0.06892, over 4708.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.1437, pruned_loss=0.04558, over 1088159.91 frames. ], batch size: 135, lr: 7.95e-03, grad_scale: 8.0 +2022-11-16 03:13:53,309 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 03:13:59,199 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5682, 2.2842, 2.7575, 3.6708, 3.6264, 2.6494, 2.4265, 3.7108], + device='cuda:2'), covar=tensor([0.0791, 0.3007, 0.2201, 0.2078, 0.1063, 0.2473, 0.2148, 0.0677], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0201, 0.0191, 0.0315, 0.0229, 0.0206, 0.0192, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 03:14:11,198 INFO [train.py:908] (2/4) Epoch 10, validation: loss=0.1673, simple_loss=0.1835, pruned_loss=0.0755, over 1530663.00 frames. +2022-11-16 03:14:11,199 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 03:15:07,801 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.614e+01 1.652e+02 1.842e+02 2.318e+02 4.092e+02, threshold=3.683e+02, percent-clipped=3.0 +2022-11-16 03:15:18,748 INFO [train.py:876] (2/4) Epoch 10, batch 6100, loss[loss=0.07871, simple_loss=0.101, pruned_loss=0.02822, over 5196.00 frames. ], tot_loss[loss=0.1171, simple_loss=0.1439, pruned_loss=0.04513, over 1091143.69 frames. ], batch size: 7, lr: 7.95e-03, grad_scale: 8.0 +2022-11-16 03:15:31,264 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-11-16 03:16:00,418 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.20 vs. limit=2.0 +2022-11-16 03:16:01,695 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7846, 1.2064, 1.3642, 1.0911, 1.5885, 1.3176, 1.1883, 1.3529], + device='cuda:2'), covar=tensor([0.0522, 0.0971, 0.0864, 0.0629, 0.1303, 0.1460, 0.0521, 0.0834], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0021, 0.0015, 0.0018, 0.0015, 0.0014, 0.0020, 0.0014], + device='cuda:2'), out_proj_covar=tensor([7.5078e-05, 1.0234e-04, 7.8363e-05, 9.1546e-05, 7.9685e-05, 7.4441e-05, + 9.7138e-05, 7.5596e-05], device='cuda:2') +2022-11-16 03:16:04,736 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=71616.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:16:16,026 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.003e+01 1.497e+02 1.833e+02 2.234e+02 4.359e+02, threshold=3.667e+02, percent-clipped=3.0 +2022-11-16 03:16:26,700 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=71648.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:16:27,196 INFO [train.py:876] (2/4) Epoch 10, batch 6200, loss[loss=0.1128, simple_loss=0.1316, pruned_loss=0.04698, over 5696.00 frames. ], tot_loss[loss=0.1189, simple_loss=0.1448, pruned_loss=0.04646, over 1082902.75 frames. ], batch size: 11, lr: 7.94e-03, grad_scale: 8.0 +2022-11-16 03:16:45,863 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=71677.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:16:52,342 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=71686.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:16:58,955 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=71696.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:17:11,481 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0036, 2.4827, 2.9063, 3.9310, 3.9924, 2.8553, 2.7687, 3.9588], + device='cuda:2'), covar=tensor([0.0630, 0.2787, 0.1732, 0.2291, 0.0894, 0.3051, 0.1821, 0.1102], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0202, 0.0193, 0.0319, 0.0231, 0.0210, 0.0192, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 03:17:23,067 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9718, 3.6580, 3.7854, 3.6504, 4.0259, 3.6097, 3.6944, 4.0415], + device='cuda:2'), covar=tensor([0.0381, 0.0423, 0.0511, 0.0355, 0.0405, 0.0460, 0.0361, 0.0375], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0145, 0.0107, 0.0141, 0.0169, 0.0100, 0.0120, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 03:17:23,601 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.040e+02 1.569e+02 1.914e+02 2.269e+02 4.591e+02, threshold=3.828e+02, percent-clipped=5.0 +2022-11-16 03:17:24,937 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=71734.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:17:29,575 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7643, 2.8650, 2.4732, 2.9339, 2.3242, 2.3739, 2.4072, 3.2814], + device='cuda:2'), covar=tensor([0.0968, 0.1225, 0.1690, 0.1439, 0.1644, 0.1548, 0.1590, 0.1138], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0100, 0.0102, 0.0097, 0.0088, 0.0098, 0.0095, 0.0075], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:17:34,643 INFO [train.py:876] (2/4) Epoch 10, batch 6300, loss[loss=0.1163, simple_loss=0.1489, pruned_loss=0.04185, over 5675.00 frames. ], tot_loss[loss=0.121, simple_loss=0.1465, pruned_loss=0.04773, over 1081499.67 frames. ], batch size: 34, lr: 7.94e-03, grad_scale: 8.0 +2022-11-16 03:17:38,739 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8435, 2.6324, 2.1519, 2.2091, 1.5512, 2.2103, 1.6073, 2.3498], + device='cuda:2'), covar=tensor([0.1085, 0.0355, 0.0863, 0.0666, 0.1983, 0.0865, 0.1753, 0.0455], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0138, 0.0161, 0.0145, 0.0174, 0.0169, 0.0165, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 03:17:47,798 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4895, 1.8977, 1.5757, 1.2126, 1.7542, 0.9112, 1.9029, 1.0797], + device='cuda:2'), covar=tensor([0.1001, 0.0371, 0.1107, 0.1147, 0.0438, 0.2064, 0.0397, 0.1379], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0104, 0.0113, 0.0112, 0.0101, 0.0121, 0.0098, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:18:14,814 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.25 vs. limit=2.0 +2022-11-16 03:18:30,422 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.087e+02 1.579e+02 1.935e+02 2.633e+02 4.632e+02, threshold=3.870e+02, percent-clipped=2.0 +2022-11-16 03:18:42,504 INFO [train.py:876] (2/4) Epoch 10, batch 6400, loss[loss=0.09879, simple_loss=0.1285, pruned_loss=0.03456, over 5569.00 frames. ], tot_loss[loss=0.1194, simple_loss=0.1449, pruned_loss=0.04689, over 1078551.81 frames. ], batch size: 15, lr: 7.93e-03, grad_scale: 8.0 +2022-11-16 03:18:57,051 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.06 vs. limit=2.0 +2022-11-16 03:19:18,952 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5733, 4.4569, 4.5560, 4.6337, 4.0792, 3.9286, 5.0505, 4.4118], + device='cuda:2'), covar=tensor([0.0429, 0.0966, 0.0399, 0.1123, 0.0501, 0.0367, 0.0603, 0.0590], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0108, 0.0093, 0.0120, 0.0089, 0.0080, 0.0146, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:19:26,929 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1148, 3.8901, 2.6196, 3.7407, 2.9651, 2.5984, 2.0186, 3.3513], + device='cuda:2'), covar=tensor([0.1591, 0.0260, 0.1151, 0.0339, 0.0933, 0.1110, 0.2092, 0.0427], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0136, 0.0159, 0.0143, 0.0173, 0.0167, 0.0162, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 03:19:29,517 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=71919.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:19:37,764 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.943e+01 1.534e+02 1.967e+02 2.415e+02 6.448e+02, threshold=3.935e+02, percent-clipped=1.0 +2022-11-16 03:19:46,952 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6946, 2.8127, 2.9618, 2.6858, 2.8967, 2.7896, 1.2248, 3.0033], + device='cuda:2'), covar=tensor([0.0337, 0.0383, 0.0304, 0.0332, 0.0387, 0.0406, 0.2902, 0.0356], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0088, 0.0087, 0.0081, 0.0105, 0.0090, 0.0132, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:19:50,179 INFO [train.py:876] (2/4) Epoch 10, batch 6500, loss[loss=0.1484, simple_loss=0.1704, pruned_loss=0.06321, over 5585.00 frames. ], tot_loss[loss=0.1193, simple_loss=0.1452, pruned_loss=0.04673, over 1080902.28 frames. ], batch size: 54, lr: 7.93e-03, grad_scale: 8.0 +2022-11-16 03:20:01,164 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-11-16 03:20:05,472 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=71972.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:20:10,810 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=71980.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:20:20,282 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4775, 4.2660, 3.2912, 1.9938, 4.0006, 1.6632, 3.9518, 2.3159], + device='cuda:2'), covar=tensor([0.1539, 0.0178, 0.0830, 0.2000, 0.0206, 0.1885, 0.0233, 0.1601], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0105, 0.0113, 0.0113, 0.0101, 0.0121, 0.0098, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:20:26,754 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-16 03:20:46,046 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.311e+01 1.515e+02 1.926e+02 2.349e+02 4.433e+02, threshold=3.852e+02, percent-clipped=3.0 +2022-11-16 03:20:57,524 INFO [train.py:876] (2/4) Epoch 10, batch 6600, loss[loss=0.1488, simple_loss=0.1661, pruned_loss=0.06574, over 5447.00 frames. ], tot_loss[loss=0.1194, simple_loss=0.1458, pruned_loss=0.04655, over 1085860.35 frames. ], batch size: 53, lr: 7.92e-03, grad_scale: 8.0 +2022-11-16 03:21:19,641 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.44 vs. limit=5.0 +2022-11-16 03:21:49,596 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2147, 2.6288, 2.6452, 3.3869, 3.2431, 2.5971, 2.1876, 3.4080], + device='cuda:2'), covar=tensor([0.0845, 0.2138, 0.1770, 0.2021, 0.1242, 0.2483, 0.2194, 0.0814], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0204, 0.0192, 0.0317, 0.0232, 0.0210, 0.0194, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 03:21:51,555 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=72129.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:21:53,318 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.616e+01 1.605e+02 1.991e+02 2.572e+02 4.942e+02, threshold=3.982e+02, percent-clipped=4.0 +2022-11-16 03:22:04,584 INFO [train.py:876] (2/4) Epoch 10, batch 6700, loss[loss=0.1337, simple_loss=0.1636, pruned_loss=0.05189, over 5624.00 frames. ], tot_loss[loss=0.1182, simple_loss=0.1446, pruned_loss=0.04591, over 1087198.79 frames. ], batch size: 32, lr: 7.91e-03, grad_scale: 8.0 +2022-11-16 03:22:33,093 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=72190.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:22:36,992 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6352, 2.4492, 2.8489, 3.7116, 3.3688, 2.6445, 2.4331, 3.6354], + device='cuda:2'), covar=tensor([0.0885, 0.2896, 0.1850, 0.2147, 0.1443, 0.3080, 0.2241, 0.0769], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0203, 0.0192, 0.0317, 0.0232, 0.0210, 0.0193, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 03:22:38,170 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4454, 2.3295, 2.3129, 2.3932, 2.4511, 2.2299, 2.6526, 2.5228], + device='cuda:2'), covar=tensor([0.0598, 0.1016, 0.0690, 0.1274, 0.0673, 0.0637, 0.1070, 0.0893], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0107, 0.0091, 0.0118, 0.0087, 0.0078, 0.0144, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:23:00,855 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0266, 2.3437, 3.6761, 3.2093, 3.8803, 2.7186, 3.4271, 4.0201], + device='cuda:2'), covar=tensor([0.0618, 0.1731, 0.0795, 0.1540, 0.0536, 0.1586, 0.1224, 0.1016], + device='cuda:2'), in_proj_covar=tensor([0.0232, 0.0191, 0.0206, 0.0208, 0.0229, 0.0190, 0.0221, 0.0225], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:23:01,934 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.034e+02 1.623e+02 1.994e+02 2.476e+02 5.420e+02, threshold=3.989e+02, percent-clipped=4.0 +2022-11-16 03:23:03,994 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7884, 1.6010, 1.5177, 1.1716, 1.3052, 1.4169, 1.2233, 1.2195], + device='cuda:2'), covar=tensor([0.0027, 0.0039, 0.0060, 0.0045, 0.0080, 0.0123, 0.0046, 0.0053], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0023, 0.0024, 0.0031, 0.0027, 0.0025, 0.0030, 0.0030], + device='cuda:2'), out_proj_covar=tensor([2.2593e-05, 2.1626e-05, 2.2120e-05, 3.0428e-05, 2.5066e-05, 2.3910e-05, + 2.9321e-05, 2.9601e-05], device='cuda:2') +2022-11-16 03:23:13,015 INFO [train.py:876] (2/4) Epoch 10, batch 6800, loss[loss=0.1124, simple_loss=0.1493, pruned_loss=0.03779, over 5765.00 frames. ], tot_loss[loss=0.1177, simple_loss=0.1443, pruned_loss=0.04549, over 1088489.68 frames. ], batch size: 20, lr: 7.91e-03, grad_scale: 8.0 +2022-11-16 03:23:26,222 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9721, 2.9970, 3.0301, 1.6101, 2.8014, 3.1257, 3.0880, 3.7209], + device='cuda:2'), covar=tensor([0.2241, 0.1532, 0.0984, 0.3012, 0.0523, 0.0888, 0.0642, 0.0768], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0182, 0.0165, 0.0185, 0.0179, 0.0198, 0.0170, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:23:28,136 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=72272.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:23:30,337 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=72275.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:23:34,339 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9923, 4.2470, 2.5205, 4.0869, 3.2936, 2.6220, 2.2489, 3.5176], + device='cuda:2'), covar=tensor([0.2691, 0.0359, 0.1905, 0.0393, 0.0943, 0.1826, 0.2595, 0.0673], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0140, 0.0162, 0.0144, 0.0177, 0.0171, 0.0165, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:24:00,668 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0283, 4.5591, 4.8791, 4.5058, 5.1095, 4.9794, 4.5014, 5.1215], + device='cuda:2'), covar=tensor([0.0338, 0.0331, 0.0396, 0.0351, 0.0333, 0.0183, 0.0264, 0.0253], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0151, 0.0110, 0.0147, 0.0173, 0.0103, 0.0123, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 03:24:02,089 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=72320.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:24:06,393 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=72326.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:24:11,067 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.013e+02 1.569e+02 1.889e+02 2.290e+02 4.536e+02, threshold=3.778e+02, percent-clipped=2.0 +2022-11-16 03:24:23,855 INFO [train.py:876] (2/4) Epoch 10, batch 6900, loss[loss=0.08782, simple_loss=0.1184, pruned_loss=0.02863, over 5682.00 frames. ], tot_loss[loss=0.119, simple_loss=0.1447, pruned_loss=0.04666, over 1088927.49 frames. ], batch size: 11, lr: 7.90e-03, grad_scale: 8.0 +2022-11-16 03:24:33,213 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4739, 3.0821, 3.2352, 2.9429, 1.9792, 3.2265, 2.1454, 2.8998], + device='cuda:2'), covar=tensor([0.0317, 0.0194, 0.0157, 0.0249, 0.0468, 0.0159, 0.0439, 0.0160], + device='cuda:2'), in_proj_covar=tensor([0.0193, 0.0172, 0.0179, 0.0201, 0.0190, 0.0175, 0.0189, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:24:49,657 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=72387.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:25:08,852 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6516, 3.5610, 3.5752, 3.2594, 1.9323, 3.6522, 2.2292, 3.0552], + device='cuda:2'), covar=tensor([0.0356, 0.0147, 0.0201, 0.0332, 0.0532, 0.0167, 0.0461, 0.0182], + device='cuda:2'), in_proj_covar=tensor([0.0191, 0.0172, 0.0177, 0.0199, 0.0188, 0.0173, 0.0187, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:25:19,931 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.314e+01 1.574e+02 1.843e+02 2.228e+02 3.077e+02, threshold=3.686e+02, percent-clipped=0.0 +2022-11-16 03:25:32,132 INFO [train.py:876] (2/4) Epoch 10, batch 7000, loss[loss=0.08596, simple_loss=0.1203, pruned_loss=0.02582, over 5711.00 frames. ], tot_loss[loss=0.1203, simple_loss=0.1457, pruned_loss=0.04749, over 1081305.06 frames. ], batch size: 17, lr: 7.90e-03, grad_scale: 8.0 +2022-11-16 03:25:56,126 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=72485.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:26:16,198 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-11-16 03:26:27,980 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.005e+02 1.555e+02 1.806e+02 2.267e+02 5.483e+02, threshold=3.612e+02, percent-clipped=3.0 +2022-11-16 03:26:39,482 INFO [train.py:876] (2/4) Epoch 10, batch 7100, loss[loss=0.1269, simple_loss=0.1492, pruned_loss=0.05229, over 5295.00 frames. ], tot_loss[loss=0.1193, simple_loss=0.1449, pruned_loss=0.04681, over 1080629.68 frames. ], batch size: 79, lr: 7.89e-03, grad_scale: 8.0 +2022-11-16 03:26:57,715 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=72575.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:27:03,921 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-16 03:27:05,807 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.81 vs. limit=2.0 +2022-11-16 03:27:08,127 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6087, 1.2047, 1.3621, 1.1347, 1.2762, 1.5287, 1.0502, 1.1052], + device='cuda:2'), covar=tensor([0.0036, 0.0094, 0.0049, 0.0064, 0.0085, 0.0091, 0.0065, 0.0090], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0023, 0.0024, 0.0031, 0.0026, 0.0025, 0.0031, 0.0030], + device='cuda:2'), out_proj_covar=tensor([2.2235e-05, 2.1356e-05, 2.1823e-05, 3.0479e-05, 2.4807e-05, 2.3861e-05, + 2.9663e-05, 2.9895e-05], device='cuda:2') +2022-11-16 03:27:20,941 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.21 vs. limit=2.0 +2022-11-16 03:27:30,438 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=72623.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:27:36,241 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.069e+02 1.573e+02 1.962e+02 2.591e+02 6.270e+02, threshold=3.924e+02, percent-clipped=1.0 +2022-11-16 03:27:37,719 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2269, 1.0384, 1.4437, 1.0334, 1.4790, 1.3698, 0.9264, 1.1734], + device='cuda:2'), covar=tensor([0.1067, 0.0423, 0.0300, 0.0842, 0.0600, 0.0978, 0.0831, 0.0464], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0021, 0.0015, 0.0018, 0.0015, 0.0013, 0.0020, 0.0014], + device='cuda:2'), out_proj_covar=tensor([7.6057e-05, 1.0161e-04, 7.7767e-05, 9.1401e-05, 7.9764e-05, 7.3578e-05, + 9.7785e-05, 7.5822e-05], device='cuda:2') +2022-11-16 03:27:47,363 INFO [train.py:876] (2/4) Epoch 10, batch 7200, loss[loss=0.1451, simple_loss=0.166, pruned_loss=0.06207, over 5458.00 frames. ], tot_loss[loss=0.1185, simple_loss=0.1443, pruned_loss=0.04636, over 1082141.68 frames. ], batch size: 58, lr: 7.89e-03, grad_scale: 8.0 +2022-11-16 03:27:48,271 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6852, 2.3158, 2.6529, 3.5571, 3.6417, 2.7985, 2.4907, 3.6500], + device='cuda:2'), covar=tensor([0.1118, 0.2952, 0.2523, 0.4764, 0.1379, 0.3246, 0.2362, 0.1183], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0199, 0.0186, 0.0307, 0.0225, 0.0204, 0.0189, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 03:27:55,884 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1276, 1.9888, 2.4969, 1.7984, 1.4558, 2.9180, 2.4558, 2.2022], + device='cuda:2'), covar=tensor([0.0975, 0.1366, 0.0779, 0.2756, 0.2806, 0.0962, 0.1701, 0.1325], + device='cuda:2'), in_proj_covar=tensor([0.0093, 0.0082, 0.0084, 0.0093, 0.0068, 0.0061, 0.0069, 0.0080], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 03:28:10,212 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=72682.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:28:26,337 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-16 03:29:18,983 INFO [train.py:876] (2/4) Epoch 11, batch 0, loss[loss=0.1227, simple_loss=0.1547, pruned_loss=0.04536, over 5767.00 frames. ], tot_loss[loss=0.1227, simple_loss=0.1547, pruned_loss=0.04536, over 5767.00 frames. ], batch size: 21, lr: 7.53e-03, grad_scale: 8.0 +2022-11-16 03:29:18,983 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 03:29:24,705 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3612, 3.1662, 3.3992, 3.1071, 3.2180, 3.3110, 3.6301, 3.3689], + device='cuda:2'), covar=tensor([0.0719, 0.1123, 0.0553, 0.1164, 0.0703, 0.0454, 0.0780, 0.0880], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0106, 0.0090, 0.0117, 0.0087, 0.0077, 0.0143, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:29:29,147 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2181, 3.6444, 2.7175, 1.8386, 3.4574, 1.5031, 3.4434, 2.2583], + device='cuda:2'), covar=tensor([0.1112, 0.0143, 0.0697, 0.1809, 0.0178, 0.1497, 0.0183, 0.1204], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0104, 0.0113, 0.0112, 0.0101, 0.0119, 0.0097, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:29:35,597 INFO [train.py:908] (2/4) Epoch 11, validation: loss=0.1663, simple_loss=0.1831, pruned_loss=0.07475, over 1530663.00 frames. +2022-11-16 03:29:35,598 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 03:29:43,100 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.965e+01 1.625e+02 2.129e+02 2.463e+02 4.242e+02, threshold=4.258e+02, percent-clipped=1.0 +2022-11-16 03:30:00,420 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.76 vs. limit=2.0 +2022-11-16 03:30:05,535 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0076, 2.6057, 2.8165, 3.8777, 3.6971, 2.7364, 2.3837, 3.8560], + device='cuda:2'), covar=tensor([0.0553, 0.2320, 0.2290, 0.2367, 0.1211, 0.2935, 0.2166, 0.0530], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0197, 0.0187, 0.0306, 0.0224, 0.0203, 0.0189, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 03:30:17,733 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=72785.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:30:34,431 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4013, 3.0866, 3.1574, 2.9232, 1.8732, 3.0234, 2.1061, 2.7008], + device='cuda:2'), covar=tensor([0.0306, 0.0168, 0.0143, 0.0233, 0.0433, 0.0179, 0.0378, 0.0143], + device='cuda:2'), in_proj_covar=tensor([0.0190, 0.0172, 0.0178, 0.0199, 0.0189, 0.0174, 0.0186, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:30:36,601 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5016, 4.6993, 3.1701, 4.4317, 3.6261, 3.1652, 2.5507, 4.1238], + device='cuda:2'), covar=tensor([0.1629, 0.0218, 0.1118, 0.0318, 0.0657, 0.1052, 0.1966, 0.0310], + device='cuda:2'), in_proj_covar=tensor([0.0157, 0.0139, 0.0160, 0.0144, 0.0176, 0.0169, 0.0164, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:30:42,334 INFO [train.py:876] (2/4) Epoch 11, batch 100, loss[loss=0.1002, simple_loss=0.1321, pruned_loss=0.03415, over 5733.00 frames. ], tot_loss[loss=0.1133, simple_loss=0.141, pruned_loss=0.04278, over 438384.60 frames. ], batch size: 27, lr: 7.52e-03, grad_scale: 8.0 +2022-11-16 03:30:49,527 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.074e+02 1.571e+02 1.949e+02 2.153e+02 3.381e+02, threshold=3.898e+02, percent-clipped=0.0 +2022-11-16 03:30:50,280 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=72833.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:31:03,653 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2582, 2.0010, 2.1550, 2.5776, 2.7507, 2.1027, 1.7549, 2.6394], + device='cuda:2'), covar=tensor([0.2455, 0.2110, 0.1593, 0.1061, 0.1030, 0.2747, 0.2030, 0.1308], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0198, 0.0186, 0.0305, 0.0223, 0.0202, 0.0189, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 03:31:31,484 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=72894.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:31:38,412 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-11-16 03:31:40,359 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8320, 3.5554, 3.6824, 3.5035, 3.7617, 3.5589, 1.3820, 3.8870], + device='cuda:2'), covar=tensor([0.0282, 0.0464, 0.0409, 0.0333, 0.0326, 0.0394, 0.3468, 0.0304], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0085, 0.0085, 0.0079, 0.0102, 0.0089, 0.0131, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:31:50,366 INFO [train.py:876] (2/4) Epoch 11, batch 200, loss[loss=0.1133, simple_loss=0.147, pruned_loss=0.03981, over 5729.00 frames. ], tot_loss[loss=0.1153, simple_loss=0.142, pruned_loss=0.04429, over 688878.73 frames. ], batch size: 31, lr: 7.52e-03, grad_scale: 8.0 +2022-11-16 03:31:55,096 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-11-16 03:31:57,299 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.513e+01 1.538e+02 1.800e+02 2.272e+02 4.125e+02, threshold=3.600e+02, percent-clipped=3.0 +2022-11-16 03:32:12,300 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=72955.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:32:15,559 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6617, 2.6885, 2.3556, 2.6209, 2.1693, 2.1402, 2.5998, 3.0947], + device='cuda:2'), covar=tensor([0.1178, 0.1881, 0.2609, 0.1899, 0.2141, 0.1716, 0.1589, 0.2879], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0103, 0.0105, 0.0099, 0.0091, 0.0100, 0.0096, 0.0078], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:32:18,625 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=72964.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 03:32:30,974 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=72982.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:32:46,986 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9074, 2.3499, 3.4255, 3.1145, 3.7508, 2.7011, 3.3179, 3.8407], + device='cuda:2'), covar=tensor([0.0762, 0.1723, 0.0996, 0.1673, 0.0575, 0.1512, 0.1263, 0.0877], + device='cuda:2'), in_proj_covar=tensor([0.0236, 0.0193, 0.0209, 0.0209, 0.0234, 0.0192, 0.0223, 0.0227], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:32:57,227 INFO [train.py:876] (2/4) Epoch 11, batch 300, loss[loss=0.06876, simple_loss=0.1128, pruned_loss=0.01237, over 5550.00 frames. ], tot_loss[loss=0.1167, simple_loss=0.1426, pruned_loss=0.04546, over 837441.37 frames. ], batch size: 13, lr: 7.51e-03, grad_scale: 8.0 +2022-11-16 03:33:00,022 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=73025.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 03:33:03,470 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=73030.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:33:04,727 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.107e+02 1.624e+02 1.928e+02 2.465e+02 5.255e+02, threshold=3.856e+02, percent-clipped=4.0 +2022-11-16 03:33:49,869 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8026, 2.9127, 2.8749, 2.6998, 2.8252, 2.8343, 1.2300, 2.9911], + device='cuda:2'), covar=tensor([0.0306, 0.0351, 0.0337, 0.0359, 0.0445, 0.0431, 0.3043, 0.0403], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0084, 0.0085, 0.0079, 0.0101, 0.0088, 0.0129, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:34:03,259 INFO [train.py:876] (2/4) Epoch 11, batch 400, loss[loss=0.1022, simple_loss=0.1337, pruned_loss=0.03536, over 5582.00 frames. ], tot_loss[loss=0.1192, simple_loss=0.1446, pruned_loss=0.04689, over 933430.37 frames. ], batch size: 16, lr: 7.51e-03, grad_scale: 8.0 +2022-11-16 03:34:11,191 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.143e+01 1.547e+02 1.866e+02 2.274e+02 4.703e+02, threshold=3.733e+02, percent-clipped=2.0 +2022-11-16 03:34:17,345 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.50 vs. limit=5.0 +2022-11-16 03:34:17,837 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.18 vs. limit=2.0 +2022-11-16 03:34:28,506 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0435, 4.6156, 4.0949, 4.6087, 4.6265, 3.7807, 4.1505, 3.9219], + device='cuda:2'), covar=tensor([0.0443, 0.0420, 0.1442, 0.0412, 0.0429, 0.0517, 0.0507, 0.0606], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0171, 0.0266, 0.0169, 0.0213, 0.0170, 0.0181, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 03:35:00,976 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9700, 2.0995, 2.7141, 2.3958, 2.7502, 1.9867, 2.5829, 2.9931], + device='cuda:2'), covar=tensor([0.0621, 0.1342, 0.0872, 0.1162, 0.0819, 0.1265, 0.1008, 0.0823], + device='cuda:2'), in_proj_covar=tensor([0.0232, 0.0189, 0.0205, 0.0206, 0.0229, 0.0190, 0.0219, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:35:10,797 INFO [train.py:876] (2/4) Epoch 11, batch 500, loss[loss=0.1302, simple_loss=0.1514, pruned_loss=0.0545, over 5642.00 frames. ], tot_loss[loss=0.1192, simple_loss=0.1449, pruned_loss=0.04676, over 996493.36 frames. ], batch size: 32, lr: 7.50e-03, grad_scale: 8.0 +2022-11-16 03:35:17,968 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.259e+01 1.458e+02 1.748e+02 2.225e+02 4.920e+02, threshold=3.496e+02, percent-clipped=3.0 +2022-11-16 03:35:23,310 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1543, 4.7046, 4.9409, 4.6427, 5.2849, 5.0799, 4.5076, 5.2187], + device='cuda:2'), covar=tensor([0.0404, 0.0344, 0.0437, 0.0334, 0.0335, 0.0217, 0.0290, 0.0239], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0150, 0.0110, 0.0145, 0.0175, 0.0102, 0.0124, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 03:35:30,816 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=73250.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:35:36,048 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4185, 1.6462, 1.7167, 1.7350, 1.5998, 1.4588, 1.6247, 1.7259], + device='cuda:2'), covar=tensor([0.3031, 0.2809, 0.2604, 0.1878, 0.2455, 0.3393, 0.2464, 0.1142], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0102, 0.0104, 0.0098, 0.0091, 0.0099, 0.0095, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:35:59,499 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=73293.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:36:18,447 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=73320.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 03:36:19,024 INFO [train.py:876] (2/4) Epoch 11, batch 600, loss[loss=0.1291, simple_loss=0.1588, pruned_loss=0.04968, over 5572.00 frames. ], tot_loss[loss=0.1188, simple_loss=0.145, pruned_loss=0.04631, over 1029216.13 frames. ], batch size: 24, lr: 7.50e-03, grad_scale: 16.0 +2022-11-16 03:36:26,010 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.407e+01 1.498e+02 1.818e+02 2.192e+02 5.468e+02, threshold=3.637e+02, percent-clipped=3.0 +2022-11-16 03:36:38,288 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9083, 4.1823, 3.8265, 3.4575, 1.8886, 4.0385, 2.2476, 3.6549], + device='cuda:2'), covar=tensor([0.0403, 0.0221, 0.0209, 0.0406, 0.0710, 0.0162, 0.0549, 0.0163], + device='cuda:2'), in_proj_covar=tensor([0.0189, 0.0172, 0.0178, 0.0198, 0.0188, 0.0173, 0.0186, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:36:41,251 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=73354.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:36:46,690 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.81 vs. limit=2.0 +2022-11-16 03:36:56,036 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6751, 2.5100, 3.3444, 3.0912, 3.4296, 2.5728, 3.2143, 3.9256], + device='cuda:2'), covar=tensor([0.0773, 0.1423, 0.1026, 0.1458, 0.0705, 0.1378, 0.1182, 0.0658], + device='cuda:2'), in_proj_covar=tensor([0.0238, 0.0193, 0.0209, 0.0210, 0.0234, 0.0194, 0.0223, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:37:14,632 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.56 vs. limit=2.0 +2022-11-16 03:37:16,593 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5833, 2.8156, 2.8385, 2.6071, 2.8204, 2.7404, 1.1576, 2.8863], + device='cuda:2'), covar=tensor([0.0343, 0.0293, 0.0278, 0.0312, 0.0335, 0.0376, 0.2814, 0.0369], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0086, 0.0087, 0.0079, 0.0101, 0.0088, 0.0131, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:37:26,419 INFO [train.py:876] (2/4) Epoch 11, batch 700, loss[loss=0.1457, simple_loss=0.1699, pruned_loss=0.06075, over 5799.00 frames. ], tot_loss[loss=0.1184, simple_loss=0.1446, pruned_loss=0.04607, over 1051069.95 frames. ], batch size: 21, lr: 7.49e-03, grad_scale: 16.0 +2022-11-16 03:37:31,027 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-11-16 03:37:33,752 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.643e+01 1.493e+02 1.779e+02 2.171e+02 7.161e+02, threshold=3.558e+02, percent-clipped=3.0 +2022-11-16 03:37:38,537 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-11-16 03:37:38,773 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-11-16 03:37:58,141 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6038, 2.2912, 3.2201, 2.7427, 3.3157, 2.3412, 2.9969, 3.6709], + device='cuda:2'), covar=tensor([0.0680, 0.1643, 0.0927, 0.1597, 0.0744, 0.1609, 0.1289, 0.0789], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0192, 0.0210, 0.0210, 0.0235, 0.0194, 0.0223, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:38:07,050 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-16 03:38:33,548 INFO [train.py:876] (2/4) Epoch 11, batch 800, loss[loss=0.09194, simple_loss=0.1324, pruned_loss=0.02574, over 5721.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.1439, pruned_loss=0.04544, over 1064939.78 frames. ], batch size: 15, lr: 7.49e-03, grad_scale: 8.0 +2022-11-16 03:38:34,965 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-11-16 03:38:41,648 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.679e+01 1.505e+02 1.889e+02 2.408e+02 4.187e+02, threshold=3.778e+02, percent-clipped=1.0 +2022-11-16 03:38:41,803 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9323, 3.7057, 2.3786, 3.5323, 2.8272, 2.5319, 1.9861, 3.2717], + device='cuda:2'), covar=tensor([0.1620, 0.0265, 0.1342, 0.0380, 0.1038, 0.1173, 0.2101, 0.0402], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0140, 0.0160, 0.0145, 0.0177, 0.0170, 0.0165, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:38:48,743 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2481, 2.2409, 2.9338, 2.5751, 2.7238, 2.1177, 2.7525, 3.3000], + device='cuda:2'), covar=tensor([0.0672, 0.1313, 0.0745, 0.1180, 0.1021, 0.1440, 0.1037, 0.0685], + device='cuda:2'), in_proj_covar=tensor([0.0241, 0.0194, 0.0213, 0.0211, 0.0237, 0.0195, 0.0226, 0.0229], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:38:53,252 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=73550.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:39:20,199 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=73590.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:39:25,556 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=73598.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:39:40,077 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=73620.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 03:39:40,651 INFO [train.py:876] (2/4) Epoch 11, batch 900, loss[loss=0.1587, simple_loss=0.1641, pruned_loss=0.07666, over 4957.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.1436, pruned_loss=0.04561, over 1071288.21 frames. ], batch size: 109, lr: 7.48e-03, grad_scale: 8.0 +2022-11-16 03:39:49,609 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.416e+01 1.675e+02 2.016e+02 2.471e+02 4.865e+02, threshold=4.032e+02, percent-clipped=2.0 +2022-11-16 03:40:00,146 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=73649.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:40:01,544 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=73651.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:40:04,815 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.39 vs. limit=5.0 +2022-11-16 03:40:12,986 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=73668.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 03:40:14,950 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8143, 1.2956, 1.2994, 1.1749, 1.0630, 1.6589, 1.3661, 1.2646], + device='cuda:2'), covar=tensor([0.2873, 0.0737, 0.1959, 0.2271, 0.1927, 0.0409, 0.1464, 0.1939], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0082, 0.0085, 0.0094, 0.0070, 0.0061, 0.0071, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 03:40:40,160 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-11-16 03:40:41,265 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5112, 1.8756, 1.6727, 1.2916, 1.4618, 1.9860, 1.8603, 2.0130], + device='cuda:2'), covar=tensor([0.1999, 0.1648, 0.2236, 0.2777, 0.1644, 0.1295, 0.0836, 0.1514], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0183, 0.0167, 0.0186, 0.0184, 0.0198, 0.0169, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:40:47,971 INFO [train.py:876] (2/4) Epoch 11, batch 1000, loss[loss=0.0881, simple_loss=0.1324, pruned_loss=0.02192, over 5544.00 frames. ], tot_loss[loss=0.1185, simple_loss=0.1447, pruned_loss=0.04616, over 1075775.33 frames. ], batch size: 14, lr: 7.48e-03, grad_scale: 8.0 +2022-11-16 03:40:55,188 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=73732.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:40:55,647 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.576e+01 1.693e+02 2.139e+02 2.600e+02 5.774e+02, threshold=4.279e+02, percent-clipped=7.0 +2022-11-16 03:41:15,214 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4721, 1.7506, 1.8156, 1.6656, 1.4108, 2.6030, 2.0292, 1.5810], + device='cuda:2'), covar=tensor([0.1222, 0.1165, 0.1111, 0.1914, 0.1695, 0.0464, 0.1473, 0.1493], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0083, 0.0086, 0.0095, 0.0070, 0.0062, 0.0072, 0.0083], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 03:41:30,968 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3218, 2.9115, 3.4252, 4.1703, 4.0279, 3.2224, 2.9244, 4.2662], + device='cuda:2'), covar=tensor([0.0565, 0.2692, 0.1716, 0.3015, 0.1189, 0.2938, 0.2032, 0.0690], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0200, 0.0187, 0.0309, 0.0225, 0.0204, 0.0190, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 03:41:36,501 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=73793.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:41:55,289 INFO [train.py:876] (2/4) Epoch 11, batch 1100, loss[loss=0.08889, simple_loss=0.1298, pruned_loss=0.024, over 5709.00 frames. ], tot_loss[loss=0.1185, simple_loss=0.1449, pruned_loss=0.04602, over 1080339.76 frames. ], batch size: 15, lr: 7.47e-03, grad_scale: 8.0 +2022-11-16 03:42:00,586 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4883, 2.2051, 3.1246, 2.7732, 3.1239, 2.0956, 2.8930, 3.5189], + device='cuda:2'), covar=tensor([0.0597, 0.1577, 0.0973, 0.1425, 0.0966, 0.1652, 0.1182, 0.0823], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0191, 0.0209, 0.0208, 0.0234, 0.0193, 0.0222, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:42:02,966 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.033e+02 1.524e+02 1.845e+02 2.203e+02 3.683e+02, threshold=3.689e+02, percent-clipped=0.0 +2022-11-16 03:42:04,118 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-11-16 03:43:01,962 INFO [train.py:876] (2/4) Epoch 11, batch 1200, loss[loss=0.09666, simple_loss=0.1394, pruned_loss=0.02698, over 5580.00 frames. ], tot_loss[loss=0.1181, simple_loss=0.1447, pruned_loss=0.0457, over 1083977.64 frames. ], batch size: 24, lr: 7.47e-03, grad_scale: 8.0 +2022-11-16 03:43:05,601 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-11-16 03:43:10,219 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.261e+01 1.559e+02 1.976e+02 2.426e+02 6.394e+02, threshold=3.952e+02, percent-clipped=4.0 +2022-11-16 03:43:18,857 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=73946.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:43:20,839 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=73949.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:43:53,689 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=73997.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:44:10,604 INFO [train.py:876] (2/4) Epoch 11, batch 1300, loss[loss=0.2225, simple_loss=0.1994, pruned_loss=0.1228, over 2999.00 frames. ], tot_loss[loss=0.1179, simple_loss=0.1443, pruned_loss=0.04573, over 1082044.25 frames. ], batch size: 284, lr: 7.46e-03, grad_scale: 8.0 +2022-11-16 03:44:18,270 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.904e+01 1.577e+02 1.830e+02 2.359e+02 4.082e+02, threshold=3.660e+02, percent-clipped=1.0 +2022-11-16 03:44:22,379 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-11-16 03:44:30,425 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.27 vs. limit=2.0 +2022-11-16 03:44:38,542 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74064.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:44:55,166 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=74088.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:44:57,325 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-11-16 03:45:01,603 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1753, 4.1344, 4.0338, 3.8149, 4.2252, 4.0030, 1.7359, 4.2810], + device='cuda:2'), covar=tensor([0.0220, 0.0283, 0.0312, 0.0333, 0.0289, 0.0340, 0.3065, 0.0286], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0088, 0.0087, 0.0081, 0.0104, 0.0090, 0.0134, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:45:07,817 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7134, 4.0941, 3.9439, 3.7589, 4.0242, 3.7722, 1.6298, 3.9516], + device='cuda:2'), covar=tensor([0.0498, 0.0397, 0.0491, 0.0384, 0.0430, 0.0542, 0.3978, 0.0606], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0087, 0.0086, 0.0080, 0.0103, 0.0090, 0.0133, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:45:16,722 INFO [train.py:876] (2/4) Epoch 11, batch 1400, loss[loss=0.1206, simple_loss=0.1441, pruned_loss=0.04858, over 5562.00 frames. ], tot_loss[loss=0.1191, simple_loss=0.1449, pruned_loss=0.04663, over 1082894.03 frames. ], batch size: 43, lr: 7.46e-03, grad_scale: 8.0 +2022-11-16 03:45:19,899 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=74125.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:45:23,535 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-16 03:45:25,524 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.235e+01 1.555e+02 1.864e+02 2.328e+02 5.952e+02, threshold=3.728e+02, percent-clipped=5.0 +2022-11-16 03:46:20,069 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6629, 3.7126, 3.6951, 3.4078, 3.6714, 3.4504, 1.4080, 3.8397], + device='cuda:2'), covar=tensor([0.0270, 0.0306, 0.0275, 0.0392, 0.0359, 0.0372, 0.3228, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0087, 0.0086, 0.0080, 0.0104, 0.0090, 0.0133, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:46:24,544 INFO [train.py:876] (2/4) Epoch 11, batch 1500, loss[loss=0.08825, simple_loss=0.1265, pruned_loss=0.02502, over 5736.00 frames. ], tot_loss[loss=0.1194, simple_loss=0.1452, pruned_loss=0.04683, over 1082077.47 frames. ], batch size: 13, lr: 7.45e-03, grad_scale: 8.0 +2022-11-16 03:46:32,814 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.078e+02 1.551e+02 1.772e+02 2.146e+02 3.863e+02, threshold=3.544e+02, percent-clipped=1.0 +2022-11-16 03:46:37,962 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74240.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:46:42,237 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=74246.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:47:15,033 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=74294.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:47:18,515 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6014, 3.7621, 3.6491, 3.2495, 1.9832, 3.6192, 2.3408, 3.0360], + device='cuda:2'), covar=tensor([0.0411, 0.0259, 0.0169, 0.0324, 0.0605, 0.0174, 0.0490, 0.0227], + device='cuda:2'), in_proj_covar=tensor([0.0189, 0.0170, 0.0175, 0.0197, 0.0187, 0.0173, 0.0185, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:47:19,882 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=74301.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:47:20,074 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-11-16 03:47:21,292 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-11-16 03:47:33,162 INFO [train.py:876] (2/4) Epoch 11, batch 1600, loss[loss=0.08018, simple_loss=0.1203, pruned_loss=0.02002, over 5751.00 frames. ], tot_loss[loss=0.1162, simple_loss=0.1432, pruned_loss=0.04461, over 1086611.16 frames. ], batch size: 13, lr: 7.45e-03, grad_scale: 8.0 +2022-11-16 03:47:39,784 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4173, 4.1482, 4.3100, 4.4054, 4.1382, 3.9186, 4.7544, 4.1514], + device='cuda:2'), covar=tensor([0.0354, 0.0777, 0.0426, 0.0927, 0.0473, 0.0384, 0.0645, 0.0645], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0106, 0.0092, 0.0116, 0.0088, 0.0076, 0.0142, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:47:40,983 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.188e+01 1.509e+02 1.863e+02 2.484e+02 5.200e+02, threshold=3.726e+02, percent-clipped=6.0 +2022-11-16 03:47:57,504 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-11-16 03:48:18,287 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=74388.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:48:39,917 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=74420.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:48:40,465 INFO [train.py:876] (2/4) Epoch 11, batch 1700, loss[loss=0.1571, simple_loss=0.1637, pruned_loss=0.07529, over 5465.00 frames. ], tot_loss[loss=0.1164, simple_loss=0.1431, pruned_loss=0.04483, over 1091788.18 frames. ], batch size: 58, lr: 7.44e-03, grad_scale: 8.0 +2022-11-16 03:48:48,601 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.147e+02 1.679e+02 2.068e+02 2.361e+02 5.198e+02, threshold=4.137e+02, percent-clipped=4.0 +2022-11-16 03:48:50,644 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=74436.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:49:15,159 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3264, 3.4353, 3.3702, 3.1027, 3.4318, 3.3333, 1.3461, 3.4977], + device='cuda:2'), covar=tensor([0.0473, 0.0392, 0.0417, 0.0547, 0.0412, 0.0444, 0.3695, 0.0501], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0087, 0.0087, 0.0081, 0.0103, 0.0089, 0.0133, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:49:48,199 INFO [train.py:876] (2/4) Epoch 11, batch 1800, loss[loss=0.1165, simple_loss=0.1507, pruned_loss=0.04116, over 5724.00 frames. ], tot_loss[loss=0.1155, simple_loss=0.1425, pruned_loss=0.0443, over 1090704.57 frames. ], batch size: 15, lr: 7.44e-03, grad_scale: 8.0 +2022-11-16 03:49:54,455 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.73 vs. limit=2.0 +2022-11-16 03:49:55,854 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.990e+01 1.615e+02 2.043e+02 2.453e+02 6.860e+02, threshold=4.086e+02, percent-clipped=1.0 +2022-11-16 03:50:38,142 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74595.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:50:38,729 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=74596.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:50:54,950 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4105, 2.1586, 3.0001, 2.7462, 2.8166, 2.1730, 2.8732, 3.3432], + device='cuda:2'), covar=tensor([0.0798, 0.1580, 0.1019, 0.1496, 0.0882, 0.1697, 0.1153, 0.0903], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0193, 0.0210, 0.0209, 0.0234, 0.0194, 0.0223, 0.0225], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:50:56,055 INFO [train.py:876] (2/4) Epoch 11, batch 1900, loss[loss=0.1021, simple_loss=0.1294, pruned_loss=0.03737, over 5579.00 frames. ], tot_loss[loss=0.1151, simple_loss=0.1417, pruned_loss=0.04425, over 1079961.66 frames. ], batch size: 16, lr: 7.43e-03, grad_scale: 8.0 +2022-11-16 03:51:00,037 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-11-16 03:51:04,168 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.647e+01 1.532e+02 1.879e+02 2.235e+02 4.032e+02, threshold=3.759e+02, percent-clipped=0.0 +2022-11-16 03:51:19,906 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=74656.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:51:31,304 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8308, 4.3142, 3.9258, 4.3862, 4.3372, 3.7145, 3.8952, 3.7844], + device='cuda:2'), covar=tensor([0.0511, 0.0447, 0.1328, 0.0378, 0.0414, 0.0427, 0.0629, 0.0588], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0173, 0.0271, 0.0173, 0.0217, 0.0171, 0.0185, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:52:03,033 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=74720.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:52:03,559 INFO [train.py:876] (2/4) Epoch 11, batch 2000, loss[loss=0.1879, simple_loss=0.1914, pruned_loss=0.09221, over 5566.00 frames. ], tot_loss[loss=0.1156, simple_loss=0.1427, pruned_loss=0.04425, over 1083966.50 frames. ], batch size: 54, lr: 7.43e-03, grad_scale: 8.0 +2022-11-16 03:52:12,067 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.989e+01 1.482e+02 1.897e+02 2.343e+02 3.956e+02, threshold=3.795e+02, percent-clipped=2.0 +2022-11-16 03:52:14,068 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.81 vs. limit=5.0 +2022-11-16 03:52:23,426 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.21 vs. limit=2.0 +2022-11-16 03:52:27,667 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0380, 2.5266, 2.1434, 1.4221, 1.7714, 1.7181, 1.4750, 2.4892], + device='cuda:2'), covar=tensor([0.0053, 0.0033, 0.0049, 0.0057, 0.0046, 0.0038, 0.0039, 0.0028], + device='cuda:2'), in_proj_covar=tensor([0.0055, 0.0051, 0.0052, 0.0054, 0.0054, 0.0048, 0.0049, 0.0046], + device='cuda:2'), out_proj_covar=tensor([4.9756e-05, 4.5288e-05, 4.5632e-05, 4.8128e-05, 4.7800e-05, 4.2082e-05, + 4.3898e-05, 4.0799e-05], device='cuda:2') +2022-11-16 03:52:35,420 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=74768.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:52:57,384 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2502, 2.2919, 2.0378, 2.3092, 1.9418, 1.8492, 2.1567, 2.4931], + device='cuda:2'), covar=tensor([0.1445, 0.1766, 0.2387, 0.1316, 0.2024, 0.2069, 0.1782, 0.2230], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0103, 0.0103, 0.0098, 0.0091, 0.0100, 0.0095, 0.0078], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:53:05,576 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.57 vs. limit=5.0 +2022-11-16 03:53:11,058 INFO [train.py:876] (2/4) Epoch 11, batch 2100, loss[loss=0.1269, simple_loss=0.1606, pruned_loss=0.04658, over 5732.00 frames. ], tot_loss[loss=0.1165, simple_loss=0.1436, pruned_loss=0.04475, over 1080281.43 frames. ], batch size: 17, lr: 7.42e-03, grad_scale: 8.0 +2022-11-16 03:53:19,079 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.996e+01 1.479e+02 1.848e+02 2.338e+02 4.200e+02, threshold=3.697e+02, percent-clipped=1.0 +2022-11-16 03:53:39,731 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1254, 3.5712, 2.6065, 1.6535, 3.1604, 1.1722, 3.4548, 1.8300], + device='cuda:2'), covar=tensor([0.1755, 0.0287, 0.0956, 0.2215, 0.0417, 0.2711, 0.0350, 0.1900], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0105, 0.0114, 0.0113, 0.0101, 0.0123, 0.0099, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:54:02,266 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=74896.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:54:10,816 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74909.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:54:18,891 INFO [train.py:876] (2/4) Epoch 11, batch 2200, loss[loss=0.1218, simple_loss=0.1439, pruned_loss=0.04983, over 4707.00 frames. ], tot_loss[loss=0.1158, simple_loss=0.1428, pruned_loss=0.04434, over 1074491.07 frames. ], batch size: 135, lr: 7.42e-03, grad_scale: 8.0 +2022-11-16 03:54:26,973 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.032e+02 1.575e+02 1.874e+02 2.286e+02 4.723e+02, threshold=3.748e+02, percent-clipped=3.0 +2022-11-16 03:54:33,331 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-16 03:54:34,896 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=74944.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:54:39,527 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=74951.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:54:48,153 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74964.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:54:52,447 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=74970.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:55:08,147 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7964, 4.9663, 5.0984, 5.1633, 4.7326, 4.0982, 5.6211, 4.8486], + device='cuda:2'), covar=tensor([0.0350, 0.0743, 0.0309, 0.0946, 0.0464, 0.0372, 0.0586, 0.0382], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0108, 0.0093, 0.0118, 0.0089, 0.0077, 0.0143, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 03:55:17,561 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5064, 1.2656, 1.1824, 1.0052, 1.3730, 1.5157, 0.7518, 1.1277], + device='cuda:2'), covar=tensor([0.0254, 0.0326, 0.0401, 0.1065, 0.0335, 0.0557, 0.0865, 0.0417], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0022, 0.0016, 0.0020, 0.0016, 0.0014, 0.0021, 0.0015], + device='cuda:2'), out_proj_covar=tensor([7.9603e-05, 1.0856e-04, 8.2780e-05, 9.8567e-05, 8.4093e-05, 7.8474e-05, + 1.0217e-04, 7.9079e-05], device='cuda:2') +2022-11-16 03:55:26,157 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 03:55:30,347 INFO [train.py:876] (2/4) Epoch 11, batch 2300, loss[loss=0.1081, simple_loss=0.15, pruned_loss=0.03312, over 5537.00 frames. ], tot_loss[loss=0.1157, simple_loss=0.1419, pruned_loss=0.04478, over 1074428.28 frames. ], batch size: 21, lr: 7.41e-03, grad_scale: 8.0 +2022-11-16 03:55:33,370 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=75025.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:55:38,336 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.756e+01 1.570e+02 1.933e+02 2.290e+02 4.748e+02, threshold=3.866e+02, percent-clipped=2.0 +2022-11-16 03:55:44,638 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8462, 1.4507, 1.6636, 1.0688, 1.6742, 1.3837, 1.2600, 1.0765], + device='cuda:2'), covar=tensor([0.0024, 0.0044, 0.0037, 0.0054, 0.0046, 0.0062, 0.0049, 0.0057], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0024, 0.0025, 0.0034, 0.0028, 0.0027, 0.0032, 0.0032], + device='cuda:2'), out_proj_covar=tensor([2.3940e-05, 2.2704e-05, 2.2795e-05, 3.2816e-05, 2.6094e-05, 2.5841e-05, + 3.1213e-05, 3.0924e-05], device='cuda:2') +2022-11-16 03:55:54,322 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9689, 2.5985, 3.3022, 2.0684, 1.7424, 3.7402, 2.8938, 2.4353], + device='cuda:2'), covar=tensor([0.0580, 0.0772, 0.0365, 0.2462, 0.2384, 0.0746, 0.0924, 0.0963], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0086, 0.0087, 0.0097, 0.0071, 0.0064, 0.0073, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 03:56:36,549 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5821, 2.2416, 2.8023, 1.8923, 1.6247, 3.2820, 2.6642, 2.2456], + device='cuda:2'), covar=tensor([0.1111, 0.1216, 0.0650, 0.2561, 0.2142, 0.0369, 0.0785, 0.1222], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0086, 0.0087, 0.0097, 0.0071, 0.0064, 0.0073, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 03:56:37,692 INFO [train.py:876] (2/4) Epoch 11, batch 2400, loss[loss=0.1301, simple_loss=0.1571, pruned_loss=0.05152, over 5545.00 frames. ], tot_loss[loss=0.1156, simple_loss=0.1426, pruned_loss=0.04431, over 1083582.64 frames. ], batch size: 43, lr: 7.41e-03, grad_scale: 8.0 +2022-11-16 03:56:40,528 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6282, 4.2303, 3.5366, 3.5264, 2.1326, 3.9206, 2.3010, 3.3750], + device='cuda:2'), covar=tensor([0.0525, 0.0167, 0.0227, 0.0362, 0.0684, 0.0203, 0.0527, 0.0216], + device='cuda:2'), in_proj_covar=tensor([0.0189, 0.0171, 0.0174, 0.0196, 0.0186, 0.0172, 0.0186, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:56:41,313 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-11-16 03:56:45,362 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.051e+02 1.612e+02 2.010e+02 2.396e+02 4.325e+02, threshold=4.021e+02, percent-clipped=4.0 +2022-11-16 03:56:56,305 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1540, 2.4872, 3.5850, 3.0596, 4.1304, 2.5347, 3.5405, 4.2666], + device='cuda:2'), covar=tensor([0.0623, 0.1919, 0.1004, 0.2044, 0.0492, 0.1977, 0.1334, 0.0736], + device='cuda:2'), in_proj_covar=tensor([0.0238, 0.0193, 0.0212, 0.0211, 0.0237, 0.0195, 0.0224, 0.0227], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:57:45,189 INFO [train.py:876] (2/4) Epoch 11, batch 2500, loss[loss=0.1166, simple_loss=0.1487, pruned_loss=0.04223, over 5534.00 frames. ], tot_loss[loss=0.1161, simple_loss=0.1432, pruned_loss=0.04448, over 1081977.77 frames. ], batch size: 46, lr: 7.40e-03, grad_scale: 8.0 +2022-11-16 03:57:50,347 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=75228.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:57:53,351 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.180e+01 1.567e+02 1.927e+02 2.439e+02 5.845e+02, threshold=3.854e+02, percent-clipped=5.0 +2022-11-16 03:58:05,557 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75251.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:58:07,164 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-11-16 03:58:15,035 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=75265.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:58:18,685 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=75270.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:58:28,155 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2878, 4.1847, 2.6842, 3.8715, 3.2269, 2.8136, 2.0544, 3.3656], + device='cuda:2'), covar=tensor([0.1557, 0.0273, 0.1209, 0.0417, 0.0777, 0.1058, 0.2089, 0.0450], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0140, 0.0160, 0.0146, 0.0177, 0.0169, 0.0162, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 03:58:31,491 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=75289.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:58:38,037 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75299.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:58:52,027 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=75320.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 03:58:52,587 INFO [train.py:876] (2/4) Epoch 11, batch 2600, loss[loss=0.09739, simple_loss=0.1378, pruned_loss=0.0285, over 5637.00 frames. ], tot_loss[loss=0.117, simple_loss=0.1437, pruned_loss=0.04518, over 1076902.76 frames. ], batch size: 32, lr: 7.40e-03, grad_scale: 8.0 +2022-11-16 03:59:00,299 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=75331.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 03:59:01,317 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.032e+02 1.425e+02 1.750e+02 2.205e+02 4.754e+02, threshold=3.499e+02, percent-clipped=3.0 +2022-11-16 03:59:03,610 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.99 vs. limit=5.0 +2022-11-16 03:59:28,966 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8546, 3.0403, 2.8585, 2.9225, 2.5670, 3.0672, 3.2085, 3.3752], + device='cuda:2'), covar=tensor([0.0892, 0.0929, 0.1592, 0.1781, 0.1164, 0.0718, 0.1017, 0.1643], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0101, 0.0103, 0.0098, 0.0090, 0.0098, 0.0093, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 03:59:59,173 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-11-16 04:00:00,559 INFO [train.py:876] (2/4) Epoch 11, batch 2700, loss[loss=0.1085, simple_loss=0.1453, pruned_loss=0.03588, over 5633.00 frames. ], tot_loss[loss=0.1158, simple_loss=0.1425, pruned_loss=0.04452, over 1084188.78 frames. ], batch size: 32, lr: 7.39e-03, grad_scale: 8.0 +2022-11-16 04:00:08,242 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.727e+01 1.476e+02 1.842e+02 2.376e+02 5.290e+02, threshold=3.683e+02, percent-clipped=4.0 +2022-11-16 04:00:12,558 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9367, 3.7228, 2.2931, 3.5775, 3.0962, 2.3901, 1.9523, 3.1663], + device='cuda:2'), covar=tensor([0.2401, 0.0496, 0.1870, 0.0653, 0.1114, 0.1761, 0.2710, 0.0663], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0139, 0.0158, 0.0144, 0.0174, 0.0167, 0.0160, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:00:24,487 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1790, 1.5659, 1.5847, 1.5166, 1.0627, 2.3350, 1.8117, 1.3578], + device='cuda:2'), covar=tensor([0.1755, 0.1215, 0.1893, 0.2300, 0.2934, 0.0781, 0.1410, 0.2247], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0086, 0.0087, 0.0096, 0.0071, 0.0064, 0.0072, 0.0084], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 04:01:07,881 INFO [train.py:876] (2/4) Epoch 11, batch 2800, loss[loss=0.08559, simple_loss=0.1239, pruned_loss=0.02362, over 5534.00 frames. ], tot_loss[loss=0.1152, simple_loss=0.1423, pruned_loss=0.04405, over 1087219.29 frames. ], batch size: 13, lr: 7.39e-03, grad_scale: 16.0 +2022-11-16 04:01:15,805 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.621e+01 1.514e+02 1.754e+02 2.242e+02 3.721e+02, threshold=3.509e+02, percent-clipped=2.0 +2022-11-16 04:01:22,909 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.41 vs. limit=5.0 +2022-11-16 04:01:36,615 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1419, 4.2741, 2.5937, 4.0252, 3.4240, 2.6860, 2.2017, 3.5720], + device='cuda:2'), covar=tensor([0.2125, 0.0381, 0.1801, 0.0447, 0.0729, 0.1621, 0.2538, 0.0491], + device='cuda:2'), in_proj_covar=tensor([0.0157, 0.0140, 0.0159, 0.0145, 0.0175, 0.0169, 0.0162, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:01:37,951 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75565.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:01:50,663 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=75584.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:02:10,830 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75613.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:02:12,404 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.87 vs. limit=5.0 +2022-11-16 04:02:15,434 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75620.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:02:15,984 INFO [train.py:876] (2/4) Epoch 11, batch 2900, loss[loss=0.07942, simple_loss=0.109, pruned_loss=0.02491, over 5782.00 frames. ], tot_loss[loss=0.1127, simple_loss=0.1404, pruned_loss=0.04247, over 1089061.59 frames. ], batch size: 9, lr: 7.38e-03, grad_scale: 16.0 +2022-11-16 04:02:19,374 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=75626.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 04:02:23,735 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.526e+01 1.563e+02 1.912e+02 2.291e+02 3.744e+02, threshold=3.824e+02, percent-clipped=2.0 +2022-11-16 04:02:35,588 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-11-16 04:02:47,902 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75668.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:02:54,591 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1793, 1.7391, 1.3509, 1.2604, 1.3954, 1.4092, 1.1094, 1.5097], + device='cuda:2'), covar=tensor([0.0080, 0.0052, 0.0049, 0.0061, 0.0055, 0.0036, 0.0055, 0.0055], + device='cuda:2'), in_proj_covar=tensor([0.0056, 0.0051, 0.0052, 0.0055, 0.0054, 0.0049, 0.0050, 0.0047], + device='cuda:2'), out_proj_covar=tensor([5.0553e-05, 4.5499e-05, 4.5792e-05, 4.9484e-05, 4.8615e-05, 4.2791e-05, + 4.4553e-05, 4.1325e-05], device='cuda:2') +2022-11-16 04:03:04,831 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9551, 1.3256, 1.2456, 1.1436, 1.1343, 1.6584, 1.2602, 1.2778], + device='cuda:2'), covar=tensor([0.3062, 0.0678, 0.3084, 0.2689, 0.2005, 0.0494, 0.1801, 0.2682], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0087, 0.0087, 0.0096, 0.0070, 0.0064, 0.0073, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 04:03:23,302 INFO [train.py:876] (2/4) Epoch 11, batch 3000, loss[loss=0.1613, simple_loss=0.1781, pruned_loss=0.07225, over 5359.00 frames. ], tot_loss[loss=0.1135, simple_loss=0.141, pruned_loss=0.04298, over 1091487.81 frames. ], batch size: 70, lr: 7.38e-03, grad_scale: 16.0 +2022-11-16 04:03:23,302 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 04:03:40,544 INFO [train.py:908] (2/4) Epoch 11, validation: loss=0.1699, simple_loss=0.1855, pruned_loss=0.07718, over 1530663.00 frames. +2022-11-16 04:03:40,545 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 04:03:44,511 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4984, 4.1278, 3.0865, 1.9595, 3.8319, 1.5723, 3.8157, 2.1640], + device='cuda:2'), covar=tensor([0.1392, 0.0208, 0.0894, 0.2022, 0.0262, 0.2104, 0.0235, 0.1743], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0103, 0.0113, 0.0113, 0.0101, 0.0123, 0.0099, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:03:48,303 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.213e+02 1.514e+02 1.845e+02 2.226e+02 5.649e+02, threshold=3.690e+02, percent-clipped=5.0 +2022-11-16 04:04:45,464 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-11-16 04:04:49,202 INFO [train.py:876] (2/4) Epoch 11, batch 3100, loss[loss=0.06945, simple_loss=0.1015, pruned_loss=0.01871, over 5136.00 frames. ], tot_loss[loss=0.1151, simple_loss=0.1425, pruned_loss=0.04379, over 1088016.18 frames. ], batch size: 7, lr: 7.37e-03, grad_scale: 16.0 +2022-11-16 04:04:56,948 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.177e+01 1.516e+02 1.803e+02 2.135e+02 3.632e+02, threshold=3.607e+02, percent-clipped=0.0 +2022-11-16 04:05:31,456 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75884.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:05:41,957 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3043, 4.1470, 2.9342, 3.9718, 3.3288, 2.8507, 2.2330, 3.5692], + device='cuda:2'), covar=tensor([0.1347, 0.0277, 0.1046, 0.0348, 0.0717, 0.0979, 0.1863, 0.0337], + device='cuda:2'), in_proj_covar=tensor([0.0157, 0.0140, 0.0160, 0.0145, 0.0176, 0.0168, 0.0162, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:05:55,894 INFO [train.py:876] (2/4) Epoch 11, batch 3200, loss[loss=0.1162, simple_loss=0.1454, pruned_loss=0.04352, over 5569.00 frames. ], tot_loss[loss=0.1159, simple_loss=0.1434, pruned_loss=0.04419, over 1086788.50 frames. ], batch size: 43, lr: 7.37e-03, grad_scale: 16.0 +2022-11-16 04:05:58,574 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.09 vs. limit=2.0 +2022-11-16 04:05:59,622 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75926.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:06:04,154 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75932.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:06:04,815 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.709e+01 1.702e+02 2.039e+02 2.411e+02 4.513e+02, threshold=4.077e+02, percent-clipped=5.0 +2022-11-16 04:06:18,011 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9688, 1.6807, 1.9552, 1.6742, 1.3283, 1.5907, 1.6128, 1.5830], + device='cuda:2'), covar=tensor([0.0042, 0.0079, 0.0028, 0.0044, 0.0141, 0.0197, 0.0044, 0.0047], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0025, 0.0025, 0.0034, 0.0028, 0.0027, 0.0033, 0.0032], + device='cuda:2'), out_proj_covar=tensor([2.4366e-05, 2.2992e-05, 2.2534e-05, 3.2721e-05, 2.6170e-05, 2.5815e-05, + 3.1591e-05, 3.1374e-05], device='cuda:2') +2022-11-16 04:06:21,888 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1708, 4.6201, 4.9823, 4.6899, 5.2577, 5.0684, 4.6136, 5.2239], + device='cuda:2'), covar=tensor([0.0302, 0.0298, 0.0414, 0.0304, 0.0283, 0.0198, 0.0214, 0.0205], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0147, 0.0107, 0.0143, 0.0172, 0.0103, 0.0122, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 04:06:31,890 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75974.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:06:37,536 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5563, 2.5703, 2.2296, 2.6509, 2.2832, 1.7375, 2.6097, 3.0056], + device='cuda:2'), covar=tensor([0.1115, 0.1204, 0.2119, 0.0913, 0.1639, 0.2028, 0.1316, 0.0902], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0102, 0.0102, 0.0099, 0.0089, 0.0099, 0.0094, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:07:03,631 INFO [train.py:876] (2/4) Epoch 11, batch 3300, loss[loss=0.1157, simple_loss=0.151, pruned_loss=0.04019, over 5569.00 frames. ], tot_loss[loss=0.1153, simple_loss=0.1427, pruned_loss=0.04396, over 1085417.70 frames. ], batch size: 25, lr: 7.36e-03, grad_scale: 16.0 +2022-11-16 04:07:11,809 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.536e+01 1.447e+02 1.827e+02 2.353e+02 6.584e+02, threshold=3.655e+02, percent-clipped=2.0 +2022-11-16 04:07:19,434 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9088, 1.5814, 1.7415, 1.5119, 1.3818, 1.7010, 1.5389, 1.4439], + device='cuda:2'), covar=tensor([0.0042, 0.0079, 0.0033, 0.0046, 0.0073, 0.0051, 0.0042, 0.0050], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0025, 0.0025, 0.0033, 0.0028, 0.0027, 0.0032, 0.0032], + device='cuda:2'), out_proj_covar=tensor([2.4165e-05, 2.2995e-05, 2.2444e-05, 3.2629e-05, 2.6086e-05, 2.5712e-05, + 3.1407e-05, 3.1155e-05], device='cuda:2') +2022-11-16 04:08:10,513 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6816, 4.6382, 3.2589, 4.4220, 3.5236, 3.1959, 2.7985, 3.8918], + device='cuda:2'), covar=tensor([0.1381, 0.0232, 0.0984, 0.0350, 0.0681, 0.0921, 0.1620, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0140, 0.0160, 0.0144, 0.0175, 0.0168, 0.0162, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:08:11,218 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9932, 1.2167, 1.0856, 0.9885, 1.2961, 1.2266, 0.8374, 1.3343], + device='cuda:2'), covar=tensor([0.0063, 0.0038, 0.0061, 0.0054, 0.0052, 0.0053, 0.0095, 0.0050], + device='cuda:2'), in_proj_covar=tensor([0.0058, 0.0052, 0.0053, 0.0056, 0.0056, 0.0050, 0.0051, 0.0048], + device='cuda:2'), out_proj_covar=tensor([5.2123e-05, 4.6046e-05, 4.7200e-05, 5.0709e-05, 4.9681e-05, 4.3899e-05, + 4.5623e-05, 4.2165e-05], device='cuda:2') +2022-11-16 04:08:11,733 INFO [train.py:876] (2/4) Epoch 11, batch 3400, loss[loss=0.1278, simple_loss=0.1655, pruned_loss=0.04501, over 5577.00 frames. ], tot_loss[loss=0.1137, simple_loss=0.1417, pruned_loss=0.04283, over 1089403.53 frames. ], batch size: 46, lr: 7.36e-03, grad_scale: 8.0 +2022-11-16 04:08:20,083 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.046e+02 1.517e+02 1.870e+02 2.344e+02 4.526e+02, threshold=3.741e+02, percent-clipped=4.0 +2022-11-16 04:09:13,826 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2046, 2.5355, 2.8351, 2.6264, 1.6747, 2.6718, 1.8825, 2.1371], + device='cuda:2'), covar=tensor([0.0284, 0.0165, 0.0134, 0.0240, 0.0380, 0.0172, 0.0348, 0.0191], + device='cuda:2'), in_proj_covar=tensor([0.0193, 0.0174, 0.0179, 0.0200, 0.0190, 0.0178, 0.0190, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:09:19,522 INFO [train.py:876] (2/4) Epoch 11, batch 3500, loss[loss=0.08048, simple_loss=0.1139, pruned_loss=0.02353, over 5526.00 frames. ], tot_loss[loss=0.1144, simple_loss=0.1421, pruned_loss=0.04336, over 1079985.81 frames. ], batch size: 10, lr: 7.35e-03, grad_scale: 8.0 +2022-11-16 04:09:27,971 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.997e+01 1.638e+02 2.033e+02 2.357e+02 4.512e+02, threshold=4.066e+02, percent-clipped=3.0 +2022-11-16 04:09:52,295 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=76269.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:10:08,546 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=76294.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:10:27,145 INFO [train.py:876] (2/4) Epoch 11, batch 3600, loss[loss=0.07212, simple_loss=0.1052, pruned_loss=0.01953, over 5323.00 frames. ], tot_loss[loss=0.1141, simple_loss=0.1417, pruned_loss=0.0433, over 1086884.78 frames. ], batch size: 9, lr: 7.35e-03, grad_scale: 8.0 +2022-11-16 04:10:28,578 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=76323.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:10:33,280 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=76330.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 04:10:35,634 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.020e+02 1.560e+02 1.906e+02 2.408e+02 5.224e+02, threshold=3.812e+02, percent-clipped=4.0 +2022-11-16 04:10:50,286 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=76355.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:11:09,729 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=76384.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:11:35,223 INFO [train.py:876] (2/4) Epoch 11, batch 3700, loss[loss=0.104, simple_loss=0.135, pruned_loss=0.03647, over 5552.00 frames. ], tot_loss[loss=0.1155, simple_loss=0.1429, pruned_loss=0.04404, over 1090584.64 frames. ], batch size: 16, lr: 7.34e-03, grad_scale: 8.0 +2022-11-16 04:11:43,593 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.959e+01 1.528e+02 1.916e+02 2.228e+02 3.767e+02, threshold=3.832e+02, percent-clipped=0.0 +2022-11-16 04:11:49,932 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-11-16 04:11:51,430 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.6142, 5.0588, 5.2858, 5.0168, 5.6878, 5.4947, 4.7867, 5.6424], + device='cuda:2'), covar=tensor([0.0305, 0.0335, 0.0558, 0.0299, 0.0268, 0.0191, 0.0225, 0.0226], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0146, 0.0108, 0.0142, 0.0171, 0.0102, 0.0122, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 04:12:38,745 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-16 04:12:41,660 INFO [train.py:876] (2/4) Epoch 11, batch 3800, loss[loss=0.1199, simple_loss=0.1364, pruned_loss=0.05167, over 5652.00 frames. ], tot_loss[loss=0.1155, simple_loss=0.1425, pruned_loss=0.0443, over 1089020.50 frames. ], batch size: 29, lr: 7.34e-03, grad_scale: 8.0 +2022-11-16 04:12:50,477 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.425e+01 1.575e+02 2.020e+02 2.543e+02 6.057e+02, threshold=4.040e+02, percent-clipped=8.0 +2022-11-16 04:13:49,823 INFO [train.py:876] (2/4) Epoch 11, batch 3900, loss[loss=0.1254, simple_loss=0.1542, pruned_loss=0.04829, over 5680.00 frames. ], tot_loss[loss=0.1165, simple_loss=0.143, pruned_loss=0.04499, over 1082942.45 frames. ], batch size: 36, lr: 7.33e-03, grad_scale: 8.0 +2022-11-16 04:13:53,093 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=76625.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 04:13:59,641 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.483e+01 1.484e+02 1.748e+02 2.175e+02 4.162e+02, threshold=3.496e+02, percent-clipped=1.0 +2022-11-16 04:14:11,304 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=76650.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:14:14,899 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.4977, 4.9559, 5.3746, 5.0121, 5.5743, 5.4894, 4.7233, 5.5608], + device='cuda:2'), covar=tensor([0.0338, 0.0294, 0.0382, 0.0244, 0.0322, 0.0149, 0.0254, 0.0212], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0147, 0.0109, 0.0143, 0.0171, 0.0103, 0.0123, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 04:14:19,685 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1621, 3.2343, 3.0910, 3.1070, 3.0775, 3.3683, 3.8415, 3.4988], + device='cuda:2'), covar=tensor([0.0829, 0.0983, 0.1292, 0.1275, 0.0985, 0.0767, 0.0642, 0.2600], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0101, 0.0101, 0.0098, 0.0089, 0.0097, 0.0093, 0.0075], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:14:31,201 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=76679.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 04:14:59,834 INFO [train.py:876] (2/4) Epoch 11, batch 4000, loss[loss=0.1226, simple_loss=0.153, pruned_loss=0.04609, over 5639.00 frames. ], tot_loss[loss=0.116, simple_loss=0.1428, pruned_loss=0.04458, over 1086450.00 frames. ], batch size: 32, lr: 7.33e-03, grad_scale: 8.0 +2022-11-16 04:15:05,060 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5009, 1.1452, 1.1466, 0.9877, 1.3446, 1.5263, 0.8496, 1.2112], + device='cuda:2'), covar=tensor([0.0370, 0.0499, 0.0788, 0.0947, 0.0697, 0.0338, 0.0976, 0.0539], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0022, 0.0015, 0.0019, 0.0015, 0.0014, 0.0021, 0.0015], + device='cuda:2'), out_proj_covar=tensor([7.7766e-05, 1.0812e-04, 8.1558e-05, 9.6082e-05, 8.2474e-05, 7.7776e-05, + 1.0192e-04, 7.8792e-05], device='cuda:2') +2022-11-16 04:15:08,075 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.071e+02 1.537e+02 1.803e+02 2.088e+02 3.858e+02, threshold=3.606e+02, percent-clipped=2.0 +2022-11-16 04:15:47,402 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-11-16 04:15:57,127 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1535, 2.4323, 3.5745, 3.2128, 4.1322, 2.5450, 3.6413, 4.2019], + device='cuda:2'), covar=tensor([0.0545, 0.1713, 0.0854, 0.1508, 0.0403, 0.1542, 0.1108, 0.0767], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0191, 0.0214, 0.0210, 0.0238, 0.0195, 0.0226, 0.0230], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:16:07,303 INFO [train.py:876] (2/4) Epoch 11, batch 4100, loss[loss=0.08751, simple_loss=0.128, pruned_loss=0.0235, over 5727.00 frames. ], tot_loss[loss=0.1146, simple_loss=0.1416, pruned_loss=0.04379, over 1080941.41 frames. ], batch size: 17, lr: 7.32e-03, grad_scale: 8.0 +2022-11-16 04:16:07,575 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.64 vs. limit=2.0 +2022-11-16 04:16:15,771 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.037e+02 1.454e+02 1.745e+02 2.235e+02 4.051e+02, threshold=3.490e+02, percent-clipped=2.0 +2022-11-16 04:16:19,235 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=76839.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:16:48,496 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2320, 3.8185, 3.4209, 3.7897, 3.8078, 3.2408, 3.4619, 3.3685], + device='cuda:2'), covar=tensor([0.1025, 0.0448, 0.1253, 0.0436, 0.0435, 0.0523, 0.0564, 0.0701], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0177, 0.0272, 0.0173, 0.0216, 0.0171, 0.0185, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:17:00,352 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=76900.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:17:02,649 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.82 vs. limit=2.0 +2022-11-16 04:17:14,935 INFO [train.py:876] (2/4) Epoch 11, batch 4200, loss[loss=0.21, simple_loss=0.188, pruned_loss=0.116, over 2991.00 frames. ], tot_loss[loss=0.1151, simple_loss=0.1421, pruned_loss=0.04409, over 1079598.50 frames. ], batch size: 284, lr: 7.32e-03, grad_scale: 8.0 +2022-11-16 04:17:17,314 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-11-16 04:17:17,665 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=76925.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:17:23,252 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.852e+01 1.372e+02 1.800e+02 2.122e+02 4.143e+02, threshold=3.599e+02, percent-clipped=4.0 +2022-11-16 04:17:34,009 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=76950.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:17:34,587 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6546, 4.6200, 4.9555, 4.7764, 4.5887, 4.0530, 5.2714, 4.7007], + device='cuda:2'), covar=tensor([0.0433, 0.0819, 0.0276, 0.1086, 0.0424, 0.0308, 0.0544, 0.0499], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0108, 0.0094, 0.0119, 0.0090, 0.0079, 0.0145, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:17:38,052 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.87 vs. limit=5.0 +2022-11-16 04:17:49,230 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6399, 2.7046, 2.4659, 2.9483, 2.2588, 2.2231, 2.6021, 3.0906], + device='cuda:2'), covar=tensor([0.1172, 0.1515, 0.1900, 0.1600, 0.1729, 0.1878, 0.1426, 0.2179], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0103, 0.0103, 0.0100, 0.0091, 0.0099, 0.0095, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:17:49,769 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=76973.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 04:17:53,747 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=76979.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 04:18:06,027 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=76998.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:18:21,659 INFO [train.py:876] (2/4) Epoch 11, batch 4300, loss[loss=0.08408, simple_loss=0.1213, pruned_loss=0.02345, over 5551.00 frames. ], tot_loss[loss=0.1154, simple_loss=0.1426, pruned_loss=0.04405, over 1082977.03 frames. ], batch size: 14, lr: 7.31e-03, grad_scale: 8.0 +2022-11-16 04:18:25,913 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77027.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:18:30,433 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.745e+01 1.515e+02 1.890e+02 2.345e+02 3.579e+02, threshold=3.779e+02, percent-clipped=0.0 +2022-11-16 04:19:11,525 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77095.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:19:16,692 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9466, 1.4866, 1.3893, 1.3507, 1.1069, 1.8926, 1.4580, 1.1549], + device='cuda:2'), covar=tensor([0.3369, 0.1327, 0.2775, 0.3022, 0.3212, 0.1075, 0.2260, 0.3760], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0087, 0.0087, 0.0097, 0.0069, 0.0064, 0.0074, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 04:19:28,276 INFO [train.py:876] (2/4) Epoch 11, batch 4400, loss[loss=0.1428, simple_loss=0.1597, pruned_loss=0.06299, over 5394.00 frames. ], tot_loss[loss=0.1158, simple_loss=0.1428, pruned_loss=0.04439, over 1078729.42 frames. ], batch size: 70, lr: 7.31e-03, grad_scale: 8.0 +2022-11-16 04:19:37,949 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.506e+01 1.521e+02 1.875e+02 2.343e+02 5.225e+02, threshold=3.749e+02, percent-clipped=3.0 +2022-11-16 04:19:38,762 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7355, 3.5577, 3.6661, 3.5510, 3.8786, 3.3958, 1.4167, 3.9445], + device='cuda:2'), covar=tensor([0.0345, 0.0509, 0.0376, 0.0432, 0.0361, 0.0518, 0.3521, 0.0325], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0087, 0.0089, 0.0081, 0.0103, 0.0090, 0.0133, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:19:52,379 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77156.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 04:20:09,012 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0209, 4.0791, 4.2166, 3.4413, 2.3304, 4.3615, 2.3741, 3.6758], + device='cuda:2'), covar=tensor([0.0353, 0.0228, 0.0162, 0.0389, 0.0540, 0.0117, 0.0543, 0.0167], + device='cuda:2'), in_proj_covar=tensor([0.0192, 0.0175, 0.0179, 0.0202, 0.0191, 0.0178, 0.0190, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:20:19,051 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77195.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:20:29,857 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77211.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:20:36,407 INFO [train.py:876] (2/4) Epoch 11, batch 4500, loss[loss=0.1159, simple_loss=0.1564, pruned_loss=0.03773, over 5689.00 frames. ], tot_loss[loss=0.1129, simple_loss=0.1406, pruned_loss=0.04262, over 1082043.07 frames. ], batch size: 28, lr: 7.31e-03, grad_scale: 8.0 +2022-11-16 04:20:45,515 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.537e+01 1.435e+02 1.838e+02 2.199e+02 4.502e+02, threshold=3.675e+02, percent-clipped=1.0 +2022-11-16 04:20:46,682 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.4070, 0.9361, 0.5046, 0.6267, 0.8094, 0.8769, 0.3813, 0.8798], + device='cuda:2'), covar=tensor([0.0074, 0.0030, 0.0048, 0.0034, 0.0038, 0.0049, 0.0077, 0.0045], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0052, 0.0053, 0.0056, 0.0055, 0.0050, 0.0050, 0.0048], + device='cuda:2'), out_proj_covar=tensor([5.1544e-05, 4.6652e-05, 4.6742e-05, 5.0286e-05, 4.8951e-05, 4.3916e-05, + 4.4941e-05, 4.2287e-05], device='cuda:2') +2022-11-16 04:21:11,332 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77272.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:21:11,933 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77273.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:21:14,569 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77277.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:21:44,376 INFO [train.py:876] (2/4) Epoch 11, batch 4600, loss[loss=0.1125, simple_loss=0.1464, pruned_loss=0.03928, over 5744.00 frames. ], tot_loss[loss=0.1139, simple_loss=0.1416, pruned_loss=0.04317, over 1080411.10 frames. ], batch size: 27, lr: 7.30e-03, grad_scale: 8.0 +2022-11-16 04:21:52,863 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.709e+01 1.569e+02 2.032e+02 2.456e+02 5.240e+02, threshold=4.063e+02, percent-clipped=2.0 +2022-11-16 04:21:53,371 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77334.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:21:56,305 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77338.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:22:08,133 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8508, 2.2534, 3.0694, 1.9982, 1.8538, 3.5079, 2.9181, 2.2836], + device='cuda:2'), covar=tensor([0.0785, 0.1017, 0.0646, 0.2609, 0.2368, 0.1875, 0.1136, 0.1358], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0088, 0.0087, 0.0097, 0.0070, 0.0064, 0.0074, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 04:22:15,320 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7686, 4.9771, 3.7322, 2.1530, 4.5334, 1.8556, 4.5124, 2.5979], + device='cuda:2'), covar=tensor([0.1445, 0.0130, 0.0417, 0.2071, 0.0194, 0.1901, 0.0163, 0.1569], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0105, 0.0114, 0.0114, 0.0102, 0.0123, 0.0100, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:22:45,866 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9138, 1.3579, 1.3500, 1.3208, 0.8070, 1.8652, 1.5055, 1.1280], + device='cuda:2'), covar=tensor([0.3779, 0.1389, 0.3203, 0.3364, 0.4078, 0.0944, 0.2533, 0.3619], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0088, 0.0087, 0.0097, 0.0071, 0.0064, 0.0074, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 04:22:52,448 INFO [train.py:876] (2/4) Epoch 11, batch 4700, loss[loss=0.1015, simple_loss=0.1438, pruned_loss=0.02955, over 5478.00 frames. ], tot_loss[loss=0.114, simple_loss=0.142, pruned_loss=0.04304, over 1086267.81 frames. ], batch size: 12, lr: 7.30e-03, grad_scale: 8.0 +2022-11-16 04:23:00,909 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.694e+01 1.424e+02 1.701e+02 2.094e+02 3.279e+02, threshold=3.401e+02, percent-clipped=0.0 +2022-11-16 04:23:12,368 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77451.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 04:23:42,074 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77495.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:24:00,662 INFO [train.py:876] (2/4) Epoch 11, batch 4800, loss[loss=0.1234, simple_loss=0.1445, pruned_loss=0.05113, over 5061.00 frames. ], tot_loss[loss=0.1129, simple_loss=0.1415, pruned_loss=0.04214, over 1090866.77 frames. ], batch size: 91, lr: 7.29e-03, grad_scale: 8.0 +2022-11-16 04:24:09,188 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.251e+01 1.590e+02 1.859e+02 2.447e+02 5.021e+02, threshold=3.719e+02, percent-clipped=6.0 +2022-11-16 04:24:15,209 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77543.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:24:16,041 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6703, 2.3756, 3.3749, 3.1251, 3.3296, 2.5052, 3.2325, 3.6401], + device='cuda:2'), covar=tensor([0.0682, 0.1528, 0.0929, 0.1619, 0.0693, 0.1450, 0.1032, 0.0882], + device='cuda:2'), in_proj_covar=tensor([0.0240, 0.0190, 0.0208, 0.0208, 0.0234, 0.0192, 0.0220, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:24:32,380 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77567.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:25:05,982 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77616.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:25:09,014 INFO [train.py:876] (2/4) Epoch 11, batch 4900, loss[loss=0.08774, simple_loss=0.1266, pruned_loss=0.02446, over 5531.00 frames. ], tot_loss[loss=0.1161, simple_loss=0.1433, pruned_loss=0.04442, over 1090350.62 frames. ], batch size: 14, lr: 7.29e-03, grad_scale: 8.0 +2022-11-16 04:25:14,423 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77629.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:25:17,035 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77633.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:25:17,558 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.666e+01 1.544e+02 1.877e+02 2.554e+02 4.573e+02, threshold=3.753e+02, percent-clipped=4.0 +2022-11-16 04:25:40,975 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-11-16 04:25:46,993 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77677.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:25:47,013 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77677.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:26:16,783 INFO [train.py:876] (2/4) Epoch 11, batch 5000, loss[loss=0.09564, simple_loss=0.1348, pruned_loss=0.02823, over 5609.00 frames. ], tot_loss[loss=0.116, simple_loss=0.1429, pruned_loss=0.04459, over 1086724.90 frames. ], batch size: 23, lr: 7.28e-03, grad_scale: 8.0 +2022-11-16 04:26:22,725 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8063, 3.7125, 3.8795, 3.7249, 3.7954, 3.7526, 1.5886, 3.8988], + device='cuda:2'), covar=tensor([0.0479, 0.0585, 0.0510, 0.0518, 0.0540, 0.0510, 0.4046, 0.0525], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0085, 0.0087, 0.0079, 0.0100, 0.0088, 0.0129, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:26:25,204 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.970e+01 1.490e+02 1.911e+02 2.304e+02 5.675e+02, threshold=3.822e+02, percent-clipped=3.0 +2022-11-16 04:26:28,008 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77738.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:26:36,354 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77751.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:27:09,052 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77799.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:27:23,482 INFO [train.py:876] (2/4) Epoch 11, batch 5100, loss[loss=0.06885, simple_loss=0.1058, pruned_loss=0.01592, over 5726.00 frames. ], tot_loss[loss=0.115, simple_loss=0.1421, pruned_loss=0.04396, over 1087335.24 frames. ], batch size: 11, lr: 7.28e-03, grad_scale: 8.0 +2022-11-16 04:27:32,549 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.765e+01 1.557e+02 2.003e+02 2.576e+02 4.677e+02, threshold=4.007e+02, percent-clipped=1.0 +2022-11-16 04:27:54,799 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77867.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:28:26,611 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77915.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:28:29,373 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2326, 2.1419, 2.6212, 1.8941, 1.3475, 3.0990, 2.6303, 2.1851], + device='cuda:2'), covar=tensor([0.1499, 0.2030, 0.0715, 0.2644, 0.3161, 0.1050, 0.1601, 0.1399], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0088, 0.0088, 0.0097, 0.0070, 0.0064, 0.0074, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 04:28:30,554 INFO [train.py:876] (2/4) Epoch 11, batch 5200, loss[loss=0.06187, simple_loss=0.09716, pruned_loss=0.01329, over 5284.00 frames. ], tot_loss[loss=0.1154, simple_loss=0.1424, pruned_loss=0.0442, over 1091380.01 frames. ], batch size: 6, lr: 7.27e-03, grad_scale: 8.0 +2022-11-16 04:28:35,985 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77929.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:28:38,567 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77933.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:28:39,080 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.034e+02 1.507e+02 1.817e+02 2.273e+02 5.327e+02, threshold=3.634e+02, percent-clipped=3.0 +2022-11-16 04:28:57,823 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6118, 1.9640, 3.2197, 2.6650, 3.4482, 2.0512, 2.7361, 3.6560], + device='cuda:2'), covar=tensor([0.0728, 0.2371, 0.1019, 0.1907, 0.0815, 0.2249, 0.1848, 0.0909], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0194, 0.0213, 0.0212, 0.0240, 0.0195, 0.0226, 0.0228], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:29:04,828 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77972.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:29:08,138 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77977.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:29:10,793 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77981.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:29:30,396 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-11-16 04:29:38,680 INFO [train.py:876] (2/4) Epoch 11, batch 5300, loss[loss=0.128, simple_loss=0.1397, pruned_loss=0.05813, over 4132.00 frames. ], tot_loss[loss=0.1149, simple_loss=0.142, pruned_loss=0.0439, over 1085546.21 frames. ], batch size: 181, lr: 7.27e-03, grad_scale: 8.0 +2022-11-16 04:29:46,505 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=78033.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:29:47,078 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.555e+01 1.542e+02 1.854e+02 2.251e+02 5.839e+02, threshold=3.709e+02, percent-clipped=3.0 +2022-11-16 04:29:52,497 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5835, 1.1346, 1.3069, 0.7793, 1.1867, 1.2827, 0.8181, 1.0973], + device='cuda:2'), covar=tensor([0.0250, 0.0383, 0.0319, 0.0681, 0.0468, 0.0600, 0.0663, 0.0327], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0023, 0.0016, 0.0020, 0.0016, 0.0015, 0.0022, 0.0015], + device='cuda:2'), out_proj_covar=tensor([8.2387e-05, 1.1220e-04, 8.5614e-05, 9.9879e-05, 8.6743e-05, 8.1224e-05, + 1.0764e-04, 8.2495e-05], device='cuda:2') +2022-11-16 04:30:13,464 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3592, 2.1124, 2.5956, 1.8855, 1.5996, 3.1394, 2.6412, 2.1461], + device='cuda:2'), covar=tensor([0.1075, 0.1469, 0.0694, 0.2541, 0.3054, 0.1113, 0.1008, 0.1330], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0088, 0.0089, 0.0098, 0.0071, 0.0064, 0.0075, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 04:30:32,314 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9384, 3.7883, 3.8192, 4.0590, 3.7496, 3.5125, 4.3700, 4.0172], + device='cuda:2'), covar=tensor([0.0472, 0.0803, 0.0477, 0.0928, 0.0572, 0.0396, 0.0714, 0.0588], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0111, 0.0095, 0.0120, 0.0090, 0.0081, 0.0147, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:30:46,419 INFO [train.py:876] (2/4) Epoch 11, batch 5400, loss[loss=0.1618, simple_loss=0.1598, pruned_loss=0.08185, over 4164.00 frames. ], tot_loss[loss=0.1137, simple_loss=0.1415, pruned_loss=0.04293, over 1087834.48 frames. ], batch size: 181, lr: 7.26e-03, grad_scale: 16.0 +2022-11-16 04:30:51,507 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5780, 3.6473, 3.5387, 3.3059, 1.9832, 3.4794, 2.1410, 3.1037], + device='cuda:2'), covar=tensor([0.0380, 0.0170, 0.0166, 0.0310, 0.0589, 0.0181, 0.0520, 0.0178], + device='cuda:2'), in_proj_covar=tensor([0.0190, 0.0173, 0.0178, 0.0199, 0.0189, 0.0176, 0.0188, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:30:55,260 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.062e+02 1.532e+02 1.889e+02 2.326e+02 4.779e+02, threshold=3.778e+02, percent-clipped=5.0 +2022-11-16 04:31:01,418 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8566, 4.7064, 4.9489, 4.8320, 4.4797, 4.4574, 5.3855, 4.8961], + device='cuda:2'), covar=tensor([0.0389, 0.0748, 0.0497, 0.1403, 0.0563, 0.0350, 0.0640, 0.0525], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0111, 0.0095, 0.0120, 0.0090, 0.0081, 0.0147, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:31:29,746 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4831, 3.9746, 3.5573, 3.9859, 3.9709, 3.4204, 3.6293, 3.5501], + device='cuda:2'), covar=tensor([0.0814, 0.0463, 0.1390, 0.0430, 0.0443, 0.0552, 0.0632, 0.0632], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0179, 0.0277, 0.0175, 0.0220, 0.0174, 0.0188, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:31:55,273 INFO [train.py:876] (2/4) Epoch 11, batch 5500, loss[loss=0.1358, simple_loss=0.1414, pruned_loss=0.06504, over 4127.00 frames. ], tot_loss[loss=0.1139, simple_loss=0.1415, pruned_loss=0.04311, over 1081739.46 frames. ], batch size: 181, lr: 7.26e-03, grad_scale: 16.0 +2022-11-16 04:32:02,843 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=78232.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:32:03,999 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.013e+02 1.487e+02 1.903e+02 2.328e+02 5.113e+02, threshold=3.806e+02, percent-clipped=2.0 +2022-11-16 04:32:29,974 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=78272.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:32:44,002 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=78293.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 04:32:58,205 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5439, 3.5945, 3.6240, 3.1904, 2.0705, 3.5136, 2.2681, 3.0008], + device='cuda:2'), covar=tensor([0.0421, 0.0158, 0.0199, 0.0379, 0.0520, 0.0180, 0.0438, 0.0181], + device='cuda:2'), in_proj_covar=tensor([0.0191, 0.0174, 0.0180, 0.0201, 0.0191, 0.0178, 0.0189, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:33:01,995 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=78320.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:33:02,618 INFO [train.py:876] (2/4) Epoch 11, batch 5600, loss[loss=0.09706, simple_loss=0.1357, pruned_loss=0.02921, over 5594.00 frames. ], tot_loss[loss=0.1148, simple_loss=0.1422, pruned_loss=0.04366, over 1083962.43 frames. ], batch size: 23, lr: 7.25e-03, grad_scale: 16.0 +2022-11-16 04:33:10,743 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3147, 2.9420, 3.1048, 2.9004, 1.9705, 2.9955, 2.1484, 2.7233], + device='cuda:2'), covar=tensor([0.0291, 0.0130, 0.0135, 0.0224, 0.0371, 0.0154, 0.0360, 0.0141], + device='cuda:2'), in_proj_covar=tensor([0.0191, 0.0173, 0.0179, 0.0201, 0.0190, 0.0177, 0.0189, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:33:11,301 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=78333.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:33:11,813 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.428e+01 1.405e+02 1.623e+02 2.102e+02 3.893e+02, threshold=3.245e+02, percent-clipped=1.0 +2022-11-16 04:33:20,946 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.22 vs. limit=2.0 +2022-11-16 04:33:38,193 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-11-16 04:33:43,985 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=78381.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:33:59,905 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3295, 2.7185, 3.8312, 3.6510, 4.1117, 2.8604, 3.7960, 4.2663], + device='cuda:2'), covar=tensor([0.0458, 0.1337, 0.0819, 0.1268, 0.0492, 0.1481, 0.1000, 0.0499], + device='cuda:2'), in_proj_covar=tensor([0.0240, 0.0190, 0.0210, 0.0208, 0.0236, 0.0192, 0.0222, 0.0225], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:34:11,589 INFO [train.py:876] (2/4) Epoch 11, batch 5700, loss[loss=0.09744, simple_loss=0.1218, pruned_loss=0.03653, over 5720.00 frames. ], tot_loss[loss=0.1135, simple_loss=0.1414, pruned_loss=0.04281, over 1089195.86 frames. ], batch size: 12, lr: 7.25e-03, grad_scale: 16.0 +2022-11-16 04:34:20,541 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.114e+01 1.453e+02 1.885e+02 2.462e+02 5.318e+02, threshold=3.770e+02, percent-clipped=5.0 +2022-11-16 04:34:37,886 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=78460.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:34:41,038 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5325, 3.7933, 3.7776, 3.5070, 3.6433, 3.7101, 1.4793, 3.7743], + device='cuda:2'), covar=tensor([0.0484, 0.0393, 0.0438, 0.0443, 0.0467, 0.0393, 0.3865, 0.0440], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0088, 0.0089, 0.0081, 0.0103, 0.0091, 0.0134, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:34:57,412 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.66 vs. limit=5.0 +2022-11-16 04:35:08,991 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3572, 4.7445, 4.2843, 4.7083, 4.6980, 4.0342, 4.3203, 4.0293], + device='cuda:2'), covar=tensor([0.0352, 0.0371, 0.1258, 0.0446, 0.0444, 0.0462, 0.0684, 0.0597], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0178, 0.0274, 0.0174, 0.0219, 0.0172, 0.0186, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:35:18,650 INFO [train.py:876] (2/4) Epoch 11, batch 5800, loss[loss=0.08077, simple_loss=0.1171, pruned_loss=0.02222, over 5521.00 frames. ], tot_loss[loss=0.1136, simple_loss=0.142, pruned_loss=0.04262, over 1095114.82 frames. ], batch size: 17, lr: 7.24e-03, grad_scale: 16.0 +2022-11-16 04:35:18,828 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=78521.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:35:27,563 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.244e+01 1.530e+02 1.946e+02 2.399e+02 7.039e+02, threshold=3.892e+02, percent-clipped=3.0 +2022-11-16 04:35:39,083 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4697, 1.9323, 2.2310, 2.6374, 2.8150, 2.1454, 1.7520, 2.7186], + device='cuda:2'), covar=tensor([0.2036, 0.2318, 0.1838, 0.1244, 0.1104, 0.2675, 0.2054, 0.1511], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0199, 0.0189, 0.0303, 0.0225, 0.0207, 0.0189, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 04:35:42,016 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-11-16 04:35:42,336 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=78556.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:35:46,008 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5933, 1.1832, 1.4887, 1.1563, 1.6039, 1.5096, 1.0337, 1.4078], + device='cuda:2'), covar=tensor([0.0711, 0.0536, 0.0377, 0.0806, 0.0721, 0.0816, 0.0646, 0.0468], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0023, 0.0016, 0.0020, 0.0016, 0.0015, 0.0022, 0.0015], + device='cuda:2'), out_proj_covar=tensor([8.2019e-05, 1.1312e-04, 8.5592e-05, 1.0102e-04, 8.6756e-05, 8.1444e-05, + 1.0743e-04, 8.1611e-05], device='cuda:2') +2022-11-16 04:36:03,411 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=78588.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 04:36:23,763 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=78617.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:36:26,219 INFO [train.py:876] (2/4) Epoch 11, batch 5900, loss[loss=0.1248, simple_loss=0.152, pruned_loss=0.04884, over 5560.00 frames. ], tot_loss[loss=0.113, simple_loss=0.1414, pruned_loss=0.04231, over 1091919.44 frames. ], batch size: 25, lr: 7.24e-03, grad_scale: 16.0 +2022-11-16 04:36:34,706 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.102e+02 1.483e+02 1.879e+02 2.262e+02 4.846e+02, threshold=3.758e+02, percent-clipped=3.0 +2022-11-16 04:37:33,705 INFO [train.py:876] (2/4) Epoch 11, batch 6000, loss[loss=0.1333, simple_loss=0.1599, pruned_loss=0.05331, over 5621.00 frames. ], tot_loss[loss=0.1126, simple_loss=0.141, pruned_loss=0.04208, over 1090949.13 frames. ], batch size: 38, lr: 7.24e-03, grad_scale: 16.0 +2022-11-16 04:37:33,705 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 04:37:45,014 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5193, 4.2708, 4.2472, 4.2372, 4.5839, 4.4933, 4.2861, 4.6066], + device='cuda:2'), covar=tensor([0.0321, 0.0371, 0.0481, 0.0436, 0.0403, 0.0227, 0.0269, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0152, 0.0109, 0.0143, 0.0176, 0.0107, 0.0124, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 04:37:51,354 INFO [train.py:908] (2/4) Epoch 11, validation: loss=0.1691, simple_loss=0.1834, pruned_loss=0.07744, over 1530663.00 frames. +2022-11-16 04:37:51,355 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 04:37:59,808 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.061e+02 1.487e+02 1.823e+02 2.133e+02 3.868e+02, threshold=3.646e+02, percent-clipped=1.0 +2022-11-16 04:38:32,445 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.47 vs. limit=5.0 +2022-11-16 04:38:56,297 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=78816.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:38:59,584 INFO [train.py:876] (2/4) Epoch 11, batch 6100, loss[loss=0.1534, simple_loss=0.1676, pruned_loss=0.06958, over 5551.00 frames. ], tot_loss[loss=0.1134, simple_loss=0.1414, pruned_loss=0.04277, over 1080375.26 frames. ], batch size: 40, lr: 7.23e-03, grad_scale: 16.0 +2022-11-16 04:39:01,020 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1969, 1.5891, 1.8878, 1.5271, 1.7677, 1.9688, 1.5260, 1.4301], + device='cuda:2'), covar=tensor([0.0026, 0.0037, 0.0040, 0.0061, 0.0071, 0.0059, 0.0037, 0.0040], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0025, 0.0025, 0.0034, 0.0028, 0.0027, 0.0032, 0.0031], + device='cuda:2'), out_proj_covar=tensor([2.4583e-05, 2.3177e-05, 2.2595e-05, 3.3207e-05, 2.6468e-05, 2.5158e-05, + 3.1246e-05, 3.0164e-05], device='cuda:2') +2022-11-16 04:39:08,233 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.645e+01 1.456e+02 1.777e+02 2.158e+02 4.181e+02, threshold=3.555e+02, percent-clipped=3.0 +2022-11-16 04:39:40,979 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0972, 4.8309, 5.1408, 5.2945, 4.8744, 4.3868, 5.5707, 4.9337], + device='cuda:2'), covar=tensor([0.0336, 0.0939, 0.0452, 0.0818, 0.0305, 0.0304, 0.0542, 0.0518], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0111, 0.0095, 0.0120, 0.0089, 0.0081, 0.0145, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:39:44,976 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=78888.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:40:01,685 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=78912.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:40:07,449 INFO [train.py:876] (2/4) Epoch 11, batch 6200, loss[loss=0.1338, simple_loss=0.1531, pruned_loss=0.05724, over 5570.00 frames. ], tot_loss[loss=0.1143, simple_loss=0.1417, pruned_loss=0.04345, over 1070904.64 frames. ], batch size: 22, lr: 7.23e-03, grad_scale: 16.0 +2022-11-16 04:40:16,300 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.628e+01 1.452e+02 1.821e+02 2.109e+02 4.865e+02, threshold=3.642e+02, percent-clipped=3.0 +2022-11-16 04:40:17,681 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=78936.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:40:37,741 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=78965.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:40:44,714 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8005, 2.6056, 2.6860, 2.4262, 2.8219, 2.7279, 2.7289, 2.7854], + device='cuda:2'), covar=tensor([0.0441, 0.0489, 0.0512, 0.0522, 0.0478, 0.0266, 0.0409, 0.0648], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0150, 0.0109, 0.0143, 0.0176, 0.0105, 0.0124, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 04:40:59,179 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7625, 2.6256, 2.2844, 2.8293, 2.2389, 2.4493, 2.2783, 3.1315], + device='cuda:2'), covar=tensor([0.1255, 0.1687, 0.2159, 0.1288, 0.1663, 0.1261, 0.1732, 0.1134], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0106, 0.0105, 0.0102, 0.0093, 0.0100, 0.0097, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:41:15,715 INFO [train.py:876] (2/4) Epoch 11, batch 6300, loss[loss=0.08727, simple_loss=0.1278, pruned_loss=0.02337, over 5741.00 frames. ], tot_loss[loss=0.1145, simple_loss=0.1415, pruned_loss=0.04379, over 1073532.11 frames. ], batch size: 20, lr: 7.22e-03, grad_scale: 16.0 +2022-11-16 04:41:17,772 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0485, 1.1554, 1.5949, 1.1438, 1.4671, 1.5444, 1.1056, 1.2326], + device='cuda:2'), covar=tensor([0.0261, 0.1048, 0.0707, 0.1009, 0.1611, 0.1945, 0.1324, 0.0877], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0022, 0.0015, 0.0019, 0.0016, 0.0014, 0.0021, 0.0015], + device='cuda:2'), out_proj_covar=tensor([7.9011e-05, 1.0899e-04, 8.2121e-05, 9.7907e-05, 8.3796e-05, 7.9408e-05, + 1.0310e-04, 7.8887e-05], device='cuda:2') +2022-11-16 04:41:19,124 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79026.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:41:22,622 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-11-16 04:41:24,115 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.143e+02 1.477e+02 1.821e+02 2.187e+02 5.336e+02, threshold=3.643e+02, percent-clipped=2.0 +2022-11-16 04:41:45,203 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-11-16 04:41:53,648 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6641, 3.6001, 3.5314, 3.6442, 3.4614, 3.1822, 4.0306, 3.6332], + device='cuda:2'), covar=tensor([0.0470, 0.0839, 0.0514, 0.1150, 0.0584, 0.0450, 0.0701, 0.0674], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0111, 0.0095, 0.0119, 0.0090, 0.0081, 0.0145, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:42:02,161 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8427, 3.0649, 3.0826, 2.8201, 2.9631, 2.9584, 1.2772, 3.0670], + device='cuda:2'), covar=tensor([0.0327, 0.0285, 0.0307, 0.0368, 0.0392, 0.0361, 0.3004, 0.0350], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0086, 0.0087, 0.0080, 0.0102, 0.0090, 0.0133, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:42:19,621 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79116.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:42:23,418 INFO [train.py:876] (2/4) Epoch 11, batch 6400, loss[loss=0.1124, simple_loss=0.1428, pruned_loss=0.04097, over 5592.00 frames. ], tot_loss[loss=0.1138, simple_loss=0.1412, pruned_loss=0.04323, over 1080163.73 frames. ], batch size: 22, lr: 7.22e-03, grad_scale: 16.0 +2022-11-16 04:42:32,273 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.453e+01 1.576e+02 1.936e+02 2.244e+02 4.119e+02, threshold=3.873e+02, percent-clipped=1.0 +2022-11-16 04:42:52,526 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=79164.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:43:25,628 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79212.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:43:31,342 INFO [train.py:876] (2/4) Epoch 11, batch 6500, loss[loss=0.1078, simple_loss=0.1199, pruned_loss=0.04787, over 5722.00 frames. ], tot_loss[loss=0.1144, simple_loss=0.1416, pruned_loss=0.04364, over 1077749.47 frames. ], batch size: 11, lr: 7.21e-03, grad_scale: 16.0 +2022-11-16 04:43:35,572 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-16 04:43:40,082 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.904e+01 1.514e+02 1.877e+02 2.297e+02 4.296e+02, threshold=3.754e+02, percent-clipped=3.0 +2022-11-16 04:43:57,781 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=79260.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:44:14,394 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79285.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 04:44:39,326 INFO [train.py:876] (2/4) Epoch 11, batch 6600, loss[loss=0.08528, simple_loss=0.1288, pruned_loss=0.02088, over 5508.00 frames. ], tot_loss[loss=0.1133, simple_loss=0.1408, pruned_loss=0.0429, over 1080595.52 frames. ], batch size: 17, lr: 7.21e-03, grad_scale: 16.0 +2022-11-16 04:44:39,408 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79321.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:44:47,242 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79333.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:44:47,747 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.843e+01 1.552e+02 2.036e+02 2.359e+02 4.515e+02, threshold=4.072e+02, percent-clipped=1.0 +2022-11-16 04:44:56,170 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79346.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 04:45:08,451 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79364.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:45:15,653 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79375.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:45:28,437 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79394.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:45:46,989 INFO [train.py:876] (2/4) Epoch 11, batch 6700, loss[loss=0.06981, simple_loss=0.1033, pruned_loss=0.01817, over 5705.00 frames. ], tot_loss[loss=0.1161, simple_loss=0.1426, pruned_loss=0.0448, over 1081475.08 frames. ], batch size: 11, lr: 7.20e-03, grad_scale: 16.0 +2022-11-16 04:45:48,568 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.66 vs. limit=2.0 +2022-11-16 04:45:49,765 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79425.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:45:55,339 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.416e+01 1.570e+02 1.880e+02 2.430e+02 4.197e+02, threshold=3.759e+02, percent-clipped=3.0 +2022-11-16 04:45:56,828 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79436.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 04:46:11,448 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8959, 1.5036, 2.0119, 1.4313, 1.7713, 1.8174, 1.5827, 1.2503], + device='cuda:2'), covar=tensor([0.0038, 0.0046, 0.0020, 0.0050, 0.0079, 0.0070, 0.0037, 0.0064], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0025, 0.0025, 0.0034, 0.0029, 0.0026, 0.0032, 0.0031], + device='cuda:2'), out_proj_covar=tensor([2.4123e-05, 2.3221e-05, 2.2584e-05, 3.2797e-05, 2.6616e-05, 2.4912e-05, + 3.0968e-05, 3.0332e-05], device='cuda:2') +2022-11-16 04:46:24,585 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8873, 4.7713, 4.9330, 4.8556, 4.2276, 4.0811, 5.4416, 4.8091], + device='cuda:2'), covar=tensor([0.0384, 0.0698, 0.0379, 0.1016, 0.0546, 0.0288, 0.0556, 0.0476], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0113, 0.0096, 0.0121, 0.0091, 0.0082, 0.0150, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:46:28,205 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.49 vs. limit=5.0 +2022-11-16 04:46:34,704 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-16 04:46:46,702 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.4511, 5.0656, 5.3197, 4.8586, 5.6224, 5.4839, 4.8035, 5.5905], + device='cuda:2'), covar=tensor([0.0428, 0.0309, 0.0381, 0.0261, 0.0308, 0.0185, 0.0238, 0.0215], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0151, 0.0108, 0.0142, 0.0175, 0.0105, 0.0123, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 04:46:54,265 INFO [train.py:876] (2/4) Epoch 11, batch 6800, loss[loss=0.07619, simple_loss=0.1123, pruned_loss=0.02001, over 5786.00 frames. ], tot_loss[loss=0.1152, simple_loss=0.1419, pruned_loss=0.04426, over 1078480.17 frames. ], batch size: 16, lr: 7.20e-03, grad_scale: 16.0 +2022-11-16 04:47:03,442 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.898e+01 1.544e+02 1.830e+02 2.349e+02 4.429e+02, threshold=3.660e+02, percent-clipped=3.0 +2022-11-16 04:47:16,293 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0933, 1.5946, 1.9753, 1.7325, 1.8198, 1.9034, 1.7457, 1.6167], + device='cuda:2'), covar=tensor([0.0057, 0.0051, 0.0065, 0.0050, 0.0117, 0.0074, 0.0037, 0.0044], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0025, 0.0025, 0.0034, 0.0029, 0.0027, 0.0032, 0.0032], + device='cuda:2'), out_proj_covar=tensor([2.4403e-05, 2.3268e-05, 2.2985e-05, 3.3349e-05, 2.6944e-05, 2.5152e-05, + 3.1301e-05, 3.0880e-05], device='cuda:2') +2022-11-16 04:47:38,075 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79585.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:47:38,475 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-11-16 04:48:00,876 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1176, 3.7021, 2.6593, 3.5163, 2.8042, 2.7186, 2.0698, 3.0640], + device='cuda:2'), covar=tensor([0.1411, 0.0319, 0.1088, 0.0390, 0.1044, 0.1027, 0.1905, 0.0594], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0143, 0.0159, 0.0148, 0.0176, 0.0169, 0.0164, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:48:02,173 INFO [train.py:876] (2/4) Epoch 11, batch 6900, loss[loss=0.1262, simple_loss=0.1376, pruned_loss=0.05738, over 4641.00 frames. ], tot_loss[loss=0.1123, simple_loss=0.1401, pruned_loss=0.04225, over 1082040.15 frames. ], batch size: 135, lr: 7.19e-03, grad_scale: 16.0 +2022-11-16 04:48:02,292 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79621.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:48:10,569 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.132e+01 1.500e+02 1.797e+02 2.147e+02 3.952e+02, threshold=3.594e+02, percent-clipped=1.0 +2022-11-16 04:48:15,836 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79641.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 04:48:19,222 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79646.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:48:26,771 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79657.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:48:34,580 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=79669.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:48:48,421 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79689.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:49:05,639 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.45 vs. limit=5.0 +2022-11-16 04:49:08,151 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79718.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:49:09,341 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79720.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:49:09,915 INFO [train.py:876] (2/4) Epoch 11, batch 7000, loss[loss=0.1057, simple_loss=0.1378, pruned_loss=0.03676, over 5779.00 frames. ], tot_loss[loss=0.1137, simple_loss=0.1412, pruned_loss=0.04308, over 1079847.25 frames. ], batch size: 21, lr: 7.19e-03, grad_scale: 16.0 +2022-11-16 04:49:16,818 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79731.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 04:49:18,677 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.908e+01 1.674e+02 1.897e+02 2.324e+02 4.828e+02, threshold=3.794e+02, percent-clipped=2.0 +2022-11-16 04:49:38,281 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79762.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:49:56,268 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3636, 4.2907, 2.9675, 4.1281, 3.1959, 2.9397, 2.2680, 3.4957], + device='cuda:2'), covar=tensor([0.1512, 0.0214, 0.0986, 0.0324, 0.0722, 0.1046, 0.2014, 0.0409], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0142, 0.0158, 0.0146, 0.0175, 0.0168, 0.0163, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:50:15,416 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8615, 1.3133, 1.7121, 1.0798, 1.6703, 1.6772, 1.1953, 1.7197], + device='cuda:2'), covar=tensor([0.0362, 0.0464, 0.0284, 0.0661, 0.1202, 0.0470, 0.0731, 0.0259], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0022, 0.0015, 0.0020, 0.0016, 0.0014, 0.0021, 0.0015], + device='cuda:2'), out_proj_covar=tensor([7.9339e-05, 1.0920e-04, 8.2570e-05, 9.8390e-05, 8.4736e-05, 7.9534e-05, + 1.0476e-04, 7.9138e-05], device='cuda:2') +2022-11-16 04:50:18,626 INFO [train.py:876] (2/4) Epoch 11, batch 7100, loss[loss=0.08071, simple_loss=0.1124, pruned_loss=0.02448, over 5302.00 frames. ], tot_loss[loss=0.113, simple_loss=0.1408, pruned_loss=0.04258, over 1085397.97 frames. ], batch size: 9, lr: 7.19e-03, grad_scale: 16.0 +2022-11-16 04:50:19,390 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4910, 4.2090, 3.2602, 2.0625, 3.9037, 1.5858, 3.9034, 2.1773], + device='cuda:2'), covar=tensor([0.1371, 0.0157, 0.0670, 0.1792, 0.0205, 0.1889, 0.0266, 0.1579], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0104, 0.0115, 0.0113, 0.0101, 0.0122, 0.0099, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:50:20,098 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79823.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:50:27,111 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.051e+02 1.428e+02 1.794e+02 2.274e+02 4.053e+02, threshold=3.587e+02, percent-clipped=1.0 +2022-11-16 04:51:17,651 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.73 vs. limit=2.0 +2022-11-16 04:51:27,331 INFO [train.py:876] (2/4) Epoch 11, batch 7200, loss[loss=0.1492, simple_loss=0.1514, pruned_loss=0.07356, over 5769.00 frames. ], tot_loss[loss=0.1118, simple_loss=0.1402, pruned_loss=0.04171, over 1086332.15 frames. ], batch size: 16, lr: 7.18e-03, grad_scale: 16.0 +2022-11-16 04:51:28,755 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9738, 4.4048, 3.9348, 4.3268, 4.3358, 3.6412, 4.0008, 3.7272], + device='cuda:2'), covar=tensor([0.0571, 0.0580, 0.1729, 0.0604, 0.0612, 0.0731, 0.0964, 0.0780], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0182, 0.0276, 0.0176, 0.0221, 0.0176, 0.0190, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:51:35,801 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.528e+01 1.486e+02 1.793e+02 2.179e+02 3.743e+02, threshold=3.587e+02, percent-clipped=3.0 +2022-11-16 04:51:40,437 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79941.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:51:40,483 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79941.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 04:52:01,292 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.77 vs. limit=2.0 +2022-11-16 04:52:10,720 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2061, 2.7368, 3.9055, 3.5822, 4.2730, 3.0374, 3.6638, 4.3860], + device='cuda:2'), covar=tensor([0.0555, 0.1678, 0.0839, 0.1206, 0.0435, 0.1391, 0.1287, 0.0702], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0192, 0.0213, 0.0213, 0.0238, 0.0196, 0.0223, 0.0230], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:52:11,196 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=79989.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 04:52:11,249 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79989.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:52:57,789 INFO [train.py:876] (2/4) Epoch 12, batch 0, loss[loss=0.06403, simple_loss=0.1008, pruned_loss=0.01364, over 4727.00 frames. ], tot_loss[loss=0.06403, simple_loss=0.1008, pruned_loss=0.01364, over 4727.00 frames. ], batch size: 5, lr: 6.88e-03, grad_scale: 16.0 +2022-11-16 04:52:57,790 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 04:53:01,821 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6089, 4.1688, 3.0764, 4.0814, 3.2757, 3.0396, 2.4312, 3.5208], + device='cuda:2'), covar=tensor([0.1116, 0.0226, 0.0816, 0.0264, 0.0786, 0.0744, 0.1490, 0.0422], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0141, 0.0155, 0.0144, 0.0170, 0.0163, 0.0159, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:53:03,268 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9257, 3.7160, 3.7892, 3.4854, 3.9658, 3.8746, 3.7851, 3.9512], + device='cuda:2'), covar=tensor([0.0286, 0.0407, 0.0421, 0.0492, 0.0355, 0.0205, 0.0326, 0.0429], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0152, 0.0110, 0.0143, 0.0177, 0.0105, 0.0124, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 04:53:14,374 INFO [train.py:908] (2/4) Epoch 12, validation: loss=0.1725, simple_loss=0.1858, pruned_loss=0.07956, over 1530663.00 frames. +2022-11-16 04:53:14,375 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 04:53:15,909 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6895, 2.0134, 1.9483, 1.3354, 2.2108, 2.3900, 2.2159, 2.3650], + device='cuda:2'), covar=tensor([0.1959, 0.1503, 0.1581, 0.2713, 0.0998, 0.0990, 0.0629, 0.1190], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0183, 0.0166, 0.0187, 0.0184, 0.0203, 0.0168, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:53:31,366 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80013.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:53:36,402 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80020.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:53:43,637 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80031.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:53:45,398 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.136e+01 1.488e+02 1.843e+02 2.324e+02 4.228e+02, threshold=3.685e+02, percent-clipped=3.0 +2022-11-16 04:53:47,754 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80037.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:54:07,299 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0431, 3.8636, 3.8855, 3.6589, 4.0114, 3.7126, 1.5972, 4.2090], + device='cuda:2'), covar=tensor([0.0247, 0.0431, 0.0360, 0.0330, 0.0323, 0.0424, 0.2925, 0.0279], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0086, 0.0088, 0.0081, 0.0101, 0.0090, 0.0131, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:54:08,553 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80068.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:54:15,972 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80079.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 04:54:17,109 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-11-16 04:54:25,349 INFO [train.py:876] (2/4) Epoch 12, batch 100, loss[loss=0.1143, simple_loss=0.139, pruned_loss=0.04483, over 5638.00 frames. ], tot_loss[loss=0.1131, simple_loss=0.1417, pruned_loss=0.04229, over 434714.82 frames. ], batch size: 29, lr: 6.87e-03, grad_scale: 16.0 +2022-11-16 04:54:42,308 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80118.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:54:43,957 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-16 04:54:52,892 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.598e+01 1.606e+02 2.099e+02 2.573e+02 5.035e+02, threshold=4.198e+02, percent-clipped=4.0 +2022-11-16 04:54:55,785 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9911, 4.5951, 4.1939, 3.7660, 2.2216, 4.4749, 2.5259, 3.9978], + device='cuda:2'), covar=tensor([0.0379, 0.0103, 0.0155, 0.0265, 0.0603, 0.0134, 0.0529, 0.0119], + device='cuda:2'), in_proj_covar=tensor([0.0191, 0.0174, 0.0180, 0.0200, 0.0190, 0.0178, 0.0188, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 04:55:16,213 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3865, 3.3909, 3.4171, 3.2135, 3.5199, 3.2896, 1.3224, 3.5988], + device='cuda:2'), covar=tensor([0.0312, 0.0338, 0.0349, 0.0312, 0.0316, 0.0396, 0.3320, 0.0313], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0085, 0.0086, 0.0079, 0.0100, 0.0089, 0.0130, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 04:55:32,807 INFO [train.py:876] (2/4) Epoch 12, batch 200, loss[loss=0.08909, simple_loss=0.1091, pruned_loss=0.03453, over 5160.00 frames. ], tot_loss[loss=0.1148, simple_loss=0.1428, pruned_loss=0.04343, over 697414.30 frames. ], batch size: 8, lr: 6.87e-03, grad_scale: 16.0 +2022-11-16 04:56:01,418 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.921e+01 1.528e+02 1.725e+02 2.144e+02 5.994e+02, threshold=3.450e+02, percent-clipped=2.0 +2022-11-16 04:56:05,495 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80241.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:56:33,650 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80283.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:56:35,950 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4675, 1.2877, 1.2119, 0.7427, 1.1777, 1.4446, 0.8035, 0.9974], + device='cuda:2'), covar=tensor([0.0198, 0.0588, 0.0264, 0.0489, 0.0250, 0.0219, 0.0575, 0.0340], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0023, 0.0015, 0.0019, 0.0016, 0.0014, 0.0021, 0.0015], + device='cuda:2'), out_proj_covar=tensor([7.9731e-05, 1.0959e-04, 8.2410e-05, 9.8145e-05, 8.5108e-05, 7.9448e-05, + 1.0471e-04, 7.9183e-05], device='cuda:2') +2022-11-16 04:56:37,806 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80289.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:56:40,373 INFO [train.py:876] (2/4) Epoch 12, batch 300, loss[loss=0.08831, simple_loss=0.1214, pruned_loss=0.02763, over 5498.00 frames. ], tot_loss[loss=0.1129, simple_loss=0.1413, pruned_loss=0.04223, over 858026.04 frames. ], batch size: 12, lr: 6.86e-03, grad_scale: 16.0 +2022-11-16 04:56:53,753 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80313.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:57:08,168 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.446e+01 1.444e+02 1.739e+02 2.202e+02 4.917e+02, threshold=3.479e+02, percent-clipped=4.0 +2022-11-16 04:57:14,551 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80344.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:57:22,063 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8999, 1.6007, 1.9745, 1.1211, 1.4598, 1.8003, 1.7696, 1.6467], + device='cuda:2'), covar=tensor([0.1888, 0.1172, 0.1006, 0.1587, 0.2884, 0.2102, 0.0664, 0.0677], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0023, 0.0016, 0.0020, 0.0016, 0.0015, 0.0021, 0.0015], + device='cuda:2'), out_proj_covar=tensor([8.1199e-05, 1.1119e-04, 8.3327e-05, 9.9816e-05, 8.6343e-05, 8.0877e-05, + 1.0620e-04, 8.0474e-05], device='cuda:2') +2022-11-16 04:57:25,856 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80361.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:57:47,437 INFO [train.py:876] (2/4) Epoch 12, batch 400, loss[loss=0.08294, simple_loss=0.1287, pruned_loss=0.01859, over 5722.00 frames. ], tot_loss[loss=0.1133, simple_loss=0.1416, pruned_loss=0.04254, over 950643.29 frames. ], batch size: 15, lr: 6.86e-03, grad_scale: 16.0 +2022-11-16 04:57:47,573 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80393.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:57:54,803 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0914, 4.8981, 3.7022, 2.2356, 4.4320, 2.2031, 4.4454, 2.7994], + device='cuda:2'), covar=tensor([0.1155, 0.0134, 0.0544, 0.2137, 0.0181, 0.1717, 0.0205, 0.1461], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0104, 0.0115, 0.0113, 0.0101, 0.0121, 0.0100, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:58:04,476 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80418.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:58:06,753 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-11-16 04:58:15,554 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.295e+01 1.482e+02 1.820e+02 2.321e+02 3.687e+02, threshold=3.641e+02, percent-clipped=1.0 +2022-11-16 04:58:25,628 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1412, 4.7090, 4.9530, 4.6353, 5.1977, 5.0378, 4.4731, 5.1872], + device='cuda:2'), covar=tensor([0.0353, 0.0314, 0.0407, 0.0309, 0.0323, 0.0176, 0.0255, 0.0234], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0156, 0.0112, 0.0146, 0.0180, 0.0107, 0.0128, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 04:58:28,308 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80454.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:58:34,737 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80463.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:58:36,494 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80466.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:58:39,720 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0128, 2.3953, 3.5272, 3.1115, 3.9578, 2.4649, 3.3379, 3.9174], + device='cuda:2'), covar=tensor([0.0613, 0.1613, 0.0830, 0.1522, 0.0585, 0.1628, 0.1268, 0.0791], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0192, 0.0213, 0.0212, 0.0238, 0.0197, 0.0225, 0.0231], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 04:58:54,188 INFO [train.py:876] (2/4) Epoch 12, batch 500, loss[loss=0.0787, simple_loss=0.1133, pruned_loss=0.02204, over 5471.00 frames. ], tot_loss[loss=0.1125, simple_loss=0.1409, pruned_loss=0.04208, over 1003681.25 frames. ], batch size: 10, lr: 6.86e-03, grad_scale: 16.0 +2022-11-16 04:58:54,998 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80494.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:59:15,502 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80524.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 04:59:22,649 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.271e+01 1.520e+02 1.906e+02 2.365e+02 4.910e+02, threshold=3.812e+02, percent-clipped=3.0 +2022-11-16 04:59:36,595 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80555.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:00:01,974 INFO [train.py:876] (2/4) Epoch 12, batch 600, loss[loss=0.09747, simple_loss=0.1252, pruned_loss=0.03487, over 5423.00 frames. ], tot_loss[loss=0.113, simple_loss=0.1412, pruned_loss=0.04246, over 1032943.18 frames. ], batch size: 9, lr: 6.85e-03, grad_scale: 16.0 +2022-11-16 05:00:30,597 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.813e+01 1.490e+02 1.792e+02 2.361e+02 3.754e+02, threshold=3.583e+02, percent-clipped=0.0 +2022-11-16 05:00:33,649 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80639.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:01:10,387 INFO [train.py:876] (2/4) Epoch 12, batch 700, loss[loss=0.1142, simple_loss=0.1484, pruned_loss=0.04, over 5799.00 frames. ], tot_loss[loss=0.1106, simple_loss=0.1393, pruned_loss=0.04091, over 1053811.19 frames. ], batch size: 22, lr: 6.85e-03, grad_scale: 16.0 +2022-11-16 05:01:27,792 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4884, 1.9216, 1.6868, 1.3509, 1.5813, 2.0856, 2.0135, 2.0912], + device='cuda:2'), covar=tensor([0.1611, 0.1178, 0.2021, 0.2399, 0.1510, 0.1065, 0.0802, 0.1253], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0183, 0.0170, 0.0189, 0.0186, 0.0204, 0.0169, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:01:27,822 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5597, 1.8993, 2.3144, 2.2811, 2.3972, 1.7564, 2.2500, 2.5233], + device='cuda:2'), covar=tensor([0.0542, 0.0906, 0.0647, 0.0580, 0.0629, 0.1115, 0.0730, 0.0573], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0194, 0.0215, 0.0213, 0.0237, 0.0197, 0.0223, 0.0230], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:01:38,616 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.003e+02 1.475e+02 1.756e+02 2.175e+02 4.412e+02, threshold=3.511e+02, percent-clipped=5.0 +2022-11-16 05:01:39,453 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80736.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:01:48,506 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80749.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:02:04,773 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80773.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:02:17,709 INFO [train.py:876] (2/4) Epoch 12, batch 800, loss[loss=0.1201, simple_loss=0.147, pruned_loss=0.04665, over 5612.00 frames. ], tot_loss[loss=0.1113, simple_loss=0.1403, pruned_loss=0.04111, over 1072598.49 frames. ], batch size: 38, lr: 6.84e-03, grad_scale: 16.0 +2022-11-16 05:02:20,489 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80797.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:02:35,213 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80819.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:02:45,861 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80834.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:02:46,324 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.454e+01 1.512e+02 1.839e+02 2.382e+02 3.696e+02, threshold=3.678e+02, percent-clipped=2.0 +2022-11-16 05:02:49,445 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-16 05:02:56,117 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80850.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:03:15,657 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.56 vs. limit=2.0 +2022-11-16 05:03:22,586 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.82 vs. limit=5.0 +2022-11-16 05:03:25,609 INFO [train.py:876] (2/4) Epoch 12, batch 900, loss[loss=0.1147, simple_loss=0.1426, pruned_loss=0.0434, over 5555.00 frames. ], tot_loss[loss=0.1097, simple_loss=0.1389, pruned_loss=0.04022, over 1084197.57 frames. ], batch size: 25, lr: 6.84e-03, grad_scale: 8.0 +2022-11-16 05:03:30,376 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3210, 4.4778, 4.1374, 3.7771, 2.3667, 4.5162, 2.4274, 3.7217], + device='cuda:2'), covar=tensor([0.0330, 0.0190, 0.0161, 0.0523, 0.0670, 0.0154, 0.0554, 0.0204], + device='cuda:2'), in_proj_covar=tensor([0.0190, 0.0174, 0.0178, 0.0201, 0.0189, 0.0177, 0.0188, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 05:03:50,449 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4391, 5.1436, 4.6427, 5.1647, 4.9967, 4.1331, 4.5901, 4.3489], + device='cuda:2'), covar=tensor([0.0367, 0.0296, 0.1122, 0.0289, 0.0441, 0.0568, 0.0407, 0.0379], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0179, 0.0277, 0.0174, 0.0221, 0.0173, 0.0190, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:03:56,034 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.157e+02 1.546e+02 1.867e+02 2.262e+02 5.374e+02, threshold=3.734e+02, percent-clipped=6.0 +2022-11-16 05:03:58,281 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80939.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:04:18,687 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-11-16 05:04:20,407 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5585, 5.3875, 4.8533, 5.4322, 5.3992, 4.7151, 4.9817, 4.6846], + device='cuda:2'), covar=tensor([0.0258, 0.0414, 0.1152, 0.0287, 0.0310, 0.0458, 0.0565, 0.0678], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0180, 0.0277, 0.0174, 0.0221, 0.0173, 0.0190, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:04:20,741 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-11-16 05:04:33,108 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80987.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:04:35,540 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0260, 2.5217, 3.2585, 1.4804, 3.0378, 3.6336, 3.2134, 3.3514], + device='cuda:2'), covar=tensor([0.2494, 0.1828, 0.0831, 0.3184, 0.0815, 0.0743, 0.0678, 0.1047], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0183, 0.0168, 0.0186, 0.0186, 0.0203, 0.0167, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:04:37,287 INFO [train.py:876] (2/4) Epoch 12, batch 1000, loss[loss=0.07495, simple_loss=0.1191, pruned_loss=0.01542, over 5505.00 frames. ], tot_loss[loss=0.11, simple_loss=0.1386, pruned_loss=0.04074, over 1080078.64 frames. ], batch size: 17, lr: 6.83e-03, grad_scale: 8.0 +2022-11-16 05:05:06,153 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.560e+01 1.507e+02 1.766e+02 2.175e+02 5.874e+02, threshold=3.531e+02, percent-clipped=3.0 +2022-11-16 05:05:15,380 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81049.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:05:21,899 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7219, 1.5798, 1.7207, 1.4147, 1.7865, 2.1412, 1.2599, 1.4029], + device='cuda:2'), covar=tensor([0.1081, 0.0604, 0.0668, 0.0728, 0.0779, 0.0378, 0.0851, 0.0528], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0023, 0.0016, 0.0020, 0.0016, 0.0015, 0.0021, 0.0015], + device='cuda:2'), out_proj_covar=tensor([8.0063e-05, 1.1132e-04, 8.4239e-05, 9.9190e-05, 8.6286e-05, 8.1297e-05, + 1.0595e-04, 8.0500e-05], device='cuda:2') +2022-11-16 05:05:43,843 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81092.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:05:44,404 INFO [train.py:876] (2/4) Epoch 12, batch 1100, loss[loss=0.1019, simple_loss=0.1377, pruned_loss=0.03307, over 5596.00 frames. ], tot_loss[loss=0.1102, simple_loss=0.1392, pruned_loss=0.04067, over 1084385.46 frames. ], batch size: 24, lr: 6.83e-03, grad_scale: 8.0 +2022-11-16 05:05:44,572 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7392, 1.1286, 0.8099, 0.7082, 0.8655, 1.0931, 0.6995, 1.1261], + device='cuda:2'), covar=tensor([0.0084, 0.0042, 0.0078, 0.0049, 0.0064, 0.0058, 0.0100, 0.0050], + device='cuda:2'), in_proj_covar=tensor([0.0059, 0.0056, 0.0056, 0.0060, 0.0058, 0.0053, 0.0052, 0.0050], + device='cuda:2'), out_proj_covar=tensor([5.3476e-05, 5.0031e-05, 4.8991e-05, 5.3601e-05, 5.1005e-05, 4.6555e-05, + 4.6564e-05, 4.3959e-05], device='cuda:2') +2022-11-16 05:05:47,051 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81097.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:06:00,240 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81116.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:06:02,123 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81119.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:06:09,123 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81129.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:06:13,617 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.779e+01 1.469e+02 1.860e+02 2.387e+02 4.762e+02, threshold=3.720e+02, percent-clipped=3.0 +2022-11-16 05:06:18,532 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.29 vs. limit=5.0 +2022-11-16 05:06:23,038 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81150.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:06:32,645 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6515, 3.1022, 3.6003, 4.3638, 4.4349, 3.7261, 3.1211, 4.4290], + device='cuda:2'), covar=tensor([0.0303, 0.2558, 0.1561, 0.2206, 0.0639, 0.1888, 0.1927, 0.0543], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0196, 0.0189, 0.0295, 0.0221, 0.0200, 0.0189, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-11-16 05:06:34,435 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81167.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:06:41,060 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=81177.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:06:44,362 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-11-16 05:06:52,015 INFO [train.py:876] (2/4) Epoch 12, batch 1200, loss[loss=0.1585, simple_loss=0.1776, pruned_loss=0.06965, over 5619.00 frames. ], tot_loss[loss=0.1116, simple_loss=0.1399, pruned_loss=0.04169, over 1076680.87 frames. ], batch size: 32, lr: 6.83e-03, grad_scale: 8.0 +2022-11-16 05:06:55,259 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81198.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:07:09,959 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2733, 4.2390, 3.2916, 1.9336, 3.8951, 1.5422, 3.8063, 2.3298], + device='cuda:2'), covar=tensor([0.1693, 0.0146, 0.0685, 0.1994, 0.0199, 0.2130, 0.0303, 0.1628], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0104, 0.0115, 0.0112, 0.0100, 0.0121, 0.0099, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:07:20,807 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.019e+02 1.439e+02 1.852e+02 2.463e+02 6.772e+02, threshold=3.705e+02, percent-clipped=5.0 +2022-11-16 05:07:55,702 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.5256, 5.1322, 5.4312, 5.0133, 5.5555, 5.4442, 4.7897, 5.6528], + device='cuda:2'), covar=tensor([0.0373, 0.0321, 0.0381, 0.0283, 0.0410, 0.0213, 0.0265, 0.0198], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0155, 0.0112, 0.0145, 0.0179, 0.0107, 0.0127, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 05:07:59,530 INFO [train.py:876] (2/4) Epoch 12, batch 1300, loss[loss=0.09415, simple_loss=0.1318, pruned_loss=0.02825, over 5753.00 frames. ], tot_loss[loss=0.1109, simple_loss=0.1393, pruned_loss=0.04126, over 1081058.50 frames. ], batch size: 14, lr: 6.82e-03, grad_scale: 8.0 +2022-11-16 05:08:01,924 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.4625, 3.3133, 3.3087, 3.4788, 3.2121, 2.9309, 3.8149, 3.3456], + device='cuda:2'), covar=tensor([0.0442, 0.0908, 0.0624, 0.1155, 0.0586, 0.0531, 0.0696, 0.0809], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0109, 0.0095, 0.0119, 0.0088, 0.0080, 0.0145, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:08:03,914 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81299.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:08:05,231 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81301.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:08:28,630 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.965e+01 1.370e+02 1.672e+02 2.098e+02 3.683e+02, threshold=3.343e+02, percent-clipped=0.0 +2022-11-16 05:08:37,240 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1860, 4.0601, 2.7076, 3.9193, 3.1816, 2.8952, 2.1276, 3.2713], + device='cuda:2'), covar=tensor([0.1661, 0.0318, 0.1234, 0.0342, 0.0899, 0.1097, 0.2097, 0.0574], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0141, 0.0157, 0.0148, 0.0174, 0.0167, 0.0161, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:08:44,918 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=81360.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:08:46,217 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=81362.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:08:50,341 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.66 vs. limit=2.0 +2022-11-16 05:09:06,330 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81392.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:09:06,899 INFO [train.py:876] (2/4) Epoch 12, batch 1400, loss[loss=0.1264, simple_loss=0.1427, pruned_loss=0.05507, over 5142.00 frames. ], tot_loss[loss=0.1115, simple_loss=0.1397, pruned_loss=0.04161, over 1082514.38 frames. ], batch size: 91, lr: 6.82e-03, grad_scale: 8.0 +2022-11-16 05:09:08,349 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7927, 1.3270, 1.8277, 1.4988, 1.7145, 2.1256, 1.6884, 1.5702], + device='cuda:2'), covar=tensor([0.0048, 0.0106, 0.0032, 0.0057, 0.0096, 0.0077, 0.0042, 0.0058], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0027, 0.0027, 0.0036, 0.0031, 0.0029, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([2.6647e-05, 2.5140e-05, 2.4398e-05, 3.5060e-05, 2.8902e-05, 2.7066e-05, + 3.4171e-05, 3.2968e-05], device='cuda:2') +2022-11-16 05:09:31,116 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81429.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:09:31,139 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3113, 1.4363, 1.9396, 1.8734, 1.9766, 2.2586, 1.9677, 1.7677], + device='cuda:2'), covar=tensor([0.0034, 0.0071, 0.0042, 0.0063, 0.0126, 0.0071, 0.0046, 0.0049], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0027, 0.0027, 0.0036, 0.0031, 0.0028, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([2.6273e-05, 2.4799e-05, 2.4065e-05, 3.4737e-05, 2.8545e-05, 2.6795e-05, + 3.3959e-05, 3.2636e-05], device='cuda:2') +2022-11-16 05:09:33,399 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7337, 0.6394, 0.8152, 0.6673, 0.7860, 0.7110, 0.4120, 0.7112], + device='cuda:2'), covar=tensor([0.0167, 0.0275, 0.0198, 0.0228, 0.0184, 0.0188, 0.0461, 0.0180], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0023, 0.0016, 0.0020, 0.0016, 0.0015, 0.0021, 0.0015], + device='cuda:2'), out_proj_covar=tensor([8.0445e-05, 1.1137e-04, 8.4326e-05, 9.9048e-05, 8.6961e-05, 8.2078e-05, + 1.0505e-04, 8.0155e-05], device='cuda:2') +2022-11-16 05:09:35,795 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.508e+01 1.575e+02 1.879e+02 2.348e+02 6.067e+02, threshold=3.757e+02, percent-clipped=7.0 +2022-11-16 05:09:38,311 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7755, 2.5178, 3.4902, 3.0357, 3.4504, 2.4861, 3.1767, 3.7445], + device='cuda:2'), covar=tensor([0.0747, 0.1461, 0.0737, 0.1463, 0.0871, 0.1492, 0.1177, 0.0794], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0192, 0.0214, 0.0211, 0.0238, 0.0195, 0.0222, 0.0231], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:09:38,802 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81440.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:09:59,257 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-11-16 05:10:00,101 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81472.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:10:03,341 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81477.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:10:06,609 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5465, 4.0996, 4.4102, 4.1122, 4.5912, 4.4054, 4.1775, 4.5852], + device='cuda:2'), covar=tensor([0.0348, 0.0385, 0.0392, 0.0298, 0.0364, 0.0263, 0.0316, 0.0302], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0151, 0.0109, 0.0140, 0.0176, 0.0104, 0.0123, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 05:10:11,585 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-11-16 05:10:14,236 INFO [train.py:876] (2/4) Epoch 12, batch 1500, loss[loss=0.1276, simple_loss=0.1594, pruned_loss=0.04788, over 5766.00 frames. ], tot_loss[loss=0.1126, simple_loss=0.1406, pruned_loss=0.04231, over 1078622.32 frames. ], batch size: 27, lr: 6.81e-03, grad_scale: 8.0 +2022-11-16 05:10:42,646 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.569e+01 1.444e+02 1.773e+02 2.242e+02 5.218e+02, threshold=3.547e+02, percent-clipped=2.0 +2022-11-16 05:11:21,231 INFO [train.py:876] (2/4) Epoch 12, batch 1600, loss[loss=0.1668, simple_loss=0.1728, pruned_loss=0.08037, over 5355.00 frames. ], tot_loss[loss=0.1133, simple_loss=0.141, pruned_loss=0.04279, over 1079032.82 frames. ], batch size: 70, lr: 6.81e-03, grad_scale: 8.0 +2022-11-16 05:11:34,970 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.10 vs. limit=2.0 +2022-11-16 05:11:51,016 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.578e+01 1.477e+02 1.862e+02 2.272e+02 5.995e+02, threshold=3.723e+02, percent-clipped=4.0 +2022-11-16 05:11:56,795 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8896, 1.9451, 2.3013, 2.0950, 1.3167, 1.9575, 1.4689, 1.8203], + device='cuda:2'), covar=tensor([0.0216, 0.0119, 0.0124, 0.0151, 0.0322, 0.0141, 0.0327, 0.0176], + device='cuda:2'), in_proj_covar=tensor([0.0190, 0.0175, 0.0179, 0.0199, 0.0190, 0.0178, 0.0188, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 05:12:04,149 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81655.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:12:05,445 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81657.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:12:14,081 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81670.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:12:15,479 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.73 vs. limit=2.0 +2022-11-16 05:12:16,084 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-11-16 05:12:20,617 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5033, 1.8295, 1.5822, 1.2010, 1.6528, 0.9512, 1.8800, 1.1589], + device='cuda:2'), covar=tensor([0.1228, 0.0442, 0.1309, 0.1423, 0.0506, 0.2300, 0.0475, 0.1418], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0105, 0.0115, 0.0112, 0.0100, 0.0121, 0.0100, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:12:29,717 INFO [train.py:876] (2/4) Epoch 12, batch 1700, loss[loss=0.1204, simple_loss=0.1467, pruned_loss=0.04704, over 5578.00 frames. ], tot_loss[loss=0.1121, simple_loss=0.1401, pruned_loss=0.04204, over 1074441.87 frames. ], batch size: 40, lr: 6.80e-03, grad_scale: 8.0 +2022-11-16 05:12:40,808 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.63 vs. limit=5.0 +2022-11-16 05:12:55,329 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=81731.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 05:12:59,065 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.982e+01 1.420e+02 1.796e+02 2.143e+02 4.079e+02, threshold=3.592e+02, percent-clipped=2.0 +2022-11-16 05:13:23,668 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81772.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:13:37,839 INFO [train.py:876] (2/4) Epoch 12, batch 1800, loss[loss=0.1836, simple_loss=0.1956, pruned_loss=0.08584, over 5574.00 frames. ], tot_loss[loss=0.1119, simple_loss=0.1399, pruned_loss=0.04196, over 1080472.26 frames. ], batch size: 43, lr: 6.80e-03, grad_scale: 8.0 +2022-11-16 05:13:56,462 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81820.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:13:59,037 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8150, 4.6883, 5.1238, 4.6978, 4.5899, 4.5544, 5.2776, 4.8934], + device='cuda:2'), covar=tensor([0.0342, 0.1318, 0.0342, 0.1494, 0.0369, 0.0281, 0.0747, 0.0505], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0111, 0.0097, 0.0122, 0.0091, 0.0082, 0.0148, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:14:06,569 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.901e+01 1.448e+02 1.812e+02 2.296e+02 5.003e+02, threshold=3.624e+02, percent-clipped=1.0 +2022-11-16 05:14:45,099 INFO [train.py:876] (2/4) Epoch 12, batch 1900, loss[loss=0.1255, simple_loss=0.1509, pruned_loss=0.05011, over 5581.00 frames. ], tot_loss[loss=0.1126, simple_loss=0.1407, pruned_loss=0.04222, over 1080292.70 frames. ], batch size: 25, lr: 6.80e-03, grad_scale: 8.0 +2022-11-16 05:15:14,249 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.303e+01 1.364e+02 1.744e+02 2.090e+02 4.733e+02, threshold=3.488e+02, percent-clipped=3.0 +2022-11-16 05:15:27,308 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81955.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:15:28,625 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81957.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:15:48,681 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81987.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:15:52,895 INFO [train.py:876] (2/4) Epoch 12, batch 2000, loss[loss=0.08674, simple_loss=0.1292, pruned_loss=0.02216, over 5534.00 frames. ], tot_loss[loss=0.1102, simple_loss=0.139, pruned_loss=0.0407, over 1077690.57 frames. ], batch size: 13, lr: 6.79e-03, grad_scale: 8.0 +2022-11-16 05:15:59,646 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82003.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:16:01,283 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82005.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:16:15,350 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82026.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 05:16:16,138 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0073, 1.5878, 2.1441, 1.5941, 2.2421, 1.9569, 1.5916, 1.4996], + device='cuda:2'), covar=tensor([0.0536, 0.0516, 0.0288, 0.0606, 0.0495, 0.0424, 0.0482, 0.0736], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0023, 0.0016, 0.0020, 0.0016, 0.0015, 0.0022, 0.0015], + device='cuda:2'), out_proj_covar=tensor([8.1281e-05, 1.1301e-04, 8.5127e-05, 9.9520e-05, 8.7592e-05, 8.2317e-05, + 1.0701e-04, 8.1337e-05], device='cuda:2') +2022-11-16 05:16:22,027 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.158e+01 1.417e+02 1.705e+02 2.224e+02 2.971e+02, threshold=3.410e+02, percent-clipped=0.0 +2022-11-16 05:16:30,584 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82048.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:16:35,231 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82055.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:16:45,537 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2041, 3.8416, 4.0000, 3.7439, 4.2110, 3.9778, 3.7961, 4.1682], + device='cuda:2'), covar=tensor([0.0357, 0.0347, 0.0469, 0.0317, 0.0352, 0.0312, 0.0376, 0.0362], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0149, 0.0108, 0.0139, 0.0174, 0.0103, 0.0122, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 05:17:00,878 INFO [train.py:876] (2/4) Epoch 12, batch 2100, loss[loss=0.07789, simple_loss=0.1204, pruned_loss=0.01771, over 5762.00 frames. ], tot_loss[loss=0.1098, simple_loss=0.1389, pruned_loss=0.04035, over 1077836.66 frames. ], batch size: 15, lr: 6.79e-03, grad_scale: 8.0 +2022-11-16 05:17:16,451 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82116.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:17:29,603 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.760e+01 1.480e+02 1.837e+02 2.085e+02 4.685e+02, threshold=3.673e+02, percent-clipped=4.0 +2022-11-16 05:17:33,199 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-11-16 05:18:07,797 INFO [train.py:876] (2/4) Epoch 12, batch 2200, loss[loss=0.1006, simple_loss=0.142, pruned_loss=0.0296, over 5786.00 frames. ], tot_loss[loss=0.111, simple_loss=0.1399, pruned_loss=0.04108, over 1083025.04 frames. ], batch size: 21, lr: 6.78e-03, grad_scale: 8.0 +2022-11-16 05:18:17,948 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-16 05:18:27,755 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0459, 4.2879, 2.5662, 3.9889, 3.3061, 2.7930, 2.4154, 3.5149], + device='cuda:2'), covar=tensor([0.1687, 0.0255, 0.1246, 0.0431, 0.0828, 0.1059, 0.1654, 0.0520], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0142, 0.0159, 0.0150, 0.0178, 0.0168, 0.0161, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:18:28,382 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8579, 4.4885, 4.6677, 4.3865, 4.9161, 4.6718, 4.3691, 4.8996], + device='cuda:2'), covar=tensor([0.0306, 0.0291, 0.0374, 0.0314, 0.0241, 0.0219, 0.0241, 0.0226], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0150, 0.0109, 0.0140, 0.0174, 0.0105, 0.0122, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 05:18:37,331 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.617e+01 1.469e+02 1.927e+02 2.568e+02 3.955e+02, threshold=3.853e+02, percent-clipped=1.0 +2022-11-16 05:18:50,692 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5955, 3.6659, 3.5156, 3.4296, 3.6004, 3.6494, 1.4103, 3.7086], + device='cuda:2'), covar=tensor([0.0293, 0.0308, 0.0511, 0.0313, 0.0403, 0.0412, 0.3203, 0.0379], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0088, 0.0090, 0.0082, 0.0103, 0.0090, 0.0133, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:19:04,973 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-11-16 05:19:15,476 INFO [train.py:876] (2/4) Epoch 12, batch 2300, loss[loss=0.09934, simple_loss=0.1373, pruned_loss=0.03068, over 5535.00 frames. ], tot_loss[loss=0.1088, simple_loss=0.138, pruned_loss=0.03986, over 1085014.36 frames. ], batch size: 13, lr: 6.78e-03, grad_scale: 8.0 +2022-11-16 05:19:22,732 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0184, 4.8610, 4.6068, 4.5975, 4.8992, 4.9140, 2.2915, 5.0985], + device='cuda:2'), covar=tensor([0.0179, 0.0238, 0.0360, 0.0193, 0.0370, 0.0334, 0.2747, 0.0280], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0089, 0.0090, 0.0082, 0.0103, 0.0091, 0.0133, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:19:34,123 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4342, 4.1829, 3.2332, 1.8688, 3.8388, 1.6634, 3.7671, 2.0660], + device='cuda:2'), covar=tensor([0.1299, 0.0124, 0.0606, 0.1954, 0.0208, 0.1764, 0.0303, 0.1567], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0102, 0.0113, 0.0109, 0.0099, 0.0119, 0.0098, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:19:37,392 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=82326.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:19:44,122 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.080e+02 1.401e+02 1.761e+02 2.241e+02 4.168e+02, threshold=3.523e+02, percent-clipped=1.0 +2022-11-16 05:19:49,493 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82343.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:19:51,462 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82346.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:20:09,898 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82374.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:20:10,649 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5731, 2.5968, 2.2756, 2.7397, 1.9609, 2.0147, 2.4346, 2.9244], + device='cuda:2'), covar=tensor([0.1297, 0.1326, 0.2619, 0.1345, 0.2198, 0.1545, 0.1824, 0.1825], + device='cuda:2'), in_proj_covar=tensor([0.0111, 0.0105, 0.0104, 0.0103, 0.0091, 0.0101, 0.0098, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 05:20:22,511 INFO [train.py:876] (2/4) Epoch 12, batch 2400, loss[loss=0.08703, simple_loss=0.1183, pruned_loss=0.0279, over 5724.00 frames. ], tot_loss[loss=0.1087, simple_loss=0.1379, pruned_loss=0.03972, over 1082129.12 frames. ], batch size: 12, lr: 6.78e-03, grad_scale: 8.0 +2022-11-16 05:20:23,983 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7226, 4.0509, 3.6115, 4.0567, 3.9641, 3.3853, 3.6071, 3.4319], + device='cuda:2'), covar=tensor([0.0574, 0.0422, 0.1350, 0.0404, 0.0485, 0.0554, 0.0708, 0.0638], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0177, 0.0273, 0.0172, 0.0219, 0.0170, 0.0187, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:20:27,038 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3187, 2.2723, 2.4511, 3.5297, 3.3731, 2.7099, 2.3126, 3.4359], + device='cuda:2'), covar=tensor([0.1614, 0.2924, 0.2960, 0.2384, 0.2032, 0.3100, 0.2500, 0.1688], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0196, 0.0189, 0.0300, 0.0223, 0.0206, 0.0193, 0.0244], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 05:20:32,557 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82407.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:20:34,691 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-11-16 05:20:35,165 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82411.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:20:37,953 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0897, 2.1415, 2.8208, 2.6449, 2.5346, 2.0950, 2.6037, 3.0698], + device='cuda:2'), covar=tensor([0.0587, 0.1161, 0.0926, 0.1155, 0.1241, 0.1250, 0.0964, 0.0709], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0198, 0.0220, 0.0217, 0.0242, 0.0201, 0.0228, 0.0234], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:20:51,660 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.786e+01 1.416e+02 1.815e+02 2.094e+02 3.842e+02, threshold=3.631e+02, percent-clipped=3.0 +2022-11-16 05:21:29,661 INFO [train.py:876] (2/4) Epoch 12, batch 2500, loss[loss=0.1396, simple_loss=0.1654, pruned_loss=0.05687, over 5683.00 frames. ], tot_loss[loss=0.1103, simple_loss=0.1393, pruned_loss=0.04066, over 1087598.37 frames. ], batch size: 34, lr: 6.77e-03, grad_scale: 8.0 +2022-11-16 05:21:29,821 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82493.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 05:21:58,347 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.955e+01 1.483e+02 1.924e+02 2.282e+02 6.804e+02, threshold=3.849e+02, percent-clipped=1.0 +2022-11-16 05:22:10,936 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82554.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 05:22:21,439 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82569.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:22:24,771 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6386, 2.1739, 2.6173, 3.6140, 3.6110, 2.5962, 2.3641, 3.6248], + device='cuda:2'), covar=tensor([0.0939, 0.2917, 0.2387, 0.2837, 0.1139, 0.3359, 0.2310, 0.0814], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0197, 0.0188, 0.0301, 0.0222, 0.0205, 0.0193, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 05:22:37,473 INFO [train.py:876] (2/4) Epoch 12, batch 2600, loss[loss=0.0667, simple_loss=0.09503, pruned_loss=0.01919, over 5543.00 frames. ], tot_loss[loss=0.109, simple_loss=0.1383, pruned_loss=0.03989, over 1084213.61 frames. ], batch size: 10, lr: 6.77e-03, grad_scale: 8.0 +2022-11-16 05:22:43,660 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.43 vs. limit=2.0 +2022-11-16 05:23:03,091 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82630.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:23:06,798 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.089e+01 1.435e+02 1.814e+02 2.327e+02 4.653e+02, threshold=3.628e+02, percent-clipped=2.0 +2022-11-16 05:23:11,768 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=82643.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:23:37,030 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-11-16 05:23:43,823 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82691.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:23:45,103 INFO [train.py:876] (2/4) Epoch 12, batch 2700, loss[loss=0.05553, simple_loss=0.08123, pruned_loss=0.01491, over 5381.00 frames. ], tot_loss[loss=0.1085, simple_loss=0.1377, pruned_loss=0.03961, over 1083104.25 frames. ], batch size: 9, lr: 6.76e-03, grad_scale: 8.0 +2022-11-16 05:23:51,439 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82702.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:23:54,776 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4342, 2.8346, 4.1201, 3.6579, 4.5656, 3.0653, 3.9561, 4.4601], + device='cuda:2'), covar=tensor([0.0495, 0.1257, 0.0547, 0.1100, 0.0267, 0.1267, 0.1074, 0.0723], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0196, 0.0219, 0.0216, 0.0240, 0.0199, 0.0225, 0.0231], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:23:57,158 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=82711.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:24:14,435 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.009e+02 1.412e+02 1.711e+02 2.254e+02 4.662e+02, threshold=3.423e+02, percent-clipped=2.0 +2022-11-16 05:24:19,151 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2119, 2.2310, 2.8359, 1.7607, 1.3368, 3.0305, 2.5993, 2.2989], + device='cuda:2'), covar=tensor([0.1280, 0.1241, 0.0692, 0.2541, 0.2597, 0.1087, 0.0803, 0.1086], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0093, 0.0093, 0.0099, 0.0073, 0.0067, 0.0076, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 05:24:29,687 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82759.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:24:43,027 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2201, 4.6762, 4.2216, 4.7335, 4.6495, 3.8950, 4.3645, 4.0250], + device='cuda:2'), covar=tensor([0.0334, 0.0409, 0.1211, 0.0343, 0.0373, 0.0436, 0.0778, 0.0601], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0177, 0.0271, 0.0171, 0.0219, 0.0168, 0.0187, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:24:52,721 INFO [train.py:876] (2/4) Epoch 12, batch 2800, loss[loss=0.2677, simple_loss=0.2288, pruned_loss=0.1533, over 3066.00 frames. ], tot_loss[loss=0.1082, simple_loss=0.1375, pruned_loss=0.03941, over 1084861.94 frames. ], batch size: 284, lr: 6.76e-03, grad_scale: 8.0 +2022-11-16 05:25:21,517 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.023e+02 1.398e+02 1.709e+02 2.150e+02 5.035e+02, threshold=3.419e+02, percent-clipped=1.0 +2022-11-16 05:25:30,442 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82849.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 05:25:39,216 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9174, 2.7489, 2.7129, 2.9273, 2.3669, 2.3620, 2.8444, 3.3542], + device='cuda:2'), covar=tensor([0.1053, 0.1653, 0.2092, 0.1389, 0.1742, 0.1567, 0.1347, 0.0759], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0106, 0.0105, 0.0105, 0.0093, 0.0103, 0.0099, 0.0081], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 05:25:59,799 INFO [train.py:876] (2/4) Epoch 12, batch 2900, loss[loss=0.09738, simple_loss=0.1389, pruned_loss=0.02795, over 5551.00 frames. ], tot_loss[loss=0.1071, simple_loss=0.1369, pruned_loss=0.03867, over 1089171.98 frames. ], batch size: 15, lr: 6.76e-03, grad_scale: 16.0 +2022-11-16 05:26:21,472 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82925.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:26:28,505 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.244e+01 1.420e+02 1.709e+02 2.033e+02 4.998e+02, threshold=3.418e+02, percent-clipped=2.0 +2022-11-16 05:26:43,108 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.70 vs. limit=2.0 +2022-11-16 05:26:47,410 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82963.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:26:59,915 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-16 05:27:07,644 INFO [train.py:876] (2/4) Epoch 12, batch 3000, loss[loss=0.09779, simple_loss=0.1363, pruned_loss=0.02964, over 5725.00 frames. ], tot_loss[loss=0.1082, simple_loss=0.1373, pruned_loss=0.03954, over 1089349.43 frames. ], batch size: 31, lr: 6.75e-03, grad_scale: 16.0 +2022-11-16 05:27:07,644 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 05:27:25,141 INFO [train.py:908] (2/4) Epoch 12, validation: loss=0.1722, simple_loss=0.1854, pruned_loss=0.07947, over 1530663.00 frames. +2022-11-16 05:27:25,142 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 05:27:31,381 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83002.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:27:34,813 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.79 vs. limit=2.0 +2022-11-16 05:27:45,970 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=83024.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:27:53,840 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.307e+01 1.447e+02 1.727e+02 2.150e+02 3.702e+02, threshold=3.454e+02, percent-clipped=1.0 +2022-11-16 05:28:03,084 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83050.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:28:32,008 INFO [train.py:876] (2/4) Epoch 12, batch 3100, loss[loss=0.06241, simple_loss=0.1011, pruned_loss=0.01186, over 5708.00 frames. ], tot_loss[loss=0.1096, simple_loss=0.1383, pruned_loss=0.04049, over 1089434.13 frames. ], batch size: 15, lr: 6.75e-03, grad_scale: 16.0 +2022-11-16 05:28:46,450 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-11-16 05:28:54,243 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-11-16 05:29:01,263 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.033e+02 1.527e+02 1.920e+02 2.292e+02 4.866e+02, threshold=3.839e+02, percent-clipped=2.0 +2022-11-16 05:29:09,695 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83149.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 05:29:10,548 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.62 vs. limit=2.0 +2022-11-16 05:29:39,571 INFO [train.py:876] (2/4) Epoch 12, batch 3200, loss[loss=0.1017, simple_loss=0.1286, pruned_loss=0.0374, over 5547.00 frames. ], tot_loss[loss=0.1099, simple_loss=0.139, pruned_loss=0.04036, over 1088156.65 frames. ], batch size: 14, lr: 6.74e-03, grad_scale: 16.0 +2022-11-16 05:29:41,737 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83196.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:29:42,243 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83197.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 05:30:01,768 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83225.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:30:09,032 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.049e+02 1.526e+02 1.871e+02 2.239e+02 5.623e+02, threshold=3.742e+02, percent-clipped=1.0 +2022-11-16 05:30:19,796 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.14 vs. limit=2.0 +2022-11-16 05:30:23,052 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=83257.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:30:33,806 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83273.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:30:41,812 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83284.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:30:47,951 INFO [train.py:876] (2/4) Epoch 12, batch 3300, loss[loss=0.1184, simple_loss=0.1512, pruned_loss=0.04282, over 5575.00 frames. ], tot_loss[loss=0.1105, simple_loss=0.1396, pruned_loss=0.04072, over 1089411.46 frames. ], batch size: 43, lr: 6.74e-03, grad_scale: 16.0 +2022-11-16 05:30:51,703 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.71 vs. limit=5.0 +2022-11-16 05:30:57,352 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6578, 5.0743, 3.1326, 4.7783, 3.8833, 3.4794, 2.7116, 4.3520], + device='cuda:2'), covar=tensor([0.1465, 0.0173, 0.0962, 0.0247, 0.0527, 0.0711, 0.1780, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0141, 0.0158, 0.0148, 0.0173, 0.0166, 0.0161, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:31:05,236 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=83319.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:31:17,550 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.733e+01 1.415e+02 1.663e+02 2.154e+02 3.672e+02, threshold=3.327e+02, percent-clipped=0.0 +2022-11-16 05:31:24,250 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=83345.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:31:47,263 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9834, 3.9389, 3.9220, 4.1216, 3.9196, 3.6556, 4.5019, 3.9381], + device='cuda:2'), covar=tensor([0.0438, 0.0913, 0.0455, 0.1094, 0.0464, 0.0375, 0.0699, 0.0728], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0110, 0.0097, 0.0122, 0.0090, 0.0080, 0.0146, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:31:56,029 INFO [train.py:876] (2/4) Epoch 12, batch 3400, loss[loss=0.1154, simple_loss=0.1496, pruned_loss=0.04056, over 5594.00 frames. ], tot_loss[loss=0.1102, simple_loss=0.1394, pruned_loss=0.04054, over 1090973.70 frames. ], batch size: 38, lr: 6.74e-03, grad_scale: 16.0 +2022-11-16 05:32:19,703 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0276, 5.6882, 5.0948, 5.7808, 5.6764, 4.9612, 5.3446, 5.1169], + device='cuda:2'), covar=tensor([0.0172, 0.0290, 0.1008, 0.0228, 0.0277, 0.0409, 0.0282, 0.0233], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0177, 0.0273, 0.0173, 0.0220, 0.0170, 0.0188, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:32:25,092 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.935e+01 1.460e+02 1.839e+02 2.206e+02 3.947e+02, threshold=3.678e+02, percent-clipped=4.0 +2022-11-16 05:32:27,997 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-11-16 05:33:03,158 INFO [train.py:876] (2/4) Epoch 12, batch 3500, loss[loss=0.08243, simple_loss=0.1077, pruned_loss=0.02857, over 5538.00 frames. ], tot_loss[loss=0.1093, simple_loss=0.1385, pruned_loss=0.04005, over 1088241.28 frames. ], batch size: 13, lr: 6.73e-03, grad_scale: 16.0 +2022-11-16 05:33:15,944 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.86 vs. limit=2.0 +2022-11-16 05:33:16,382 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3071, 2.4880, 3.8936, 3.4323, 4.3143, 2.8006, 3.6723, 4.2094], + device='cuda:2'), covar=tensor([0.0851, 0.1677, 0.0807, 0.1738, 0.0424, 0.1710, 0.1362, 0.0784], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0195, 0.0218, 0.0213, 0.0238, 0.0199, 0.0225, 0.0231], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:33:32,641 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.791e+01 1.416e+02 1.765e+02 2.301e+02 4.941e+02, threshold=3.529e+02, percent-clipped=1.0 +2022-11-16 05:33:43,847 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=83552.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:34:11,112 INFO [train.py:876] (2/4) Epoch 12, batch 3600, loss[loss=0.09371, simple_loss=0.1278, pruned_loss=0.02979, over 5546.00 frames. ], tot_loss[loss=0.1091, simple_loss=0.1381, pruned_loss=0.04003, over 1084516.62 frames. ], batch size: 15, lr: 6.73e-03, grad_scale: 16.0 +2022-11-16 05:34:21,914 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5087, 3.6176, 3.6485, 3.7625, 3.5944, 3.3597, 4.0058, 3.5490], + device='cuda:2'), covar=tensor([0.0603, 0.1058, 0.0497, 0.1167, 0.0548, 0.0438, 0.0837, 0.0814], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0108, 0.0097, 0.0121, 0.0090, 0.0079, 0.0147, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:34:29,884 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83619.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:34:40,744 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.534e+01 1.623e+02 1.893e+02 2.295e+02 4.939e+02, threshold=3.787e+02, percent-clipped=6.0 +2022-11-16 05:34:43,454 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=83640.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:35:02,006 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83667.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:35:19,170 INFO [train.py:876] (2/4) Epoch 12, batch 3700, loss[loss=0.128, simple_loss=0.1578, pruned_loss=0.0491, over 5796.00 frames. ], tot_loss[loss=0.1086, simple_loss=0.138, pruned_loss=0.03957, over 1086389.16 frames. ], batch size: 21, lr: 6.72e-03, grad_scale: 16.0 +2022-11-16 05:35:48,573 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.368e+01 1.468e+02 1.783e+02 2.131e+02 3.533e+02, threshold=3.566e+02, percent-clipped=0.0 +2022-11-16 05:35:58,187 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3526, 4.2535, 4.4766, 4.4876, 4.1419, 4.2286, 4.8011, 4.4746], + device='cuda:2'), covar=tensor([0.0422, 0.1005, 0.0353, 0.1134, 0.0456, 0.0252, 0.0778, 0.0563], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0107, 0.0096, 0.0120, 0.0090, 0.0079, 0.0146, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:36:04,207 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83760.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:36:17,729 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3804, 2.8405, 3.8378, 3.4613, 4.3599, 3.1512, 3.9565, 4.4598], + device='cuda:2'), covar=tensor([0.0645, 0.1409, 0.0744, 0.1284, 0.0372, 0.1393, 0.1086, 0.0599], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0198, 0.0221, 0.0218, 0.0243, 0.0202, 0.0228, 0.0236], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:36:27,061 INFO [train.py:876] (2/4) Epoch 12, batch 3800, loss[loss=0.08645, simple_loss=0.1077, pruned_loss=0.03258, over 4552.00 frames. ], tot_loss[loss=0.1079, simple_loss=0.1377, pruned_loss=0.03906, over 1086511.64 frames. ], batch size: 5, lr: 6.72e-03, grad_scale: 16.0 +2022-11-16 05:36:28,562 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7394, 2.3660, 2.6798, 3.8037, 3.7143, 2.8928, 2.6681, 3.6520], + device='cuda:2'), covar=tensor([0.0941, 0.2809, 0.2145, 0.2395, 0.1143, 0.2892, 0.1924, 0.1019], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0197, 0.0187, 0.0300, 0.0223, 0.0203, 0.0188, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 05:36:43,111 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2850, 4.0595, 2.7121, 3.9068, 3.2177, 2.8870, 2.3210, 3.4878], + device='cuda:2'), covar=tensor([0.1613, 0.0280, 0.1192, 0.0393, 0.0856, 0.1015, 0.1917, 0.0494], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0139, 0.0155, 0.0146, 0.0171, 0.0164, 0.0158, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:36:46,172 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=83821.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:36:56,435 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.699e+01 1.465e+02 1.775e+02 2.276e+02 4.868e+02, threshold=3.550e+02, percent-clipped=4.0 +2022-11-16 05:37:01,807 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-16 05:37:07,267 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83852.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:37:30,630 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-11-16 05:37:35,199 INFO [train.py:876] (2/4) Epoch 12, batch 3900, loss[loss=0.09225, simple_loss=0.1238, pruned_loss=0.03037, over 5407.00 frames. ], tot_loss[loss=0.1097, simple_loss=0.1386, pruned_loss=0.04036, over 1077821.70 frames. ], batch size: 11, lr: 6.72e-03, grad_scale: 16.0 +2022-11-16 05:37:39,877 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83900.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:38:04,280 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.940e+01 1.504e+02 1.885e+02 2.376e+02 4.069e+02, threshold=3.770e+02, percent-clipped=4.0 +2022-11-16 05:38:07,065 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83940.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:38:13,238 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83949.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:38:37,780 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83985.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:38:39,631 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83988.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:38:43,115 INFO [train.py:876] (2/4) Epoch 12, batch 4000, loss[loss=0.1475, simple_loss=0.1659, pruned_loss=0.06449, over 5510.00 frames. ], tot_loss[loss=0.11, simple_loss=0.1384, pruned_loss=0.04082, over 1078439.60 frames. ], batch size: 49, lr: 6.71e-03, grad_scale: 16.0 +2022-11-16 05:38:54,474 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84010.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 05:39:11,847 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.002e+01 1.513e+02 1.878e+02 2.345e+02 5.861e+02, threshold=3.757e+02, percent-clipped=4.0 +2022-11-16 05:39:18,963 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84046.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:39:31,026 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.66 vs. limit=2.0 +2022-11-16 05:39:50,159 INFO [train.py:876] (2/4) Epoch 12, batch 4100, loss[loss=0.1122, simple_loss=0.1466, pruned_loss=0.03896, over 5638.00 frames. ], tot_loss[loss=0.1103, simple_loss=0.1388, pruned_loss=0.04089, over 1083645.89 frames. ], batch size: 24, lr: 6.71e-03, grad_scale: 16.0 +2022-11-16 05:39:53,223 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84097.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:40:06,009 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84116.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:40:18,967 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.102e+02 1.530e+02 1.828e+02 2.227e+02 3.985e+02, threshold=3.657e+02, percent-clipped=1.0 +2022-11-16 05:40:34,671 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84158.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:40:51,321 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0015, 3.9439, 3.9888, 4.2010, 3.7135, 3.3805, 4.4798, 4.1504], + device='cuda:2'), covar=tensor([0.0383, 0.0797, 0.0445, 0.0912, 0.0586, 0.0449, 0.0703, 0.0536], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0107, 0.0095, 0.0119, 0.0089, 0.0079, 0.0144, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:40:56,964 INFO [train.py:876] (2/4) Epoch 12, batch 4200, loss[loss=0.1092, simple_loss=0.1404, pruned_loss=0.03898, over 5595.00 frames. ], tot_loss[loss=0.1098, simple_loss=0.1389, pruned_loss=0.04036, over 1081650.01 frames. ], batch size: 43, lr: 6.70e-03, grad_scale: 16.0 +2022-11-16 05:41:01,794 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84199.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:41:13,616 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-11-16 05:41:26,498 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.244e+01 1.460e+02 1.805e+02 2.127e+02 5.710e+02, threshold=3.610e+02, percent-clipped=1.0 +2022-11-16 05:41:26,698 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84236.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:41:43,002 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84260.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:42:04,887 INFO [train.py:876] (2/4) Epoch 12, batch 4300, loss[loss=0.113, simple_loss=0.1425, pruned_loss=0.0418, over 5692.00 frames. ], tot_loss[loss=0.1113, simple_loss=0.1397, pruned_loss=0.04151, over 1084402.26 frames. ], batch size: 34, lr: 6.70e-03, grad_scale: 16.0 +2022-11-16 05:42:07,619 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84297.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:42:12,739 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84305.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 05:42:22,213 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0716, 4.0076, 4.0713, 3.8707, 4.0433, 3.9048, 1.6291, 4.2625], + device='cuda:2'), covar=tensor([0.0256, 0.0321, 0.0323, 0.0276, 0.0309, 0.0362, 0.3199, 0.0297], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0089, 0.0090, 0.0083, 0.0103, 0.0091, 0.0132, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:42:34,272 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.802e+01 1.531e+02 1.904e+02 2.409e+02 4.137e+02, threshold=3.809e+02, percent-clipped=6.0 +2022-11-16 05:42:34,459 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84336.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:42:37,628 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84341.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:42:44,271 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5716, 2.1513, 2.1935, 2.8761, 2.8419, 2.2383, 2.0012, 2.8970], + device='cuda:2'), covar=tensor([0.1936, 0.2279, 0.1883, 0.1426, 0.1213, 0.2757, 0.1795, 0.1362], + device='cuda:2'), in_proj_covar=tensor([0.0250, 0.0197, 0.0185, 0.0300, 0.0223, 0.0203, 0.0187, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 05:43:12,339 INFO [train.py:876] (2/4) Epoch 12, batch 4400, loss[loss=0.1043, simple_loss=0.1451, pruned_loss=0.03174, over 5716.00 frames. ], tot_loss[loss=0.1106, simple_loss=0.1396, pruned_loss=0.04077, over 1084965.09 frames. ], batch size: 15, lr: 6.70e-03, grad_scale: 16.0 +2022-11-16 05:43:14,310 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5479, 4.4070, 4.2846, 3.9904, 4.5102, 4.3303, 1.7826, 4.7030], + device='cuda:2'), covar=tensor([0.0224, 0.0286, 0.0343, 0.0343, 0.0275, 0.0388, 0.3268, 0.0216], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0089, 0.0090, 0.0083, 0.0103, 0.0091, 0.0132, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:43:15,068 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84397.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:43:27,627 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84416.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:43:34,744 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0815, 4.0236, 4.1293, 4.1537, 3.6907, 3.5838, 4.4312, 3.9542], + device='cuda:2'), covar=tensor([0.0434, 0.0774, 0.0388, 0.1073, 0.0459, 0.0372, 0.0642, 0.0554], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0108, 0.0096, 0.0121, 0.0089, 0.0079, 0.0145, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:43:41,383 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.927e+01 1.500e+02 1.777e+02 2.208e+02 3.922e+02, threshold=3.553e+02, percent-clipped=1.0 +2022-11-16 05:43:52,910 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84453.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:43:59,962 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84464.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:44:19,825 INFO [train.py:876] (2/4) Epoch 12, batch 4500, loss[loss=0.0873, simple_loss=0.1226, pruned_loss=0.02601, over 5584.00 frames. ], tot_loss[loss=0.1093, simple_loss=0.1391, pruned_loss=0.03972, over 1085637.15 frames. ], batch size: 25, lr: 6.69e-03, grad_scale: 16.0 +2022-11-16 05:44:41,055 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84525.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:44:48,042 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.005e+02 1.515e+02 1.740e+02 2.350e+02 4.214e+02, threshold=3.480e+02, percent-clipped=2.0 +2022-11-16 05:45:01,595 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84555.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:45:04,550 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.3994, 4.8296, 5.3179, 4.8180, 5.4240, 5.3136, 4.6517, 5.4210], + device='cuda:2'), covar=tensor([0.0307, 0.0358, 0.0297, 0.0285, 0.0374, 0.0188, 0.0264, 0.0270], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0154, 0.0111, 0.0145, 0.0183, 0.0109, 0.0128, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 05:45:22,417 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84586.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:45:26,267 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84592.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:45:26,817 INFO [train.py:876] (2/4) Epoch 12, batch 4600, loss[loss=0.1039, simple_loss=0.1312, pruned_loss=0.03829, over 5096.00 frames. ], tot_loss[loss=0.1088, simple_loss=0.1383, pruned_loss=0.03961, over 1085128.44 frames. ], batch size: 91, lr: 6.69e-03, grad_scale: 16.0 +2022-11-16 05:45:28,274 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7344, 3.6163, 3.7021, 3.2628, 2.0732, 3.7007, 2.3198, 3.0337], + device='cuda:2'), covar=tensor([0.0372, 0.0237, 0.0160, 0.0317, 0.0578, 0.0161, 0.0509, 0.0241], + device='cuda:2'), in_proj_covar=tensor([0.0197, 0.0181, 0.0183, 0.0206, 0.0195, 0.0182, 0.0191, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 05:45:36,004 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84605.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:45:54,583 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84633.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:45:56,355 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 1.464e+02 1.836e+02 2.237e+02 3.755e+02, threshold=3.672e+02, percent-clipped=3.0 +2022-11-16 05:45:59,673 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84641.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:46:06,323 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7145, 2.8168, 2.5717, 2.8772, 2.2193, 2.4226, 2.5926, 3.2274], + device='cuda:2'), covar=tensor([0.0958, 0.1048, 0.1609, 0.1014, 0.1356, 0.1139, 0.1059, 0.0807], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0106, 0.0105, 0.0104, 0.0092, 0.0101, 0.0098, 0.0081], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 05:46:08,137 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84653.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:46:32,304 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84689.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:46:34,322 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84692.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:46:34,899 INFO [train.py:876] (2/4) Epoch 12, batch 4700, loss[loss=0.1069, simple_loss=0.1369, pruned_loss=0.03847, over 5575.00 frames. ], tot_loss[loss=0.1081, simple_loss=0.1382, pruned_loss=0.03903, over 1089049.99 frames. ], batch size: 43, lr: 6.68e-03, grad_scale: 16.0 +2022-11-16 05:46:35,708 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84694.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:46:35,721 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9506, 2.7203, 2.5670, 1.6418, 2.6949, 2.8652, 2.7848, 3.1224], + device='cuda:2'), covar=tensor([0.2321, 0.1733, 0.1579, 0.2843, 0.0973, 0.1253, 0.0690, 0.1269], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0184, 0.0169, 0.0186, 0.0183, 0.0202, 0.0169, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:47:03,946 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.103e+01 1.427e+02 1.725e+02 2.096e+02 3.748e+02, threshold=3.451e+02, percent-clipped=1.0 +2022-11-16 05:47:06,182 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1161, 0.9368, 0.9764, 0.7725, 1.0826, 0.9400, 0.4771, 0.7702], + device='cuda:2'), covar=tensor([0.0234, 0.0420, 0.0384, 0.0476, 0.0283, 0.0367, 0.0950, 0.0380], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0024, 0.0017, 0.0021, 0.0017, 0.0015, 0.0023, 0.0016], + device='cuda:2'), out_proj_covar=tensor([8.4513e-05, 1.1646e-04, 8.9964e-05, 1.0473e-04, 9.1832e-05, 8.5570e-05, + 1.1320e-04, 8.6142e-05], device='cuda:2') +2022-11-16 05:47:09,329 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1417, 3.4128, 2.5824, 1.7718, 3.2170, 1.3498, 3.2621, 1.9166], + device='cuda:2'), covar=tensor([0.1757, 0.0295, 0.1310, 0.2026, 0.0354, 0.2392, 0.0314, 0.1749], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0103, 0.0114, 0.0109, 0.0100, 0.0118, 0.0099, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:47:15,105 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84753.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:47:25,067 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9331, 4.2466, 3.9877, 3.5811, 2.1461, 4.1521, 2.3854, 3.6034], + device='cuda:2'), covar=tensor([0.0455, 0.0157, 0.0184, 0.0437, 0.0675, 0.0184, 0.0556, 0.0164], + device='cuda:2'), in_proj_covar=tensor([0.0194, 0.0178, 0.0180, 0.0203, 0.0192, 0.0180, 0.0189, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 05:47:29,931 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8543, 3.9578, 3.9028, 3.4335, 2.0467, 4.0012, 2.3295, 3.3508], + device='cuda:2'), covar=tensor([0.0412, 0.0277, 0.0166, 0.0373, 0.0653, 0.0177, 0.0524, 0.0135], + device='cuda:2'), in_proj_covar=tensor([0.0193, 0.0178, 0.0180, 0.0203, 0.0192, 0.0180, 0.0188, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 05:47:42,125 INFO [train.py:876] (2/4) Epoch 12, batch 4800, loss[loss=0.1226, simple_loss=0.1559, pruned_loss=0.04468, over 5603.00 frames. ], tot_loss[loss=0.1076, simple_loss=0.1373, pruned_loss=0.03892, over 1084990.93 frames. ], batch size: 38, lr: 6.68e-03, grad_scale: 16.0 +2022-11-16 05:47:47,075 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84800.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:47:47,584 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84801.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:48:11,629 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.403e+01 1.631e+02 1.987e+02 2.475e+02 5.083e+02, threshold=3.974e+02, percent-clipped=5.0 +2022-11-16 05:48:24,052 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84855.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:48:27,987 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84861.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:48:41,228 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7805, 2.8137, 2.1912, 2.3836, 1.6498, 2.3292, 1.5090, 2.2839], + device='cuda:2'), covar=tensor([0.1396, 0.0436, 0.1137, 0.0805, 0.2105, 0.0995, 0.2077, 0.0626], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0143, 0.0159, 0.0150, 0.0176, 0.0168, 0.0162, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:48:41,868 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84881.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:48:46,470 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6412, 4.4398, 4.6065, 4.6742, 4.2035, 4.0602, 5.1509, 4.6164], + device='cuda:2'), covar=tensor([0.0457, 0.1104, 0.0408, 0.1312, 0.0446, 0.0351, 0.0685, 0.0595], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0111, 0.0099, 0.0124, 0.0092, 0.0082, 0.0149, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:48:49,140 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84892.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:48:49,707 INFO [train.py:876] (2/4) Epoch 12, batch 4900, loss[loss=0.1467, simple_loss=0.1659, pruned_loss=0.06369, over 5566.00 frames. ], tot_loss[loss=0.1063, simple_loss=0.1357, pruned_loss=0.03847, over 1082927.36 frames. ], batch size: 54, lr: 6.68e-03, grad_scale: 32.0 +2022-11-16 05:48:55,934 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.77 vs. limit=2.0 +2022-11-16 05:48:56,264 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84903.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:49:19,715 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.452e+01 1.404e+02 1.712e+02 2.121e+02 6.209e+02, threshold=3.423e+02, percent-clipped=1.0 +2022-11-16 05:49:21,812 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84940.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:49:28,959 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84951.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:49:48,068 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-11-16 05:49:54,402 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84989.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:49:56,372 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84992.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:49:56,851 INFO [train.py:876] (2/4) Epoch 12, batch 5000, loss[loss=0.1, simple_loss=0.1328, pruned_loss=0.0336, over 5426.00 frames. ], tot_loss[loss=0.1069, simple_loss=0.1363, pruned_loss=0.03874, over 1087245.98 frames. ], batch size: 11, lr: 6.67e-03, grad_scale: 16.0 +2022-11-16 05:50:13,401 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85012.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:50:29,276 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.668e+01 1.461e+02 1.751e+02 2.205e+02 3.739e+02, threshold=3.502e+02, percent-clipped=4.0 +2022-11-16 05:50:31,271 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85040.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:50:45,612 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2145, 1.3960, 1.1504, 0.9745, 1.3062, 1.1733, 0.8069, 1.3636], + device='cuda:2'), covar=tensor([0.0064, 0.0052, 0.0061, 0.0069, 0.0050, 0.0061, 0.0092, 0.0053], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0057, 0.0055, 0.0060, 0.0058, 0.0053, 0.0053, 0.0050], + device='cuda:2'), out_proj_covar=tensor([5.4174e-05, 5.0243e-05, 4.8213e-05, 5.3403e-05, 5.1475e-05, 4.6139e-05, + 4.7159e-05, 4.4178e-05], device='cuda:2') +2022-11-16 05:51:06,696 INFO [train.py:876] (2/4) Epoch 12, batch 5100, loss[loss=0.1018, simple_loss=0.1304, pruned_loss=0.03656, over 5207.00 frames. ], tot_loss[loss=0.1073, simple_loss=0.1367, pruned_loss=0.03889, over 1086586.97 frames. ], batch size: 8, lr: 6.67e-03, grad_scale: 16.0 +2022-11-16 05:51:16,498 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85107.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:51:21,645 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85115.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:51:32,091 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-11-16 05:51:36,164 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.733e+01 1.524e+02 1.873e+02 2.260e+02 4.795e+02, threshold=3.745e+02, percent-clipped=3.0 +2022-11-16 05:51:49,422 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85156.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:51:57,600 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85168.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 05:51:59,241 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8994, 4.8910, 3.7750, 2.3832, 4.5720, 2.0485, 4.5534, 2.6625], + device='cuda:2'), covar=tensor([0.1350, 0.0160, 0.0519, 0.2057, 0.0188, 0.1880, 0.0229, 0.1550], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0103, 0.0115, 0.0110, 0.0100, 0.0119, 0.0100, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:52:03,252 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85176.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:52:06,509 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85181.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:52:09,060 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4504, 2.6075, 2.7204, 2.4370, 2.7145, 2.4848, 1.1918, 2.6815], + device='cuda:2'), covar=tensor([0.0543, 0.0519, 0.0426, 0.0476, 0.0495, 0.0687, 0.3148, 0.0596], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0088, 0.0089, 0.0081, 0.0102, 0.0090, 0.0130, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:52:14,141 INFO [train.py:876] (2/4) Epoch 12, batch 5200, loss[loss=0.08509, simple_loss=0.1239, pruned_loss=0.02314, over 5729.00 frames. ], tot_loss[loss=0.1065, simple_loss=0.1365, pruned_loss=0.03829, over 1091141.88 frames. ], batch size: 19, lr: 6.66e-03, grad_scale: 16.0 +2022-11-16 05:52:39,249 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85229.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:52:45,062 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.605e+01 1.468e+02 1.779e+02 2.161e+02 4.129e+02, threshold=3.557e+02, percent-clipped=1.0 +2022-11-16 05:53:06,533 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8634, 2.3909, 2.9394, 1.8701, 1.4961, 3.3495, 2.8340, 2.4525], + device='cuda:2'), covar=tensor([0.0857, 0.1384, 0.0782, 0.2706, 0.3464, 0.1487, 0.0745, 0.1303], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0094, 0.0093, 0.0099, 0.0074, 0.0067, 0.0078, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 05:53:07,121 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8408, 2.9734, 3.0239, 2.7797, 3.0397, 2.9175, 1.2120, 3.1159], + device='cuda:2'), covar=tensor([0.0304, 0.0329, 0.0311, 0.0357, 0.0341, 0.0390, 0.2958, 0.0350], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0088, 0.0088, 0.0081, 0.0102, 0.0090, 0.0130, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 05:53:20,075 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85289.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:53:21,444 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4715, 4.3424, 3.8677, 3.5543, 2.0451, 4.2014, 2.3214, 3.7017], + device='cuda:2'), covar=tensor([0.0504, 0.0148, 0.0150, 0.0445, 0.0640, 0.0132, 0.0515, 0.0131], + device='cuda:2'), in_proj_covar=tensor([0.0198, 0.0182, 0.0184, 0.0207, 0.0195, 0.0183, 0.0193, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 05:53:22,522 INFO [train.py:876] (2/4) Epoch 12, batch 5300, loss[loss=0.1248, simple_loss=0.1461, pruned_loss=0.05178, over 5562.00 frames. ], tot_loss[loss=0.1066, simple_loss=0.1367, pruned_loss=0.03827, over 1088582.22 frames. ], batch size: 40, lr: 6.66e-03, grad_scale: 8.0 +2022-11-16 05:53:31,528 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85307.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:53:44,848 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.9370, 5.4204, 5.6865, 5.3235, 5.9845, 5.8517, 5.0561, 5.9068], + device='cuda:2'), covar=tensor([0.0342, 0.0302, 0.0416, 0.0351, 0.0288, 0.0134, 0.0225, 0.0221], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0152, 0.0110, 0.0143, 0.0180, 0.0108, 0.0126, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 05:53:52,734 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85337.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:53:53,358 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.591e+01 1.461e+02 1.746e+02 2.193e+02 3.892e+02, threshold=3.493e+02, percent-clipped=1.0 +2022-11-16 05:54:11,963 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7682, 2.2888, 2.7854, 3.7113, 3.7510, 2.7691, 2.2570, 3.5828], + device='cuda:2'), covar=tensor([0.0772, 0.3145, 0.2391, 0.2563, 0.1124, 0.3062, 0.2661, 0.0754], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0198, 0.0190, 0.0306, 0.0224, 0.0203, 0.0190, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 05:54:21,209 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.05 vs. limit=5.0 +2022-11-16 05:54:33,136 INFO [train.py:876] (2/4) Epoch 12, batch 5400, loss[loss=0.1064, simple_loss=0.1427, pruned_loss=0.03504, over 5714.00 frames. ], tot_loss[loss=0.1081, simple_loss=0.1377, pruned_loss=0.0392, over 1088089.96 frames. ], batch size: 19, lr: 6.66e-03, grad_scale: 8.0 +2022-11-16 05:54:57,343 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85428.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:55:04,102 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.058e+02 1.454e+02 1.853e+02 2.296e+02 5.814e+02, threshold=3.706e+02, percent-clipped=5.0 +2022-11-16 05:55:15,944 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85456.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:55:17,236 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9413, 3.0250, 3.2150, 1.5769, 2.9596, 3.3494, 3.3291, 3.7369], + device='cuda:2'), covar=tensor([0.2199, 0.1390, 0.0833, 0.3021, 0.0560, 0.0879, 0.0459, 0.0670], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0183, 0.0167, 0.0186, 0.0185, 0.0201, 0.0168, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:55:20,368 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85463.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 05:55:25,555 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85471.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:55:32,330 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-11-16 05:55:34,543 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85483.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:55:34,823 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.74 vs. limit=2.0 +2022-11-16 05:55:38,423 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85489.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:55:41,218 INFO [train.py:876] (2/4) Epoch 12, batch 5500, loss[loss=0.1632, simple_loss=0.1776, pruned_loss=0.07445, over 5406.00 frames. ], tot_loss[loss=0.1086, simple_loss=0.1384, pruned_loss=0.03945, over 1085495.51 frames. ], batch size: 70, lr: 6.65e-03, grad_scale: 8.0 +2022-11-16 05:55:48,427 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85504.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:56:06,082 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0380, 1.7373, 2.0864, 1.7825, 1.7428, 1.8599, 1.4996, 1.6850], + device='cuda:2'), covar=tensor([0.0033, 0.0063, 0.0028, 0.0046, 0.0097, 0.0073, 0.0052, 0.0054], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0026, 0.0026, 0.0034, 0.0030, 0.0027, 0.0034, 0.0032], + device='cuda:2'), out_proj_covar=tensor([2.6162e-05, 2.4046e-05, 2.3647e-05, 3.2920e-05, 2.7621e-05, 2.5778e-05, + 3.2411e-05, 3.1144e-05], device='cuda:2') +2022-11-16 05:56:06,098 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85530.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:56:11,500 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.321e+01 1.546e+02 1.853e+02 2.385e+02 3.916e+02, threshold=3.707e+02, percent-clipped=1.0 +2022-11-16 05:56:16,099 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85544.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:56:43,779 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.51 vs. limit=5.0 +2022-11-16 05:56:44,772 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7617, 2.1944, 1.7727, 1.4274, 2.1711, 2.2703, 2.2494, 2.4383], + device='cuda:2'), covar=tensor([0.1730, 0.1461, 0.1982, 0.2557, 0.0913, 0.1182, 0.0794, 0.1171], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0183, 0.0168, 0.0186, 0.0185, 0.0202, 0.0168, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:56:47,724 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85591.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:56:48,897 INFO [train.py:876] (2/4) Epoch 12, batch 5600, loss[loss=0.0946, simple_loss=0.1218, pruned_loss=0.03373, over 5624.00 frames. ], tot_loss[loss=0.1095, simple_loss=0.1386, pruned_loss=0.0402, over 1084311.70 frames. ], batch size: 18, lr: 6.65e-03, grad_scale: 8.0 +2022-11-16 05:56:58,732 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85607.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:57:03,423 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.4099, 0.7819, 0.5793, 0.6926, 0.7366, 0.7505, 0.6352, 0.7511], + device='cuda:2'), covar=tensor([0.0053, 0.0034, 0.0043, 0.0033, 0.0029, 0.0037, 0.0055, 0.0026], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0056, 0.0055, 0.0059, 0.0058, 0.0053, 0.0053, 0.0051], + device='cuda:2'), out_proj_covar=tensor([5.3704e-05, 4.9676e-05, 4.8227e-05, 5.2620e-05, 5.1080e-05, 4.6429e-05, + 4.7408e-05, 4.4549e-05], device='cuda:2') +2022-11-16 05:57:10,790 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85625.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:57:20,010 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.221e+01 1.474e+02 1.888e+02 2.414e+02 5.206e+02, threshold=3.776e+02, percent-clipped=5.0 +2022-11-16 05:57:32,029 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85655.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:57:48,428 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7157, 3.7005, 3.6854, 3.3522, 1.9583, 3.7574, 2.2854, 3.1000], + device='cuda:2'), covar=tensor([0.0438, 0.0231, 0.0234, 0.0393, 0.0676, 0.0202, 0.0567, 0.0260], + device='cuda:2'), in_proj_covar=tensor([0.0193, 0.0180, 0.0181, 0.0204, 0.0193, 0.0182, 0.0189, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 05:57:52,351 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85686.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 05:57:56,996 INFO [train.py:876] (2/4) Epoch 12, batch 5700, loss[loss=0.1204, simple_loss=0.1518, pruned_loss=0.04446, over 5612.00 frames. ], tot_loss[loss=0.1087, simple_loss=0.1383, pruned_loss=0.03956, over 1088664.00 frames. ], batch size: 24, lr: 6.64e-03, grad_scale: 8.0 +2022-11-16 05:58:07,199 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9427, 1.5255, 1.9710, 1.6882, 1.7105, 1.9686, 1.6219, 1.6859], + device='cuda:2'), covar=tensor([0.0038, 0.0064, 0.0046, 0.0060, 0.0078, 0.0057, 0.0050, 0.0055], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0026, 0.0027, 0.0034, 0.0030, 0.0027, 0.0034, 0.0033], + device='cuda:2'), out_proj_covar=tensor([2.6300e-05, 2.4306e-05, 2.3963e-05, 3.3388e-05, 2.7975e-05, 2.6014e-05, + 3.2737e-05, 3.1411e-05], device='cuda:2') +2022-11-16 05:58:15,117 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6523, 3.9199, 3.7703, 3.4846, 1.9443, 3.8648, 2.2702, 3.1790], + device='cuda:2'), covar=tensor([0.0384, 0.0169, 0.0144, 0.0301, 0.0585, 0.0152, 0.0495, 0.0193], + device='cuda:2'), in_proj_covar=tensor([0.0194, 0.0180, 0.0182, 0.0205, 0.0194, 0.0182, 0.0191, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 05:58:25,217 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.1461, 1.5877, 1.1901, 1.0924, 1.2597, 1.0759, 0.9884, 1.3897], + device='cuda:2'), covar=tensor([0.0085, 0.0053, 0.0071, 0.0082, 0.0071, 0.0068, 0.0106, 0.0068], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0056, 0.0055, 0.0059, 0.0058, 0.0053, 0.0053, 0.0051], + device='cuda:2'), out_proj_covar=tensor([5.3723e-05, 4.9457e-05, 4.8109e-05, 5.2700e-05, 5.0982e-05, 4.6409e-05, + 4.7297e-05, 4.4441e-05], device='cuda:2') +2022-11-16 05:58:25,854 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4724, 1.8454, 1.3087, 1.3075, 1.3901, 1.3243, 1.2568, 1.5771], + device='cuda:2'), covar=tensor([0.0055, 0.0043, 0.0052, 0.0061, 0.0052, 0.0044, 0.0065, 0.0053], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0056, 0.0055, 0.0059, 0.0058, 0.0053, 0.0053, 0.0051], + device='cuda:2'), out_proj_covar=tensor([5.3705e-05, 4.9446e-05, 4.8094e-05, 5.2686e-05, 5.0963e-05, 4.6396e-05, + 4.7281e-05, 4.4428e-05], device='cuda:2') +2022-11-16 05:58:26,104 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-11-16 05:58:26,982 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.513e+01 1.496e+02 1.877e+02 2.228e+02 5.709e+02, threshold=3.754e+02, percent-clipped=3.0 +2022-11-16 05:58:38,805 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8076, 2.5627, 3.1280, 3.8240, 3.8728, 3.0344, 2.8747, 3.7359], + device='cuda:2'), covar=tensor([0.0921, 0.3879, 0.1886, 0.3066, 0.1179, 0.2893, 0.1968, 0.0894], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0197, 0.0186, 0.0301, 0.0224, 0.0202, 0.0186, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 05:58:43,943 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85763.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:58:46,783 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0664, 3.6002, 2.4138, 3.3981, 2.7630, 2.5571, 1.9613, 3.0611], + device='cuda:2'), covar=tensor([0.1587, 0.0358, 0.1234, 0.0463, 0.1270, 0.1188, 0.2016, 0.0614], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0143, 0.0156, 0.0148, 0.0174, 0.0166, 0.0158, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 05:58:49,775 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85771.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:58:58,190 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85784.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:59:04,118 INFO [train.py:876] (2/4) Epoch 12, batch 5800, loss[loss=0.1532, simple_loss=0.1577, pruned_loss=0.07433, over 4390.00 frames. ], tot_loss[loss=0.1094, simple_loss=0.1388, pruned_loss=0.03997, over 1086390.05 frames. ], batch size: 5, lr: 6.64e-03, grad_scale: 8.0 +2022-11-16 05:59:07,017 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-11-16 05:59:16,827 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85811.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:59:22,788 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85819.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 05:59:35,107 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.803e+01 1.482e+02 1.820e+02 2.147e+02 4.590e+02, threshold=3.641e+02, percent-clipped=4.0 +2022-11-16 05:59:35,887 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85839.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:00:00,449 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85875.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:00:07,908 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85886.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:00:12,399 INFO [train.py:876] (2/4) Epoch 12, batch 5900, loss[loss=0.1083, simple_loss=0.1398, pruned_loss=0.03839, over 5303.00 frames. ], tot_loss[loss=0.1078, simple_loss=0.137, pruned_loss=0.03931, over 1077731.33 frames. ], batch size: 79, lr: 6.64e-03, grad_scale: 8.0 +2022-11-16 06:00:38,046 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85930.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:00:41,974 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85936.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:00:43,059 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.779e+01 1.458e+02 1.851e+02 2.281e+02 4.967e+02, threshold=3.703e+02, percent-clipped=4.0 +2022-11-16 06:00:55,692 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85957.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:01:12,071 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85981.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 06:01:18,832 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85991.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:01:19,973 INFO [train.py:876] (2/4) Epoch 12, batch 6000, loss[loss=0.07657, simple_loss=0.1115, pruned_loss=0.02083, over 5535.00 frames. ], tot_loss[loss=0.1089, simple_loss=0.1379, pruned_loss=0.04, over 1077870.09 frames. ], batch size: 13, lr: 6.63e-03, grad_scale: 8.0 +2022-11-16 06:01:19,973 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 06:01:37,497 INFO [train.py:908] (2/4) Epoch 12, validation: loss=0.1738, simple_loss=0.1864, pruned_loss=0.08063, over 1530663.00 frames. +2022-11-16 06:01:37,498 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 06:01:43,762 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86002.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:01:54,418 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86018.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:02:08,197 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.233e+01 1.359e+02 1.723e+02 2.216e+02 5.600e+02, threshold=3.445e+02, percent-clipped=2.0 +2022-11-16 06:02:17,509 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86052.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:02:24,821 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86063.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:02:25,405 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1446, 3.0059, 2.3031, 1.7509, 2.8521, 1.2486, 2.8674, 1.8950], + device='cuda:2'), covar=tensor([0.1220, 0.0261, 0.1692, 0.1874, 0.0314, 0.2143, 0.0344, 0.1283], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0105, 0.0116, 0.0113, 0.0102, 0.0121, 0.0102, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:02:29,094 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86069.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:02:39,122 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86084.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:02:45,104 INFO [train.py:876] (2/4) Epoch 12, batch 6100, loss[loss=0.0753, simple_loss=0.1133, pruned_loss=0.01865, over 5462.00 frames. ], tot_loss[loss=0.11, simple_loss=0.1386, pruned_loss=0.04069, over 1076960.28 frames. ], batch size: 10, lr: 6.63e-03, grad_scale: 8.0 +2022-11-16 06:02:58,465 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86113.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:03:10,247 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86130.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:03:11,383 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86132.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:03:15,121 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.336e+01 1.468e+02 1.787e+02 2.256e+02 5.479e+02, threshold=3.574e+02, percent-clipped=5.0 +2022-11-16 06:03:15,901 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86139.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:03:47,217 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86186.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:03:47,784 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86187.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:03:51,957 INFO [train.py:876] (2/4) Epoch 12, batch 6200, loss[loss=0.06728, simple_loss=0.1075, pruned_loss=0.01351, over 5429.00 frames. ], tot_loss[loss=0.1089, simple_loss=0.1379, pruned_loss=0.03992, over 1081549.36 frames. ], batch size: 11, lr: 6.63e-03, grad_scale: 8.0 +2022-11-16 06:04:09,307 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.70 vs. limit=5.0 +2022-11-16 06:04:17,345 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86231.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:04:19,258 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86234.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:04:22,083 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.231e+01 1.405e+02 1.749e+02 2.219e+02 4.004e+02, threshold=3.499e+02, percent-clipped=3.0 +2022-11-16 06:04:27,863 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86246.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:04:43,569 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4929, 5.0292, 4.5843, 5.0606, 4.9543, 4.2372, 4.5090, 4.3354], + device='cuda:2'), covar=tensor([0.0229, 0.0375, 0.1214, 0.0327, 0.0411, 0.0485, 0.0598, 0.0622], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0179, 0.0277, 0.0176, 0.0224, 0.0175, 0.0192, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:04:51,626 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86281.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:04:53,380 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-11-16 06:04:54,823 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86286.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:04:59,683 INFO [train.py:876] (2/4) Epoch 12, batch 6300, loss[loss=0.05829, simple_loss=0.09286, pruned_loss=0.01187, over 5112.00 frames. ], tot_loss[loss=0.1086, simple_loss=0.137, pruned_loss=0.04006, over 1077411.90 frames. ], batch size: 7, lr: 6.62e-03, grad_scale: 8.0 +2022-11-16 06:05:09,337 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86307.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:05:13,605 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86313.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:05:24,117 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86329.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:05:29,914 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.920e+01 1.421e+02 1.647e+02 2.112e+02 5.317e+02, threshold=3.295e+02, percent-clipped=6.0 +2022-11-16 06:05:44,665 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86358.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:06:07,710 INFO [train.py:876] (2/4) Epoch 12, batch 6400, loss[loss=0.0897, simple_loss=0.1214, pruned_loss=0.02899, over 5721.00 frames. ], tot_loss[loss=0.109, simple_loss=0.1379, pruned_loss=0.04, over 1083610.12 frames. ], batch size: 13, lr: 6.62e-03, grad_scale: 8.0 +2022-11-16 06:06:09,722 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-16 06:06:15,263 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-11-16 06:06:18,206 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86408.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:06:30,006 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86425.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:06:38,424 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.070e+01 1.434e+02 1.773e+02 2.236e+02 3.206e+02, threshold=3.547e+02, percent-clipped=0.0 +2022-11-16 06:06:39,881 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86440.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:06:43,176 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86445.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:07:09,873 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2282, 3.0023, 3.1129, 2.8448, 3.2692, 3.1538, 3.0632, 3.2386], + device='cuda:2'), covar=tensor([0.0425, 0.0451, 0.0471, 0.0457, 0.0473, 0.0266, 0.0435, 0.0504], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0156, 0.0111, 0.0147, 0.0183, 0.0109, 0.0129, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 06:07:15,669 INFO [train.py:876] (2/4) Epoch 12, batch 6500, loss[loss=0.07648, simple_loss=0.1166, pruned_loss=0.01821, over 5705.00 frames. ], tot_loss[loss=0.1081, simple_loss=0.1374, pruned_loss=0.03938, over 1082363.77 frames. ], batch size: 12, lr: 6.61e-03, grad_scale: 8.0 +2022-11-16 06:07:21,064 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86501.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:07:25,105 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0369, 2.1073, 2.0575, 2.1905, 1.9170, 1.7333, 1.9569, 2.2451], + device='cuda:2'), covar=tensor([0.1795, 0.1835, 0.2082, 0.1436, 0.1616, 0.1820, 0.1777, 0.0970], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0106, 0.0104, 0.0104, 0.0091, 0.0102, 0.0097, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 06:07:25,131 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86506.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 06:07:42,124 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86531.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:07:46,545 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.116e+01 1.482e+02 1.807e+02 2.369e+02 3.734e+02, threshold=3.614e+02, percent-clipped=1.0 +2022-11-16 06:07:55,420 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 06:08:13,934 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86579.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:08:14,721 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86580.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:08:14,979 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.49 vs. limit=5.0 +2022-11-16 06:08:18,598 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86586.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 06:08:23,756 INFO [train.py:876] (2/4) Epoch 12, batch 6600, loss[loss=0.1029, simple_loss=0.1317, pruned_loss=0.03698, over 5587.00 frames. ], tot_loss[loss=0.1071, simple_loss=0.1367, pruned_loss=0.03881, over 1088596.77 frames. ], batch size: 18, lr: 6.61e-03, grad_scale: 8.0 +2022-11-16 06:08:30,003 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86602.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:08:37,140 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86613.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:08:38,813 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-11-16 06:08:51,385 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86634.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:08:54,694 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.903e+01 1.415e+02 1.832e+02 2.260e+02 3.608e+02, threshold=3.664e+02, percent-clipped=0.0 +2022-11-16 06:08:56,849 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86641.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:09:08,035 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86658.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:09:09,959 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86661.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:09:32,612 INFO [train.py:876] (2/4) Epoch 12, batch 6700, loss[loss=0.1232, simple_loss=0.1447, pruned_loss=0.0508, over 5324.00 frames. ], tot_loss[loss=0.1076, simple_loss=0.1369, pruned_loss=0.03914, over 1081960.31 frames. ], batch size: 79, lr: 6.61e-03, grad_scale: 8.0 +2022-11-16 06:09:41,156 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86706.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:09:42,531 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86708.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:09:52,486 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9658, 1.8095, 1.8808, 1.4953, 1.6354, 1.5762, 1.6436, 1.4282], + device='cuda:2'), covar=tensor([0.0029, 0.0042, 0.0074, 0.0063, 0.0078, 0.0059, 0.0044, 0.0055], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0028, 0.0027, 0.0036, 0.0031, 0.0028, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([2.7300e-05, 2.5640e-05, 2.4632e-05, 3.4555e-05, 2.9031e-05, 2.7246e-05, + 3.4085e-05, 3.2463e-05], device='cuda:2') +2022-11-16 06:09:53,705 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86725.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:09:58,499 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4956, 2.4429, 3.0011, 1.7881, 1.2775, 3.2304, 2.8836, 2.2136], + device='cuda:2'), covar=tensor([0.1103, 0.1113, 0.0650, 0.3000, 0.2985, 0.1804, 0.0715, 0.1391], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0099, 0.0098, 0.0104, 0.0078, 0.0070, 0.0081, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 06:10:02,549 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.026e+02 1.559e+02 1.954e+02 2.479e+02 4.501e+02, threshold=3.908e+02, percent-clipped=4.0 +2022-11-16 06:10:13,510 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-11-16 06:10:15,006 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86756.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:10:25,967 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86773.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:10:29,669 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1750, 2.6110, 3.0624, 4.0654, 3.8649, 2.9376, 2.7065, 3.9786], + device='cuda:2'), covar=tensor([0.0653, 0.2484, 0.1944, 0.2555, 0.1208, 0.3030, 0.2011, 0.0853], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0194, 0.0186, 0.0298, 0.0224, 0.0200, 0.0184, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:10:39,702 INFO [train.py:876] (2/4) Epoch 12, batch 6800, loss[loss=0.143, simple_loss=0.1748, pruned_loss=0.05564, over 5263.00 frames. ], tot_loss[loss=0.1094, simple_loss=0.1381, pruned_loss=0.04035, over 1087893.77 frames. ], batch size: 79, lr: 6.60e-03, grad_scale: 8.0 +2022-11-16 06:10:41,679 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86796.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:10:45,596 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86801.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:10:54,177 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86814.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:11:03,396 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-11-16 06:11:10,428 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.000e+02 1.446e+02 1.789e+02 2.436e+02 4.053e+02, threshold=3.578e+02, percent-clipped=1.0 +2022-11-16 06:11:16,405 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6047, 1.6060, 1.8557, 1.6430, 1.0149, 1.4508, 1.1156, 1.2778], + device='cuda:2'), covar=tensor([0.0145, 0.0074, 0.0084, 0.0096, 0.0247, 0.0106, 0.0180, 0.0128], + device='cuda:2'), in_proj_covar=tensor([0.0195, 0.0180, 0.0182, 0.0205, 0.0193, 0.0182, 0.0189, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 06:11:29,273 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.66 vs. limit=2.0 +2022-11-16 06:11:35,475 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86875.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:11:37,382 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86878.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:11:47,379 INFO [train.py:876] (2/4) Epoch 12, batch 6900, loss[loss=0.08885, simple_loss=0.1205, pruned_loss=0.02858, over 5470.00 frames. ], tot_loss[loss=0.1072, simple_loss=0.1366, pruned_loss=0.03885, over 1084551.69 frames. ], batch size: 11, lr: 6.60e-03, grad_scale: 8.0 +2022-11-16 06:11:53,850 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86902.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:12:17,107 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86936.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:12:18,369 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.928e+01 1.457e+02 1.817e+02 2.231e+02 4.523e+02, threshold=3.633e+02, percent-clipped=5.0 +2022-11-16 06:12:19,218 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86939.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:12:26,805 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86950.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:12:48,371 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-11-16 06:12:55,766 INFO [train.py:876] (2/4) Epoch 12, batch 7000, loss[loss=0.1072, simple_loss=0.1434, pruned_loss=0.03555, over 5706.00 frames. ], tot_loss[loss=0.1076, simple_loss=0.137, pruned_loss=0.03907, over 1087348.08 frames. ], batch size: 34, lr: 6.60e-03, grad_scale: 8.0 +2022-11-16 06:13:02,858 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87002.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:13:05,985 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4894, 5.0334, 4.6197, 5.1316, 5.0479, 4.2235, 4.6595, 4.4872], + device='cuda:2'), covar=tensor([0.0325, 0.0404, 0.1157, 0.0369, 0.0398, 0.0505, 0.0424, 0.0601], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0178, 0.0276, 0.0175, 0.0222, 0.0174, 0.0189, 0.0177], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:13:13,636 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-11-16 06:13:26,388 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.516e+01 1.515e+02 1.846e+02 2.332e+02 4.129e+02, threshold=3.691e+02, percent-clipped=3.0 +2022-11-16 06:13:28,421 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6700, 3.8904, 3.9048, 3.6648, 3.8704, 3.7248, 1.3970, 3.9108], + device='cuda:2'), covar=tensor([0.0292, 0.0246, 0.0253, 0.0303, 0.0295, 0.0370, 0.3062, 0.0305], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0088, 0.0088, 0.0082, 0.0103, 0.0090, 0.0131, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:13:38,849 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.71 vs. limit=5.0 +2022-11-16 06:13:43,355 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87063.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 06:14:03,310 INFO [train.py:876] (2/4) Epoch 12, batch 7100, loss[loss=0.08673, simple_loss=0.1157, pruned_loss=0.0289, over 5680.00 frames. ], tot_loss[loss=0.1066, simple_loss=0.1363, pruned_loss=0.03843, over 1084627.61 frames. ], batch size: 11, lr: 6.59e-03, grad_scale: 8.0 +2022-11-16 06:14:05,395 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87096.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:14:08,684 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87101.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:14:26,984 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 06:14:33,938 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.177e+01 1.551e+02 1.888e+02 2.451e+02 4.689e+02, threshold=3.775e+02, percent-clipped=4.0 +2022-11-16 06:14:37,912 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87144.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:14:41,187 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87149.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:14:55,688 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87170.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:15:11,223 INFO [train.py:876] (2/4) Epoch 12, batch 7200, loss[loss=0.1159, simple_loss=0.1455, pruned_loss=0.04316, over 5541.00 frames. ], tot_loss[loss=0.1076, simple_loss=0.1369, pruned_loss=0.03909, over 1080643.37 frames. ], batch size: 46, lr: 6.59e-03, grad_scale: 8.0 +2022-11-16 06:15:36,443 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87230.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:15:39,016 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87234.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:15:40,328 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87236.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:15:41,450 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.393e+01 1.550e+02 1.929e+02 2.381e+02 4.425e+02, threshold=3.859e+02, percent-clipped=3.0 +2022-11-16 06:15:52,518 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3106, 1.6399, 1.2008, 1.3870, 1.4946, 1.4976, 1.3426, 1.6811], + device='cuda:2'), covar=tensor([0.0067, 0.0053, 0.0062, 0.0059, 0.0053, 0.0048, 0.0060, 0.0050], + device='cuda:2'), in_proj_covar=tensor([0.0061, 0.0057, 0.0057, 0.0060, 0.0059, 0.0054, 0.0054, 0.0052], + device='cuda:2'), out_proj_covar=tensor([5.5056e-05, 5.0573e-05, 4.9926e-05, 5.3525e-05, 5.2426e-05, 4.7398e-05, + 4.7818e-05, 4.5498e-05], device='cuda:2') +2022-11-16 06:15:54,342 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9475, 3.0134, 3.1426, 2.8909, 3.0867, 2.9100, 1.3195, 3.1576], + device='cuda:2'), covar=tensor([0.0336, 0.0344, 0.0288, 0.0326, 0.0327, 0.0434, 0.2979, 0.0326], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0088, 0.0088, 0.0082, 0.0102, 0.0090, 0.0130, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:16:42,443 INFO [train.py:876] (2/4) Epoch 13, batch 0, loss[loss=0.06752, simple_loss=0.1012, pruned_loss=0.0169, over 5660.00 frames. ], tot_loss[loss=0.06752, simple_loss=0.1012, pruned_loss=0.0169, over 5660.00 frames. ], batch size: 12, lr: 6.33e-03, grad_scale: 16.0 +2022-11-16 06:16:42,444 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 06:16:58,482 INFO [train.py:908] (2/4) Epoch 13, validation: loss=0.175, simple_loss=0.1891, pruned_loss=0.08049, over 1530663.00 frames. +2022-11-16 06:16:58,483 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 06:17:11,205 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87284.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:17:16,596 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87291.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:17:47,398 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.806e+01 1.427e+02 1.803e+02 2.265e+02 3.823e+02, threshold=3.607e+02, percent-clipped=0.0 +2022-11-16 06:18:01,509 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87358.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:18:02,779 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1132, 3.8761, 2.6069, 3.6378, 3.0219, 2.7552, 2.0695, 3.3142], + device='cuda:2'), covar=tensor([0.1619, 0.0301, 0.1203, 0.0430, 0.0924, 0.1076, 0.1993, 0.0474], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0144, 0.0158, 0.0150, 0.0174, 0.0169, 0.0159, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:18:04,464 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87362.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:18:06,214 INFO [train.py:876] (2/4) Epoch 13, batch 100, loss[loss=0.1126, simple_loss=0.1495, pruned_loss=0.03783, over 5584.00 frames. ], tot_loss[loss=0.1071, simple_loss=0.1373, pruned_loss=0.03847, over 434373.17 frames. ], batch size: 50, lr: 6.32e-03, grad_scale: 16.0 +2022-11-16 06:18:31,353 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9407, 1.8339, 2.4122, 1.7813, 2.5029, 1.7991, 1.6170, 1.9367], + device='cuda:2'), covar=tensor([0.0668, 0.0678, 0.0340, 0.0763, 0.0216, 0.1145, 0.0557, 0.0336], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0024, 0.0017, 0.0021, 0.0017, 0.0015, 0.0023, 0.0016], + device='cuda:2'), out_proj_covar=tensor([8.6557e-05, 1.2017e-04, 9.1650e-05, 1.0653e-04, 9.3810e-05, 8.6139e-05, + 1.1534e-04, 8.8493e-05], device='cuda:2') +2022-11-16 06:18:43,684 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-11-16 06:18:45,474 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87423.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:18:52,773 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0592, 4.5375, 4.1147, 4.5901, 4.5424, 3.8030, 4.2706, 3.9630], + device='cuda:2'), covar=tensor([0.0563, 0.0542, 0.1351, 0.0368, 0.0487, 0.0519, 0.0770, 0.0633], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0176, 0.0273, 0.0174, 0.0221, 0.0172, 0.0189, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:18:55,227 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.013e+01 1.498e+02 1.837e+02 2.189e+02 4.153e+02, threshold=3.674e+02, percent-clipped=6.0 +2022-11-16 06:19:10,505 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87461.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:19:12,929 INFO [train.py:876] (2/4) Epoch 13, batch 200, loss[loss=0.1203, simple_loss=0.152, pruned_loss=0.04431, over 5620.00 frames. ], tot_loss[loss=0.1075, simple_loss=0.137, pruned_loss=0.03901, over 691078.84 frames. ], batch size: 38, lr: 6.32e-03, grad_scale: 16.0 +2022-11-16 06:19:17,078 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87470.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:19:25,869 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.9100, 5.3211, 5.5517, 5.1198, 5.9328, 5.7480, 4.9140, 5.9158], + device='cuda:2'), covar=tensor([0.0302, 0.0302, 0.0478, 0.0298, 0.0309, 0.0164, 0.0202, 0.0187], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0156, 0.0111, 0.0146, 0.0183, 0.0109, 0.0128, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 06:19:40,370 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87505.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:19:47,577 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.33 vs. limit=5.0 +2022-11-16 06:19:49,630 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87518.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:19:52,436 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87522.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:20:00,740 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87534.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:20:03,214 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.494e+01 1.564e+02 1.812e+02 2.322e+02 4.189e+02, threshold=3.625e+02, percent-clipped=2.0 +2022-11-16 06:20:08,626 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4871, 1.7615, 1.4390, 1.4650, 1.5815, 1.6262, 1.4990, 1.6602], + device='cuda:2'), covar=tensor([0.0064, 0.0064, 0.0055, 0.0063, 0.0059, 0.0051, 0.0060, 0.0051], + device='cuda:2'), in_proj_covar=tensor([0.0062, 0.0057, 0.0057, 0.0060, 0.0059, 0.0054, 0.0054, 0.0052], + device='cuda:2'), out_proj_covar=tensor([5.5369e-05, 5.0272e-05, 4.9739e-05, 5.3716e-05, 5.2107e-05, 4.7294e-05, + 4.8060e-05, 4.5184e-05], device='cuda:2') +2022-11-16 06:20:21,146 INFO [train.py:876] (2/4) Epoch 13, batch 300, loss[loss=0.0862, simple_loss=0.1189, pruned_loss=0.02673, over 5439.00 frames. ], tot_loss[loss=0.1079, simple_loss=0.1373, pruned_loss=0.03925, over 842496.67 frames. ], batch size: 10, lr: 6.32e-03, grad_scale: 16.0 +2022-11-16 06:20:21,984 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87566.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:20:33,024 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87582.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:20:34,419 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87584.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:20:35,589 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87586.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:20:37,645 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87589.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:20:48,681 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87605.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:20:49,291 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6773, 5.1946, 4.7378, 5.1771, 5.1650, 4.4800, 4.8286, 4.5532], + device='cuda:2'), covar=tensor([0.0217, 0.0410, 0.1343, 0.0339, 0.0362, 0.0418, 0.0515, 0.0608], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0178, 0.0276, 0.0176, 0.0223, 0.0174, 0.0191, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:21:11,332 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.900e+01 1.347e+02 1.607e+02 1.950e+02 4.005e+02, threshold=3.214e+02, percent-clipped=2.0 +2022-11-16 06:21:16,204 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87645.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 06:21:19,742 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87650.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:21:24,824 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87658.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:21:26,159 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5658, 1.7167, 1.5305, 1.4347, 1.6908, 1.6395, 1.5426, 1.5283], + device='cuda:2'), covar=tensor([0.0068, 0.0054, 0.0054, 0.0061, 0.0049, 0.0041, 0.0056, 0.0071], + device='cuda:2'), in_proj_covar=tensor([0.0062, 0.0057, 0.0057, 0.0061, 0.0059, 0.0055, 0.0054, 0.0052], + device='cuda:2'), out_proj_covar=tensor([5.5444e-05, 5.0532e-05, 5.0056e-05, 5.4098e-05, 5.2539e-05, 4.7514e-05, + 4.8261e-05, 4.5457e-05], device='cuda:2') +2022-11-16 06:21:29,304 INFO [train.py:876] (2/4) Epoch 13, batch 400, loss[loss=0.07791, simple_loss=0.1183, pruned_loss=0.01874, over 5596.00 frames. ], tot_loss[loss=0.1058, simple_loss=0.1361, pruned_loss=0.0378, over 940204.05 frames. ], batch size: 25, lr: 6.31e-03, grad_scale: 16.0 +2022-11-16 06:21:30,130 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87666.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:21:46,742 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-11-16 06:21:57,181 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87706.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:22:00,911 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-11-16 06:22:04,476 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.3194, 3.7097, 3.3276, 3.6785, 3.7218, 3.1863, 3.2998, 3.2073], + device='cuda:2'), covar=tensor([0.0960, 0.0509, 0.1446, 0.0502, 0.0555, 0.0600, 0.0775, 0.0781], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0177, 0.0276, 0.0175, 0.0223, 0.0175, 0.0190, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:22:05,083 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87718.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:22:19,049 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.955e+01 1.567e+02 1.911e+02 2.428e+02 4.922e+02, threshold=3.823e+02, percent-clipped=4.0 +2022-11-16 06:22:37,356 INFO [train.py:876] (2/4) Epoch 13, batch 500, loss[loss=0.09318, simple_loss=0.1352, pruned_loss=0.02557, over 5572.00 frames. ], tot_loss[loss=0.1085, simple_loss=0.1378, pruned_loss=0.03965, over 995401.17 frames. ], batch size: 24, lr: 6.31e-03, grad_scale: 16.0 +2022-11-16 06:23:12,961 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87817.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:23:16,373 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2728, 2.2736, 2.9927, 2.7132, 2.8267, 2.3241, 2.8865, 3.2768], + device='cuda:2'), covar=tensor([0.0872, 0.1463, 0.0874, 0.1511, 0.0801, 0.1456, 0.1101, 0.0855], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0194, 0.0218, 0.0213, 0.0242, 0.0197, 0.0227, 0.0230], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:23:17,572 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.3052, 4.8154, 5.0092, 4.6444, 5.1106, 4.6269, 2.6037, 5.3964], + device='cuda:2'), covar=tensor([0.0159, 0.0233, 0.0260, 0.0333, 0.0322, 0.0652, 0.2406, 0.0253], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0087, 0.0088, 0.0081, 0.0102, 0.0089, 0.0129, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:23:26,588 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.493e+01 1.445e+02 1.920e+02 2.398e+02 4.024e+02, threshold=3.840e+02, percent-clipped=2.0 +2022-11-16 06:23:42,716 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87861.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 06:23:45,211 INFO [train.py:876] (2/4) Epoch 13, batch 600, loss[loss=0.1004, simple_loss=0.1317, pruned_loss=0.03455, over 5711.00 frames. ], tot_loss[loss=0.1061, simple_loss=0.1361, pruned_loss=0.038, over 1031182.21 frames. ], batch size: 17, lr: 6.31e-03, grad_scale: 16.0 +2022-11-16 06:23:57,744 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.48 vs. limit=5.0 +2022-11-16 06:23:59,302 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87886.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:24:21,220 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4213, 1.6315, 1.3061, 1.3272, 1.5036, 1.3729, 1.1888, 1.5140], + device='cuda:2'), covar=tensor([0.0068, 0.0058, 0.0066, 0.0076, 0.0064, 0.0056, 0.0074, 0.0079], + device='cuda:2'), in_proj_covar=tensor([0.0061, 0.0057, 0.0057, 0.0060, 0.0059, 0.0054, 0.0053, 0.0052], + device='cuda:2'), out_proj_covar=tensor([5.4527e-05, 5.0570e-05, 4.9505e-05, 5.3407e-05, 5.2657e-05, 4.7373e-05, + 4.7440e-05, 4.5210e-05], device='cuda:2') +2022-11-16 06:24:31,962 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87934.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:24:35,163 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.722e+01 1.443e+02 1.741e+02 2.053e+02 3.488e+02, threshold=3.481e+02, percent-clipped=0.0 +2022-11-16 06:24:35,929 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87940.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 06:24:38,964 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.32 vs. limit=5.0 +2022-11-16 06:24:39,196 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87945.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:24:49,807 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87961.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 06:24:52,678 INFO [train.py:876] (2/4) Epoch 13, batch 700, loss[loss=0.09845, simple_loss=0.1337, pruned_loss=0.03158, over 5670.00 frames. ], tot_loss[loss=0.1055, simple_loss=0.1356, pruned_loss=0.03768, over 1055050.40 frames. ], batch size: 19, lr: 6.30e-03, grad_scale: 8.0 +2022-11-16 06:25:09,901 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4074, 4.1576, 2.7613, 3.8744, 3.1440, 2.8279, 2.0798, 3.5258], + device='cuda:2'), covar=tensor([0.1459, 0.0227, 0.1109, 0.0374, 0.0927, 0.1055, 0.2039, 0.0399], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0143, 0.0156, 0.0148, 0.0174, 0.0168, 0.0158, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:25:19,994 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9685, 2.3630, 2.4301, 1.5273, 2.6089, 2.8624, 2.5717, 2.9612], + device='cuda:2'), covar=tensor([0.1895, 0.1751, 0.1611, 0.2767, 0.0691, 0.1288, 0.0620, 0.1016], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0182, 0.0168, 0.0183, 0.0181, 0.0201, 0.0167, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:25:29,014 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88018.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:25:42,925 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.823e+01 1.432e+02 1.789e+02 2.095e+02 4.590e+02, threshold=3.577e+02, percent-clipped=1.0 +2022-11-16 06:26:00,229 INFO [train.py:876] (2/4) Epoch 13, batch 800, loss[loss=0.1067, simple_loss=0.1444, pruned_loss=0.03448, over 5551.00 frames. ], tot_loss[loss=0.1042, simple_loss=0.1345, pruned_loss=0.03699, over 1068675.67 frames. ], batch size: 30, lr: 6.30e-03, grad_scale: 8.0 +2022-11-16 06:26:01,329 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88066.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:26:24,656 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=88100.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:26:36,773 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88117.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:26:50,583 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-11-16 06:26:51,417 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.479e+01 1.431e+02 1.753e+02 2.206e+02 3.833e+02, threshold=3.505e+02, percent-clipped=1.0 +2022-11-16 06:27:05,808 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88161.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:27:05,853 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=88161.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:27:08,639 INFO [train.py:876] (2/4) Epoch 13, batch 900, loss[loss=0.1125, simple_loss=0.1328, pruned_loss=0.04609, over 4999.00 frames. ], tot_loss[loss=0.1041, simple_loss=0.1344, pruned_loss=0.0369, over 1074146.31 frames. ], batch size: 109, lr: 6.30e-03, grad_scale: 8.0 +2022-11-16 06:27:08,681 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88165.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:27:10,727 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0803, 0.9107, 0.9765, 0.9426, 1.1189, 1.0576, 0.5825, 0.8916], + device='cuda:2'), covar=tensor([0.0259, 0.0429, 0.0326, 0.0487, 0.0343, 0.0289, 0.0811, 0.0336], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0023, 0.0017, 0.0020, 0.0017, 0.0015, 0.0023, 0.0016], + device='cuda:2'), out_proj_covar=tensor([8.3882e-05, 1.1595e-04, 8.8792e-05, 1.0300e-04, 9.0727e-05, 8.5130e-05, + 1.1282e-04, 8.5882e-05], device='cuda:2') +2022-11-16 06:27:38,498 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88209.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:27:59,344 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.909e+01 1.508e+02 1.868e+02 2.272e+02 4.107e+02, threshold=3.735e+02, percent-clipped=5.0 +2022-11-16 06:28:00,104 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88240.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:28:03,436 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88245.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:28:14,162 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88261.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 06:28:16,638 INFO [train.py:876] (2/4) Epoch 13, batch 1000, loss[loss=0.0722, simple_loss=0.1079, pruned_loss=0.01828, over 5170.00 frames. ], tot_loss[loss=0.1045, simple_loss=0.1351, pruned_loss=0.03693, over 1083104.68 frames. ], batch size: 8, lr: 6.29e-03, grad_scale: 8.0 +2022-11-16 06:28:32,435 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88288.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:28:34,837 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=88291.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:28:36,007 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88293.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:28:45,257 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1083, 2.5028, 2.9643, 3.8657, 3.9041, 3.2023, 2.8771, 3.9351], + device='cuda:2'), covar=tensor([0.0766, 0.2918, 0.2489, 0.3146, 0.1425, 0.2722, 0.2127, 0.0600], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0197, 0.0189, 0.0301, 0.0227, 0.0202, 0.0190, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:28:46,441 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88309.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:29:06,586 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.385e+01 1.405e+02 1.701e+02 2.123e+02 3.653e+02, threshold=3.402e+02, percent-clipped=0.0 +2022-11-16 06:29:16,003 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=88352.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:29:24,128 INFO [train.py:876] (2/4) Epoch 13, batch 1100, loss[loss=0.1112, simple_loss=0.1347, pruned_loss=0.0438, over 5134.00 frames. ], tot_loss[loss=0.1032, simple_loss=0.1344, pruned_loss=0.03606, over 1083544.38 frames. ], batch size: 91, lr: 6.29e-03, grad_scale: 8.0 +2022-11-16 06:29:44,195 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.89 vs. limit=5.0 +2022-11-16 06:30:13,851 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.379e+01 1.472e+02 1.907e+02 2.402e+02 6.330e+02, threshold=3.813e+02, percent-clipped=8.0 +2022-11-16 06:30:22,578 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.23 vs. limit=2.0 +2022-11-16 06:30:25,730 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=88456.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:30:31,426 INFO [train.py:876] (2/4) Epoch 13, batch 1200, loss[loss=0.08935, simple_loss=0.1255, pruned_loss=0.02662, over 5318.00 frames. ], tot_loss[loss=0.1018, simple_loss=0.1329, pruned_loss=0.0354, over 1080448.18 frames. ], batch size: 9, lr: 6.28e-03, grad_scale: 8.0 +2022-11-16 06:31:16,954 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9680, 3.1231, 3.1465, 3.0184, 3.1596, 2.9451, 1.2709, 3.2457], + device='cuda:2'), covar=tensor([0.0332, 0.0314, 0.0316, 0.0317, 0.0293, 0.0430, 0.3046, 0.0311], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0089, 0.0088, 0.0083, 0.0103, 0.0091, 0.0131, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:31:21,164 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.717e+01 1.518e+02 1.854e+02 2.184e+02 7.084e+02, threshold=3.708e+02, percent-clipped=2.0 +2022-11-16 06:31:38,933 INFO [train.py:876] (2/4) Epoch 13, batch 1300, loss[loss=0.07648, simple_loss=0.1013, pruned_loss=0.02582, over 5462.00 frames. ], tot_loss[loss=0.1037, simple_loss=0.1343, pruned_loss=0.03655, over 1079808.29 frames. ], batch size: 11, lr: 6.28e-03, grad_scale: 8.0 +2022-11-16 06:31:43,975 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-11-16 06:32:03,297 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.38 vs. limit=5.0 +2022-11-16 06:32:07,130 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5662, 2.3167, 2.6286, 2.0516, 1.2968, 3.2083, 2.7724, 2.3098], + device='cuda:2'), covar=tensor([0.0788, 0.0997, 0.0781, 0.2324, 0.3222, 0.2403, 0.0848, 0.1235], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0100, 0.0100, 0.0102, 0.0076, 0.0070, 0.0081, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 06:32:28,428 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.235e+01 1.439e+02 1.728e+02 2.189e+02 4.268e+02, threshold=3.455e+02, percent-clipped=2.0 +2022-11-16 06:32:31,862 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9760, 2.5921, 2.3670, 1.5972, 2.7497, 2.7674, 2.7280, 2.8997], + device='cuda:2'), covar=tensor([0.1814, 0.1638, 0.1806, 0.2524, 0.0835, 0.1475, 0.0541, 0.1041], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0184, 0.0171, 0.0184, 0.0184, 0.0205, 0.0170, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:32:33,693 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=88647.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 06:32:38,968 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-11-16 06:32:45,386 INFO [train.py:876] (2/4) Epoch 13, batch 1400, loss[loss=0.08228, simple_loss=0.127, pruned_loss=0.01877, over 5564.00 frames. ], tot_loss[loss=0.1048, simple_loss=0.1351, pruned_loss=0.0373, over 1082314.64 frames. ], batch size: 25, lr: 6.28e-03, grad_scale: 8.0 +2022-11-16 06:33:34,910 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.011e+02 1.374e+02 1.560e+02 2.014e+02 3.886e+02, threshold=3.121e+02, percent-clipped=4.0 +2022-11-16 06:33:40,954 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8505, 3.9380, 3.7208, 3.5851, 3.9176, 3.7293, 1.4366, 3.9903], + device='cuda:2'), covar=tensor([0.0297, 0.0262, 0.0337, 0.0375, 0.0377, 0.0343, 0.3172, 0.0341], + device='cuda:2'), in_proj_covar=tensor([0.0108, 0.0091, 0.0090, 0.0085, 0.0105, 0.0092, 0.0134, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:33:46,719 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88756.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:33:52,516 INFO [train.py:876] (2/4) Epoch 13, batch 1500, loss[loss=0.09374, simple_loss=0.1309, pruned_loss=0.02827, over 5699.00 frames. ], tot_loss[loss=0.1062, simple_loss=0.136, pruned_loss=0.03816, over 1084976.33 frames. ], batch size: 14, lr: 6.27e-03, grad_scale: 8.0 +2022-11-16 06:33:55,378 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.42 vs. limit=5.0 +2022-11-16 06:34:19,411 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88804.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:34:33,414 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4861, 2.9217, 3.5039, 1.5853, 3.0894, 3.6187, 3.4644, 3.8855], + device='cuda:2'), covar=tensor([0.1750, 0.1529, 0.0936, 0.2974, 0.0556, 0.0952, 0.0452, 0.0616], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0183, 0.0169, 0.0183, 0.0182, 0.0202, 0.0169, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:34:37,855 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0001, 4.7467, 4.7604, 4.9207, 4.4664, 4.2516, 5.3595, 4.7354], + device='cuda:2'), covar=tensor([0.0442, 0.1131, 0.0544, 0.1289, 0.0615, 0.0385, 0.0770, 0.0776], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0109, 0.0097, 0.0123, 0.0091, 0.0082, 0.0148, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:34:42,719 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.033e+02 1.503e+02 1.931e+02 2.477e+02 5.840e+02, threshold=3.862e+02, percent-clipped=6.0 +2022-11-16 06:34:48,644 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8685, 3.0886, 2.5154, 2.9826, 3.0431, 2.9330, 2.9996, 3.0862], + device='cuda:2'), covar=tensor([0.1943, 0.0912, 0.2100, 0.1017, 0.1045, 0.0716, 0.0878, 0.0740], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0178, 0.0276, 0.0175, 0.0223, 0.0175, 0.0191, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:34:55,618 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8495, 2.6065, 2.0464, 2.4923, 2.5194, 2.4235, 2.5112, 2.6130], + device='cuda:2'), covar=tensor([0.0453, 0.1177, 0.2841, 0.1301, 0.1421, 0.0975, 0.1359, 0.0928], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0178, 0.0277, 0.0175, 0.0224, 0.0175, 0.0191, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:35:00,134 INFO [train.py:876] (2/4) Epoch 13, batch 1600, loss[loss=0.1035, simple_loss=0.1382, pruned_loss=0.03439, over 5643.00 frames. ], tot_loss[loss=0.1061, simple_loss=0.1361, pruned_loss=0.03809, over 1079625.09 frames. ], batch size: 32, lr: 6.27e-03, grad_scale: 8.0 +2022-11-16 06:35:49,234 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.059e+02 1.383e+02 1.790e+02 2.013e+02 5.184e+02, threshold=3.580e+02, percent-clipped=2.0 +2022-11-16 06:35:55,090 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88947.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 06:36:07,124 INFO [train.py:876] (2/4) Epoch 13, batch 1700, loss[loss=0.1245, simple_loss=0.147, pruned_loss=0.05101, over 5736.00 frames. ], tot_loss[loss=0.1047, simple_loss=0.1352, pruned_loss=0.03706, over 1084234.28 frames. ], batch size: 20, lr: 6.27e-03, grad_scale: 8.0 +2022-11-16 06:36:19,424 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2071, 2.6144, 3.1819, 1.8400, 1.8352, 3.8511, 3.0500, 2.5005], + device='cuda:2'), covar=tensor([0.0719, 0.1128, 0.0723, 0.2865, 0.3321, 0.0706, 0.0772, 0.1401], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0100, 0.0100, 0.0102, 0.0077, 0.0070, 0.0081, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 06:36:26,871 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88995.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:36:39,627 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.84 vs. limit=2.0 +2022-11-16 06:36:41,234 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0743, 2.3187, 2.3771, 2.0983, 2.3581, 2.2842, 1.1269, 2.4457], + device='cuda:2'), covar=tensor([0.0374, 0.0396, 0.0366, 0.0458, 0.0425, 0.0430, 0.2813, 0.0404], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0090, 0.0089, 0.0084, 0.0103, 0.0091, 0.0132, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:36:49,145 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89027.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:36:49,186 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5666, 1.3439, 1.5548, 1.4321, 1.7489, 1.4610, 1.2231, 1.5350], + device='cuda:2'), covar=tensor([0.1065, 0.1115, 0.1435, 0.1091, 0.0991, 0.1213, 0.2158, 0.1868], + device='cuda:2'), in_proj_covar=tensor([0.0260, 0.0198, 0.0188, 0.0301, 0.0227, 0.0202, 0.0190, 0.0250], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:36:56,974 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.337e+01 1.392e+02 1.730e+02 2.257e+02 5.092e+02, threshold=3.461e+02, percent-clipped=3.0 +2022-11-16 06:37:15,263 INFO [train.py:876] (2/4) Epoch 13, batch 1800, loss[loss=0.08875, simple_loss=0.1135, pruned_loss=0.032, over 5178.00 frames. ], tot_loss[loss=0.1033, simple_loss=0.1339, pruned_loss=0.03629, over 1080558.20 frames. ], batch size: 7, lr: 6.26e-03, grad_scale: 8.0 +2022-11-16 06:37:18,942 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.46 vs. limit=5.0 +2022-11-16 06:37:30,534 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89088.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:38:04,921 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.586e+01 1.379e+02 1.721e+02 2.183e+02 4.295e+02, threshold=3.442e+02, percent-clipped=5.0 +2022-11-16 06:38:23,036 INFO [train.py:876] (2/4) Epoch 13, batch 1900, loss[loss=0.08794, simple_loss=0.1139, pruned_loss=0.031, over 5445.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1342, pruned_loss=0.03673, over 1078660.28 frames. ], batch size: 11, lr: 6.26e-03, grad_scale: 8.0 +2022-11-16 06:38:25,955 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89169.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:39:00,879 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-11-16 06:39:06,972 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89230.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:39:12,155 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5885, 3.2201, 3.2166, 3.0079, 1.8905, 3.3067, 2.1816, 2.9118], + device='cuda:2'), covar=tensor([0.0313, 0.0207, 0.0187, 0.0385, 0.0538, 0.0151, 0.0499, 0.0167], + device='cuda:2'), in_proj_covar=tensor([0.0196, 0.0181, 0.0186, 0.0207, 0.0197, 0.0183, 0.0194, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 06:39:12,533 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.915e+01 1.392e+02 1.772e+02 2.206e+02 3.328e+02, threshold=3.543e+02, percent-clipped=0.0 +2022-11-16 06:39:21,583 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0458, 3.4867, 2.9600, 3.4281, 3.4731, 3.0017, 3.0712, 3.2726], + device='cuda:2'), covar=tensor([0.1608, 0.0760, 0.2215, 0.0790, 0.0798, 0.0789, 0.1161, 0.0761], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0177, 0.0273, 0.0175, 0.0222, 0.0175, 0.0190, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:39:29,335 INFO [train.py:876] (2/4) Epoch 13, batch 2000, loss[loss=0.1145, simple_loss=0.1396, pruned_loss=0.04465, over 5574.00 frames. ], tot_loss[loss=0.106, simple_loss=0.1356, pruned_loss=0.03821, over 1082393.49 frames. ], batch size: 21, lr: 6.26e-03, grad_scale: 8.0 +2022-11-16 06:39:53,689 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8060, 2.4378, 3.3190, 2.9201, 3.1863, 2.4892, 3.1586, 3.7069], + device='cuda:2'), covar=tensor([0.0706, 0.1454, 0.0861, 0.1517, 0.0889, 0.1525, 0.1297, 0.0857], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0190, 0.0212, 0.0206, 0.0237, 0.0193, 0.0220, 0.0225], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:39:57,181 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-16 06:40:09,768 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5151, 2.9741, 3.5928, 2.0276, 2.0651, 3.9491, 3.3099, 2.9377], + device='cuda:2'), covar=tensor([0.0542, 0.1003, 0.0566, 0.2967, 0.2685, 0.0949, 0.0451, 0.0786], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0104, 0.0101, 0.0106, 0.0079, 0.0072, 0.0084, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 06:40:14,845 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6697, 3.7189, 3.6794, 3.4710, 3.7119, 3.6089, 1.2516, 3.8847], + device='cuda:2'), covar=tensor([0.0260, 0.0247, 0.0309, 0.0327, 0.0293, 0.0318, 0.3186, 0.0247], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0089, 0.0087, 0.0083, 0.0102, 0.0090, 0.0130, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:40:20,292 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.714e+01 1.486e+02 1.827e+02 2.274e+02 3.584e+02, threshold=3.655e+02, percent-clipped=1.0 +2022-11-16 06:40:37,212 INFO [train.py:876] (2/4) Epoch 13, batch 2100, loss[loss=0.1119, simple_loss=0.1441, pruned_loss=0.03989, over 5593.00 frames. ], tot_loss[loss=0.1067, simple_loss=0.1366, pruned_loss=0.03842, over 1084803.29 frames. ], batch size: 38, lr: 6.25e-03, grad_scale: 8.0 +2022-11-16 06:40:43,516 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89374.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:40:46,459 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-11-16 06:40:49,286 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89383.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:41:17,738 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89425.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:41:24,437 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89435.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:41:26,875 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.665e+01 1.565e+02 1.857e+02 2.395e+02 6.396e+02, threshold=3.713e+02, percent-clipped=5.0 +2022-11-16 06:41:27,069 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89439.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:41:44,659 INFO [train.py:876] (2/4) Epoch 13, batch 2200, loss[loss=0.09086, simple_loss=0.1226, pruned_loss=0.02958, over 5283.00 frames. ], tot_loss[loss=0.1054, simple_loss=0.1358, pruned_loss=0.0375, over 1085045.67 frames. ], batch size: 79, lr: 6.25e-03, grad_scale: 8.0 +2022-11-16 06:41:50,731 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 06:41:58,290 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89486.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:42:03,746 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1627, 4.5608, 4.1627, 4.5199, 4.5754, 3.8668, 4.2049, 4.0605], + device='cuda:2'), covar=tensor([0.0487, 0.0516, 0.1455, 0.0507, 0.0494, 0.0513, 0.0688, 0.0657], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0180, 0.0276, 0.0176, 0.0225, 0.0176, 0.0192, 0.0177], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:42:07,610 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89500.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:42:14,372 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89509.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:42:21,748 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-16 06:42:24,621 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89525.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:42:33,580 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.106e+01 1.371e+02 1.691e+02 2.068e+02 3.234e+02, threshold=3.383e+02, percent-clipped=0.0 +2022-11-16 06:42:51,734 INFO [train.py:876] (2/4) Epoch 13, batch 2300, loss[loss=0.1087, simple_loss=0.1321, pruned_loss=0.04267, over 5291.00 frames. ], tot_loss[loss=0.1058, simple_loss=0.136, pruned_loss=0.0378, over 1084346.88 frames. ], batch size: 79, lr: 6.25e-03, grad_scale: 8.0 +2022-11-16 06:42:55,287 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89570.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:42:59,204 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1708, 2.7188, 3.8544, 3.3407, 4.1088, 2.7480, 3.7197, 4.3001], + device='cuda:2'), covar=tensor([0.0743, 0.1515, 0.0792, 0.1734, 0.0503, 0.1830, 0.1201, 0.0692], + device='cuda:2'), in_proj_covar=tensor([0.0241, 0.0191, 0.0213, 0.0211, 0.0241, 0.0197, 0.0225, 0.0227], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:43:41,342 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.175e+01 1.502e+02 1.727e+02 2.123e+02 1.355e+03, threshold=3.453e+02, percent-clipped=6.0 +2022-11-16 06:43:44,353 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1346, 1.6445, 2.0700, 1.8539, 1.7803, 1.8208, 2.0736, 1.5543], + device='cuda:2'), covar=tensor([0.0028, 0.0144, 0.0042, 0.0060, 0.0079, 0.0069, 0.0041, 0.0062], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0027, 0.0028, 0.0036, 0.0032, 0.0029, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([2.8009e-05, 2.5612e-05, 2.5406e-05, 3.4423e-05, 2.9381e-05, 2.7618e-05, + 3.3883e-05, 3.2635e-05], device='cuda:2') +2022-11-16 06:44:00,257 INFO [train.py:876] (2/4) Epoch 13, batch 2400, loss[loss=0.1003, simple_loss=0.1232, pruned_loss=0.03864, over 4588.00 frames. ], tot_loss[loss=0.1062, simple_loss=0.136, pruned_loss=0.03816, over 1083404.42 frames. ], batch size: 5, lr: 6.24e-03, grad_scale: 8.0 +2022-11-16 06:44:12,343 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89680.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:44:14,485 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=89683.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:44:24,433 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8693, 2.2634, 2.4215, 3.0619, 3.0797, 2.4780, 2.1669, 3.0613], + device='cuda:2'), covar=tensor([0.1501, 0.2226, 0.2018, 0.1730, 0.1279, 0.2533, 0.1993, 0.1029], + device='cuda:2'), in_proj_covar=tensor([0.0257, 0.0197, 0.0187, 0.0296, 0.0227, 0.0202, 0.0189, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:44:47,033 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89730.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:44:47,660 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=89731.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:44:52,831 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.083e+02 1.552e+02 1.853e+02 2.424e+02 4.958e+02, threshold=3.705e+02, percent-clipped=7.0 +2022-11-16 06:44:54,411 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89741.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:45:08,610 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9185, 4.5891, 4.7673, 4.8368, 4.5593, 4.2634, 5.1455, 4.6740], + device='cuda:2'), covar=tensor([0.0365, 0.0728, 0.0440, 0.1026, 0.0382, 0.0361, 0.0687, 0.0568], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0108, 0.0096, 0.0123, 0.0090, 0.0081, 0.0148, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:45:09,834 INFO [train.py:876] (2/4) Epoch 13, batch 2500, loss[loss=0.1057, simple_loss=0.143, pruned_loss=0.03414, over 5759.00 frames. ], tot_loss[loss=0.1057, simple_loss=0.136, pruned_loss=0.03766, over 1087842.95 frames. ], batch size: 16, lr: 6.24e-03, grad_scale: 8.0 +2022-11-16 06:45:21,167 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89781.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:45:30,638 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89795.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 06:45:50,800 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=89825.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:45:57,168 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-16 06:46:00,866 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.702e+01 1.449e+02 1.693e+02 2.128e+02 5.529e+02, threshold=3.385e+02, percent-clipped=3.0 +2022-11-16 06:46:17,700 INFO [train.py:876] (2/4) Epoch 13, batch 2600, loss[loss=0.1311, simple_loss=0.1479, pruned_loss=0.05718, over 5457.00 frames. ], tot_loss[loss=0.1059, simple_loss=0.136, pruned_loss=0.0379, over 1086491.16 frames. ], batch size: 58, lr: 6.24e-03, grad_scale: 8.0 +2022-11-16 06:46:17,788 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89865.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:46:17,864 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89865.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:46:23,249 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=89873.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:46:36,559 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4951, 1.2151, 1.4438, 0.9594, 1.4870, 1.4146, 0.8348, 1.3016], + device='cuda:2'), covar=tensor([0.0524, 0.0619, 0.0445, 0.0756, 0.0567, 0.0846, 0.0921, 0.0378], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0027, 0.0019, 0.0023, 0.0019, 0.0017, 0.0025, 0.0018], + device='cuda:2'), out_proj_covar=tensor([9.4381e-05, 1.3276e-04, 1.0031e-04, 1.1536e-04, 1.0330e-04, 9.6421e-05, + 1.2587e-04, 9.6298e-05], device='cuda:2') +2022-11-16 06:46:58,596 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89926.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 06:47:04,674 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-11-16 06:47:07,227 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.464e+01 1.385e+02 1.759e+02 2.199e+02 3.359e+02, threshold=3.518e+02, percent-clipped=0.0 +2022-11-16 06:47:24,943 INFO [train.py:876] (2/4) Epoch 13, batch 2700, loss[loss=0.1201, simple_loss=0.1467, pruned_loss=0.04673, over 5704.00 frames. ], tot_loss[loss=0.1041, simple_loss=0.1348, pruned_loss=0.03674, over 1089243.29 frames. ], batch size: 36, lr: 6.23e-03, grad_scale: 16.0 +2022-11-16 06:47:25,656 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.9118, 5.7279, 5.4684, 5.8850, 5.5574, 5.2731, 6.3791, 5.7172], + device='cuda:2'), covar=tensor([0.0303, 0.0619, 0.0284, 0.1054, 0.0297, 0.0114, 0.0578, 0.0490], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0110, 0.0098, 0.0125, 0.0091, 0.0082, 0.0151, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 06:47:38,710 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6446, 4.5689, 3.4417, 2.0253, 4.2604, 1.7561, 4.1876, 2.5782], + device='cuda:2'), covar=tensor([0.1330, 0.0126, 0.0602, 0.2072, 0.0217, 0.1771, 0.0233, 0.1388], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0105, 0.0117, 0.0112, 0.0104, 0.0120, 0.0103, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:47:58,912 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-16 06:48:12,091 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90030.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:48:15,895 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90036.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:48:17,698 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.688e+01 1.446e+02 1.718e+02 2.130e+02 5.119e+02, threshold=3.437e+02, percent-clipped=5.0 +2022-11-16 06:48:19,177 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1631, 2.3698, 2.5388, 3.2501, 3.1677, 2.5620, 2.1433, 3.3531], + device='cuda:2'), covar=tensor([0.1338, 0.2066, 0.2122, 0.2064, 0.1519, 0.2862, 0.2164, 0.0954], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0198, 0.0189, 0.0300, 0.0228, 0.0203, 0.0189, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:48:35,917 INFO [train.py:876] (2/4) Epoch 13, batch 2800, loss[loss=0.1159, simple_loss=0.1443, pruned_loss=0.04376, over 5597.00 frames. ], tot_loss[loss=0.105, simple_loss=0.1353, pruned_loss=0.03734, over 1083732.70 frames. ], batch size: 22, lr: 6.23e-03, grad_scale: 16.0 +2022-11-16 06:48:41,155 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5494, 3.3629, 3.5896, 1.6111, 3.1728, 3.6998, 3.6828, 4.1272], + device='cuda:2'), covar=tensor([0.1853, 0.1374, 0.0957, 0.3232, 0.0724, 0.0862, 0.0416, 0.0566], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0181, 0.0171, 0.0183, 0.0182, 0.0203, 0.0168, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:48:44,258 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90078.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:48:45,113 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7599, 2.4960, 2.8751, 3.7691, 3.6782, 2.8872, 2.4705, 3.7239], + device='cuda:2'), covar=tensor([0.1397, 0.2592, 0.2190, 0.2300, 0.1372, 0.2876, 0.2435, 0.0963], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0198, 0.0188, 0.0300, 0.0226, 0.0202, 0.0188, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:48:45,626 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3711, 3.9578, 4.2431, 3.9757, 4.4513, 4.1854, 4.0234, 4.4055], + device='cuda:2'), covar=tensor([0.0335, 0.0438, 0.0425, 0.0338, 0.0354, 0.0331, 0.0378, 0.0356], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0156, 0.0112, 0.0146, 0.0185, 0.0112, 0.0130, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 06:48:46,314 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90081.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:48:55,335 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90095.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:49:15,468 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0460, 1.5014, 1.2324, 1.3713, 1.2238, 1.7865, 1.5197, 1.2988], + device='cuda:2'), covar=tensor([0.3081, 0.1142, 0.3421, 0.2866, 0.2517, 0.0615, 0.1724, 0.2911], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0100, 0.0099, 0.0103, 0.0075, 0.0070, 0.0080, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 06:49:18,683 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90129.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:49:19,467 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90130.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:49:25,056 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.858e+01 1.324e+02 1.624e+02 2.114e+02 4.134e+02, threshold=3.247e+02, percent-clipped=3.0 +2022-11-16 06:49:27,799 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90143.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:49:34,364 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2307, 2.8378, 2.7516, 1.4308, 2.7218, 3.0622, 3.0555, 3.3206], + device='cuda:2'), covar=tensor([0.1929, 0.1861, 0.1280, 0.3562, 0.0917, 0.1271, 0.0745, 0.0880], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0182, 0.0173, 0.0184, 0.0184, 0.0204, 0.0170, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:49:43,063 INFO [train.py:876] (2/4) Epoch 13, batch 2900, loss[loss=0.1018, simple_loss=0.1305, pruned_loss=0.03654, over 5701.00 frames. ], tot_loss[loss=0.1028, simple_loss=0.134, pruned_loss=0.03579, over 1089043.65 frames. ], batch size: 19, lr: 6.23e-03, grad_scale: 16.0 +2022-11-16 06:49:43,163 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90165.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:50:00,526 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90191.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:50:16,487 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90213.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:50:21,687 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90221.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 06:50:28,903 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90232.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:50:33,302 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.557e+01 1.381e+02 1.773e+02 2.128e+02 3.504e+02, threshold=3.546e+02, percent-clipped=3.0 +2022-11-16 06:50:33,465 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8724, 5.4659, 3.7205, 5.0162, 4.1439, 3.7129, 3.4434, 4.7615], + device='cuda:2'), covar=tensor([0.1595, 0.0226, 0.1126, 0.0577, 0.0547, 0.0987, 0.1530, 0.0247], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0143, 0.0156, 0.0150, 0.0172, 0.0169, 0.0159, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:50:50,742 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8069, 4.7297, 3.5756, 2.2800, 4.3565, 2.1837, 4.3012, 2.6483], + device='cuda:2'), covar=tensor([0.1259, 0.0094, 0.0507, 0.1738, 0.0169, 0.1447, 0.0195, 0.1283], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0104, 0.0115, 0.0111, 0.0104, 0.0118, 0.0101, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:50:51,297 INFO [train.py:876] (2/4) Epoch 13, batch 3000, loss[loss=0.09623, simple_loss=0.1329, pruned_loss=0.0298, over 5767.00 frames. ], tot_loss[loss=0.104, simple_loss=0.1346, pruned_loss=0.03672, over 1087016.92 frames. ], batch size: 16, lr: 6.22e-03, grad_scale: 16.0 +2022-11-16 06:50:51,297 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 06:51:06,004 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5620, 1.4899, 1.3771, 1.0860, 1.4454, 1.7026, 0.8074, 1.2082], + device='cuda:2'), covar=tensor([0.0349, 0.0373, 0.0386, 0.0737, 0.0435, 0.0350, 0.0771, 0.0532], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0026, 0.0018, 0.0022, 0.0019, 0.0017, 0.0024, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.1589e-05, 1.2868e-04, 9.7384e-05, 1.1243e-04, 1.0041e-04, 9.2803e-05, + 1.2213e-04, 9.3014e-05], device='cuda:2') +2022-11-16 06:51:07,540 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9325, 2.8132, 2.8438, 2.6510, 2.9696, 2.9457, 2.9569, 2.9962], + device='cuda:2'), covar=tensor([0.0380, 0.0452, 0.0437, 0.0538, 0.0471, 0.0249, 0.0371, 0.0443], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0155, 0.0111, 0.0145, 0.0183, 0.0112, 0.0129, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 06:51:08,998 INFO [train.py:908] (2/4) Epoch 13, validation: loss=0.1737, simple_loss=0.1855, pruned_loss=0.08091, over 1530663.00 frames. +2022-11-16 06:51:08,999 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 06:51:27,467 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90293.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:51:29,352 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3982, 4.2141, 3.2266, 1.9697, 3.8542, 1.5670, 3.8475, 2.1663], + device='cuda:2'), covar=tensor([0.1369, 0.0111, 0.0707, 0.1777, 0.0231, 0.1898, 0.0242, 0.1516], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0104, 0.0116, 0.0112, 0.0104, 0.0119, 0.0102, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:51:49,352 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.24 vs. limit=5.0 +2022-11-16 06:51:56,870 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90336.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:51:58,707 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.650e+01 1.477e+02 1.762e+02 2.223e+02 4.727e+02, threshold=3.524e+02, percent-clipped=4.0 +2022-11-16 06:52:06,697 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90351.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:52:14,645 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5605, 1.3111, 1.4957, 1.4095, 1.7432, 1.4291, 1.1620, 1.5501], + device='cuda:2'), covar=tensor([0.1246, 0.1244, 0.1505, 0.1180, 0.0962, 0.1213, 0.2245, 0.2056], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0198, 0.0189, 0.0299, 0.0225, 0.0202, 0.0188, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:52:16,338 INFO [train.py:876] (2/4) Epoch 13, batch 3100, loss[loss=0.07581, simple_loss=0.1032, pruned_loss=0.02423, over 5563.00 frames. ], tot_loss[loss=0.1037, simple_loss=0.1346, pruned_loss=0.03634, over 1085217.24 frames. ], batch size: 10, lr: 6.22e-03, grad_scale: 16.0 +2022-11-16 06:52:29,316 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90384.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:52:47,883 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90412.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:53:06,644 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.938e+01 1.382e+02 1.732e+02 2.119e+02 3.320e+02, threshold=3.464e+02, percent-clipped=0.0 +2022-11-16 06:53:06,847 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0402, 1.7996, 1.8209, 1.8979, 1.7079, 2.0876, 1.9058, 1.5233], + device='cuda:2'), covar=tensor([0.0043, 0.0041, 0.0049, 0.0053, 0.0095, 0.0053, 0.0041, 0.0052], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0027, 0.0028, 0.0036, 0.0032, 0.0028, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([2.8032e-05, 2.5212e-05, 2.5254e-05, 3.4407e-05, 2.9352e-05, 2.7157e-05, + 3.3317e-05, 3.2520e-05], device='cuda:2') +2022-11-16 06:53:19,993 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90459.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:53:23,670 INFO [train.py:876] (2/4) Epoch 13, batch 3200, loss[loss=0.1018, simple_loss=0.1348, pruned_loss=0.03441, over 5536.00 frames. ], tot_loss[loss=0.1044, simple_loss=0.1356, pruned_loss=0.03664, over 1087693.92 frames. ], batch size: 21, lr: 6.22e-03, grad_scale: 16.0 +2022-11-16 06:53:32,586 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-11-16 06:53:38,088 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90486.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:53:44,962 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90496.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:54:00,518 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90520.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:54:01,095 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90521.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 06:54:10,579 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9299, 1.7263, 2.0049, 1.3974, 1.5505, 1.7713, 1.6297, 1.4257], + device='cuda:2'), covar=tensor([0.0043, 0.0060, 0.0025, 0.0065, 0.0113, 0.0074, 0.0040, 0.0073], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0027, 0.0028, 0.0036, 0.0032, 0.0028, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([2.8056e-05, 2.4966e-05, 2.5095e-05, 3.4382e-05, 2.9259e-05, 2.7228e-05, + 3.3311e-05, 3.2458e-05], device='cuda:2') +2022-11-16 06:54:11,893 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2883, 2.2781, 2.5254, 3.5011, 3.3611, 2.6330, 2.3414, 3.4930], + device='cuda:2'), covar=tensor([0.1499, 0.2342, 0.2091, 0.2116, 0.1229, 0.3141, 0.2032, 0.0890], + device='cuda:2'), in_proj_covar=tensor([0.0261, 0.0200, 0.0192, 0.0304, 0.0228, 0.0205, 0.0192, 0.0252], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:54:13,667 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.039e+02 1.437e+02 1.897e+02 2.279e+02 5.045e+02, threshold=3.794e+02, percent-clipped=5.0 +2022-11-16 06:54:25,909 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90557.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:54:30,907 INFO [train.py:876] (2/4) Epoch 13, batch 3300, loss[loss=0.09041, simple_loss=0.1223, pruned_loss=0.02929, over 5748.00 frames. ], tot_loss[loss=0.1047, simple_loss=0.1356, pruned_loss=0.03684, over 1085363.62 frames. ], batch size: 27, lr: 6.21e-03, grad_scale: 16.0 +2022-11-16 06:54:31,877 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.75 vs. limit=5.0 +2022-11-16 06:54:33,620 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90569.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:54:46,742 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90588.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:55:10,397 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9008, 2.2306, 2.1894, 1.5827, 2.3885, 2.2778, 2.3175, 2.5297], + device='cuda:2'), covar=tensor([0.1611, 0.1380, 0.1513, 0.2401, 0.0804, 0.1298, 0.0798, 0.0947], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0182, 0.0170, 0.0184, 0.0182, 0.0202, 0.0170, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:55:21,064 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.442e+01 1.383e+02 1.673e+02 2.134e+02 3.431e+02, threshold=3.345e+02, percent-clipped=0.0 +2022-11-16 06:55:21,744 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.59 vs. limit=2.0 +2022-11-16 06:55:38,734 INFO [train.py:876] (2/4) Epoch 13, batch 3400, loss[loss=0.111, simple_loss=0.1497, pruned_loss=0.03612, over 5759.00 frames. ], tot_loss[loss=0.1055, simple_loss=0.1359, pruned_loss=0.03754, over 1080573.88 frames. ], batch size: 21, lr: 6.21e-03, grad_scale: 16.0 +2022-11-16 06:56:07,881 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90707.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:56:29,389 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.201e+01 1.464e+02 1.802e+02 2.100e+02 5.077e+02, threshold=3.604e+02, percent-clipped=5.0 +2022-11-16 06:56:47,178 INFO [train.py:876] (2/4) Epoch 13, batch 3500, loss[loss=0.1188, simple_loss=0.1393, pruned_loss=0.04911, over 5749.00 frames. ], tot_loss[loss=0.105, simple_loss=0.1356, pruned_loss=0.03722, over 1083985.35 frames. ], batch size: 20, lr: 6.21e-03, grad_scale: 16.0 +2022-11-16 06:57:00,840 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90786.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:57:21,074 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90815.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:57:32,851 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9159, 4.7992, 3.2811, 4.4620, 3.6839, 3.2818, 2.7265, 4.0970], + device='cuda:2'), covar=tensor([0.1197, 0.0202, 0.1039, 0.0427, 0.0597, 0.0866, 0.1784, 0.0285], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0143, 0.0156, 0.0151, 0.0172, 0.0168, 0.0160, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:57:32,885 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1420, 1.7493, 1.9667, 1.7816, 1.7208, 1.7637, 2.0180, 1.5947], + device='cuda:2'), covar=tensor([0.0038, 0.0072, 0.0056, 0.0054, 0.0077, 0.0104, 0.0045, 0.0057], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0027, 0.0028, 0.0036, 0.0032, 0.0029, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([2.8165e-05, 2.5027e-05, 2.5252e-05, 3.4257e-05, 2.9542e-05, 2.7520e-05, + 3.3627e-05, 3.2853e-05], device='cuda:2') +2022-11-16 06:57:33,473 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90834.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:57:36,726 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.181e+01 1.483e+02 1.758e+02 2.114e+02 3.884e+02, threshold=3.515e+02, percent-clipped=1.0 +2022-11-16 06:57:43,838 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90849.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:57:46,111 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90852.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:57:53,109 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.18 vs. limit=2.0 +2022-11-16 06:57:54,735 INFO [train.py:876] (2/4) Epoch 13, batch 3600, loss[loss=0.1318, simple_loss=0.1559, pruned_loss=0.05381, over 5583.00 frames. ], tot_loss[loss=0.1053, simple_loss=0.1355, pruned_loss=0.0376, over 1081870.25 frames. ], batch size: 46, lr: 6.20e-03, grad_scale: 16.0 +2022-11-16 06:58:01,448 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6456, 3.2704, 4.2866, 3.8213, 4.7163, 3.2649, 4.3039, 4.9055], + device='cuda:2'), covar=tensor([0.0509, 0.1478, 0.0741, 0.1220, 0.0226, 0.1407, 0.1133, 0.0533], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0195, 0.0219, 0.0213, 0.0244, 0.0198, 0.0228, 0.0233], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:58:02,974 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.79 vs. limit=2.0 +2022-11-16 06:58:10,073 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90888.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:58:25,598 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90910.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:58:27,964 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2813, 2.6792, 3.0766, 3.8312, 4.0939, 3.1013, 2.8187, 3.9735], + device='cuda:2'), covar=tensor([0.0516, 0.3627, 0.2203, 0.3858, 0.0895, 0.3216, 0.2503, 0.0819], + device='cuda:2'), in_proj_covar=tensor([0.0262, 0.0199, 0.0192, 0.0306, 0.0228, 0.0206, 0.0193, 0.0253], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:58:33,058 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.8809, 3.0837, 3.4666, 4.4666, 4.4287, 3.4728, 3.3695, 4.3327], + device='cuda:2'), covar=tensor([0.0338, 0.2031, 0.2152, 0.1822, 0.0873, 0.2448, 0.1655, 0.0618], + device='cuda:2'), in_proj_covar=tensor([0.0261, 0.0199, 0.0192, 0.0306, 0.0228, 0.0205, 0.0192, 0.0253], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 06:58:42,751 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90936.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:58:44,665 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.811e+01 1.461e+02 1.838e+02 2.236e+02 5.014e+02, threshold=3.676e+02, percent-clipped=2.0 +2022-11-16 06:59:02,755 INFO [train.py:876] (2/4) Epoch 13, batch 3700, loss[loss=0.2101, simple_loss=0.1956, pruned_loss=0.1123, over 2975.00 frames. ], tot_loss[loss=0.105, simple_loss=0.1352, pruned_loss=0.03736, over 1075880.53 frames. ], batch size: 284, lr: 6.20e-03, grad_scale: 16.0 +2022-11-16 06:59:08,896 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1048, 1.8746, 2.1034, 1.6292, 1.6649, 1.7465, 1.6104, 1.4536], + device='cuda:2'), covar=tensor([0.0030, 0.0053, 0.0026, 0.0077, 0.0063, 0.0095, 0.0055, 0.0060], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0027, 0.0028, 0.0036, 0.0032, 0.0029, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([2.8289e-05, 2.5217e-05, 2.5072e-05, 3.4209e-05, 2.9822e-05, 2.7373e-05, + 3.3686e-05, 3.2784e-05], device='cuda:2') +2022-11-16 06:59:26,639 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5536, 1.1877, 1.1935, 0.8765, 1.2910, 1.5047, 1.0055, 1.1658], + device='cuda:2'), covar=tensor([0.0262, 0.0358, 0.0357, 0.0604, 0.0502, 0.0242, 0.0597, 0.0353], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0025, 0.0018, 0.0021, 0.0018, 0.0016, 0.0023, 0.0016], + device='cuda:2'), out_proj_covar=tensor([8.9049e-05, 1.2436e-04, 9.4598e-05, 1.0766e-04, 9.6301e-05, 8.9887e-05, + 1.1667e-04, 8.9692e-05], device='cuda:2') +2022-11-16 06:59:30,586 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91007.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 06:59:36,424 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5258, 4.4024, 3.3859, 1.9654, 4.1386, 1.8456, 4.1369, 2.3730], + device='cuda:2'), covar=tensor([0.1506, 0.0123, 0.0631, 0.2320, 0.0241, 0.1828, 0.0200, 0.1623], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0104, 0.0116, 0.0112, 0.0104, 0.0120, 0.0101, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 06:59:45,017 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-11-16 06:59:52,312 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.488e+01 1.377e+02 1.670e+02 2.030e+02 3.964e+02, threshold=3.341e+02, percent-clipped=2.0 +2022-11-16 07:00:03,058 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91055.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:00:09,532 INFO [train.py:876] (2/4) Epoch 13, batch 3800, loss[loss=0.08444, simple_loss=0.1203, pruned_loss=0.0243, over 5460.00 frames. ], tot_loss[loss=0.1045, simple_loss=0.1352, pruned_loss=0.03688, over 1080451.43 frames. ], batch size: 11, lr: 6.19e-03, grad_scale: 16.0 +2022-11-16 07:00:14,158 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1506, 4.9294, 3.7209, 2.3789, 4.6822, 2.3291, 4.6748, 2.9288], + device='cuda:2'), covar=tensor([0.1196, 0.0142, 0.0584, 0.1996, 0.0161, 0.1591, 0.0147, 0.1421], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0103, 0.0115, 0.0111, 0.0103, 0.0119, 0.0101, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:00:43,430 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91115.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:00:59,671 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.615e+01 1.416e+02 1.762e+02 2.192e+02 4.990e+02, threshold=3.525e+02, percent-clipped=3.0 +2022-11-16 07:01:08,649 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91152.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:01:15,187 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1850, 3.5676, 2.8203, 1.7245, 3.2755, 1.5014, 3.3648, 1.8729], + device='cuda:2'), covar=tensor([0.1946, 0.0329, 0.1073, 0.2663, 0.0442, 0.2580, 0.0407, 0.2219], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0104, 0.0115, 0.0111, 0.0104, 0.0118, 0.0100, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:01:15,771 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91163.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:01:17,059 INFO [train.py:876] (2/4) Epoch 13, batch 3900, loss[loss=0.1943, simple_loss=0.1824, pruned_loss=0.1032, over 3092.00 frames. ], tot_loss[loss=0.1049, simple_loss=0.1358, pruned_loss=0.03697, over 1085080.09 frames. ], batch size: 284, lr: 6.19e-03, grad_scale: 16.0 +2022-11-16 07:01:41,634 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91200.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:01:44,971 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=91205.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:02:07,454 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.326e+01 1.380e+02 1.738e+02 2.230e+02 3.262e+02, threshold=3.475e+02, percent-clipped=0.0 +2022-11-16 07:02:12,504 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.91 vs. limit=5.0 +2022-11-16 07:02:25,431 INFO [train.py:876] (2/4) Epoch 13, batch 4000, loss[loss=0.08993, simple_loss=0.1203, pruned_loss=0.02978, over 4974.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1352, pruned_loss=0.03622, over 1084216.17 frames. ], batch size: 109, lr: 6.19e-03, grad_scale: 16.0 +2022-11-16 07:02:27,574 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5293, 2.5213, 2.2345, 2.5003, 2.1629, 2.0024, 2.5120, 2.8717], + device='cuda:2'), covar=tensor([0.1425, 0.1327, 0.1804, 0.1381, 0.1533, 0.1458, 0.1199, 0.1620], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0108, 0.0105, 0.0106, 0.0093, 0.0103, 0.0097, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 07:02:30,795 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4562, 2.4689, 2.3319, 2.4739, 2.2281, 1.8788, 2.4903, 2.7248], + device='cuda:2'), covar=tensor([0.1437, 0.1375, 0.2054, 0.0990, 0.1439, 0.1784, 0.1402, 0.1759], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0108, 0.0105, 0.0106, 0.0093, 0.0104, 0.0097, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 07:02:38,647 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91285.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:03:15,062 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.467e+01 1.389e+02 1.729e+02 2.051e+02 4.497e+02, threshold=3.458e+02, percent-clipped=2.0 +2022-11-16 07:03:19,977 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=91346.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:03:33,557 INFO [train.py:876] (2/4) Epoch 13, batch 4100, loss[loss=0.09097, simple_loss=0.1287, pruned_loss=0.02662, over 5708.00 frames. ], tot_loss[loss=0.1036, simple_loss=0.135, pruned_loss=0.03611, over 1086382.79 frames. ], batch size: 19, lr: 6.18e-03, grad_scale: 16.0 +2022-11-16 07:03:42,614 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2862, 2.1614, 1.5561, 2.0602, 2.0915, 2.0792, 2.0738, 2.2036], + device='cuda:2'), covar=tensor([0.0496, 0.1199, 0.2801, 0.1253, 0.1302, 0.0844, 0.1473, 0.0925], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0180, 0.0274, 0.0174, 0.0222, 0.0174, 0.0189, 0.0177], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:03:58,742 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.17 vs. limit=2.0 +2022-11-16 07:04:23,286 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.983e+01 1.379e+02 1.736e+02 2.115e+02 4.817e+02, threshold=3.473e+02, percent-clipped=3.0 +2022-11-16 07:04:26,176 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-11-16 07:04:40,410 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.57 vs. limit=5.0 +2022-11-16 07:04:40,816 INFO [train.py:876] (2/4) Epoch 13, batch 4200, loss[loss=0.0688, simple_loss=0.1006, pruned_loss=0.01849, over 4631.00 frames. ], tot_loss[loss=0.1026, simple_loss=0.1339, pruned_loss=0.0356, over 1086181.23 frames. ], batch size: 5, lr: 6.18e-03, grad_scale: 16.0 +2022-11-16 07:04:42,679 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.70 vs. limit=5.0 +2022-11-16 07:04:47,361 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9288, 3.1704, 3.6716, 4.7566, 4.6438, 3.6506, 3.4414, 4.5717], + device='cuda:2'), covar=tensor([0.0286, 0.2161, 0.1921, 0.1873, 0.0767, 0.2713, 0.1767, 0.0480], + device='cuda:2'), in_proj_covar=tensor([0.0259, 0.0197, 0.0188, 0.0302, 0.0225, 0.0204, 0.0190, 0.0252], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 07:04:56,204 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6500, 2.2712, 2.6866, 3.7017, 3.6384, 2.7317, 2.5212, 3.6353], + device='cuda:2'), covar=tensor([0.0951, 0.3439, 0.2262, 0.1791, 0.1037, 0.2727, 0.1888, 0.0732], + device='cuda:2'), in_proj_covar=tensor([0.0259, 0.0197, 0.0188, 0.0302, 0.0226, 0.0204, 0.0190, 0.0252], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 07:05:07,941 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91505.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:05:08,031 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4503, 3.2828, 3.3369, 3.0299, 1.9517, 3.3501, 2.1770, 2.9830], + device='cuda:2'), covar=tensor([0.0493, 0.0198, 0.0214, 0.0441, 0.0642, 0.0228, 0.0574, 0.0194], + device='cuda:2'), in_proj_covar=tensor([0.0195, 0.0184, 0.0186, 0.0210, 0.0197, 0.0185, 0.0194, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 07:05:31,427 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.444e+01 1.459e+02 1.798e+02 2.244e+02 4.763e+02, threshold=3.595e+02, percent-clipped=2.0 +2022-11-16 07:05:33,631 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91542.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:05:39,587 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9277, 3.3605, 2.3188, 3.1681, 2.5324, 2.3582, 1.7631, 2.9291], + device='cuda:2'), covar=tensor([0.1533, 0.0353, 0.1310, 0.0511, 0.1217, 0.1177, 0.2310, 0.0559], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0144, 0.0156, 0.0149, 0.0173, 0.0167, 0.0159, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:05:40,836 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91553.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:05:45,336 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8132, 2.6243, 2.7223, 2.5057, 2.8592, 2.7264, 2.7761, 2.8498], + device='cuda:2'), covar=tensor([0.0447, 0.0477, 0.0483, 0.0506, 0.0490, 0.0290, 0.0396, 0.0490], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0156, 0.0112, 0.0146, 0.0184, 0.0112, 0.0129, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 07:05:48,494 INFO [train.py:876] (2/4) Epoch 13, batch 4300, loss[loss=0.07237, simple_loss=0.1103, pruned_loss=0.01724, over 5512.00 frames. ], tot_loss[loss=0.1029, simple_loss=0.1341, pruned_loss=0.03589, over 1082185.21 frames. ], batch size: 14, lr: 6.18e-03, grad_scale: 16.0 +2022-11-16 07:05:50,147 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4945, 4.4272, 4.5878, 4.5786, 4.3678, 3.8737, 5.0227, 4.4912], + device='cuda:2'), covar=tensor([0.0411, 0.0791, 0.0353, 0.1042, 0.0370, 0.0381, 0.0571, 0.0573], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0106, 0.0094, 0.0122, 0.0089, 0.0080, 0.0146, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:05:59,669 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-11-16 07:06:15,250 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=91603.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:06:15,918 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6814, 2.1352, 2.0044, 1.3613, 2.0736, 2.3474, 2.1547, 2.3741], + device='cuda:2'), covar=tensor([0.1910, 0.1739, 0.2173, 0.3008, 0.1244, 0.1253, 0.1042, 0.1455], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0180, 0.0166, 0.0182, 0.0179, 0.0198, 0.0167, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:06:39,672 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.763e+01 1.435e+02 1.703e+02 2.070e+02 3.900e+02, threshold=3.406e+02, percent-clipped=1.0 +2022-11-16 07:06:41,069 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=91641.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:06:56,956 INFO [train.py:876] (2/4) Epoch 13, batch 4400, loss[loss=0.09077, simple_loss=0.1298, pruned_loss=0.02587, over 5786.00 frames. ], tot_loss[loss=0.1019, simple_loss=0.1335, pruned_loss=0.03514, over 1089022.09 frames. ], batch size: 21, lr: 6.17e-03, grad_scale: 16.0 +2022-11-16 07:07:09,207 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.09 vs. limit=5.0 +2022-11-16 07:07:17,450 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8655, 4.2878, 3.8712, 3.6995, 1.9847, 4.0171, 2.3335, 3.6755], + device='cuda:2'), covar=tensor([0.0408, 0.0149, 0.0221, 0.0356, 0.0725, 0.0205, 0.0595, 0.0151], + device='cuda:2'), in_proj_covar=tensor([0.0194, 0.0184, 0.0183, 0.0209, 0.0196, 0.0184, 0.0193, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 07:07:25,950 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 07:07:46,911 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.810e+01 1.451e+02 1.858e+02 2.310e+02 4.864e+02, threshold=3.715e+02, percent-clipped=3.0 +2022-11-16 07:08:04,755 INFO [train.py:876] (2/4) Epoch 13, batch 4500, loss[loss=0.07989, simple_loss=0.1145, pruned_loss=0.02263, over 5709.00 frames. ], tot_loss[loss=0.1032, simple_loss=0.1343, pruned_loss=0.0361, over 1088904.75 frames. ], batch size: 12, lr: 6.17e-03, grad_scale: 16.0 +2022-11-16 07:08:55,633 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.055e+01 1.330e+02 1.643e+02 2.153e+02 4.136e+02, threshold=3.287e+02, percent-clipped=1.0 +2022-11-16 07:09:00,441 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.50 vs. limit=2.0 +2022-11-16 07:09:13,846 INFO [train.py:876] (2/4) Epoch 13, batch 4600, loss[loss=0.09663, simple_loss=0.1422, pruned_loss=0.02555, over 5597.00 frames. ], tot_loss[loss=0.105, simple_loss=0.1361, pruned_loss=0.03701, over 1085936.36 frames. ], batch size: 24, lr: 6.17e-03, grad_scale: 16.0 +2022-11-16 07:09:31,316 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91891.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:09:35,817 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=91898.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 07:10:03,762 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.956e+01 1.427e+02 1.793e+02 2.306e+02 3.919e+02, threshold=3.587e+02, percent-clipped=3.0 +2022-11-16 07:10:05,220 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91941.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:10:12,691 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=91952.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:10:14,596 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91955.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:10:21,954 INFO [train.py:876] (2/4) Epoch 13, batch 4700, loss[loss=0.1177, simple_loss=0.1599, pruned_loss=0.0378, over 5778.00 frames. ], tot_loss[loss=0.1033, simple_loss=0.1344, pruned_loss=0.03608, over 1083116.17 frames. ], batch size: 27, lr: 6.16e-03, grad_scale: 32.0 +2022-11-16 07:10:32,708 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91981.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:10:38,257 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91989.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:10:41,713 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4433, 3.2169, 3.2282, 2.9429, 1.8586, 3.1967, 2.1319, 2.9532], + device='cuda:2'), covar=tensor([0.0426, 0.0241, 0.0186, 0.0361, 0.0600, 0.0201, 0.0528, 0.0185], + device='cuda:2'), in_proj_covar=tensor([0.0196, 0.0184, 0.0184, 0.0209, 0.0197, 0.0185, 0.0193, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 07:10:57,222 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92016.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:11:13,169 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.635e+01 1.436e+02 1.731e+02 2.246e+02 5.128e+02, threshold=3.463e+02, percent-clipped=1.0 +2022-11-16 07:11:14,682 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92042.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 07:11:29,850 INFO [train.py:876] (2/4) Epoch 13, batch 4800, loss[loss=0.1185, simple_loss=0.1344, pruned_loss=0.05136, over 5312.00 frames. ], tot_loss[loss=0.1019, simple_loss=0.1332, pruned_loss=0.03527, over 1084672.56 frames. ], batch size: 79, lr: 6.16e-03, grad_scale: 16.0 +2022-11-16 07:12:21,126 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.028e+02 1.456e+02 1.773e+02 2.176e+02 4.110e+02, threshold=3.546e+02, percent-clipped=4.0 +2022-11-16 07:12:36,042 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-11-16 07:12:37,621 INFO [train.py:876] (2/4) Epoch 13, batch 4900, loss[loss=0.07211, simple_loss=0.1084, pruned_loss=0.0179, over 5628.00 frames. ], tot_loss[loss=0.09989, simple_loss=0.1319, pruned_loss=0.03396, over 1090211.30 frames. ], batch size: 23, lr: 6.16e-03, grad_scale: 8.0 +2022-11-16 07:12:49,126 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3460, 2.3417, 2.3642, 2.3963, 2.0809, 1.6554, 2.3739, 2.6227], + device='cuda:2'), covar=tensor([0.1308, 0.1697, 0.1597, 0.1274, 0.1465, 0.2681, 0.1268, 0.1138], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0109, 0.0107, 0.0108, 0.0094, 0.0105, 0.0099, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 07:13:00,130 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=92198.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 07:13:29,751 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.574e+01 1.436e+02 1.818e+02 2.548e+02 4.452e+02, threshold=3.637e+02, percent-clipped=5.0 +2022-11-16 07:13:33,144 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=92246.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:13:33,774 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=92247.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:13:45,200 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7774, 3.4269, 3.6249, 3.3635, 3.8511, 3.7116, 3.6112, 3.7974], + device='cuda:2'), covar=tensor([0.0415, 0.0445, 0.0478, 0.0398, 0.0416, 0.0259, 0.0366, 0.0432], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0156, 0.0113, 0.0145, 0.0184, 0.0112, 0.0129, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 07:13:45,762 INFO [train.py:876] (2/4) Epoch 13, batch 5000, loss[loss=0.1081, simple_loss=0.1487, pruned_loss=0.03374, over 5284.00 frames. ], tot_loss[loss=0.1, simple_loss=0.132, pruned_loss=0.03405, over 1089622.57 frames. ], batch size: 79, lr: 6.15e-03, grad_scale: 8.0 +2022-11-16 07:14:16,844 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=92311.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:14:34,567 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=92337.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:14:37,065 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.814e+01 1.442e+02 1.699e+02 2.118e+02 7.275e+02, threshold=3.398e+02, percent-clipped=6.0 +2022-11-16 07:14:50,131 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8641, 4.9465, 3.6337, 2.0563, 4.5795, 2.0453, 4.6852, 2.6668], + device='cuda:2'), covar=tensor([0.1263, 0.0093, 0.0424, 0.2004, 0.0137, 0.1558, 0.0122, 0.1372], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0104, 0.0115, 0.0112, 0.0102, 0.0119, 0.0101, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:14:54,062 INFO [train.py:876] (2/4) Epoch 13, batch 5100, loss[loss=0.07785, simple_loss=0.1145, pruned_loss=0.0206, over 5447.00 frames. ], tot_loss[loss=0.1013, simple_loss=0.1328, pruned_loss=0.03488, over 1092808.86 frames. ], batch size: 11, lr: 6.15e-03, grad_scale: 8.0 +2022-11-16 07:15:45,935 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.294e+01 1.423e+02 1.797e+02 2.276e+02 4.290e+02, threshold=3.595e+02, percent-clipped=2.0 +2022-11-16 07:16:02,830 INFO [train.py:876] (2/4) Epoch 13, batch 5200, loss[loss=0.08491, simple_loss=0.1271, pruned_loss=0.02135, over 5550.00 frames. ], tot_loss[loss=0.1014, simple_loss=0.1329, pruned_loss=0.03496, over 1091974.54 frames. ], batch size: 15, lr: 6.15e-03, grad_scale: 8.0 +2022-11-16 07:16:28,279 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4373, 1.1354, 1.0951, 1.0302, 1.2959, 1.1948, 0.6720, 1.0142], + device='cuda:2'), covar=tensor([0.0213, 0.0345, 0.0405, 0.0449, 0.0298, 0.0395, 0.0699, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0026, 0.0018, 0.0022, 0.0018, 0.0017, 0.0024, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.1381e-05, 1.2753e-04, 9.7613e-05, 1.1076e-04, 9.8101e-05, 9.2577e-05, + 1.2194e-04, 9.0820e-05], device='cuda:2') +2022-11-16 07:16:54,295 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.360e+01 1.391e+02 1.845e+02 2.323e+02 4.876e+02, threshold=3.690e+02, percent-clipped=4.0 +2022-11-16 07:16:58,340 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=92547.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:17:10,187 INFO [train.py:876] (2/4) Epoch 13, batch 5300, loss[loss=0.1004, simple_loss=0.1361, pruned_loss=0.03242, over 5589.00 frames. ], tot_loss[loss=0.1035, simple_loss=0.1344, pruned_loss=0.03635, over 1085192.54 frames. ], batch size: 43, lr: 6.14e-03, grad_scale: 8.0 +2022-11-16 07:17:30,907 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=92595.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:17:35,782 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7964, 2.6078, 2.6588, 2.4938, 2.8604, 2.6960, 2.7409, 2.8095], + device='cuda:2'), covar=tensor([0.0465, 0.0539, 0.0590, 0.0545, 0.0468, 0.0317, 0.0467, 0.0609], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0157, 0.0113, 0.0146, 0.0185, 0.0114, 0.0131, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 07:17:41,732 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=92611.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:17:51,954 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92625.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:17:59,651 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=92637.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:18:01,602 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4142, 4.2932, 3.2964, 1.9987, 3.9489, 1.7372, 4.0081, 2.4330], + device='cuda:2'), covar=tensor([0.1530, 0.0141, 0.0591, 0.2027, 0.0214, 0.1765, 0.0157, 0.1362], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0104, 0.0114, 0.0111, 0.0102, 0.0118, 0.0100, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:18:02,113 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.870e+01 1.449e+02 1.761e+02 2.157e+02 4.829e+02, threshold=3.522e+02, percent-clipped=3.0 +2022-11-16 07:18:14,403 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=92659.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:18:16,520 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9851, 1.8841, 1.9877, 1.7642, 1.6168, 2.0788, 1.8126, 1.6975], + device='cuda:2'), covar=tensor([0.0060, 0.0059, 0.0141, 0.0055, 0.0114, 0.0060, 0.0055, 0.0052], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0028, 0.0029, 0.0037, 0.0032, 0.0029, 0.0036, 0.0035], + device='cuda:2'), out_proj_covar=tensor([2.8243e-05, 2.5940e-05, 2.5819e-05, 3.5822e-05, 2.9460e-05, 2.7608e-05, + 3.4436e-05, 3.3395e-05], device='cuda:2') +2022-11-16 07:18:18,359 INFO [train.py:876] (2/4) Epoch 13, batch 5400, loss[loss=0.07775, simple_loss=0.1226, pruned_loss=0.01643, over 5761.00 frames. ], tot_loss[loss=0.1026, simple_loss=0.1341, pruned_loss=0.03552, over 1087524.29 frames. ], batch size: 20, lr: 6.14e-03, grad_scale: 8.0 +2022-11-16 07:18:19,238 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7433, 3.9286, 3.7480, 3.4573, 1.8621, 3.9767, 2.2020, 3.3023], + device='cuda:2'), covar=tensor([0.0428, 0.0193, 0.0175, 0.0409, 0.0772, 0.0177, 0.0604, 0.0189], + device='cuda:2'), in_proj_covar=tensor([0.0198, 0.0186, 0.0185, 0.0211, 0.0198, 0.0188, 0.0198, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 07:18:32,618 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=92685.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:18:33,346 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92686.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:18:55,986 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92720.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:19:03,920 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9582, 3.6387, 3.8154, 3.6084, 4.0588, 3.7624, 3.6913, 4.0280], + device='cuda:2'), covar=tensor([0.0476, 0.0452, 0.0522, 0.0385, 0.0434, 0.0347, 0.0414, 0.0414], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0158, 0.0113, 0.0147, 0.0186, 0.0115, 0.0132, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 07:19:10,733 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.348e+01 1.471e+02 1.783e+02 2.230e+02 3.628e+02, threshold=3.566e+02, percent-clipped=2.0 +2022-11-16 07:19:26,815 INFO [train.py:876] (2/4) Epoch 13, batch 5500, loss[loss=0.1357, simple_loss=0.1331, pruned_loss=0.06918, over 4212.00 frames. ], tot_loss[loss=0.1005, simple_loss=0.1323, pruned_loss=0.03435, over 1087565.52 frames. ], batch size: 181, lr: 6.14e-03, grad_scale: 8.0 +2022-11-16 07:19:38,277 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92781.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:20:12,850 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92831.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:20:19,051 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2546, 2.0478, 2.7095, 1.7814, 1.3309, 2.9362, 2.4577, 2.1189], + device='cuda:2'), covar=tensor([0.1101, 0.1587, 0.0718, 0.2714, 0.3303, 0.1125, 0.1197, 0.1499], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0100, 0.0099, 0.0102, 0.0075, 0.0071, 0.0080, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 07:20:19,502 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.251e+01 1.445e+02 1.860e+02 2.400e+02 4.307e+02, threshold=3.721e+02, percent-clipped=4.0 +2022-11-16 07:20:32,069 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8197, 2.5235, 3.4271, 2.9586, 3.5117, 2.2896, 3.1772, 3.7464], + device='cuda:2'), covar=tensor([0.0660, 0.1443, 0.0896, 0.1493, 0.0750, 0.1550, 0.1101, 0.0763], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0191, 0.0216, 0.0208, 0.0241, 0.0195, 0.0224, 0.0230], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:20:35,696 INFO [train.py:876] (2/4) Epoch 13, batch 5600, loss[loss=0.1014, simple_loss=0.1367, pruned_loss=0.03308, over 5597.00 frames. ], tot_loss[loss=0.1021, simple_loss=0.1333, pruned_loss=0.03542, over 1083645.38 frames. ], batch size: 40, lr: 6.13e-03, grad_scale: 8.0 +2022-11-16 07:20:54,528 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92892.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:20:59,086 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92899.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:21:20,276 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92931.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:21:27,403 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.557e+01 1.397e+02 1.621e+02 2.063e+02 4.647e+02, threshold=3.241e+02, percent-clipped=3.0 +2022-11-16 07:21:40,279 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92960.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:21:43,363 INFO [train.py:876] (2/4) Epoch 13, batch 5700, loss[loss=0.131, simple_loss=0.159, pruned_loss=0.05152, over 5760.00 frames. ], tot_loss[loss=0.1028, simple_loss=0.1341, pruned_loss=0.03581, over 1087351.91 frames. ], batch size: 20, lr: 6.13e-03, grad_scale: 8.0 +2022-11-16 07:21:50,436 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92975.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:21:54,336 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=92981.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:22:02,307 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92992.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:22:14,761 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93010.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:22:32,122 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93036.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:22:32,145 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93036.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:22:32,806 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0151, 1.7929, 2.0415, 1.6570, 1.6680, 2.1742, 2.1195, 1.6956], + device='cuda:2'), covar=tensor([0.0066, 0.0085, 0.0042, 0.0071, 0.0120, 0.0048, 0.0061, 0.0058], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0028, 0.0029, 0.0038, 0.0033, 0.0030, 0.0037, 0.0036], + device='cuda:2'), out_proj_covar=tensor([2.8784e-05, 2.6731e-05, 2.6291e-05, 3.6673e-05, 3.0343e-05, 2.8645e-05, + 3.5337e-05, 3.3945e-05], device='cuda:2') +2022-11-16 07:22:36,331 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.192e+01 1.368e+02 1.655e+02 2.036e+02 3.681e+02, threshold=3.311e+02, percent-clipped=3.0 +2022-11-16 07:22:52,169 INFO [train.py:876] (2/4) Epoch 13, batch 5800, loss[loss=0.08954, simple_loss=0.1337, pruned_loss=0.02267, over 5772.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.133, pruned_loss=0.03503, over 1077736.64 frames. ], batch size: 16, lr: 6.13e-03, grad_scale: 8.0 +2022-11-16 07:22:56,539 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93071.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:22:59,773 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93076.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:23:05,901 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.00 vs. limit=5.0 +2022-11-16 07:23:13,884 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93097.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 07:23:17,142 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4892, 1.3450, 1.2412, 1.0990, 1.3552, 1.5236, 0.7851, 1.1341], + device='cuda:2'), covar=tensor([0.0375, 0.0309, 0.0333, 0.0616, 0.0339, 0.0397, 0.0770, 0.0310], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0025, 0.0018, 0.0021, 0.0017, 0.0016, 0.0023, 0.0016], + device='cuda:2'), out_proj_covar=tensor([8.9286e-05, 1.2402e-04, 9.4946e-05, 1.0746e-04, 9.5175e-05, 8.9819e-05, + 1.1771e-04, 8.8987e-05], device='cuda:2') +2022-11-16 07:23:26,243 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93115.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:23:43,304 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.091e+01 1.350e+02 1.786e+02 2.182e+02 6.733e+02, threshold=3.572e+02, percent-clipped=3.0 +2022-11-16 07:24:00,086 INFO [train.py:876] (2/4) Epoch 13, batch 5900, loss[loss=0.1221, simple_loss=0.1645, pruned_loss=0.03985, over 5560.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.1332, pruned_loss=0.03494, over 1085654.69 frames. ], batch size: 40, lr: 6.12e-03, grad_scale: 8.0 +2022-11-16 07:24:04,901 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-11-16 07:24:07,383 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93176.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:24:14,471 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93187.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:24:28,879 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8062, 1.7164, 1.7616, 1.5676, 1.5394, 1.8514, 1.7352, 1.3422], + device='cuda:2'), covar=tensor([0.0047, 0.0074, 0.0064, 0.0069, 0.0075, 0.0049, 0.0053, 0.0071], + device='cuda:2'), in_proj_covar=tensor([0.0032, 0.0029, 0.0030, 0.0038, 0.0033, 0.0030, 0.0037, 0.0036], + device='cuda:2'), out_proj_covar=tensor([2.8995e-05, 2.7081e-05, 2.6685e-05, 3.6576e-05, 3.0667e-05, 2.9047e-05, + 3.5760e-05, 3.4336e-05], device='cuda:2') +2022-11-16 07:24:29,534 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7047, 1.7594, 1.9292, 1.9996, 1.7196, 1.5010, 1.8513, 1.9373], + device='cuda:2'), covar=tensor([0.2676, 0.2901, 0.2039, 0.1731, 0.2101, 0.3354, 0.1883, 0.1018], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0111, 0.0109, 0.0109, 0.0095, 0.0106, 0.0100, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 07:24:51,117 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.010e+02 1.444e+02 1.638e+02 1.961e+02 3.238e+02, threshold=3.277e+02, percent-clipped=0.0 +2022-11-16 07:24:54,018 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-11-16 07:24:59,465 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8608, 5.2364, 3.3689, 4.8891, 3.9723, 3.5596, 2.8906, 4.4306], + device='cuda:2'), covar=tensor([0.1285, 0.0140, 0.0865, 0.0277, 0.0535, 0.0716, 0.1466, 0.0294], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0141, 0.0154, 0.0146, 0.0173, 0.0165, 0.0156, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:25:00,832 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93255.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:25:07,946 INFO [train.py:876] (2/4) Epoch 13, batch 6000, loss[loss=0.2248, simple_loss=0.209, pruned_loss=0.1203, over 3078.00 frames. ], tot_loss[loss=0.1024, simple_loss=0.1336, pruned_loss=0.0356, over 1081308.76 frames. ], batch size: 284, lr: 6.12e-03, grad_scale: 8.0 +2022-11-16 07:25:07,947 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 07:25:27,979 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6359, 4.3476, 3.0916, 4.0089, 3.4354, 3.2470, 2.6129, 3.6255], + device='cuda:2'), covar=tensor([0.1397, 0.0151, 0.1035, 0.0282, 0.0787, 0.0735, 0.1746, 0.0485], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0141, 0.0153, 0.0146, 0.0172, 0.0164, 0.0155, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:25:35,119 INFO [train.py:908] (2/4) Epoch 13, validation: loss=0.1768, simple_loss=0.1872, pruned_loss=0.08323, over 1530663.00 frames. +2022-11-16 07:25:35,120 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 07:25:45,600 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93281.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:25:45,612 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8860, 1.8596, 2.3166, 1.8095, 1.2695, 2.7827, 2.1409, 1.9218], + device='cuda:2'), covar=tensor([0.1268, 0.1569, 0.0947, 0.2127, 0.3162, 0.0421, 0.1601, 0.1731], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0102, 0.0102, 0.0104, 0.0076, 0.0072, 0.0083, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 07:25:46,093 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-11-16 07:25:49,439 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93287.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:26:09,117 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-11-16 07:26:18,408 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93329.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:26:19,777 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93331.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:26:26,166 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.791e+01 1.381e+02 1.698e+02 2.176e+02 5.974e+02, threshold=3.396e+02, percent-clipped=7.0 +2022-11-16 07:26:36,658 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5543, 3.1876, 4.1349, 2.5946, 2.4210, 4.1790, 3.2826, 2.9193], + device='cuda:2'), covar=tensor([0.0675, 0.0776, 0.0292, 0.2016, 0.1690, 0.0258, 0.0435, 0.0760], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0101, 0.0102, 0.0103, 0.0075, 0.0071, 0.0082, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 07:26:42,556 INFO [train.py:876] (2/4) Epoch 13, batch 6100, loss[loss=0.06229, simple_loss=0.1036, pruned_loss=0.01047, over 5477.00 frames. ], tot_loss[loss=0.1007, simple_loss=0.1325, pruned_loss=0.03446, over 1083153.43 frames. ], batch size: 10, lr: 6.12e-03, grad_scale: 8.0 +2022-11-16 07:26:43,310 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93366.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:26:50,785 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93376.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:26:59,282 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-11-16 07:27:01,547 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93392.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:27:23,294 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93424.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:27:35,455 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.736e+01 1.359e+02 1.712e+02 2.042e+02 3.975e+02, threshold=3.424e+02, percent-clipped=1.0 +2022-11-16 07:27:45,584 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0984, 4.5204, 4.0705, 4.5365, 4.5099, 3.9007, 4.1535, 4.0070], + device='cuda:2'), covar=tensor([0.0458, 0.0498, 0.1395, 0.0492, 0.0476, 0.0496, 0.0751, 0.0600], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0178, 0.0275, 0.0178, 0.0223, 0.0174, 0.0190, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:27:49,487 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-11-16 07:27:51,740 INFO [train.py:876] (2/4) Epoch 13, batch 6200, loss[loss=0.07642, simple_loss=0.1077, pruned_loss=0.02256, over 4129.00 frames. ], tot_loss[loss=0.1002, simple_loss=0.1319, pruned_loss=0.03427, over 1085248.13 frames. ], batch size: 4, lr: 6.12e-03, grad_scale: 8.0 +2022-11-16 07:27:54,987 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6524, 3.3045, 3.4899, 3.2479, 3.7148, 3.5173, 3.3970, 3.6347], + device='cuda:2'), covar=tensor([0.0344, 0.0459, 0.0465, 0.0408, 0.0379, 0.0258, 0.0371, 0.0447], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0154, 0.0111, 0.0143, 0.0183, 0.0113, 0.0128, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 07:27:55,650 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93471.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:28:06,767 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93487.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:28:38,380 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93535.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:28:42,539 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.266e+01 1.465e+02 1.713e+02 2.165e+02 4.081e+02, threshold=3.427e+02, percent-clipped=1.0 +2022-11-16 07:28:52,489 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93555.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:28:58,922 INFO [train.py:876] (2/4) Epoch 13, batch 6300, loss[loss=0.09118, simple_loss=0.1262, pruned_loss=0.02806, over 5766.00 frames. ], tot_loss[loss=0.1021, simple_loss=0.1332, pruned_loss=0.03548, over 1086772.37 frames. ], batch size: 20, lr: 6.11e-03, grad_scale: 8.0 +2022-11-16 07:29:13,682 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93587.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:29:18,845 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9201, 1.4558, 0.9966, 1.0505, 1.2209, 1.1877, 0.6722, 1.1638], + device='cuda:2'), covar=tensor([0.0072, 0.0050, 0.0078, 0.0071, 0.0060, 0.0062, 0.0105, 0.0072], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0060, 0.0060, 0.0066, 0.0063, 0.0059, 0.0056, 0.0055], + device='cuda:2'), out_proj_covar=tensor([5.9555e-05, 5.3420e-05, 5.2195e-05, 5.8269e-05, 5.5381e-05, 5.1373e-05, + 4.9512e-05, 4.8075e-05], device='cuda:2') +2022-11-16 07:29:20,716 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3488, 4.1941, 4.3412, 4.4925, 4.1423, 3.8070, 4.7938, 4.2635], + device='cuda:2'), covar=tensor([0.0353, 0.0714, 0.0333, 0.1027, 0.0414, 0.0325, 0.0558, 0.0583], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0110, 0.0096, 0.0124, 0.0090, 0.0081, 0.0148, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:29:24,621 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93603.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:29:43,334 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93631.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:29:46,175 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93635.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:29:49,999 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.661e+01 1.430e+02 1.750e+02 2.354e+02 5.950e+02, threshold=3.500e+02, percent-clipped=2.0 +2022-11-16 07:30:07,007 INFO [train.py:876] (2/4) Epoch 13, batch 6400, loss[loss=0.09363, simple_loss=0.1342, pruned_loss=0.02655, over 5525.00 frames. ], tot_loss[loss=0.1029, simple_loss=0.134, pruned_loss=0.03591, over 1083884.78 frames. ], batch size: 40, lr: 6.11e-03, grad_scale: 8.0 +2022-11-16 07:30:07,758 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93666.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:30:16,122 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93679.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:30:24,959 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93692.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:30:32,312 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.64 vs. limit=5.0 +2022-11-16 07:30:35,964 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5307, 4.3832, 3.4352, 1.8589, 4.0927, 1.9056, 4.0942, 2.4942], + device='cuda:2'), covar=tensor([0.1747, 0.0259, 0.0769, 0.2434, 0.0316, 0.2051, 0.0355, 0.2079], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0105, 0.0115, 0.0110, 0.0104, 0.0119, 0.0101, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:30:39,898 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93714.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:30:57,385 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93740.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:30:57,985 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.087e+01 1.406e+02 1.692e+02 2.048e+02 4.065e+02, threshold=3.385e+02, percent-clipped=2.0 +2022-11-16 07:30:58,107 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2925, 3.8666, 3.5016, 3.8243, 3.8422, 3.3101, 3.4663, 3.4614], + device='cuda:2'), covar=tensor([0.1054, 0.0491, 0.1189, 0.0534, 0.0456, 0.0518, 0.0809, 0.0642], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0179, 0.0274, 0.0177, 0.0224, 0.0174, 0.0191, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:30:59,497 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9940, 1.3516, 1.9432, 1.1316, 1.9009, 1.7509, 1.2020, 1.7108], + device='cuda:2'), covar=tensor([0.0250, 0.0675, 0.0356, 0.0777, 0.1429, 0.0729, 0.0707, 0.0352], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0026, 0.0018, 0.0022, 0.0018, 0.0017, 0.0025, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.2433e-05, 1.2855e-04, 9.8359e-05, 1.1130e-04, 9.9624e-05, 9.3218e-05, + 1.2293e-04, 9.3361e-05], device='cuda:2') +2022-11-16 07:31:13,829 INFO [train.py:876] (2/4) Epoch 13, batch 6500, loss[loss=0.06885, simple_loss=0.1085, pruned_loss=0.01461, over 5563.00 frames. ], tot_loss[loss=0.1028, simple_loss=0.1334, pruned_loss=0.03605, over 1081717.83 frames. ], batch size: 13, lr: 6.11e-03, grad_scale: 8.0 +2022-11-16 07:31:16,520 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-11-16 07:31:16,915 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1118, 3.9294, 2.5879, 3.6849, 3.0549, 2.5404, 2.1950, 3.2826], + device='cuda:2'), covar=tensor([0.1630, 0.0306, 0.1246, 0.0407, 0.1031, 0.1248, 0.2005, 0.0540], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0142, 0.0155, 0.0147, 0.0174, 0.0167, 0.0157, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:31:17,576 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93770.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:31:18,264 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93771.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:31:38,823 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5076, 2.3493, 2.2709, 2.3283, 2.0686, 1.6604, 2.2436, 2.6642], + device='cuda:2'), covar=tensor([0.1089, 0.1333, 0.1670, 0.1363, 0.1492, 0.2583, 0.1338, 0.0963], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0108, 0.0106, 0.0107, 0.0093, 0.0104, 0.0098, 0.0084], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 07:31:50,712 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93819.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:31:59,072 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93831.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:32:05,379 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.939e+01 1.358e+02 1.733e+02 2.139e+02 3.711e+02, threshold=3.467e+02, percent-clipped=1.0 +2022-11-16 07:32:21,364 INFO [train.py:876] (2/4) Epoch 13, batch 6600, loss[loss=0.1033, simple_loss=0.132, pruned_loss=0.03732, over 5763.00 frames. ], tot_loss[loss=0.1039, simple_loss=0.1349, pruned_loss=0.03648, over 1083754.87 frames. ], batch size: 31, lr: 6.10e-03, grad_scale: 8.0 +2022-11-16 07:32:39,618 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0119, 2.3767, 2.2005, 1.6358, 1.7919, 1.8735, 1.8340, 2.5150], + device='cuda:2'), covar=tensor([0.0078, 0.0050, 0.0053, 0.0067, 0.0062, 0.0055, 0.0045, 0.0122], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0061, 0.0060, 0.0066, 0.0063, 0.0059, 0.0056, 0.0055], + device='cuda:2'), out_proj_covar=tensor([5.9333e-05, 5.3718e-05, 5.2506e-05, 5.8326e-05, 5.5914e-05, 5.1524e-05, + 4.9803e-05, 4.8375e-05], device='cuda:2') +2022-11-16 07:32:42,176 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6234, 4.5716, 4.5734, 4.5986, 4.3432, 3.9960, 5.1552, 4.6293], + device='cuda:2'), covar=tensor([0.0330, 0.0722, 0.0365, 0.1028, 0.0537, 0.0329, 0.0484, 0.0438], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0109, 0.0096, 0.0123, 0.0089, 0.0081, 0.0146, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:33:13,073 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.008e+02 1.396e+02 1.723e+02 2.017e+02 3.846e+02, threshold=3.447e+02, percent-clipped=2.0 +2022-11-16 07:33:29,297 INFO [train.py:876] (2/4) Epoch 13, batch 6700, loss[loss=0.1036, simple_loss=0.1401, pruned_loss=0.03351, over 5739.00 frames. ], tot_loss[loss=0.1016, simple_loss=0.1331, pruned_loss=0.03507, over 1084629.38 frames. ], batch size: 31, lr: 6.10e-03, grad_scale: 8.0 +2022-11-16 07:33:35,054 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5332, 2.4187, 2.8602, 1.8821, 1.7403, 3.2780, 2.8170, 2.3483], + device='cuda:2'), covar=tensor([0.1074, 0.1366, 0.0701, 0.2460, 0.3517, 0.0594, 0.0707, 0.1566], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0101, 0.0100, 0.0103, 0.0074, 0.0071, 0.0082, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 07:33:40,310 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93981.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:33:42,226 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8616, 4.8981, 3.7005, 2.3364, 4.5790, 2.0583, 4.4877, 2.8296], + device='cuda:2'), covar=tensor([0.1322, 0.0122, 0.0496, 0.1960, 0.0166, 0.1728, 0.0159, 0.1427], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0103, 0.0114, 0.0109, 0.0102, 0.0117, 0.0099, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:34:23,829 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.687e+01 1.406e+02 1.734e+02 2.317e+02 5.958e+02, threshold=3.468e+02, percent-clipped=4.0 +2022-11-16 07:34:25,120 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=94042.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:34:28,010 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.42 vs. limit=5.0 +2022-11-16 07:34:40,854 INFO [train.py:876] (2/4) Epoch 13, batch 6800, loss[loss=0.07754, simple_loss=0.09956, pruned_loss=0.02776, over 4648.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.1324, pruned_loss=0.03525, over 1076052.02 frames. ], batch size: 5, lr: 6.10e-03, grad_scale: 8.0 +2022-11-16 07:35:17,456 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.24 vs. limit=5.0 +2022-11-16 07:35:22,178 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=94126.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:35:32,243 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.571e+01 1.399e+02 1.659e+02 1.999e+02 4.068e+02, threshold=3.319e+02, percent-clipped=3.0 +2022-11-16 07:35:33,767 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8382, 2.5320, 3.0906, 1.9025, 1.8177, 3.3951, 2.7856, 2.4403], + device='cuda:2'), covar=tensor([0.0877, 0.1328, 0.0595, 0.2584, 0.2540, 0.0944, 0.0866, 0.1097], + device='cuda:2'), in_proj_covar=tensor([0.0111, 0.0103, 0.0102, 0.0105, 0.0076, 0.0072, 0.0084, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 07:35:48,802 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.80 vs. limit=2.0 +2022-11-16 07:35:48,962 INFO [train.py:876] (2/4) Epoch 13, batch 6900, loss[loss=0.1385, simple_loss=0.1624, pruned_loss=0.05727, over 5548.00 frames. ], tot_loss[loss=0.1012, simple_loss=0.1328, pruned_loss=0.03477, over 1079447.23 frames. ], batch size: 46, lr: 6.09e-03, grad_scale: 16.0 +2022-11-16 07:35:52,291 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1818, 4.6945, 4.9146, 4.7100, 5.1951, 5.0276, 4.5295, 5.1510], + device='cuda:2'), covar=tensor([0.0266, 0.0330, 0.0396, 0.0327, 0.0300, 0.0220, 0.0312, 0.0229], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0153, 0.0111, 0.0144, 0.0185, 0.0111, 0.0128, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 07:36:08,594 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-16 07:36:23,561 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0842, 1.8277, 2.0438, 1.8160, 1.9902, 2.1055, 1.9353, 1.7909], + device='cuda:2'), covar=tensor([0.0080, 0.0083, 0.0046, 0.0061, 0.0055, 0.0088, 0.0045, 0.0051], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0029, 0.0029, 0.0038, 0.0033, 0.0029, 0.0037, 0.0035], + device='cuda:2'), out_proj_covar=tensor([2.8879e-05, 2.6911e-05, 2.6370e-05, 3.6309e-05, 3.0625e-05, 2.8084e-05, + 3.5184e-05, 3.3174e-05], device='cuda:2') +2022-11-16 07:36:28,750 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=94223.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:36:40,444 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.408e+01 1.493e+02 1.868e+02 2.271e+02 3.997e+02, threshold=3.737e+02, percent-clipped=4.0 +2022-11-16 07:36:56,959 INFO [train.py:876] (2/4) Epoch 13, batch 7000, loss[loss=0.1189, simple_loss=0.1497, pruned_loss=0.04407, over 5690.00 frames. ], tot_loss[loss=0.1027, simple_loss=0.1339, pruned_loss=0.03575, over 1081230.49 frames. ], batch size: 36, lr: 6.09e-03, grad_scale: 16.0 +2022-11-16 07:37:02,681 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0481, 2.5382, 3.2776, 3.8803, 3.8592, 3.0209, 2.9702, 3.9546], + device='cuda:2'), covar=tensor([0.0749, 0.2896, 0.1899, 0.2380, 0.1149, 0.2767, 0.1942, 0.0681], + device='cuda:2'), in_proj_covar=tensor([0.0255, 0.0191, 0.0182, 0.0293, 0.0223, 0.0199, 0.0185, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 07:37:09,859 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=94284.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:37:45,800 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=94337.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:37:48,222 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.793e+01 1.390e+02 1.725e+02 2.069e+02 3.894e+02, threshold=3.450e+02, percent-clipped=1.0 +2022-11-16 07:37:52,967 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=94348.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:38:04,153 INFO [train.py:876] (2/4) Epoch 13, batch 7100, loss[loss=0.07976, simple_loss=0.1175, pruned_loss=0.02099, over 5651.00 frames. ], tot_loss[loss=0.1016, simple_loss=0.1328, pruned_loss=0.03524, over 1078655.75 frames. ], batch size: 32, lr: 6.09e-03, grad_scale: 16.0 +2022-11-16 07:38:10,556 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.29 vs. limit=5.0 +2022-11-16 07:38:11,929 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.21 vs. limit=2.0 +2022-11-16 07:38:22,710 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5931, 4.3333, 2.9778, 4.1514, 3.4214, 3.0702, 2.4438, 3.7471], + device='cuda:2'), covar=tensor([0.1244, 0.0217, 0.0946, 0.0355, 0.0647, 0.0960, 0.1639, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0142, 0.0155, 0.0148, 0.0173, 0.0167, 0.0158, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:38:27,459 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.09 vs. limit=2.0 +2022-11-16 07:38:29,571 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6818, 2.1901, 2.4208, 3.0397, 3.0072, 2.4090, 2.1328, 2.9793], + device='cuda:2'), covar=tensor([0.2342, 0.2162, 0.1830, 0.1311, 0.1161, 0.2695, 0.1856, 0.1464], + device='cuda:2'), in_proj_covar=tensor([0.0259, 0.0193, 0.0185, 0.0298, 0.0225, 0.0200, 0.0188, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 07:38:34,091 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=94409.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 07:38:45,568 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=94426.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:38:46,654 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-11-16 07:38:56,307 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.683e+01 1.440e+02 1.714e+02 2.140e+02 3.966e+02, threshold=3.428e+02, percent-clipped=2.0 +2022-11-16 07:39:12,057 INFO [train.py:876] (2/4) Epoch 13, batch 7200, loss[loss=0.09052, simple_loss=0.1224, pruned_loss=0.02934, over 5561.00 frames. ], tot_loss[loss=0.101, simple_loss=0.1321, pruned_loss=0.03493, over 1076417.57 frames. ], batch size: 16, lr: 6.08e-03, grad_scale: 16.0 +2022-11-16 07:39:15,940 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-11-16 07:39:18,295 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=94474.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:39:54,697 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8254, 4.8439, 3.1744, 4.6813, 3.8424, 3.4132, 2.9606, 4.2875], + device='cuda:2'), covar=tensor([0.1499, 0.0298, 0.1191, 0.0457, 0.0572, 0.0956, 0.1779, 0.0335], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0142, 0.0155, 0.0147, 0.0173, 0.0166, 0.0157, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:40:43,969 INFO [train.py:876] (2/4) Epoch 14, batch 0, loss[loss=0.08793, simple_loss=0.1326, pruned_loss=0.02163, over 5479.00 frames. ], tot_loss[loss=0.08793, simple_loss=0.1326, pruned_loss=0.02163, over 5479.00 frames. ], batch size: 11, lr: 5.86e-03, grad_scale: 16.0 +2022-11-16 07:40:43,970 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 07:41:00,499 INFO [train.py:908] (2/4) Epoch 14, validation: loss=0.1755, simple_loss=0.1868, pruned_loss=0.08205, over 1530663.00 frames. +2022-11-16 07:41:00,500 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 07:41:03,039 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.829e+01 1.398e+02 1.682e+02 2.138e+02 4.621e+02, threshold=3.364e+02, percent-clipped=3.0 +2022-11-16 07:41:07,838 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4494, 4.3557, 4.1023, 3.9252, 4.4639, 4.3213, 1.8157, 4.6855], + device='cuda:2'), covar=tensor([0.0219, 0.0283, 0.0325, 0.0367, 0.0280, 0.0273, 0.2843, 0.0232], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0089, 0.0087, 0.0082, 0.0102, 0.0089, 0.0129, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:41:21,933 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8550, 1.7780, 1.8190, 1.6327, 1.7519, 1.7778, 1.6993, 1.8262], + device='cuda:2'), covar=tensor([0.0070, 0.0076, 0.0053, 0.0061, 0.0057, 0.0045, 0.0057, 0.0060], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0061, 0.0060, 0.0066, 0.0062, 0.0059, 0.0056, 0.0055], + device='cuda:2'), out_proj_covar=tensor([5.8783e-05, 5.3559e-05, 5.2854e-05, 5.8010e-05, 5.5047e-05, 5.1085e-05, + 5.0011e-05, 4.7984e-05], device='cuda:2') +2022-11-16 07:41:29,028 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=94579.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:42:08,085 INFO [train.py:876] (2/4) Epoch 14, batch 100, loss[loss=0.1061, simple_loss=0.1474, pruned_loss=0.03244, over 5596.00 frames. ], tot_loss[loss=0.09869, simple_loss=0.1316, pruned_loss=0.03289, over 433718.81 frames. ], batch size: 38, lr: 5.86e-03, grad_scale: 16.0 +2022-11-16 07:42:08,194 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=94637.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:42:10,679 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.173e+01 1.465e+02 1.762e+02 2.317e+02 5.551e+02, threshold=3.525e+02, percent-clipped=6.0 +2022-11-16 07:42:38,840 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3945, 2.9989, 2.9586, 2.8526, 1.8102, 2.9777, 2.0962, 2.7039], + device='cuda:2'), covar=tensor([0.0348, 0.0183, 0.0164, 0.0275, 0.0516, 0.0200, 0.0418, 0.0171], + device='cuda:2'), in_proj_covar=tensor([0.0197, 0.0185, 0.0182, 0.0209, 0.0197, 0.0186, 0.0194, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 07:42:40,596 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=94685.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:42:52,949 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=94704.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 07:42:56,043 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-11-16 07:43:06,854 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4789, 1.7508, 1.4600, 1.2295, 1.4988, 1.9084, 1.8746, 1.8554], + device='cuda:2'), covar=tensor([0.1699, 0.1206, 0.2262, 0.2655, 0.1534, 0.1150, 0.1136, 0.1478], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0178, 0.0166, 0.0180, 0.0181, 0.0201, 0.0168, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:43:08,168 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8616, 2.9082, 2.7313, 3.0305, 2.3609, 2.5359, 2.8685, 3.4806], + device='cuda:2'), covar=tensor([0.1064, 0.1104, 0.1623, 0.0882, 0.1406, 0.0921, 0.1162, 0.1705], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0110, 0.0108, 0.0109, 0.0094, 0.0105, 0.0099, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 07:43:08,227 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0385, 2.5001, 3.6167, 3.1816, 3.9024, 2.3308, 3.2229, 3.9180], + device='cuda:2'), covar=tensor([0.0690, 0.1763, 0.0857, 0.1781, 0.0631, 0.1917, 0.1585, 0.0924], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0190, 0.0212, 0.0209, 0.0238, 0.0192, 0.0223, 0.0227], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:43:15,766 INFO [train.py:876] (2/4) Epoch 14, batch 200, loss[loss=0.09479, simple_loss=0.1337, pruned_loss=0.02795, over 5665.00 frames. ], tot_loss[loss=0.09859, simple_loss=0.1315, pruned_loss=0.03286, over 693294.39 frames. ], batch size: 36, lr: 5.85e-03, grad_scale: 16.0 +2022-11-16 07:43:18,295 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 1.376e+02 1.680e+02 2.123e+02 3.782e+02, threshold=3.359e+02, percent-clipped=1.0 +2022-11-16 07:44:08,203 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2861, 4.7366, 4.2889, 4.6885, 4.6684, 3.9530, 4.3827, 4.1104], + device='cuda:2'), covar=tensor([0.0381, 0.0463, 0.1276, 0.0572, 0.0501, 0.0565, 0.0888, 0.1057], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0179, 0.0275, 0.0177, 0.0222, 0.0174, 0.0191, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:44:22,890 INFO [train.py:876] (2/4) Epoch 14, batch 300, loss[loss=0.1255, simple_loss=0.155, pruned_loss=0.04796, over 5558.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.1334, pruned_loss=0.03481, over 847917.82 frames. ], batch size: 46, lr: 5.85e-03, grad_scale: 16.0 +2022-11-16 07:44:25,438 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.663e+01 1.544e+02 1.893e+02 2.592e+02 6.103e+02, threshold=3.786e+02, percent-clipped=6.0 +2022-11-16 07:44:50,715 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=94879.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:45:22,538 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=94927.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:45:29,280 INFO [train.py:876] (2/4) Epoch 14, batch 400, loss[loss=0.1047, simple_loss=0.135, pruned_loss=0.03717, over 5740.00 frames. ], tot_loss[loss=0.1025, simple_loss=0.1344, pruned_loss=0.03534, over 946360.94 frames. ], batch size: 27, lr: 5.85e-03, grad_scale: 16.0 +2022-11-16 07:45:32,607 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.318e+01 1.380e+02 1.703e+02 1.935e+02 3.356e+02, threshold=3.406e+02, percent-clipped=0.0 +2022-11-16 07:45:55,888 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.0979, 0.8825, 0.9114, 0.9105, 1.1011, 1.0079, 0.6318, 0.8665], + device='cuda:2'), covar=tensor([0.0287, 0.0461, 0.0406, 0.0455, 0.0425, 0.0407, 0.0914, 0.0426], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0026, 0.0018, 0.0022, 0.0018, 0.0017, 0.0025, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.3196e-05, 1.3110e-04, 9.9573e-05, 1.1318e-04, 1.0017e-04, 9.3971e-05, + 1.2432e-04, 9.4068e-05], device='cuda:2') +2022-11-16 07:46:19,819 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95004.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:46:40,793 INFO [train.py:876] (2/4) Epoch 14, batch 500, loss[loss=0.112, simple_loss=0.1411, pruned_loss=0.04146, over 5565.00 frames. ], tot_loss[loss=0.1024, simple_loss=0.1336, pruned_loss=0.03556, over 994768.40 frames. ], batch size: 40, lr: 5.84e-03, grad_scale: 16.0 +2022-11-16 07:46:43,327 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.308e+01 1.428e+02 1.816e+02 2.349e+02 3.391e+02, threshold=3.632e+02, percent-clipped=0.0 +2022-11-16 07:46:51,602 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=95052.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:47:26,159 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95105.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:47:48,216 INFO [train.py:876] (2/4) Epoch 14, batch 600, loss[loss=0.1179, simple_loss=0.138, pruned_loss=0.04893, over 4117.00 frames. ], tot_loss[loss=0.1006, simple_loss=0.1329, pruned_loss=0.03422, over 1033698.91 frames. ], batch size: 181, lr: 5.84e-03, grad_scale: 16.0 +2022-11-16 07:47:50,760 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.356e+01 1.449e+02 1.769e+02 2.281e+02 4.546e+02, threshold=3.538e+02, percent-clipped=1.0 +2022-11-16 07:48:06,186 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5173, 1.5527, 1.7532, 1.8112, 1.5896, 1.3668, 1.5221, 1.5502], + device='cuda:2'), covar=tensor([0.2747, 0.2729, 0.2520, 0.2078, 0.2197, 0.3142, 0.2375, 0.1438], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0108, 0.0107, 0.0109, 0.0094, 0.0104, 0.0099, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 07:48:07,458 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95166.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:48:56,206 INFO [train.py:876] (2/4) Epoch 14, batch 700, loss[loss=0.1199, simple_loss=0.1474, pruned_loss=0.04616, over 5600.00 frames. ], tot_loss[loss=0.1018, simple_loss=0.134, pruned_loss=0.03482, over 1054372.66 frames. ], batch size: 22, lr: 5.84e-03, grad_scale: 16.0 +2022-11-16 07:48:58,830 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.646e+01 1.509e+02 1.874e+02 2.495e+02 6.608e+02, threshold=3.748e+02, percent-clipped=12.0 +2022-11-16 07:49:11,318 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95260.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:49:16,513 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95268.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:49:52,363 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95321.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:49:58,228 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95329.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:50:03,504 INFO [train.py:876] (2/4) Epoch 14, batch 800, loss[loss=0.1363, simple_loss=0.1567, pruned_loss=0.05796, over 5383.00 frames. ], tot_loss[loss=0.1019, simple_loss=0.134, pruned_loss=0.03494, over 1069135.85 frames. ], batch size: 70, lr: 5.83e-03, grad_scale: 16.0 +2022-11-16 07:50:04,894 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95339.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:50:06,027 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.478e+01 1.483e+02 1.769e+02 2.212e+02 4.574e+02, threshold=3.537e+02, percent-clipped=3.0 +2022-11-16 07:50:46,447 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95400.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:51:11,133 INFO [train.py:876] (2/4) Epoch 14, batch 900, loss[loss=0.07969, simple_loss=0.1139, pruned_loss=0.02273, over 5548.00 frames. ], tot_loss[loss=0.1003, simple_loss=0.1328, pruned_loss=0.03383, over 1081520.87 frames. ], batch size: 14, lr: 5.83e-03, grad_scale: 16.0 +2022-11-16 07:51:13,912 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.380e+01 1.449e+02 1.681e+02 2.078e+02 5.193e+02, threshold=3.361e+02, percent-clipped=2.0 +2022-11-16 07:51:26,943 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95461.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:51:27,805 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.47 vs. limit=5.0 +2022-11-16 07:51:50,807 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.56 vs. limit=2.0 +2022-11-16 07:52:17,327 INFO [train.py:876] (2/4) Epoch 14, batch 1000, loss[loss=0.09622, simple_loss=0.1312, pruned_loss=0.03063, over 5780.00 frames. ], tot_loss[loss=0.1013, simple_loss=0.1332, pruned_loss=0.0347, over 1083803.35 frames. ], batch size: 26, lr: 5.83e-03, grad_scale: 16.0 +2022-11-16 07:52:19,863 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.883e+01 1.427e+02 1.771e+02 2.173e+02 4.557e+02, threshold=3.542e+02, percent-clipped=6.0 +2022-11-16 07:52:40,445 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8204, 2.8996, 3.0115, 2.7048, 2.9974, 2.8110, 1.3035, 2.9960], + device='cuda:2'), covar=tensor([0.0315, 0.0327, 0.0316, 0.0325, 0.0312, 0.0440, 0.2845, 0.0341], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0091, 0.0088, 0.0082, 0.0101, 0.0090, 0.0132, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 07:52:50,358 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5274, 1.5610, 1.4894, 1.1955, 1.3461, 1.4239, 1.2057, 0.7637], + device='cuda:2'), covar=tensor([0.0037, 0.0038, 0.0035, 0.0064, 0.0062, 0.0061, 0.0053, 0.0079], + device='cuda:2'), in_proj_covar=tensor([0.0032, 0.0029, 0.0029, 0.0038, 0.0033, 0.0030, 0.0037, 0.0036], + device='cuda:2'), out_proj_covar=tensor([2.9055e-05, 2.7522e-05, 2.6464e-05, 3.6402e-05, 3.0931e-05, 2.8879e-05, + 3.5224e-05, 3.3898e-05], device='cuda:2') +2022-11-16 07:52:53,033 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8753, 2.4120, 3.4230, 3.0985, 3.6333, 2.3118, 3.1496, 3.7910], + device='cuda:2'), covar=tensor([0.0597, 0.1579, 0.1069, 0.1526, 0.0639, 0.1717, 0.1276, 0.0813], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0194, 0.0215, 0.0211, 0.0241, 0.0195, 0.0224, 0.0231], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:53:11,119 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95616.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:53:11,846 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95617.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:53:16,360 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95624.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:53:21,502 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3860, 1.4806, 1.4936, 1.4852, 1.5740, 1.6147, 1.4886, 1.6061], + device='cuda:2'), covar=tensor([0.0088, 0.0067, 0.0064, 0.0068, 0.0064, 0.0056, 0.0067, 0.0071], + device='cuda:2'), in_proj_covar=tensor([0.0065, 0.0060, 0.0059, 0.0064, 0.0062, 0.0058, 0.0056, 0.0055], + device='cuda:2'), out_proj_covar=tensor([5.7585e-05, 5.3048e-05, 5.1588e-05, 5.6933e-05, 5.4676e-05, 5.0150e-05, + 4.9922e-05, 4.7658e-05], device='cuda:2') +2022-11-16 07:53:24,674 INFO [train.py:876] (2/4) Epoch 14, batch 1100, loss[loss=0.07253, simple_loss=0.1075, pruned_loss=0.01877, over 5105.00 frames. ], tot_loss[loss=0.1007, simple_loss=0.133, pruned_loss=0.03417, over 1089611.14 frames. ], batch size: 7, lr: 5.83e-03, grad_scale: 16.0 +2022-11-16 07:53:27,209 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.869e+01 1.386e+02 1.674e+02 2.201e+02 3.601e+02, threshold=3.349e+02, percent-clipped=2.0 +2022-11-16 07:53:28,727 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4446, 1.6968, 1.5548, 1.2403, 1.3743, 1.8662, 1.8538, 1.8803], + device='cuda:2'), covar=tensor([0.1580, 0.1249, 0.2130, 0.2767, 0.1632, 0.1325, 0.1117, 0.1422], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0179, 0.0170, 0.0183, 0.0186, 0.0204, 0.0170, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:53:38,692 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0042, 1.5152, 1.9275, 1.6726, 1.6080, 1.6970, 1.6124, 1.4854], + device='cuda:2'), covar=tensor([0.0057, 0.0098, 0.0052, 0.0066, 0.0133, 0.0100, 0.0051, 0.0061], + device='cuda:2'), in_proj_covar=tensor([0.0032, 0.0029, 0.0029, 0.0038, 0.0033, 0.0030, 0.0037, 0.0036], + device='cuda:2'), out_proj_covar=tensor([2.8907e-05, 2.7363e-05, 2.6311e-05, 3.6209e-05, 3.0846e-05, 2.8828e-05, + 3.5135e-05, 3.3912e-05], device='cuda:2') +2022-11-16 07:53:53,106 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95678.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:54:03,924 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95695.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:54:27,034 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5050, 1.5208, 1.4656, 1.3095, 1.3836, 1.3885, 1.1106, 0.7445], + device='cuda:2'), covar=tensor([0.0036, 0.0038, 0.0058, 0.0058, 0.0056, 0.0048, 0.0063, 0.0084], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0029, 0.0029, 0.0038, 0.0033, 0.0030, 0.0036, 0.0035], + device='cuda:2'), out_proj_covar=tensor([2.8474e-05, 2.6985e-05, 2.6027e-05, 3.5988e-05, 3.0453e-05, 2.8484e-05, + 3.4678e-05, 3.3385e-05], device='cuda:2') +2022-11-16 07:54:32,053 INFO [train.py:876] (2/4) Epoch 14, batch 1200, loss[loss=0.05912, simple_loss=0.08817, pruned_loss=0.01504, over 5052.00 frames. ], tot_loss[loss=0.1012, simple_loss=0.1334, pruned_loss=0.0345, over 1087102.68 frames. ], batch size: 7, lr: 5.82e-03, grad_scale: 16.0 +2022-11-16 07:54:34,545 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.521e+01 1.381e+02 1.760e+02 2.068e+02 4.246e+02, threshold=3.521e+02, percent-clipped=4.0 +2022-11-16 07:54:47,842 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95761.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:55:20,110 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=95809.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:55:22,110 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95812.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:55:38,701 INFO [train.py:876] (2/4) Epoch 14, batch 1300, loss[loss=0.1052, simple_loss=0.1392, pruned_loss=0.0356, over 5601.00 frames. ], tot_loss[loss=0.09987, simple_loss=0.1323, pruned_loss=0.03373, over 1085267.19 frames. ], batch size: 24, lr: 5.82e-03, grad_scale: 16.0 +2022-11-16 07:55:41,905 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.562e+01 1.314e+02 1.675e+02 2.012e+02 3.727e+02, threshold=3.350e+02, percent-clipped=1.0 +2022-11-16 07:55:45,973 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95847.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:56:02,938 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95873.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:56:27,142 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95908.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 07:56:32,268 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95916.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:56:37,636 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95924.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:56:46,584 INFO [train.py:876] (2/4) Epoch 14, batch 1400, loss[loss=0.09461, simple_loss=0.119, pruned_loss=0.03509, over 5444.00 frames. ], tot_loss[loss=0.09899, simple_loss=0.1312, pruned_loss=0.03337, over 1078702.99 frames. ], batch size: 11, lr: 5.82e-03, grad_scale: 16.0 +2022-11-16 07:56:49,506 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.691e+01 1.448e+02 1.696e+02 2.137e+02 4.589e+02, threshold=3.392e+02, percent-clipped=2.0 +2022-11-16 07:57:04,976 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=95964.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:57:10,153 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=95972.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:57:10,815 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95973.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:57:15,499 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95980.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:57:25,806 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95995.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:57:53,117 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-11-16 07:57:54,061 INFO [train.py:876] (2/4) Epoch 14, batch 1500, loss[loss=0.09552, simple_loss=0.1217, pruned_loss=0.03465, over 5709.00 frames. ], tot_loss[loss=0.1012, simple_loss=0.1326, pruned_loss=0.03488, over 1078397.99 frames. ], batch size: 17, lr: 5.81e-03, grad_scale: 16.0 +2022-11-16 07:57:56,681 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.019e+02 1.465e+02 1.739e+02 2.113e+02 3.912e+02, threshold=3.478e+02, percent-clipped=1.0 +2022-11-16 07:57:56,884 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96041.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:57:58,099 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96043.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:58:20,656 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.6032, 1.3395, 1.4802, 1.1301, 1.5597, 1.7737, 0.8860, 1.3630], + device='cuda:2'), covar=tensor([0.0313, 0.0437, 0.0303, 0.0533, 0.0533, 0.0352, 0.0624, 0.0281], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0026, 0.0019, 0.0022, 0.0018, 0.0017, 0.0025, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.3676e-05, 1.3089e-04, 1.0070e-04, 1.1291e-04, 1.0131e-04, 9.4933e-05, + 1.2504e-04, 9.4069e-05], device='cuda:2') +2022-11-16 07:58:24,064 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96080.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:58:55,853 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1863, 3.6041, 2.8218, 1.7268, 3.4469, 1.4614, 3.3750, 1.8993], + device='cuda:2'), covar=tensor([0.1517, 0.0187, 0.0778, 0.1965, 0.0243, 0.1989, 0.0291, 0.1540], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0103, 0.0113, 0.0109, 0.0101, 0.0117, 0.0098, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 07:59:02,198 INFO [train.py:876] (2/4) Epoch 14, batch 1600, loss[loss=0.1711, simple_loss=0.1823, pruned_loss=0.07994, over 5583.00 frames. ], tot_loss[loss=0.1022, simple_loss=0.1337, pruned_loss=0.03538, over 1080100.39 frames. ], batch size: 50, lr: 5.81e-03, grad_scale: 16.0 +2022-11-16 07:59:04,693 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.966e+01 1.432e+02 1.726e+02 2.177e+02 5.569e+02, threshold=3.453e+02, percent-clipped=4.0 +2022-11-16 07:59:04,901 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96141.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:59:06,482 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.98 vs. limit=5.0 +2022-11-16 07:59:21,833 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96165.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 07:59:23,751 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96168.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 07:59:26,394 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4914, 2.4495, 2.2408, 2.5140, 2.1399, 2.0132, 2.3915, 2.8183], + device='cuda:2'), covar=tensor([0.1116, 0.1591, 0.1873, 0.1427, 0.1447, 0.1384, 0.1336, 0.1011], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0109, 0.0108, 0.0109, 0.0094, 0.0104, 0.0099, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 07:59:32,064 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.55 vs. limit=5.0 +2022-11-16 07:59:46,891 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96203.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 07:59:48,686 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6400, 2.5264, 2.8345, 3.5873, 3.4682, 2.6278, 2.2614, 3.6394], + device='cuda:2'), covar=tensor([0.0923, 0.2493, 0.1656, 0.2296, 0.1100, 0.2575, 0.2059, 0.0673], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0194, 0.0184, 0.0296, 0.0224, 0.0199, 0.0188, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 07:59:52,752 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0693, 3.4927, 2.6927, 1.6850, 3.3312, 1.3494, 3.2506, 1.6807], + device='cuda:2'), covar=tensor([0.1682, 0.0211, 0.1025, 0.2028, 0.0295, 0.2174, 0.0362, 0.1693], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0103, 0.0113, 0.0110, 0.0101, 0.0117, 0.0098, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:00:03,459 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96226.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:00:07,528 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2591, 2.6588, 3.2543, 1.7315, 2.9422, 3.4649, 3.2821, 3.6411], + device='cuda:2'), covar=tensor([0.1762, 0.1705, 0.0863, 0.2736, 0.0794, 0.0679, 0.0576, 0.0681], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0179, 0.0169, 0.0182, 0.0185, 0.0202, 0.0171, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:00:10,807 INFO [train.py:876] (2/4) Epoch 14, batch 1700, loss[loss=0.08778, simple_loss=0.1246, pruned_loss=0.02549, over 5542.00 frames. ], tot_loss[loss=0.1016, simple_loss=0.1334, pruned_loss=0.03484, over 1088692.40 frames. ], batch size: 21, lr: 5.81e-03, grad_scale: 16.0 +2022-11-16 08:00:14,150 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.983e+01 1.431e+02 1.731e+02 2.193e+02 6.139e+02, threshold=3.462e+02, percent-clipped=4.0 +2022-11-16 08:00:36,052 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96273.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:00:59,391 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6169, 4.4438, 4.7275, 4.7362, 4.3634, 4.3545, 5.1379, 4.7635], + device='cuda:2'), covar=tensor([0.0411, 0.0937, 0.0388, 0.1017, 0.0521, 0.0320, 0.0578, 0.0509], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0111, 0.0097, 0.0123, 0.0091, 0.0082, 0.0148, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:01:07,790 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96321.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:01:10,176 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96324.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:01:18,260 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96336.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:01:18,846 INFO [train.py:876] (2/4) Epoch 14, batch 1800, loss[loss=0.0838, simple_loss=0.1157, pruned_loss=0.02597, over 5697.00 frames. ], tot_loss[loss=0.1013, simple_loss=0.133, pruned_loss=0.0348, over 1087321.85 frames. ], batch size: 12, lr: 5.80e-03, grad_scale: 16.0 +2022-11-16 08:01:22,028 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.395e+01 1.406e+02 1.732e+02 2.199e+02 6.902e+02, threshold=3.464e+02, percent-clipped=4.0 +2022-11-16 08:01:50,749 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96385.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:01:54,347 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96390.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:02:04,305 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2113, 1.5732, 1.2954, 1.3559, 1.5752, 1.3687, 1.2933, 1.6178], + device='cuda:2'), covar=tensor([0.0106, 0.0051, 0.0073, 0.0084, 0.0068, 0.0063, 0.0082, 0.0061], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0061, 0.0060, 0.0066, 0.0064, 0.0059, 0.0057, 0.0056], + device='cuda:2'), out_proj_covar=tensor([5.8827e-05, 5.3968e-05, 5.2493e-05, 5.8176e-05, 5.6159e-05, 5.1117e-05, + 5.0454e-05, 4.9088e-05], device='cuda:2') +2022-11-16 08:02:25,309 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96436.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:02:25,905 INFO [train.py:876] (2/4) Epoch 14, batch 1900, loss[loss=0.06433, simple_loss=0.09956, pruned_loss=0.01455, over 5332.00 frames. ], tot_loss[loss=0.09966, simple_loss=0.1314, pruned_loss=0.03396, over 1091166.33 frames. ], batch size: 9, lr: 5.80e-03, grad_scale: 16.0 +2022-11-16 08:02:29,415 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.308e+01 1.441e+02 1.720e+02 2.105e+02 7.193e+02, threshold=3.439e+02, percent-clipped=2.0 +2022-11-16 08:02:35,456 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96451.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:02:46,379 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96468.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 08:02:56,872 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.23 vs. limit=5.0 +2022-11-16 08:03:10,714 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96503.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:03:19,147 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96516.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:03:22,352 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96521.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:03:32,723 INFO [train.py:876] (2/4) Epoch 14, batch 2000, loss[loss=0.07346, simple_loss=0.1114, pruned_loss=0.01778, over 5670.00 frames. ], tot_loss[loss=0.09887, simple_loss=0.1308, pruned_loss=0.03348, over 1088764.69 frames. ], batch size: 11, lr: 5.80e-03, grad_scale: 16.0 +2022-11-16 08:03:36,667 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.185e+01 1.342e+02 1.750e+02 2.213e+02 4.524e+02, threshold=3.499e+02, percent-clipped=3.0 +2022-11-16 08:03:43,303 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96551.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:04:15,942 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.24 vs. limit=2.0 +2022-11-16 08:04:40,268 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96636.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:04:40,791 INFO [train.py:876] (2/4) Epoch 14, batch 2100, loss[loss=0.1239, simple_loss=0.1451, pruned_loss=0.05137, over 5705.00 frames. ], tot_loss[loss=0.1007, simple_loss=0.132, pruned_loss=0.03472, over 1078174.86 frames. ], batch size: 34, lr: 5.80e-03, grad_scale: 8.0 +2022-11-16 08:04:45,310 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.473e+01 1.415e+02 1.802e+02 2.261e+02 4.449e+02, threshold=3.604e+02, percent-clipped=2.0 +2022-11-16 08:05:10,359 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96680.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:05:12,896 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96684.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:05:48,004 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96736.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:05:48,603 INFO [train.py:876] (2/4) Epoch 14, batch 2200, loss[loss=0.09976, simple_loss=0.1333, pruned_loss=0.03308, over 5282.00 frames. ], tot_loss[loss=0.1, simple_loss=0.1318, pruned_loss=0.03411, over 1074880.49 frames. ], batch size: 79, lr: 5.79e-03, grad_scale: 8.0 +2022-11-16 08:05:52,487 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.562e+01 1.449e+02 1.748e+02 2.179e+02 3.480e+02, threshold=3.495e+02, percent-clipped=0.0 +2022-11-16 08:05:53,384 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-11-16 08:05:55,273 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96746.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:06:15,829 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9335, 1.3063, 1.1663, 1.2118, 1.2101, 1.6981, 1.4133, 1.3895], + device='cuda:2'), covar=tensor([0.3588, 0.1052, 0.3445, 0.2920, 0.2186, 0.0636, 0.2291, 0.2390], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0104, 0.0105, 0.0105, 0.0076, 0.0072, 0.0085, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 08:06:15,855 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96776.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:06:20,964 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96784.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:06:46,717 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96821.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:06:46,749 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.0091, 2.9472, 2.7116, 2.9810, 2.5529, 2.9040, 3.1866, 3.4693], + device='cuda:2'), covar=tensor([0.0875, 0.1446, 0.1441, 0.1354, 0.1332, 0.0801, 0.0870, 0.0952], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0109, 0.0107, 0.0109, 0.0094, 0.0104, 0.0098, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 08:06:55,851 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8694, 5.0067, 3.7769, 2.2095, 4.6908, 1.9514, 4.6857, 2.7247], + device='cuda:2'), covar=tensor([0.1174, 0.0093, 0.0417, 0.1773, 0.0135, 0.1633, 0.0131, 0.1376], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0103, 0.0114, 0.0110, 0.0101, 0.0117, 0.0099, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:06:56,979 INFO [train.py:876] (2/4) Epoch 14, batch 2300, loss[loss=0.08912, simple_loss=0.1218, pruned_loss=0.0282, over 5500.00 frames. ], tot_loss[loss=0.09877, simple_loss=0.1306, pruned_loss=0.03346, over 1078233.56 frames. ], batch size: 12, lr: 5.79e-03, grad_scale: 4.0 +2022-11-16 08:06:57,149 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96837.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:07:01,480 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.903e+01 1.452e+02 1.773e+02 2.447e+02 7.176e+02, threshold=3.545e+02, percent-clipped=7.0 +2022-11-16 08:07:14,931 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.39 vs. limit=5.0 +2022-11-16 08:07:18,438 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96869.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:07:33,324 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-16 08:07:57,174 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5706, 1.8508, 1.7775, 1.2781, 1.7330, 2.1096, 2.1345, 2.1931], + device='cuda:2'), covar=tensor([0.1922, 0.1547, 0.1921, 0.2723, 0.1432, 0.1266, 0.0935, 0.1281], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0177, 0.0166, 0.0180, 0.0183, 0.0202, 0.0169, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:08:04,682 INFO [train.py:876] (2/4) Epoch 14, batch 2400, loss[loss=0.1272, simple_loss=0.1562, pruned_loss=0.0491, over 5560.00 frames. ], tot_loss[loss=0.09897, simple_loss=0.1315, pruned_loss=0.03325, over 1084401.68 frames. ], batch size: 54, lr: 5.79e-03, grad_scale: 8.0 +2022-11-16 08:08:09,599 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.192e+01 1.391e+02 1.679e+02 2.056e+02 5.016e+02, threshold=3.358e+02, percent-clipped=6.0 +2022-11-16 08:08:28,660 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1858, 2.5752, 2.6709, 1.5227, 2.6967, 3.0697, 2.9356, 3.3621], + device='cuda:2'), covar=tensor([0.1755, 0.1827, 0.1054, 0.2817, 0.0824, 0.0944, 0.0634, 0.0845], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0178, 0.0167, 0.0180, 0.0184, 0.0202, 0.0169, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:08:34,219 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96980.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:08:48,045 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97000.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:09:03,258 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1936, 2.6640, 3.1752, 3.8750, 3.9757, 3.1334, 2.7578, 4.1109], + device='cuda:2'), covar=tensor([0.0753, 0.2418, 0.1953, 0.3249, 0.1615, 0.2587, 0.1872, 0.0805], + device='cuda:2'), in_proj_covar=tensor([0.0265, 0.0199, 0.0189, 0.0298, 0.0229, 0.0203, 0.0191, 0.0251], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:09:06,579 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97028.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:09:08,764 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.92 vs. limit=5.0 +2022-11-16 08:09:12,710 INFO [train.py:876] (2/4) Epoch 14, batch 2500, loss[loss=0.1181, simple_loss=0.1509, pruned_loss=0.04266, over 5532.00 frames. ], tot_loss[loss=0.1005, simple_loss=0.1325, pruned_loss=0.03428, over 1084437.69 frames. ], batch size: 49, lr: 5.78e-03, grad_scale: 8.0 +2022-11-16 08:09:17,249 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.842e+01 1.491e+02 1.726e+02 2.085e+02 3.787e+02, threshold=3.452e+02, percent-clipped=1.0 +2022-11-16 08:09:18,707 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97046.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:09:29,249 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97061.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:09:51,076 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97094.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:09:59,243 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.76 vs. limit=5.0 +2022-11-16 08:10:12,115 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0326, 2.3937, 2.9375, 3.7498, 3.9355, 2.9850, 2.7217, 3.8103], + device='cuda:2'), covar=tensor([0.0574, 0.2544, 0.2050, 0.3107, 0.0888, 0.2384, 0.2063, 0.0852], + device='cuda:2'), in_proj_covar=tensor([0.0264, 0.0198, 0.0188, 0.0296, 0.0228, 0.0201, 0.0190, 0.0250], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:10:16,527 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97132.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:10:19,667 INFO [train.py:876] (2/4) Epoch 14, batch 2600, loss[loss=0.1054, simple_loss=0.1464, pruned_loss=0.0322, over 5716.00 frames. ], tot_loss[loss=0.09987, simple_loss=0.132, pruned_loss=0.03387, over 1084710.96 frames. ], batch size: 12, lr: 5.78e-03, grad_scale: 8.0 +2022-11-16 08:10:22,485 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.62 vs. limit=5.0 +2022-11-16 08:10:25,003 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.432e+01 1.490e+02 1.874e+02 2.362e+02 5.488e+02, threshold=3.748e+02, percent-clipped=3.0 +2022-11-16 08:10:36,712 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.77 vs. limit=2.0 +2022-11-16 08:10:54,884 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5581, 3.4682, 3.4802, 3.1782, 1.8858, 3.5863, 2.2530, 3.1213], + device='cuda:2'), covar=tensor([0.0421, 0.0274, 0.0230, 0.0409, 0.0667, 0.0236, 0.0518, 0.0222], + device='cuda:2'), in_proj_covar=tensor([0.0196, 0.0185, 0.0182, 0.0209, 0.0197, 0.0185, 0.0194, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 08:11:27,617 INFO [train.py:876] (2/4) Epoch 14, batch 2700, loss[loss=0.1345, simple_loss=0.1536, pruned_loss=0.05764, over 5525.00 frames. ], tot_loss[loss=0.09881, simple_loss=0.1312, pruned_loss=0.03319, over 1083872.15 frames. ], batch size: 54, lr: 5.78e-03, grad_scale: 8.0 +2022-11-16 08:11:32,090 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.527e+01 1.417e+02 1.707e+02 2.038e+02 4.656e+02, threshold=3.414e+02, percent-clipped=3.0 +2022-11-16 08:11:43,176 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97260.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:11:55,778 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-11-16 08:12:24,373 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97321.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:12:34,935 INFO [train.py:876] (2/4) Epoch 14, batch 2800, loss[loss=0.07885, simple_loss=0.123, pruned_loss=0.01737, over 5706.00 frames. ], tot_loss[loss=0.0986, simple_loss=0.1311, pruned_loss=0.03304, over 1086937.82 frames. ], batch size: 17, lr: 5.77e-03, grad_scale: 8.0 +2022-11-16 08:12:38,288 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97342.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:12:39,406 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.283e+01 1.377e+02 1.617e+02 1.956e+02 4.684e+02, threshold=3.233e+02, percent-clipped=2.0 +2022-11-16 08:12:41,607 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97347.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:12:47,491 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97356.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:12:59,840 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97374.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:13:05,483 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7817, 4.2407, 3.7955, 3.6233, 2.0697, 4.0583, 2.4123, 3.5165], + device='cuda:2'), covar=tensor([0.0485, 0.0163, 0.0229, 0.0340, 0.0719, 0.0162, 0.0605, 0.0161], + device='cuda:2'), in_proj_covar=tensor([0.0197, 0.0185, 0.0184, 0.0210, 0.0197, 0.0186, 0.0195, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 08:13:09,440 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.29 vs. limit=5.0 +2022-11-16 08:13:13,819 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.43 vs. limit=5.0 +2022-11-16 08:13:17,025 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3227, 3.9779, 3.0435, 1.9325, 3.5733, 1.5801, 3.5074, 2.0524], + device='cuda:2'), covar=tensor([0.1529, 0.0186, 0.0868, 0.1870, 0.0297, 0.1833, 0.0399, 0.1468], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0103, 0.0115, 0.0111, 0.0102, 0.0118, 0.0100, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:13:19,936 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97403.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:13:23,119 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97408.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:13:24,356 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1926, 3.8183, 4.0215, 3.6650, 4.2347, 3.9373, 3.8466, 4.2228], + device='cuda:2'), covar=tensor([0.0386, 0.0454, 0.0487, 0.0450, 0.0426, 0.0351, 0.0391, 0.0389], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0162, 0.0114, 0.0151, 0.0192, 0.0117, 0.0133, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0004, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 08:13:39,502 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97432.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:13:41,861 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97435.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:13:42,455 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0904, 3.5785, 2.4601, 3.3202, 2.6975, 2.4534, 1.9058, 3.0716], + device='cuda:2'), covar=tensor([0.1413, 0.0276, 0.1166, 0.0507, 0.1276, 0.1205, 0.2095, 0.0522], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0141, 0.0154, 0.0148, 0.0173, 0.0167, 0.0157, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:13:43,021 INFO [train.py:876] (2/4) Epoch 14, batch 2900, loss[loss=0.1414, simple_loss=0.1624, pruned_loss=0.06014, over 5549.00 frames. ], tot_loss[loss=0.09714, simple_loss=0.1301, pruned_loss=0.03209, over 1089264.19 frames. ], batch size: 46, lr: 5.77e-03, grad_scale: 8.0 +2022-11-16 08:13:47,950 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.266e+01 1.378e+02 1.705e+02 2.124e+02 3.777e+02, threshold=3.411e+02, percent-clipped=2.0 +2022-11-16 08:14:12,676 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97480.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:14:29,268 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5561, 1.6570, 1.6249, 1.4960, 1.6491, 1.6266, 1.4405, 1.4555], + device='cuda:2'), covar=tensor([0.0077, 0.0053, 0.0050, 0.0064, 0.0058, 0.0051, 0.0070, 0.0062], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0061, 0.0060, 0.0066, 0.0064, 0.0060, 0.0058, 0.0057], + device='cuda:2'), out_proj_covar=tensor([5.9517e-05, 5.4381e-05, 5.2719e-05, 5.7713e-05, 5.6170e-05, 5.1709e-05, + 5.1274e-05, 4.9369e-05], device='cuda:2') +2022-11-16 08:14:51,002 INFO [train.py:876] (2/4) Epoch 14, batch 3000, loss[loss=0.1005, simple_loss=0.1357, pruned_loss=0.03266, over 5719.00 frames. ], tot_loss[loss=0.09879, simple_loss=0.1311, pruned_loss=0.03324, over 1087701.03 frames. ], batch size: 28, lr: 5.77e-03, grad_scale: 8.0 +2022-11-16 08:14:51,002 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 08:14:56,178 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6911, 2.7931, 2.6347, 2.7379, 2.4268, 2.0930, 2.6203, 3.0312], + device='cuda:2'), covar=tensor([0.1196, 0.1280, 0.1440, 0.0917, 0.1177, 0.2355, 0.1173, 0.1429], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0108, 0.0106, 0.0108, 0.0093, 0.0103, 0.0097, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 08:15:04,178 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3721, 2.0314, 2.4243, 2.0818, 1.4936, 2.2648, 2.1890, 1.8798], + device='cuda:2'), covar=tensor([0.0056, 0.0060, 0.0032, 0.0055, 0.0188, 0.0081, 0.0039, 0.0042], + device='cuda:2'), in_proj_covar=tensor([0.0032, 0.0029, 0.0029, 0.0038, 0.0034, 0.0030, 0.0037, 0.0036], + device='cuda:2'), out_proj_covar=tensor([2.9130e-05, 2.7469e-05, 2.6392e-05, 3.6499e-05, 3.1499e-05, 2.9003e-05, + 3.5050e-05, 3.4367e-05], device='cuda:2') +2022-11-16 08:15:08,553 INFO [train.py:908] (2/4) Epoch 14, validation: loss=0.178, simple_loss=0.188, pruned_loss=0.08395, over 1530663.00 frames. +2022-11-16 08:15:08,554 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 08:15:12,972 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.365e+01 1.436e+02 1.776e+02 2.242e+02 5.969e+02, threshold=3.553e+02, percent-clipped=3.0 +2022-11-16 08:15:20,367 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97555.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:15:28,790 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9151, 2.5776, 3.5527, 3.1804, 3.7689, 2.5141, 3.2574, 3.9643], + device='cuda:2'), covar=tensor([0.0922, 0.1448, 0.0823, 0.1419, 0.0712, 0.1494, 0.1535, 0.0893], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0193, 0.0216, 0.0212, 0.0241, 0.0195, 0.0226, 0.0230], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:15:38,641 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-11-16 08:15:45,206 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7295, 4.7914, 3.2397, 4.7380, 3.7246, 3.2598, 2.6436, 4.1845], + device='cuda:2'), covar=tensor([0.1262, 0.0196, 0.0971, 0.0264, 0.0685, 0.0909, 0.1846, 0.0264], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0142, 0.0155, 0.0148, 0.0174, 0.0167, 0.0158, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:15:46,549 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4456, 4.4860, 3.3450, 2.0464, 4.1239, 1.7364, 4.2588, 2.3702], + device='cuda:2'), covar=tensor([0.1354, 0.0140, 0.0716, 0.1744, 0.0219, 0.1690, 0.0191, 0.1353], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0103, 0.0115, 0.0111, 0.0102, 0.0118, 0.0100, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:16:01,568 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97616.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:16:01,662 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97616.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:16:15,591 INFO [train.py:876] (2/4) Epoch 14, batch 3100, loss[loss=0.08517, simple_loss=0.1161, pruned_loss=0.02713, over 5596.00 frames. ], tot_loss[loss=0.09882, simple_loss=0.1317, pruned_loss=0.03297, over 1096263.04 frames. ], batch size: 23, lr: 5.77e-03, grad_scale: 8.0 +2022-11-16 08:16:20,445 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.977e+01 1.456e+02 1.784e+02 2.207e+02 3.883e+02, threshold=3.567e+02, percent-clipped=1.0 +2022-11-16 08:16:28,852 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97656.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:16:30,326 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7107, 1.3982, 1.9843, 1.4422, 1.8943, 1.9805, 1.4049, 1.6530], + device='cuda:2'), covar=tensor([0.0441, 0.0566, 0.0174, 0.0827, 0.0968, 0.0509, 0.0474, 0.0250], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0026, 0.0018, 0.0022, 0.0018, 0.0017, 0.0025, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.2353e-05, 1.2998e-04, 9.9008e-05, 1.1191e-04, 1.0016e-04, 9.3488e-05, + 1.2335e-04, 9.3451e-05], device='cuda:2') +2022-11-16 08:16:41,524 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97674.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:16:57,841 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97698.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 08:17:01,176 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97703.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:17:01,770 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97704.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:17:19,799 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97730.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:17:23,169 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97735.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 08:17:24,291 INFO [train.py:876] (2/4) Epoch 14, batch 3200, loss[loss=0.07056, simple_loss=0.09244, pruned_loss=0.02434, over 5427.00 frames. ], tot_loss[loss=0.09939, simple_loss=0.1322, pruned_loss=0.0333, over 1093826.91 frames. ], batch size: 9, lr: 5.76e-03, grad_scale: 8.0 +2022-11-16 08:17:29,198 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.777e+01 1.455e+02 1.714e+02 2.116e+02 4.590e+02, threshold=3.428e+02, percent-clipped=2.0 +2022-11-16 08:17:45,028 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.15 vs. limit=2.0 +2022-11-16 08:17:54,265 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5325, 2.4259, 2.5166, 2.4717, 2.5292, 2.3314, 2.6728, 2.5542], + device='cuda:2'), covar=tensor([0.0500, 0.0909, 0.0592, 0.1222, 0.0639, 0.0555, 0.0949, 0.0931], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0113, 0.0097, 0.0125, 0.0091, 0.0084, 0.0148, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:18:32,099 INFO [train.py:876] (2/4) Epoch 14, batch 3300, loss[loss=0.1053, simple_loss=0.143, pruned_loss=0.03375, over 5541.00 frames. ], tot_loss[loss=0.0996, simple_loss=0.132, pruned_loss=0.03361, over 1088958.35 frames. ], batch size: 15, lr: 5.76e-03, grad_scale: 8.0 +2022-11-16 08:18:36,477 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.779e+01 1.482e+02 1.736e+02 2.155e+02 3.992e+02, threshold=3.473e+02, percent-clipped=3.0 +2022-11-16 08:18:55,360 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-11-16 08:19:05,931 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3627, 4.2797, 3.3202, 1.9585, 3.9631, 1.7306, 3.8363, 2.1972], + device='cuda:2'), covar=tensor([0.1464, 0.0134, 0.0498, 0.1898, 0.0210, 0.1788, 0.0204, 0.1579], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0104, 0.0115, 0.0112, 0.0103, 0.0119, 0.0100, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:19:06,980 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97888.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:19:09,005 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3095, 0.8479, 1.0687, 0.9680, 1.1399, 1.0862, 0.6850, 0.9218], + device='cuda:2'), covar=tensor([0.0414, 0.0437, 0.0357, 0.0566, 0.0326, 0.0398, 0.0837, 0.0350], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0026, 0.0018, 0.0022, 0.0018, 0.0017, 0.0025, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.2675e-05, 1.3020e-04, 9.9570e-05, 1.1244e-04, 9.9911e-05, 9.4062e-05, + 1.2377e-04, 9.3594e-05], device='cuda:2') +2022-11-16 08:19:22,252 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97911.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:19:25,495 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97916.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:19:27,227 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9303, 2.3784, 3.5164, 3.2143, 3.7064, 2.2662, 3.2597, 3.8664], + device='cuda:2'), covar=tensor([0.0607, 0.1382, 0.0830, 0.1239, 0.0593, 0.1615, 0.1026, 0.0659], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0191, 0.0214, 0.0210, 0.0237, 0.0194, 0.0222, 0.0227], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:19:39,524 INFO [train.py:876] (2/4) Epoch 14, batch 3400, loss[loss=0.09139, simple_loss=0.1295, pruned_loss=0.02662, over 5779.00 frames. ], tot_loss[loss=0.09931, simple_loss=0.1318, pruned_loss=0.03342, over 1087265.96 frames. ], batch size: 21, lr: 5.76e-03, grad_scale: 8.0 +2022-11-16 08:19:44,337 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.552e+01 1.387e+02 1.696e+02 2.106e+02 3.635e+02, threshold=3.392e+02, percent-clipped=1.0 +2022-11-16 08:19:47,857 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97949.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:19:48,514 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.3533, 1.7660, 1.4665, 1.4507, 1.6362, 1.6675, 1.3551, 1.6787], + device='cuda:2'), covar=tensor([0.0076, 0.0060, 0.0073, 0.0068, 0.0071, 0.0052, 0.0078, 0.0068], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0063, 0.0061, 0.0066, 0.0064, 0.0060, 0.0058, 0.0057], + device='cuda:2'), out_proj_covar=tensor([5.9897e-05, 5.5469e-05, 5.3332e-05, 5.8398e-05, 5.6834e-05, 5.2358e-05, + 5.1676e-05, 4.9937e-05], device='cuda:2') +2022-11-16 08:19:58,064 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97964.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:20:21,518 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97998.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:20:25,087 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98003.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:20:43,496 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98030.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 08:20:43,533 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98030.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:20:47,942 INFO [train.py:876] (2/4) Epoch 14, batch 3500, loss[loss=0.08855, simple_loss=0.1286, pruned_loss=0.02425, over 5750.00 frames. ], tot_loss[loss=0.09977, simple_loss=0.1319, pruned_loss=0.0338, over 1090391.81 frames. ], batch size: 13, lr: 5.75e-03, grad_scale: 8.0 +2022-11-16 08:20:52,493 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.923e+01 1.343e+02 1.705e+02 2.352e+02 4.621e+02, threshold=3.411e+02, percent-clipped=6.0 +2022-11-16 08:20:53,902 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98046.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:20:57,585 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98051.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:21:10,182 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8421, 2.9318, 3.3693, 1.7808, 3.1956, 3.5258, 3.4506, 3.8436], + device='cuda:2'), covar=tensor([0.1846, 0.1708, 0.0753, 0.3056, 0.0958, 0.0753, 0.0623, 0.0774], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0178, 0.0167, 0.0180, 0.0186, 0.0205, 0.0170, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:21:15,969 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98078.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:21:35,054 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98106.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:21:55,983 INFO [train.py:876] (2/4) Epoch 14, batch 3600, loss[loss=0.1722, simple_loss=0.1707, pruned_loss=0.08687, over 3111.00 frames. ], tot_loss[loss=0.09953, simple_loss=0.1315, pruned_loss=0.03378, over 1087374.50 frames. ], batch size: 284, lr: 5.75e-03, grad_scale: 8.0 +2022-11-16 08:21:59,350 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.73 vs. limit=2.0 +2022-11-16 08:22:00,947 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.265e+01 1.372e+02 1.706e+02 2.197e+02 4.106e+02, threshold=3.412e+02, percent-clipped=3.0 +2022-11-16 08:22:06,640 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-11-16 08:22:16,979 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98167.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:22:22,648 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.19 vs. limit=5.0 +2022-11-16 08:22:47,082 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98211.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:23:04,728 INFO [train.py:876] (2/4) Epoch 14, batch 3700, loss[loss=0.1025, simple_loss=0.1455, pruned_loss=0.0298, over 5671.00 frames. ], tot_loss[loss=0.09982, simple_loss=0.1315, pruned_loss=0.03408, over 1081625.19 frames. ], batch size: 29, lr: 5.75e-03, grad_scale: 8.0 +2022-11-16 08:23:09,249 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.991e+01 1.391e+02 1.713e+02 2.053e+02 4.916e+02, threshold=3.427e+02, percent-clipped=4.0 +2022-11-16 08:23:09,346 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98244.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:23:15,428 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7983, 3.7070, 3.9071, 3.4972, 3.9185, 3.6582, 1.5211, 4.0561], + device='cuda:2'), covar=tensor([0.0272, 0.0276, 0.0231, 0.0386, 0.0286, 0.0461, 0.3112, 0.0272], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0090, 0.0089, 0.0084, 0.0105, 0.0091, 0.0132, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:23:19,339 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98259.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:23:54,162 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7167, 4.4761, 4.6066, 4.6045, 4.3670, 4.3389, 5.0030, 4.4237], + device='cuda:2'), covar=tensor([0.0370, 0.0664, 0.0411, 0.0988, 0.0456, 0.0274, 0.0520, 0.0675], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0110, 0.0096, 0.0123, 0.0090, 0.0082, 0.0146, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:23:59,208 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-16 08:24:08,801 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98330.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:24:13,989 INFO [train.py:876] (2/4) Epoch 14, batch 3800, loss[loss=0.1495, simple_loss=0.1582, pruned_loss=0.07041, over 5361.00 frames. ], tot_loss[loss=0.09893, simple_loss=0.1313, pruned_loss=0.0333, over 1084703.02 frames. ], batch size: 70, lr: 5.74e-03, grad_scale: 4.0 +2022-11-16 08:24:19,594 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.532e+01 1.388e+02 1.687e+02 2.091e+02 4.683e+02, threshold=3.374e+02, percent-clipped=2.0 +2022-11-16 08:24:31,022 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.47 vs. limit=5.0 +2022-11-16 08:24:42,604 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98378.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:25:04,761 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98410.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:25:22,860 INFO [train.py:876] (2/4) Epoch 14, batch 3900, loss[loss=0.122, simple_loss=0.1496, pruned_loss=0.04719, over 5772.00 frames. ], tot_loss[loss=0.09911, simple_loss=0.1315, pruned_loss=0.03334, over 1086171.29 frames. ], batch size: 21, lr: 5.74e-03, grad_scale: 4.0 +2022-11-16 08:25:27,986 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.450e+01 1.480e+02 1.725e+02 2.158e+02 4.236e+02, threshold=3.450e+02, percent-clipped=3.0 +2022-11-16 08:25:33,047 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5042, 4.8534, 2.9858, 4.5648, 3.6597, 3.1492, 2.7021, 4.2001], + device='cuda:2'), covar=tensor([0.1302, 0.0201, 0.1070, 0.0345, 0.0745, 0.1033, 0.1631, 0.0309], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0139, 0.0152, 0.0146, 0.0172, 0.0165, 0.0154, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:25:39,862 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98462.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:25:45,712 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98471.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:26:23,356 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8459, 2.1964, 2.4480, 3.2099, 3.1175, 2.3408, 2.0791, 3.2447], + device='cuda:2'), covar=tensor([0.1578, 0.2553, 0.2128, 0.1575, 0.1153, 0.2995, 0.2234, 0.0811], + device='cuda:2'), in_proj_covar=tensor([0.0259, 0.0198, 0.0189, 0.0297, 0.0226, 0.0201, 0.0188, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:26:30,067 INFO [train.py:876] (2/4) Epoch 14, batch 4000, loss[loss=0.1034, simple_loss=0.1339, pruned_loss=0.03648, over 5523.00 frames. ], tot_loss[loss=0.09957, simple_loss=0.1321, pruned_loss=0.03353, over 1086698.48 frames. ], batch size: 40, lr: 5.74e-03, grad_scale: 8.0 +2022-11-16 08:26:34,211 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98543.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:26:34,794 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98544.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:26:35,247 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.016e+02 1.413e+02 1.702e+02 2.140e+02 3.638e+02, threshold=3.404e+02, percent-clipped=2.0 +2022-11-16 08:27:01,223 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8068, 1.4073, 1.8683, 1.1010, 1.8072, 1.8119, 1.1835, 1.4016], + device='cuda:2'), covar=tensor([0.0677, 0.0631, 0.0317, 0.0913, 0.0802, 0.0277, 0.1148, 0.0537], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0026, 0.0018, 0.0021, 0.0018, 0.0017, 0.0024, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.1604e-05, 1.2839e-04, 9.8543e-05, 1.1069e-04, 9.8953e-05, 9.3096e-05, + 1.2159e-04, 9.2305e-05], device='cuda:2') +2022-11-16 08:27:07,307 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98592.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:27:15,820 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98604.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:27:37,138 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-11-16 08:27:37,423 INFO [train.py:876] (2/4) Epoch 14, batch 4100, loss[loss=0.08876, simple_loss=0.117, pruned_loss=0.03027, over 5120.00 frames. ], tot_loss[loss=0.1002, simple_loss=0.1322, pruned_loss=0.03407, over 1080341.23 frames. ], batch size: 91, lr: 5.74e-03, grad_scale: 8.0 +2022-11-16 08:27:39,458 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.6824, 5.1656, 5.4178, 4.9265, 5.7332, 5.4378, 4.7804, 5.6977], + device='cuda:2'), covar=tensor([0.0321, 0.0350, 0.0389, 0.0482, 0.0341, 0.0297, 0.0314, 0.0271], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0156, 0.0110, 0.0147, 0.0187, 0.0115, 0.0131, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 08:27:42,138 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-11-16 08:27:42,931 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.009e+01 1.414e+02 1.742e+02 2.183e+02 4.032e+02, threshold=3.484e+02, percent-clipped=2.0 +2022-11-16 08:27:48,719 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-11-16 08:28:45,156 INFO [train.py:876] (2/4) Epoch 14, batch 4200, loss[loss=0.07925, simple_loss=0.1107, pruned_loss=0.0239, over 5736.00 frames. ], tot_loss[loss=0.09937, simple_loss=0.1319, pruned_loss=0.03341, over 1089909.80 frames. ], batch size: 27, lr: 5.73e-03, grad_scale: 8.0 +2022-11-16 08:28:46,734 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6806, 2.3197, 2.6715, 3.6392, 3.6395, 2.7782, 2.2348, 3.6746], + device='cuda:2'), covar=tensor([0.1051, 0.2686, 0.2277, 0.2114, 0.1178, 0.3204, 0.2248, 0.0686], + device='cuda:2'), in_proj_covar=tensor([0.0257, 0.0198, 0.0187, 0.0296, 0.0223, 0.0200, 0.0187, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:28:50,365 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.013e+02 1.341e+02 1.638e+02 2.138e+02 3.541e+02, threshold=3.276e+02, percent-clipped=2.0 +2022-11-16 08:28:51,166 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7186, 4.6897, 3.5344, 1.8865, 4.2805, 2.1145, 4.2588, 2.3427], + device='cuda:2'), covar=tensor([0.1338, 0.0113, 0.0517, 0.2240, 0.0201, 0.1579, 0.0188, 0.1543], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0103, 0.0115, 0.0112, 0.0103, 0.0119, 0.0100, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:28:54,531 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-11-16 08:28:55,075 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7924, 2.3145, 2.2413, 1.4201, 2.7832, 2.6714, 2.5765, 2.8654], + device='cuda:2'), covar=tensor([0.1911, 0.1814, 0.1501, 0.2985, 0.0675, 0.1434, 0.0626, 0.1072], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0176, 0.0167, 0.0179, 0.0184, 0.0203, 0.0168, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:29:01,486 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98762.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:29:04,385 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98766.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:29:18,894 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6801, 2.6888, 3.4823, 4.1166, 4.3594, 3.5055, 3.0571, 4.3084], + device='cuda:2'), covar=tensor([0.0353, 0.2465, 0.1991, 0.3034, 0.0827, 0.2580, 0.1818, 0.0658], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0199, 0.0188, 0.0298, 0.0224, 0.0201, 0.0188, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:29:34,477 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98810.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:29:53,231 INFO [train.py:876] (2/4) Epoch 14, batch 4300, loss[loss=0.1133, simple_loss=0.1543, pruned_loss=0.0362, over 5715.00 frames. ], tot_loss[loss=0.09862, simple_loss=0.1318, pruned_loss=0.03269, over 1095341.31 frames. ], batch size: 34, lr: 5.73e-03, grad_scale: 8.0 +2022-11-16 08:29:58,781 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.664e+01 1.356e+02 1.673e+02 1.998e+02 3.650e+02, threshold=3.347e+02, percent-clipped=3.0 +2022-11-16 08:30:08,019 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6394, 2.1260, 2.2844, 2.8083, 2.8944, 2.2830, 1.9902, 2.9503], + device='cuda:2'), covar=tensor([0.1962, 0.2266, 0.1922, 0.1883, 0.1454, 0.2863, 0.2198, 0.1164], + device='cuda:2'), in_proj_covar=tensor([0.0259, 0.0200, 0.0188, 0.0296, 0.0225, 0.0201, 0.0188, 0.0250], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:30:22,875 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-11-16 08:30:28,755 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98890.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:30:32,317 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98895.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:30:34,859 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98899.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:31:00,054 INFO [train.py:876] (2/4) Epoch 14, batch 4400, loss[loss=0.09059, simple_loss=0.1116, pruned_loss=0.0348, over 5022.00 frames. ], tot_loss[loss=0.09962, simple_loss=0.1321, pruned_loss=0.03356, over 1089944.43 frames. ], batch size: 109, lr: 5.73e-03, grad_scale: 8.0 +2022-11-16 08:31:05,615 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.027e+01 1.520e+02 1.757e+02 2.071e+02 5.109e+02, threshold=3.514e+02, percent-clipped=4.0 +2022-11-16 08:31:09,746 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98951.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:31:13,487 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98956.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:31:23,420 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98971.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:31:35,052 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1354, 4.4637, 4.1635, 4.4953, 4.5336, 3.8215, 3.9447, 3.9669], + device='cuda:2'), covar=tensor([0.0487, 0.0486, 0.1407, 0.0479, 0.0411, 0.0515, 0.0792, 0.0646], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0185, 0.0276, 0.0178, 0.0225, 0.0176, 0.0192, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:32:04,711 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99032.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:32:07,922 INFO [train.py:876] (2/4) Epoch 14, batch 4500, loss[loss=0.08726, simple_loss=0.1165, pruned_loss=0.02903, over 5615.00 frames. ], tot_loss[loss=0.09908, simple_loss=0.1314, pruned_loss=0.03338, over 1092329.01 frames. ], batch size: 23, lr: 5.72e-03, grad_scale: 8.0 +2022-11-16 08:32:13,106 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.350e+01 1.443e+02 1.649e+02 2.186e+02 4.322e+02, threshold=3.298e+02, percent-clipped=3.0 +2022-11-16 08:32:17,538 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99051.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:32:18,102 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6822, 4.3254, 4.4692, 4.1830, 4.7909, 4.4537, 4.2427, 4.7593], + device='cuda:2'), covar=tensor([0.0640, 0.0907, 0.0686, 0.1044, 0.0713, 0.0700, 0.0637, 0.0804], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0156, 0.0109, 0.0146, 0.0188, 0.0115, 0.0131, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 08:32:25,580 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-11-16 08:32:27,874 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99066.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:32:53,910 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8003, 2.8801, 2.4595, 3.0152, 2.2684, 2.7470, 2.8403, 2.9684], + device='cuda:2'), covar=tensor([0.1290, 0.1133, 0.2024, 0.1173, 0.1769, 0.1067, 0.1440, 0.2400], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0109, 0.0107, 0.0109, 0.0095, 0.0106, 0.0099, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-11-16 08:32:58,461 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99112.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:32:59,478 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.5330, 1.3003, 1.4596, 1.1307, 1.6116, 1.8601, 0.9413, 1.2938], + device='cuda:2'), covar=tensor([0.0524, 0.0738, 0.0480, 0.0849, 0.0753, 0.0440, 0.1166, 0.0632], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0026, 0.0019, 0.0022, 0.0018, 0.0017, 0.0025, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.4506e-05, 1.3248e-04, 1.0116e-04, 1.1348e-04, 1.0182e-04, 9.5894e-05, + 1.2458e-04, 9.4927e-05], device='cuda:2') +2022-11-16 08:33:00,364 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99114.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:33:16,014 INFO [train.py:876] (2/4) Epoch 14, batch 4600, loss[loss=0.1339, simple_loss=0.1557, pruned_loss=0.05605, over 5577.00 frames. ], tot_loss[loss=0.09887, simple_loss=0.1306, pruned_loss=0.03358, over 1083512.70 frames. ], batch size: 46, lr: 5.72e-03, grad_scale: 8.0 +2022-11-16 08:33:21,170 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.692e+01 1.360e+02 1.705e+02 2.390e+02 5.580e+02, threshold=3.409e+02, percent-clipped=5.0 +2022-11-16 08:33:56,046 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1325, 2.6702, 3.7237, 3.0884, 4.1129, 2.7851, 3.6513, 4.0138], + device='cuda:2'), covar=tensor([0.0789, 0.1574, 0.0923, 0.1807, 0.0505, 0.1513, 0.1192, 0.1088], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0193, 0.0216, 0.0214, 0.0242, 0.0196, 0.0224, 0.0233], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:33:57,238 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99199.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:34:17,418 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99229.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:34:22,861 INFO [train.py:876] (2/4) Epoch 14, batch 4700, loss[loss=0.06536, simple_loss=0.1046, pruned_loss=0.01306, over 5227.00 frames. ], tot_loss[loss=0.09831, simple_loss=0.1304, pruned_loss=0.03311, over 1084873.15 frames. ], batch size: 8, lr: 5.72e-03, grad_scale: 8.0 +2022-11-16 08:34:28,049 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.829e+01 1.395e+02 1.659e+02 2.125e+02 3.836e+02, threshold=3.317e+02, percent-clipped=3.0 +2022-11-16 08:34:28,801 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99246.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:34:29,430 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99247.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:34:31,949 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.44 vs. limit=5.0 +2022-11-16 08:34:32,132 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99251.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:34:38,845 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1010, 2.7286, 3.2375, 3.8990, 3.8412, 3.0330, 2.6244, 3.9558], + device='cuda:2'), covar=tensor([0.0532, 0.2256, 0.1636, 0.2920, 0.1064, 0.2557, 0.2002, 0.0574], + device='cuda:2'), in_proj_covar=tensor([0.0262, 0.0201, 0.0187, 0.0300, 0.0227, 0.0202, 0.0189, 0.0251], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:34:43,506 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99268.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:34:58,192 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99290.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:35:21,414 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99324.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:35:23,245 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99327.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:35:24,664 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99329.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:35:29,932 INFO [train.py:876] (2/4) Epoch 14, batch 4800, loss[loss=0.1159, simple_loss=0.149, pruned_loss=0.0414, over 5699.00 frames. ], tot_loss[loss=0.09867, simple_loss=0.1302, pruned_loss=0.03359, over 1081199.67 frames. ], batch size: 19, lr: 5.72e-03, grad_scale: 8.0 +2022-11-16 08:35:35,131 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.609e+01 1.410e+02 1.721e+02 2.085e+02 4.264e+02, threshold=3.442e+02, percent-clipped=4.0 +2022-11-16 08:35:52,621 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.1296, 4.5532, 4.8791, 4.4538, 5.1546, 5.0094, 4.4456, 5.0982], + device='cuda:2'), covar=tensor([0.0326, 0.0400, 0.0447, 0.0368, 0.0336, 0.0207, 0.0322, 0.0306], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0157, 0.0110, 0.0147, 0.0189, 0.0116, 0.0131, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 08:36:02,043 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99385.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:36:04,077 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.53 vs. limit=2.0 +2022-11-16 08:36:17,341 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99407.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:36:25,862 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99420.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:36:26,511 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3194, 2.1998, 2.9259, 1.9318, 1.3307, 2.9572, 2.6043, 2.2624], + device='cuda:2'), covar=tensor([0.1204, 0.1850, 0.0687, 0.2887, 0.5553, 0.1730, 0.1858, 0.2015], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0109, 0.0107, 0.0107, 0.0081, 0.0074, 0.0089, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 08:36:27,230 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4154, 1.6069, 2.0812, 1.3352, 1.9389, 2.0804, 1.4741, 1.7290], + device='cuda:2'), covar=tensor([0.1732, 0.1216, 0.0763, 0.0726, 0.2056, 0.0907, 0.1649, 0.0751], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0026, 0.0019, 0.0022, 0.0018, 0.0017, 0.0025, 0.0017], + device='cuda:2'), out_proj_covar=tensor([9.5215e-05, 1.3258e-04, 1.0088e-04, 1.1372e-04, 1.0187e-04, 9.5910e-05, + 1.2474e-04, 9.5337e-05], device='cuda:2') +2022-11-16 08:36:33,074 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1882, 4.6498, 4.3105, 4.6564, 4.7018, 3.9760, 4.3107, 4.1685], + device='cuda:2'), covar=tensor([0.0454, 0.0519, 0.1339, 0.0572, 0.0533, 0.0610, 0.0745, 0.0622], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0184, 0.0276, 0.0178, 0.0225, 0.0175, 0.0191, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:36:37,295 INFO [train.py:876] (2/4) Epoch 14, batch 4900, loss[loss=0.08555, simple_loss=0.1177, pruned_loss=0.02672, over 5533.00 frames. ], tot_loss[loss=0.09709, simple_loss=0.1296, pruned_loss=0.03227, over 1086315.87 frames. ], batch size: 16, lr: 5.71e-03, grad_scale: 8.0 +2022-11-16 08:36:43,022 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.258e+01 1.532e+02 1.838e+02 2.274e+02 5.384e+02, threshold=3.676e+02, percent-clipped=5.0 +2022-11-16 08:36:58,588 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99468.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 08:37:00,247 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-11-16 08:37:07,070 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99481.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:37:32,840 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.92 vs. limit=5.0 +2022-11-16 08:37:36,160 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.02 vs. limit=5.0 +2022-11-16 08:37:39,780 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99529.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 08:37:44,760 INFO [train.py:876] (2/4) Epoch 14, batch 5000, loss[loss=0.07011, simple_loss=0.098, pruned_loss=0.02111, over 5183.00 frames. ], tot_loss[loss=0.09654, simple_loss=0.1292, pruned_loss=0.03196, over 1088550.11 frames. ], batch size: 8, lr: 5.71e-03, grad_scale: 8.0 +2022-11-16 08:37:48,983 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9860, 4.7850, 3.6504, 2.1658, 4.5212, 2.0669, 4.5351, 2.5462], + device='cuda:2'), covar=tensor([0.1253, 0.0160, 0.0604, 0.2174, 0.0207, 0.1691, 0.0185, 0.1558], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0104, 0.0116, 0.0111, 0.0103, 0.0118, 0.0099, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:37:50,174 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.179e+01 1.461e+02 1.855e+02 2.293e+02 4.970e+02, threshold=3.710e+02, percent-clipped=2.0 +2022-11-16 08:37:51,246 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99546.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:37:54,697 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99551.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:06,369 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99568.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:11,688 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6651, 2.4366, 2.8388, 3.6592, 3.5934, 2.6765, 2.4747, 3.5959], + device='cuda:2'), covar=tensor([0.1080, 0.2848, 0.1810, 0.1653, 0.1117, 0.2982, 0.1968, 0.0991], + device='cuda:2'), in_proj_covar=tensor([0.0260, 0.0199, 0.0187, 0.0298, 0.0226, 0.0199, 0.0190, 0.0251], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:38:17,352 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99585.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:19,747 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.56 vs. limit=2.0 +2022-11-16 08:38:20,015 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99589.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:20,065 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6850, 4.2648, 3.9007, 3.6019, 2.1457, 4.2073, 2.2980, 3.6067], + device='cuda:2'), covar=tensor([0.0556, 0.0309, 0.0207, 0.0412, 0.0806, 0.0188, 0.0700, 0.0233], + device='cuda:2'), in_proj_covar=tensor([0.0198, 0.0186, 0.0183, 0.0212, 0.0199, 0.0186, 0.0196, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 08:38:23,147 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99594.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:26,737 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99599.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:44,696 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99624.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:46,674 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99627.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:48,061 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99629.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:38:51,789 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3160, 2.1585, 2.8764, 1.8881, 1.4616, 3.1946, 2.5749, 2.2421], + device='cuda:2'), covar=tensor([0.0907, 0.1491, 0.0620, 0.2753, 0.2010, 0.1343, 0.1982, 0.1429], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0109, 0.0107, 0.0107, 0.0080, 0.0074, 0.0090, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 08:38:52,867 INFO [train.py:876] (2/4) Epoch 14, batch 5100, loss[loss=0.09123, simple_loss=0.1275, pruned_loss=0.02747, over 5393.00 frames. ], tot_loss[loss=0.09777, simple_loss=0.1302, pruned_loss=0.03266, over 1079388.80 frames. ], batch size: 11, lr: 5.71e-03, grad_scale: 8.0 +2022-11-16 08:38:58,066 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.215e+01 1.466e+02 1.636e+02 2.038e+02 3.411e+02, threshold=3.271e+02, percent-clipped=0.0 +2022-11-16 08:39:01,527 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99650.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:39:03,489 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0955, 2.5223, 2.9664, 3.8825, 3.8663, 2.9939, 2.5675, 3.8731], + device='cuda:2'), covar=tensor([0.0677, 0.2609, 0.2219, 0.3003, 0.1046, 0.2912, 0.2362, 0.0903], + device='cuda:2'), in_proj_covar=tensor([0.0260, 0.0199, 0.0187, 0.0298, 0.0226, 0.0198, 0.0189, 0.0251], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:39:19,180 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99675.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:39:22,552 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99680.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:39:22,617 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99680.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:39:26,476 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7804, 1.9479, 1.8009, 1.2649, 1.8760, 2.2799, 2.1336, 2.2012], + device='cuda:2'), covar=tensor([0.1785, 0.1654, 0.2260, 0.2760, 0.1267, 0.1074, 0.1241, 0.1467], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0176, 0.0167, 0.0180, 0.0186, 0.0205, 0.0171, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:39:37,703 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-11-16 08:39:40,064 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99707.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:40:00,850 INFO [train.py:876] (2/4) Epoch 14, batch 5200, loss[loss=0.09622, simple_loss=0.1351, pruned_loss=0.02867, over 5674.00 frames. ], tot_loss[loss=0.09872, simple_loss=0.1313, pruned_loss=0.03306, over 1087461.54 frames. ], batch size: 36, lr: 5.70e-03, grad_scale: 8.0 +2022-11-16 08:40:03,569 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99741.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:40:05,955 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.036e+02 1.374e+02 1.799e+02 2.301e+02 6.123e+02, threshold=3.597e+02, percent-clipped=6.0 +2022-11-16 08:40:06,764 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2164, 3.6095, 2.7773, 1.8483, 3.3559, 1.4409, 3.3948, 1.9660], + device='cuda:2'), covar=tensor([0.1364, 0.0182, 0.0923, 0.1787, 0.0270, 0.1907, 0.0297, 0.1371], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0105, 0.0116, 0.0111, 0.0103, 0.0119, 0.0100, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:40:07,370 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3460, 3.9639, 4.1581, 3.9337, 4.3948, 4.2606, 4.0204, 4.4308], + device='cuda:2'), covar=tensor([0.0409, 0.0412, 0.0481, 0.0377, 0.0401, 0.0251, 0.0349, 0.0301], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0160, 0.0113, 0.0149, 0.0193, 0.0118, 0.0133, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0004, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 08:40:12,566 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99755.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:40:26,777 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99776.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:40:59,383 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99824.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 08:41:08,867 INFO [train.py:876] (2/4) Epoch 14, batch 5300, loss[loss=0.09309, simple_loss=0.1236, pruned_loss=0.03129, over 5564.00 frames. ], tot_loss[loss=0.09832, simple_loss=0.131, pruned_loss=0.03283, over 1087274.13 frames. ], batch size: 16, lr: 5.70e-03, grad_scale: 8.0 +2022-11-16 08:41:14,413 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.304e+01 1.298e+02 1.618e+02 1.971e+02 5.007e+02, threshold=3.235e+02, percent-clipped=2.0 +2022-11-16 08:41:40,973 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99885.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:42:07,328 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99924.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:42:07,377 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99924.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:42:13,192 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99933.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:42:16,094 INFO [train.py:876] (2/4) Epoch 14, batch 5400, loss[loss=0.09999, simple_loss=0.15, pruned_loss=0.02499, over 5566.00 frames. ], tot_loss[loss=0.09747, simple_loss=0.1303, pruned_loss=0.03233, over 1087254.92 frames. ], batch size: 25, lr: 5.70e-03, grad_scale: 8.0 +2022-11-16 08:42:21,975 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.513e+01 1.478e+02 1.709e+02 2.137e+02 3.244e+02, threshold=3.418e+02, percent-clipped=1.0 +2022-11-16 08:42:22,091 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99945.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:42:25,156 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2919, 4.2767, 4.1818, 4.0786, 4.0829, 3.9435, 1.8535, 4.3903], + device='cuda:2'), covar=tensor([0.0208, 0.0330, 0.0252, 0.0299, 0.0298, 0.0431, 0.2802, 0.0289], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0090, 0.0088, 0.0082, 0.0103, 0.0090, 0.0131, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:42:40,286 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99972.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:42:45,541 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99980.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:43:08,158 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=100007.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 08:43:17,577 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9297, 4.1869, 4.0131, 3.6919, 2.1274, 4.1684, 2.4107, 3.7087], + device='cuda:2'), covar=tensor([0.0433, 0.0238, 0.0171, 0.0357, 0.0688, 0.0170, 0.0557, 0.0162], + device='cuda:2'), in_proj_covar=tensor([0.0197, 0.0187, 0.0184, 0.0211, 0.0201, 0.0187, 0.0196, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 08:43:21,945 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100028.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:43:27,264 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=100036.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:43:27,831 INFO [train.py:876] (2/4) Epoch 14, batch 5500, loss[loss=0.1168, simple_loss=0.1436, pruned_loss=0.04501, over 5395.00 frames. ], tot_loss[loss=0.0984, simple_loss=0.1306, pruned_loss=0.03307, over 1081270.50 frames. ], batch size: 70, lr: 5.70e-03, grad_scale: 8.0 +2022-11-16 08:43:32,933 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.703e+01 1.432e+02 1.744e+02 2.287e+02 4.720e+02, threshold=3.489e+02, percent-clipped=3.0 +2022-11-16 08:43:39,607 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0017, 2.5679, 3.5439, 3.2360, 3.8643, 2.4105, 3.3595, 3.9655], + device='cuda:2'), covar=tensor([0.0555, 0.1563, 0.0952, 0.1478, 0.0572, 0.1809, 0.1338, 0.0912], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0193, 0.0216, 0.0212, 0.0242, 0.0197, 0.0224, 0.0231], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:43:48,240 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-11-16 08:43:49,310 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=100068.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:43:54,519 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100076.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:44:11,061 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9459, 2.3760, 2.2627, 1.5068, 2.4659, 2.6494, 2.4502, 2.6741], + device='cuda:2'), covar=tensor([0.1975, 0.1645, 0.2148, 0.2911, 0.0928, 0.1241, 0.0858, 0.1182], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0178, 0.0169, 0.0181, 0.0186, 0.0205, 0.0172, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:44:16,868 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.5967, 3.6214, 3.6859, 3.5112, 3.5963, 3.6113, 1.4744, 3.8301], + device='cuda:2'), covar=tensor([0.0293, 0.0470, 0.0337, 0.0296, 0.0354, 0.0352, 0.3066, 0.0316], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0090, 0.0087, 0.0081, 0.0102, 0.0090, 0.0129, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:44:27,260 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100124.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:44:27,340 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100124.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 08:44:35,527 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.33 vs. limit=5.0 +2022-11-16 08:44:35,693 INFO [train.py:876] (2/4) Epoch 14, batch 5600, loss[loss=0.06441, simple_loss=0.103, pruned_loss=0.0129, over 5728.00 frames. ], tot_loss[loss=0.09796, simple_loss=0.1304, pruned_loss=0.03276, over 1089240.49 frames. ], batch size: 12, lr: 5.69e-03, grad_scale: 8.0 +2022-11-16 08:44:38,282 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.79 vs. limit=5.0 +2022-11-16 08:44:40,975 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.553e+01 1.416e+02 1.691e+02 1.981e+02 4.410e+02, threshold=3.382e+02, percent-clipped=1.0 +2022-11-16 08:44:59,700 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100172.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 08:45:34,534 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100224.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:45:43,186 INFO [train.py:876] (2/4) Epoch 14, batch 5700, loss[loss=0.08918, simple_loss=0.1176, pruned_loss=0.03037, over 5791.00 frames. ], tot_loss[loss=0.09733, simple_loss=0.1294, pruned_loss=0.03262, over 1085366.08 frames. ], batch size: 21, lr: 5.69e-03, grad_scale: 8.0 +2022-11-16 08:45:43,982 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=100238.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:45:48,358 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.517e+01 1.364e+02 1.751e+02 2.063e+02 4.628e+02, threshold=3.502e+02, percent-clipped=3.0 +2022-11-16 08:45:48,541 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100245.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:46:06,336 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100272.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:46:08,816 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3727, 3.1694, 3.5624, 1.9710, 3.3716, 3.6683, 3.5940, 3.9964], + device='cuda:2'), covar=tensor([0.2028, 0.1783, 0.0691, 0.2912, 0.0805, 0.0776, 0.0534, 0.0672], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0177, 0.0167, 0.0180, 0.0184, 0.0203, 0.0171, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:46:21,249 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100293.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:46:25,380 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=100299.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:46:50,216 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100336.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:46:50,740 INFO [train.py:876] (2/4) Epoch 14, batch 5800, loss[loss=0.07393, simple_loss=0.1094, pruned_loss=0.01925, over 5006.00 frames. ], tot_loss[loss=0.09679, simple_loss=0.1292, pruned_loss=0.03219, over 1085424.11 frames. ], batch size: 5, lr: 5.69e-03, grad_scale: 16.0 +2022-11-16 08:46:56,199 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.798e+01 1.394e+02 1.721e+02 2.262e+02 4.141e+02, threshold=3.442e+02, percent-clipped=1.0 +2022-11-16 08:47:07,934 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=100363.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 08:47:19,901 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2678, 1.4392, 1.2449, 0.9423, 1.2624, 1.6484, 1.6400, 1.5009], + device='cuda:2'), covar=tensor([0.1214, 0.0906, 0.1838, 0.2633, 0.1366, 0.1112, 0.1143, 0.1297], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0178, 0.0168, 0.0180, 0.0183, 0.0203, 0.0171, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:47:21,653 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100384.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:47:34,805 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=100402.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:47:57,689 INFO [train.py:876] (2/4) Epoch 14, batch 5900, loss[loss=0.09764, simple_loss=0.1368, pruned_loss=0.02922, over 5638.00 frames. ], tot_loss[loss=0.09744, simple_loss=0.1299, pruned_loss=0.0325, over 1090115.43 frames. ], batch size: 32, lr: 5.68e-03, grad_scale: 16.0 +2022-11-16 08:47:58,423 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.2694, 3.7753, 3.4266, 3.7993, 3.7977, 3.2177, 3.4022, 3.3454], + device='cuda:2'), covar=tensor([0.1215, 0.0504, 0.1399, 0.0436, 0.0446, 0.0480, 0.0646, 0.0585], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0183, 0.0275, 0.0177, 0.0224, 0.0176, 0.0190, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:48:01,105 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6482, 3.0320, 3.3083, 2.8956, 1.9589, 3.2173, 2.2138, 2.8407], + device='cuda:2'), covar=tensor([0.0321, 0.0238, 0.0170, 0.0357, 0.0555, 0.0248, 0.0518, 0.0195], + device='cuda:2'), in_proj_covar=tensor([0.0197, 0.0189, 0.0184, 0.0211, 0.0200, 0.0188, 0.0197, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 08:48:03,429 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.325e+01 1.325e+02 1.657e+02 2.050e+02 5.165e+02, threshold=3.313e+02, percent-clipped=3.0 +2022-11-16 08:48:16,286 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=100463.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 08:48:24,103 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.6495, 2.8327, 3.4945, 4.0716, 4.4453, 3.6816, 3.4868, 4.3904], + device='cuda:2'), covar=tensor([0.0447, 0.2787, 0.1429, 0.4345, 0.0776, 0.2241, 0.1671, 0.0740], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0198, 0.0184, 0.0296, 0.0224, 0.0197, 0.0186, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:48:55,056 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4173, 1.6573, 1.4361, 1.1252, 1.5277, 1.9120, 1.8187, 1.8108], + device='cuda:2'), covar=tensor([0.1592, 0.1202, 0.2128, 0.2644, 0.1540, 0.1270, 0.1019, 0.1383], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0177, 0.0167, 0.0179, 0.0183, 0.0202, 0.0170, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:49:05,461 INFO [train.py:876] (2/4) Epoch 14, batch 6000, loss[loss=0.09188, simple_loss=0.1306, pruned_loss=0.02659, over 5568.00 frames. ], tot_loss[loss=0.09762, simple_loss=0.1298, pruned_loss=0.03272, over 1083292.79 frames. ], batch size: 18, lr: 5.68e-03, grad_scale: 16.0 +2022-11-16 08:49:05,462 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 08:49:19,224 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.5927, 4.0022, 4.4911, 4.1136, 4.1403, 4.3665, 4.7047, 4.6057], + device='cuda:2'), covar=tensor([0.0276, 0.1575, 0.0378, 0.1276, 0.0394, 0.0230, 0.0648, 0.0408], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0111, 0.0098, 0.0125, 0.0091, 0.0083, 0.0149, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:49:23,826 INFO [train.py:908] (2/4) Epoch 14, validation: loss=0.1801, simple_loss=0.1888, pruned_loss=0.08568, over 1530663.00 frames. +2022-11-16 08:49:23,826 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 08:49:29,376 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.974e+01 1.386e+02 1.665e+02 1.958e+02 3.486e+02, threshold=3.330e+02, percent-clipped=1.0 +2022-11-16 08:49:36,171 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.46 vs. limit=5.0 +2022-11-16 08:49:43,925 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.30 vs. limit=2.0 +2022-11-16 08:50:01,491 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=100594.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:50:30,384 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5145, 4.2504, 3.2710, 2.0659, 4.1228, 1.8296, 3.7612, 2.4131], + device='cuda:2'), covar=tensor([0.1285, 0.0151, 0.0903, 0.1941, 0.0188, 0.1632, 0.0299, 0.1272], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0104, 0.0115, 0.0111, 0.0102, 0.0118, 0.0100, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 08:50:30,879 INFO [train.py:876] (2/4) Epoch 14, batch 6100, loss[loss=0.1156, simple_loss=0.1371, pruned_loss=0.04705, over 5578.00 frames. ], tot_loss[loss=0.09728, simple_loss=0.1294, pruned_loss=0.03261, over 1085899.64 frames. ], batch size: 24, lr: 5.68e-03, grad_scale: 16.0 +2022-11-16 08:50:36,073 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.272e+01 1.427e+02 1.651e+02 1.977e+02 4.323e+02, threshold=3.302e+02, percent-clipped=4.0 +2022-11-16 08:50:48,647 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100663.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 08:51:20,682 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100711.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:51:34,126 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=100730.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:51:38,475 INFO [train.py:876] (2/4) Epoch 14, batch 6200, loss[loss=0.1177, simple_loss=0.148, pruned_loss=0.04367, over 5480.00 frames. ], tot_loss[loss=0.09773, simple_loss=0.1295, pruned_loss=0.033, over 1078983.53 frames. ], batch size: 58, lr: 5.68e-03, grad_scale: 16.0 +2022-11-16 08:51:43,693 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.597e+01 1.377e+02 1.629e+02 2.100e+02 5.485e+02, threshold=3.258e+02, percent-clipped=3.0 +2022-11-16 08:51:49,706 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0816, 2.5356, 2.9111, 3.8820, 3.8974, 2.8813, 2.7921, 3.9079], + device='cuda:2'), covar=tensor([0.0594, 0.2594, 0.2293, 0.2941, 0.1236, 0.3702, 0.1985, 0.0719], + device='cuda:2'), in_proj_covar=tensor([0.0260, 0.0197, 0.0186, 0.0297, 0.0225, 0.0200, 0.0188, 0.0252], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 08:51:52,264 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=100758.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 08:51:59,723 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.9810, 4.4016, 4.8394, 4.5054, 5.0449, 4.8770, 4.3654, 5.0084], + device='cuda:2'), covar=tensor([0.0357, 0.0424, 0.0400, 0.0317, 0.0341, 0.0236, 0.0322, 0.0287], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0160, 0.0113, 0.0149, 0.0191, 0.0117, 0.0132, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0004, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 08:52:15,303 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=100791.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 08:52:46,741 INFO [train.py:876] (2/4) Epoch 14, batch 6300, loss[loss=0.06968, simple_loss=0.105, pruned_loss=0.01719, over 5478.00 frames. ], tot_loss[loss=0.0971, simple_loss=0.129, pruned_loss=0.0326, over 1077127.27 frames. ], batch size: 10, lr: 5.67e-03, grad_scale: 16.0 +2022-11-16 08:52:51,883 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.472e+01 1.427e+02 1.681e+02 2.021e+02 4.003e+02, threshold=3.363e+02, percent-clipped=4.0 +2022-11-16 08:52:56,175 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-11-16 08:53:24,988 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100894.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:53:53,680 INFO [train.py:876] (2/4) Epoch 14, batch 6400, loss[loss=0.1134, simple_loss=0.1483, pruned_loss=0.03928, over 5678.00 frames. ], tot_loss[loss=0.0975, simple_loss=0.1299, pruned_loss=0.03253, over 1078326.54 frames. ], batch size: 19, lr: 5.67e-03, grad_scale: 16.0 +2022-11-16 08:53:57,518 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100942.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:53:59,455 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.356e+01 1.445e+02 1.710e+02 2.169e+02 3.491e+02, threshold=3.419e+02, percent-clipped=2.0 +2022-11-16 08:55:01,411 INFO [train.py:876] (2/4) Epoch 14, batch 6500, loss[loss=0.1163, simple_loss=0.1368, pruned_loss=0.04789, over 5014.00 frames. ], tot_loss[loss=0.09656, simple_loss=0.1296, pruned_loss=0.03175, over 1086313.69 frames. ], batch size: 109, lr: 5.67e-03, grad_scale: 16.0 +2022-11-16 08:55:06,943 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.989e+01 1.437e+02 1.725e+02 2.064e+02 3.698e+02, threshold=3.449e+02, percent-clipped=2.0 +2022-11-16 08:55:16,247 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=101058.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:55:34,982 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=101086.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 08:55:36,578 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-11-16 08:55:48,583 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=101106.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:55:49,326 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=101107.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:56:00,398 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.8209, 1.8033, 2.0162, 1.7445, 1.7079, 1.4474, 1.8675, 1.7196], + device='cuda:2'), covar=tensor([0.0085, 0.0094, 0.0054, 0.0070, 0.0134, 0.0192, 0.0060, 0.0081], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0030, 0.0031, 0.0040, 0.0035, 0.0031, 0.0039, 0.0038], + device='cuda:2'), out_proj_covar=tensor([3.1398e-05, 2.8365e-05, 2.7688e-05, 3.7781e-05, 3.2625e-05, 2.9669e-05, + 3.7658e-05, 3.6592e-05], device='cuda:2') +2022-11-16 08:56:09,261 INFO [train.py:876] (2/4) Epoch 14, batch 6600, loss[loss=0.07514, simple_loss=0.1096, pruned_loss=0.02032, over 5715.00 frames. ], tot_loss[loss=0.09768, simple_loss=0.1302, pruned_loss=0.03256, over 1085036.28 frames. ], batch size: 15, lr: 5.66e-03, grad_scale: 16.0 +2022-11-16 08:56:14,436 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.470e+01 1.360e+02 1.590e+02 2.159e+02 4.243e+02, threshold=3.180e+02, percent-clipped=1.0 +2022-11-16 08:56:31,087 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=101168.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 08:56:37,368 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-11-16 08:57:01,154 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1285, 1.7891, 2.0244, 1.6133, 1.8657, 1.9716, 2.0848, 1.9531], + device='cuda:2'), covar=tensor([0.0065, 0.0083, 0.0052, 0.0067, 0.0065, 0.0053, 0.0056, 0.0059], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0063, 0.0062, 0.0068, 0.0066, 0.0062, 0.0060, 0.0057], + device='cuda:2'), out_proj_covar=tensor([6.1203e-05, 5.5508e-05, 5.4125e-05, 5.9639e-05, 5.8192e-05, 5.3360e-05, + 5.2701e-05, 4.9374e-05], device='cuda:2') +2022-11-16 08:57:05,359 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-11-16 08:57:11,057 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.95 vs. limit=5.0 +2022-11-16 08:57:17,780 INFO [train.py:876] (2/4) Epoch 14, batch 6700, loss[loss=0.07881, simple_loss=0.1093, pruned_loss=0.02414, over 5517.00 frames. ], tot_loss[loss=0.09639, simple_loss=0.1291, pruned_loss=0.03183, over 1086006.19 frames. ], batch size: 13, lr: 5.66e-03, grad_scale: 16.0 +2022-11-16 08:57:22,870 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.036e+01 1.360e+02 1.742e+02 2.134e+02 3.328e+02, threshold=3.484e+02, percent-clipped=2.0 +2022-11-16 08:57:25,610 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1750, 3.0119, 2.9501, 3.0876, 2.9012, 2.6983, 3.4258, 3.1297], + device='cuda:2'), covar=tensor([0.0569, 0.1013, 0.0621, 0.1289, 0.0744, 0.0534, 0.0831, 0.0782], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0114, 0.0100, 0.0127, 0.0093, 0.0084, 0.0150, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 08:57:51,863 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1071, 4.3198, 4.0963, 3.7064, 2.2115, 4.4570, 2.6013, 3.9458], + device='cuda:2'), covar=tensor([0.0381, 0.0201, 0.0155, 0.0353, 0.0707, 0.0149, 0.0521, 0.0145], + device='cuda:2'), in_proj_covar=tensor([0.0198, 0.0189, 0.0185, 0.0213, 0.0200, 0.0189, 0.0197, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 08:58:25,805 INFO [train.py:876] (2/4) Epoch 14, batch 6800, loss[loss=0.07213, simple_loss=0.1139, pruned_loss=0.01518, over 5740.00 frames. ], tot_loss[loss=0.09635, simple_loss=0.1286, pruned_loss=0.03204, over 1084666.54 frames. ], batch size: 14, lr: 5.66e-03, grad_scale: 16.0 +2022-11-16 08:58:30,945 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.618e+01 1.430e+02 1.640e+02 2.057e+02 3.965e+02, threshold=3.281e+02, percent-clipped=2.0 +2022-11-16 08:58:32,809 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-11-16 08:58:58,317 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=101386.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 08:59:00,275 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.19 vs. limit=2.0 +2022-11-16 08:59:30,814 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=101434.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 08:59:32,735 INFO [train.py:876] (2/4) Epoch 14, batch 6900, loss[loss=0.0734, simple_loss=0.1182, pruned_loss=0.0143, over 5532.00 frames. ], tot_loss[loss=0.09784, simple_loss=0.1301, pruned_loss=0.03278, over 1083626.00 frames. ], batch size: 14, lr: 5.66e-03, grad_scale: 16.0 +2022-11-16 08:59:39,123 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.093e+01 1.364e+02 1.752e+02 2.207e+02 5.252e+02, threshold=3.504e+02, percent-clipped=3.0 +2022-11-16 08:59:50,954 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=101463.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:00:40,658 INFO [train.py:876] (2/4) Epoch 14, batch 7000, loss[loss=0.04607, simple_loss=0.08241, pruned_loss=0.004859, over 5092.00 frames. ], tot_loss[loss=0.09593, simple_loss=0.129, pruned_loss=0.03145, over 1083122.09 frames. ], batch size: 7, lr: 5.65e-03, grad_scale: 16.0 +2022-11-16 09:00:47,041 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.705e+01 1.319e+02 1.556e+02 2.148e+02 4.018e+02, threshold=3.112e+02, percent-clipped=2.0 +2022-11-16 09:01:13,466 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2846, 4.4137, 3.4105, 1.8840, 4.0649, 1.5676, 3.8669, 2.3433], + device='cuda:2'), covar=tensor([0.1635, 0.0160, 0.0593, 0.1961, 0.0268, 0.2105, 0.0360, 0.1554], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0104, 0.0114, 0.0109, 0.0102, 0.0119, 0.0100, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:01:48,513 INFO [train.py:876] (2/4) Epoch 14, batch 7100, loss[loss=0.09697, simple_loss=0.1267, pruned_loss=0.03363, over 5078.00 frames. ], tot_loss[loss=0.09717, simple_loss=0.13, pruned_loss=0.03217, over 1077506.21 frames. ], batch size: 91, lr: 5.65e-03, grad_scale: 8.0 +2022-11-16 09:01:49,371 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.2460, 2.6912, 3.7844, 3.5829, 4.1011, 2.7658, 3.7204, 4.1689], + device='cuda:2'), covar=tensor([0.0859, 0.1584, 0.0827, 0.1291, 0.0744, 0.1644, 0.1185, 0.0742], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0197, 0.0218, 0.0213, 0.0245, 0.0201, 0.0227, 0.0233], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:01:54,699 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.279e+01 1.412e+02 1.777e+02 2.341e+02 5.678e+02, threshold=3.553e+02, percent-clipped=7.0 +2022-11-16 09:02:17,348 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-11-16 09:02:21,014 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=101684.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:02:39,080 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.45 vs. limit=5.0 +2022-11-16 09:02:50,101 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1594, 3.4971, 2.6157, 1.7518, 3.3005, 1.4443, 3.1867, 1.7128], + device='cuda:2'), covar=tensor([0.1703, 0.0308, 0.1069, 0.2088, 0.0358, 0.2227, 0.0399, 0.1882], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0104, 0.0113, 0.0109, 0.0101, 0.0119, 0.0100, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:02:57,249 INFO [train.py:876] (2/4) Epoch 14, batch 7200, loss[loss=0.061, simple_loss=0.0981, pruned_loss=0.01195, over 5380.00 frames. ], tot_loss[loss=0.0965, simple_loss=0.1292, pruned_loss=0.03189, over 1082087.07 frames. ], batch size: 9, lr: 5.65e-03, grad_scale: 8.0 +2022-11-16 09:03:03,318 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=101745.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:03:03,779 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.181e+01 1.351e+02 1.693e+02 2.156e+02 4.493e+02, threshold=3.386e+02, percent-clipped=3.0 +2022-11-16 09:03:15,172 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=101763.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:04:28,278 INFO [train.py:876] (2/4) Epoch 15, batch 0, loss[loss=0.07584, simple_loss=0.1114, pruned_loss=0.02013, over 5724.00 frames. ], tot_loss[loss=0.07584, simple_loss=0.1114, pruned_loss=0.02013, over 5724.00 frames. ], batch size: 11, lr: 5.45e-03, grad_scale: 8.0 +2022-11-16 09:04:28,278 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 09:04:44,422 INFO [train.py:908] (2/4) Epoch 15, validation: loss=0.1798, simple_loss=0.1892, pruned_loss=0.08518, over 1530663.00 frames. +2022-11-16 09:04:44,422 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 09:04:45,742 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=101811.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:05:09,641 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.429e+01 1.523e+02 1.865e+02 2.134e+02 5.248e+02, threshold=3.731e+02, percent-clipped=3.0 +2022-11-16 09:05:28,442 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2886, 2.2083, 2.7948, 1.9413, 1.4649, 3.1686, 2.6071, 2.2986], + device='cuda:2'), covar=tensor([0.0977, 0.1129, 0.0724, 0.2181, 0.2376, 0.0544, 0.0862, 0.1175], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0104, 0.0105, 0.0105, 0.0079, 0.0073, 0.0087, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 09:05:39,553 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-11-16 09:05:45,725 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5482, 3.3630, 3.6113, 1.9219, 3.3926, 3.6977, 3.5941, 4.1373], + device='cuda:2'), covar=tensor([0.2135, 0.1363, 0.0628, 0.2708, 0.0584, 0.0629, 0.0735, 0.0571], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0178, 0.0167, 0.0181, 0.0184, 0.0204, 0.0173, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:05:51,985 INFO [train.py:876] (2/4) Epoch 15, batch 100, loss[loss=0.08112, simple_loss=0.1165, pruned_loss=0.02286, over 5604.00 frames. ], tot_loss[loss=0.09521, simple_loss=0.1291, pruned_loss=0.03066, over 437474.84 frames. ], batch size: 23, lr: 5.45e-03, grad_scale: 8.0 +2022-11-16 09:05:59,285 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=101920.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 09:06:16,937 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.051e+01 1.432e+02 1.657e+02 2.152e+02 4.167e+02, threshold=3.314e+02, percent-clipped=2.0 +2022-11-16 09:06:40,784 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=101981.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 09:06:44,617 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=101987.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:06:46,965 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0219, 3.2370, 3.8350, 4.6631, 4.7506, 3.7678, 3.3468, 4.6869], + device='cuda:2'), covar=tensor([0.0342, 0.2667, 0.1720, 0.1940, 0.0834, 0.2401, 0.1854, 0.0685], + device='cuda:2'), in_proj_covar=tensor([0.0257, 0.0192, 0.0184, 0.0290, 0.0224, 0.0195, 0.0184, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:06:52,424 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1775, 3.8093, 2.6985, 3.5968, 3.0938, 2.7439, 2.2033, 3.3002], + device='cuda:2'), covar=tensor([0.1546, 0.0354, 0.1097, 0.0420, 0.0914, 0.1158, 0.1850, 0.0458], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0140, 0.0151, 0.0144, 0.0169, 0.0163, 0.0155, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:07:00,223 INFO [train.py:876] (2/4) Epoch 15, batch 200, loss[loss=0.1365, simple_loss=0.1534, pruned_loss=0.05983, over 4719.00 frames. ], tot_loss[loss=0.1006, simple_loss=0.1323, pruned_loss=0.03448, over 686306.99 frames. ], batch size: 135, lr: 5.45e-03, grad_scale: 8.0 +2022-11-16 09:07:04,232 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7659, 4.1562, 3.7102, 4.1571, 4.0511, 3.5640, 3.7288, 3.6260], + device='cuda:2'), covar=tensor([0.0586, 0.0442, 0.1343, 0.0412, 0.0524, 0.0484, 0.0626, 0.0566], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0185, 0.0279, 0.0178, 0.0227, 0.0178, 0.0192, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:07:10,465 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.81 vs. limit=2.0 +2022-11-16 09:07:20,903 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=102040.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:07:24,774 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.824e+01 1.391e+02 1.728e+02 2.198e+02 6.457e+02, threshold=3.456e+02, percent-clipped=4.0 +2022-11-16 09:07:26,254 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=102048.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:08:07,559 INFO [train.py:876] (2/4) Epoch 15, batch 300, loss[loss=0.1133, simple_loss=0.1379, pruned_loss=0.04433, over 5813.00 frames. ], tot_loss[loss=0.09949, simple_loss=0.1317, pruned_loss=0.03366, over 846700.29 frames. ], batch size: 21, lr: 5.45e-03, grad_scale: 8.0 +2022-11-16 09:08:09,352 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2636, 2.7175, 3.0017, 2.6545, 1.6679, 2.8454, 2.0214, 2.3430], + device='cuda:2'), covar=tensor([0.0380, 0.0245, 0.0165, 0.0336, 0.0578, 0.0215, 0.0494, 0.0235], + device='cuda:2'), in_proj_covar=tensor([0.0196, 0.0187, 0.0184, 0.0211, 0.0199, 0.0187, 0.0196, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 09:08:19,826 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.25 vs. limit=2.0 +2022-11-16 09:08:32,102 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.719e+01 1.477e+02 1.712e+02 2.125e+02 3.835e+02, threshold=3.423e+02, percent-clipped=2.0 +2022-11-16 09:09:15,302 INFO [train.py:876] (2/4) Epoch 15, batch 400, loss[loss=0.08967, simple_loss=0.1271, pruned_loss=0.0261, over 5794.00 frames. ], tot_loss[loss=0.09877, simple_loss=0.1312, pruned_loss=0.03315, over 933255.20 frames. ], batch size: 22, lr: 5.44e-03, grad_scale: 8.0 +2022-11-16 09:09:40,371 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.002e+02 1.441e+02 1.640e+02 2.170e+02 4.348e+02, threshold=3.279e+02, percent-clipped=4.0 +2022-11-16 09:09:44,069 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-11-16 09:10:00,705 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=102276.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 09:10:22,874 INFO [train.py:876] (2/4) Epoch 15, batch 500, loss[loss=0.1081, simple_loss=0.1494, pruned_loss=0.03346, over 5649.00 frames. ], tot_loss[loss=0.09683, simple_loss=0.1299, pruned_loss=0.03187, over 993094.04 frames. ], batch size: 38, lr: 5.44e-03, grad_scale: 8.0 +2022-11-16 09:10:44,313 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=102340.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:10:46,607 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=102343.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:10:48,482 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.403e+01 1.330e+02 1.651e+02 2.070e+02 4.075e+02, threshold=3.302e+02, percent-clipped=1.0 +2022-11-16 09:10:55,780 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4945, 2.3707, 2.8982, 2.0081, 1.5843, 3.3441, 2.6896, 2.4004], + device='cuda:2'), covar=tensor([0.1143, 0.1661, 0.0796, 0.2663, 0.3113, 0.0489, 0.1202, 0.1574], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0105, 0.0106, 0.0105, 0.0079, 0.0074, 0.0087, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 09:10:58,345 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4308, 4.8183, 4.4022, 4.9315, 4.8550, 4.2813, 4.5806, 4.3409], + device='cuda:2'), covar=tensor([0.0332, 0.0642, 0.1416, 0.0440, 0.0463, 0.0624, 0.0762, 0.0836], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0185, 0.0281, 0.0179, 0.0228, 0.0178, 0.0193, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:11:06,889 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.59 vs. limit=2.0 +2022-11-16 09:11:07,041 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-11-16 09:11:11,257 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1574, 3.2771, 2.9886, 3.3166, 3.2929, 2.9410, 2.9280, 3.1145], + device='cuda:2'), covar=tensor([0.0961, 0.0620, 0.1303, 0.0465, 0.0544, 0.0526, 0.0866, 0.0598], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0186, 0.0281, 0.0180, 0.0228, 0.0179, 0.0194, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:11:17,216 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=102388.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:11:17,259 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9393, 4.3656, 4.0581, 4.4178, 4.3857, 3.9042, 4.0847, 3.9728], + device='cuda:2'), covar=tensor([0.0545, 0.0459, 0.1166, 0.0419, 0.0432, 0.0474, 0.0518, 0.0467], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0186, 0.0281, 0.0180, 0.0228, 0.0179, 0.0194, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:11:31,322 INFO [train.py:876] (2/4) Epoch 15, batch 600, loss[loss=0.08961, simple_loss=0.1293, pruned_loss=0.02495, over 5523.00 frames. ], tot_loss[loss=0.09869, simple_loss=0.1321, pruned_loss=0.03264, over 1028564.60 frames. ], batch size: 17, lr: 5.44e-03, grad_scale: 8.0 +2022-11-16 09:11:56,870 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.008e+02 1.396e+02 1.798e+02 2.195e+02 6.197e+02, threshold=3.596e+02, percent-clipped=7.0 +2022-11-16 09:11:56,991 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.0677, 5.5890, 5.1080, 5.5066, 5.6275, 4.8666, 5.2126, 4.9541], + device='cuda:2'), covar=tensor([0.0193, 0.0469, 0.1159, 0.0482, 0.0325, 0.0480, 0.0395, 0.0833], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0185, 0.0278, 0.0178, 0.0226, 0.0177, 0.0192, 0.0180], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:11:58,375 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.4998, 0.8160, 0.7613, 0.6725, 0.7019, 0.6977, 0.6609, 0.7791], + device='cuda:2'), covar=tensor([0.0073, 0.0036, 0.0053, 0.0044, 0.0044, 0.0055, 0.0070, 0.0038], + device='cuda:2'), in_proj_covar=tensor([0.0070, 0.0064, 0.0064, 0.0069, 0.0067, 0.0062, 0.0060, 0.0058], + device='cuda:2'), out_proj_covar=tensor([6.2407e-05, 5.6248e-05, 5.5792e-05, 6.0557e-05, 5.8952e-05, 5.4160e-05, + 5.3309e-05, 5.0797e-05], device='cuda:2') +2022-11-16 09:12:10,628 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.9057, 5.3270, 5.6056, 5.2127, 5.9385, 5.7619, 4.9174, 5.8821], + device='cuda:2'), covar=tensor([0.0262, 0.0338, 0.0465, 0.0346, 0.0260, 0.0193, 0.0230, 0.0223], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0162, 0.0114, 0.0150, 0.0192, 0.0119, 0.0133, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0004, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 09:12:39,189 INFO [train.py:876] (2/4) Epoch 15, batch 700, loss[loss=0.1482, simple_loss=0.1726, pruned_loss=0.06184, over 5463.00 frames. ], tot_loss[loss=0.09823, simple_loss=0.1315, pruned_loss=0.03248, over 1047051.75 frames. ], batch size: 53, lr: 5.44e-03, grad_scale: 8.0 +2022-11-16 09:13:04,053 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.068e+01 1.444e+02 1.826e+02 2.249e+02 4.884e+02, threshold=3.652e+02, percent-clipped=1.0 +2022-11-16 09:13:06,099 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3764, 4.1953, 4.2667, 4.2646, 3.8581, 3.8034, 4.7494, 4.1700], + device='cuda:2'), covar=tensor([0.0410, 0.0830, 0.0480, 0.1467, 0.0491, 0.0403, 0.0651, 0.0726], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0112, 0.0098, 0.0126, 0.0092, 0.0083, 0.0149, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:13:08,029 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7872, 4.7879, 4.7423, 4.6839, 4.7386, 4.3743, 2.2582, 4.8352], + device='cuda:2'), covar=tensor([0.0152, 0.0250, 0.0184, 0.0146, 0.0216, 0.0418, 0.2384, 0.0220], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0090, 0.0090, 0.0083, 0.0103, 0.0092, 0.0131, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:13:24,171 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=102576.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 09:13:35,228 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.23 vs. limit=5.0 +2022-11-16 09:13:37,896 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-11-16 09:13:45,255 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.54 vs. limit=5.0 +2022-11-16 09:13:46,905 INFO [train.py:876] (2/4) Epoch 15, batch 800, loss[loss=0.1119, simple_loss=0.1312, pruned_loss=0.04626, over 4769.00 frames. ], tot_loss[loss=0.09818, simple_loss=0.1314, pruned_loss=0.03248, over 1056250.64 frames. ], batch size: 136, lr: 5.43e-03, grad_scale: 8.0 +2022-11-16 09:13:57,851 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=102624.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 09:14:03,655 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-11-16 09:14:11,161 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=102643.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:14:13,462 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.304e+01 1.433e+02 1.767e+02 2.256e+02 4.472e+02, threshold=3.533e+02, percent-clipped=3.0 +2022-11-16 09:14:45,027 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=102691.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:14:46,004 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-11-16 09:14:56,845 INFO [train.py:876] (2/4) Epoch 15, batch 900, loss[loss=0.1509, simple_loss=0.1584, pruned_loss=0.07173, over 4191.00 frames. ], tot_loss[loss=0.09839, simple_loss=0.131, pruned_loss=0.03289, over 1064942.22 frames. ], batch size: 181, lr: 5.43e-03, grad_scale: 8.0 +2022-11-16 09:15:21,995 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.688e+01 1.409e+02 1.812e+02 2.361e+02 4.444e+02, threshold=3.625e+02, percent-clipped=1.0 +2022-11-16 09:15:34,951 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([5.4083, 4.8654, 5.1596, 4.7585, 5.4152, 5.1824, 4.7240, 5.3764], + device='cuda:2'), covar=tensor([0.0352, 0.0391, 0.0466, 0.0342, 0.0385, 0.0269, 0.0303, 0.0279], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0162, 0.0114, 0.0150, 0.0194, 0.0120, 0.0133, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0004, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 09:15:46,335 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-11-16 09:15:46,664 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6693, 2.0225, 2.5612, 3.6170, 3.3992, 2.5431, 2.2442, 3.5451], + device='cuda:2'), covar=tensor([0.0813, 0.2849, 0.2191, 0.1984, 0.1500, 0.3088, 0.2513, 0.0854], + device='cuda:2'), in_proj_covar=tensor([0.0257, 0.0194, 0.0185, 0.0292, 0.0226, 0.0198, 0.0186, 0.0250], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:15:52,808 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.60 vs. limit=5.0 +2022-11-16 09:16:04,876 INFO [train.py:876] (2/4) Epoch 15, batch 1000, loss[loss=0.09214, simple_loss=0.1167, pruned_loss=0.03376, over 5735.00 frames. ], tot_loss[loss=0.0961, simple_loss=0.1295, pruned_loss=0.03138, over 1077207.01 frames. ], batch size: 13, lr: 5.43e-03, grad_scale: 8.0 +2022-11-16 09:16:29,880 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.467e+01 1.525e+02 1.767e+02 2.154e+02 7.760e+02, threshold=3.535e+02, percent-clipped=4.0 +2022-11-16 09:16:33,627 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=102851.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:16:57,595 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=102887.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:16:59,021 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6369, 3.6752, 3.5891, 3.3543, 1.9307, 3.6339, 2.2774, 3.1564], + device='cuda:2'), covar=tensor([0.0504, 0.0308, 0.0202, 0.0402, 0.0704, 0.0219, 0.0581, 0.0216], + device='cuda:2'), in_proj_covar=tensor([0.0197, 0.0188, 0.0186, 0.0210, 0.0199, 0.0188, 0.0196, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 09:17:12,903 INFO [train.py:876] (2/4) Epoch 15, batch 1100, loss[loss=0.08931, simple_loss=0.1251, pruned_loss=0.02674, over 5785.00 frames. ], tot_loss[loss=0.09725, simple_loss=0.1301, pruned_loss=0.03222, over 1085345.62 frames. ], batch size: 16, lr: 5.42e-03, grad_scale: 8.0 +2022-11-16 09:17:14,975 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=102912.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:17:25,649 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6480, 2.7346, 2.4145, 2.6966, 2.7275, 2.5105, 2.3627, 2.5848], + device='cuda:2'), covar=tensor([0.0403, 0.0683, 0.1452, 0.0600, 0.0614, 0.0541, 0.1274, 0.0806], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0184, 0.0276, 0.0176, 0.0221, 0.0176, 0.0189, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:17:37,837 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.870e+01 1.387e+02 1.709e+02 2.035e+02 6.034e+02, threshold=3.418e+02, percent-clipped=1.0 +2022-11-16 09:17:38,740 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=102948.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:17:50,944 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0297, 2.0354, 2.1159, 1.6670, 1.9525, 2.1896, 2.0197, 2.4284], + device='cuda:2'), covar=tensor([0.0086, 0.0090, 0.0057, 0.0071, 0.0077, 0.0049, 0.0051, 0.0108], + device='cuda:2'), in_proj_covar=tensor([0.0070, 0.0064, 0.0064, 0.0069, 0.0067, 0.0063, 0.0061, 0.0059], + device='cuda:2'), out_proj_covar=tensor([6.2521e-05, 5.6017e-05, 5.5535e-05, 6.0784e-05, 5.9319e-05, 5.4504e-05, + 5.3789e-05, 5.1123e-05], device='cuda:2') +2022-11-16 09:18:04,784 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3674, 2.8301, 3.2507, 4.0970, 4.2342, 3.0469, 2.7467, 4.1954], + device='cuda:2'), covar=tensor([0.0621, 0.2882, 0.1818, 0.2206, 0.0987, 0.3147, 0.2233, 0.0558], + device='cuda:2'), in_proj_covar=tensor([0.0262, 0.0195, 0.0188, 0.0298, 0.0229, 0.0201, 0.0190, 0.0253], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:18:19,907 INFO [train.py:876] (2/4) Epoch 15, batch 1200, loss[loss=0.09524, simple_loss=0.1345, pruned_loss=0.02799, over 5759.00 frames. ], tot_loss[loss=0.09561, simple_loss=0.1287, pruned_loss=0.03127, over 1089145.10 frames. ], batch size: 21, lr: 5.42e-03, grad_scale: 8.0 +2022-11-16 09:18:36,623 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.81 vs. limit=5.0 +2022-11-16 09:18:45,658 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.196e+01 1.283e+02 1.545e+02 2.048e+02 3.817e+02, threshold=3.089e+02, percent-clipped=2.0 +2022-11-16 09:18:56,430 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0193, 1.6707, 2.0191, 1.7544, 1.8408, 2.3372, 1.8902, 1.8157], + device='cuda:2'), covar=tensor([0.0038, 0.0095, 0.0040, 0.0076, 0.0096, 0.0039, 0.0047, 0.0059], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0031, 0.0032, 0.0040, 0.0035, 0.0032, 0.0040, 0.0039], + device='cuda:2'), out_proj_covar=tensor([3.2314e-05, 2.8693e-05, 2.8433e-05, 3.8080e-05, 3.2739e-05, 3.0254e-05, + 3.7968e-05, 3.7087e-05], device='cuda:2') +2022-11-16 09:19:03,706 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5438, 2.8291, 2.5215, 2.8636, 2.4963, 2.2748, 2.5527, 3.0320], + device='cuda:2'), covar=tensor([0.1422, 0.1215, 0.1639, 0.1279, 0.1267, 0.1533, 0.1324, 0.1826], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0111, 0.0109, 0.0112, 0.0096, 0.0107, 0.0100, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:19:06,450 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.81 vs. limit=2.0 +2022-11-16 09:19:20,304 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103098.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:19:27,358 INFO [train.py:876] (2/4) Epoch 15, batch 1300, loss[loss=0.09973, simple_loss=0.1221, pruned_loss=0.03866, over 4075.00 frames. ], tot_loss[loss=0.09523, simple_loss=0.1282, pruned_loss=0.03112, over 1080343.60 frames. ], batch size: 181, lr: 5.42e-03, grad_scale: 8.0 +2022-11-16 09:19:30,078 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-11-16 09:19:31,936 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-16 09:19:41,979 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.9844, 2.0187, 2.5934, 1.7928, 1.4769, 2.7129, 2.3668, 2.1104], + device='cuda:2'), covar=tensor([0.1217, 0.1736, 0.1000, 0.2634, 0.2890, 0.1592, 0.0950, 0.1600], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0106, 0.0106, 0.0105, 0.0079, 0.0074, 0.0087, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 09:19:43,280 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103132.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:19:53,331 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.362e+01 1.342e+02 1.572e+02 1.992e+02 4.136e+02, threshold=3.143e+02, percent-clipped=6.0 +2022-11-16 09:20:01,534 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103159.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:20:09,049 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.18 vs. limit=2.0 +2022-11-16 09:20:17,588 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1340, 4.1478, 4.0616, 4.0438, 4.1943, 4.0038, 1.7565, 4.1858], + device='cuda:2'), covar=tensor([0.0297, 0.0337, 0.0366, 0.0317, 0.0298, 0.0349, 0.3217, 0.0342], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0090, 0.0090, 0.0082, 0.0102, 0.0091, 0.0131, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:20:24,593 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-16 09:20:24,864 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103193.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:20:34,608 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103207.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:20:35,873 INFO [train.py:876] (2/4) Epoch 15, batch 1400, loss[loss=0.07803, simple_loss=0.1212, pruned_loss=0.01741, over 5750.00 frames. ], tot_loss[loss=0.09531, simple_loss=0.1284, pruned_loss=0.03109, over 1085248.81 frames. ], batch size: 27, lr: 5.42e-03, grad_scale: 8.0 +2022-11-16 09:20:37,265 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7063, 1.2673, 1.3006, 1.0684, 1.5778, 1.8248, 1.0350, 1.4885], + device='cuda:2'), covar=tensor([0.0378, 0.0496, 0.0484, 0.0680, 0.0829, 0.0191, 0.0904, 0.0311], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0027, 0.0019, 0.0023, 0.0019, 0.0018, 0.0026, 0.0019], + device='cuda:2'), out_proj_covar=tensor([9.8492e-05, 1.3743e-04, 1.0422e-04, 1.1835e-04, 1.0508e-04, 1.0003e-04, + 1.3083e-04, 1.0136e-04], device='cuda:2') +2022-11-16 09:20:37,476 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.21 vs. limit=2.0 +2022-11-16 09:20:59,048 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103243.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:21:00,992 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9140, 3.7871, 3.8934, 3.9271, 3.6588, 3.5583, 4.3380, 3.8052], + device='cuda:2'), covar=tensor([0.0480, 0.0952, 0.0532, 0.1221, 0.0541, 0.0371, 0.0669, 0.0818], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0113, 0.0099, 0.0125, 0.0092, 0.0083, 0.0149, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:21:01,540 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.466e+01 1.279e+02 1.600e+02 2.005e+02 3.383e+02, threshold=3.199e+02, percent-clipped=1.0 +2022-11-16 09:21:13,295 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.82 vs. limit=2.0 +2022-11-16 09:21:42,723 INFO [train.py:876] (2/4) Epoch 15, batch 1500, loss[loss=0.08857, simple_loss=0.1345, pruned_loss=0.02135, over 5589.00 frames. ], tot_loss[loss=0.09505, simple_loss=0.1288, pruned_loss=0.03066, over 1092610.02 frames. ], batch size: 22, lr: 5.41e-03, grad_scale: 8.0 +2022-11-16 09:22:08,706 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.936e+01 1.307e+02 1.656e+02 2.050e+02 4.827e+02, threshold=3.313e+02, percent-clipped=2.0 +2022-11-16 09:22:14,254 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1384, 3.5349, 2.4387, 3.2309, 2.6230, 2.5559, 2.0173, 2.9928], + device='cuda:2'), covar=tensor([0.1331, 0.0343, 0.1246, 0.0485, 0.1216, 0.1176, 0.1957, 0.0548], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0141, 0.0150, 0.0143, 0.0170, 0.0163, 0.0155, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:22:14,863 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1074, 3.7470, 2.8829, 1.8079, 3.4623, 1.3777, 3.4666, 2.0107], + device='cuda:2'), covar=tensor([0.1642, 0.0190, 0.0840, 0.1827, 0.0281, 0.1975, 0.0278, 0.1454], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0104, 0.0113, 0.0110, 0.0102, 0.0119, 0.0101, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:22:18,233 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7693, 3.6052, 3.6472, 3.5618, 3.6594, 3.6212, 1.5240, 3.7754], + device='cuda:2'), covar=tensor([0.0294, 0.0439, 0.0363, 0.0310, 0.0333, 0.0357, 0.3105, 0.0379], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0089, 0.0089, 0.0083, 0.0102, 0.0091, 0.0131, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:22:21,970 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103366.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 09:22:51,380 INFO [train.py:876] (2/4) Epoch 15, batch 1600, loss[loss=0.08841, simple_loss=0.1121, pruned_loss=0.03236, over 4737.00 frames. ], tot_loss[loss=0.09598, simple_loss=0.1293, pruned_loss=0.03134, over 1079032.78 frames. ], batch size: 135, lr: 5.41e-03, grad_scale: 8.0 +2022-11-16 09:23:03,577 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103427.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 09:23:17,106 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.055e+01 1.366e+02 1.646e+02 2.008e+02 3.608e+02, threshold=3.293e+02, percent-clipped=2.0 +2022-11-16 09:23:17,262 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.2811, 4.1290, 2.6182, 3.8247, 3.1560, 2.7017, 2.2788, 3.3217], + device='cuda:2'), covar=tensor([0.1554, 0.0273, 0.1345, 0.0437, 0.0889, 0.1326, 0.2051, 0.0565], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0143, 0.0152, 0.0145, 0.0172, 0.0165, 0.0157, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:23:21,828 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103454.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:23:44,581 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103488.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:23:57,692 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=103507.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:23:58,295 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.9673, 4.0294, 4.0974, 3.8098, 3.9401, 3.8030, 1.6129, 3.9815], + device='cuda:2'), covar=tensor([0.0474, 0.0615, 0.0531, 0.0596, 0.0772, 0.0959, 0.4206, 0.0712], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0090, 0.0090, 0.0083, 0.0102, 0.0091, 0.0132, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:23:58,832 INFO [train.py:876] (2/4) Epoch 15, batch 1700, loss[loss=0.08277, simple_loss=0.1248, pruned_loss=0.02039, over 5741.00 frames. ], tot_loss[loss=0.09578, simple_loss=0.129, pruned_loss=0.03127, over 1076522.46 frames. ], batch size: 14, lr: 5.41e-03, grad_scale: 8.0 +2022-11-16 09:24:21,638 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=103543.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:24:24,448 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.609e+01 1.355e+02 1.661e+02 2.024e+02 3.979e+02, threshold=3.323e+02, percent-clipped=4.0 +2022-11-16 09:24:30,121 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=103555.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:24:43,168 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1163, 2.3946, 2.1452, 1.6046, 2.7077, 2.7116, 2.4511, 2.8211], + device='cuda:2'), covar=tensor([0.1586, 0.1535, 0.1904, 0.2553, 0.0781, 0.1053, 0.0727, 0.0977], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0178, 0.0167, 0.0181, 0.0186, 0.0203, 0.0174, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:24:48,710 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8069, 4.7637, 3.1247, 4.4060, 3.4948, 3.0168, 2.5716, 3.9362], + device='cuda:2'), covar=tensor([0.1172, 0.0255, 0.0993, 0.0431, 0.0717, 0.1018, 0.1734, 0.0363], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0142, 0.0151, 0.0145, 0.0171, 0.0163, 0.0157, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:24:53,834 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=103591.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:25:06,720 INFO [train.py:876] (2/4) Epoch 15, batch 1800, loss[loss=0.1156, simple_loss=0.1422, pruned_loss=0.04453, over 5577.00 frames. ], tot_loss[loss=0.09592, simple_loss=0.1288, pruned_loss=0.03153, over 1079089.75 frames. ], batch size: 54, lr: 5.41e-03, grad_scale: 8.0 +2022-11-16 09:25:27,856 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7543, 2.8072, 2.2215, 3.0220, 2.2627, 2.3854, 2.5980, 3.0596], + device='cuda:2'), covar=tensor([0.0999, 0.1273, 0.1840, 0.1047, 0.1276, 0.1130, 0.1206, 0.3077], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0110, 0.0109, 0.0112, 0.0096, 0.0106, 0.0100, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:25:31,584 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.004e+01 1.509e+02 1.805e+02 2.366e+02 4.290e+02, threshold=3.611e+02, percent-clipped=5.0 +2022-11-16 09:26:13,047 INFO [train.py:876] (2/4) Epoch 15, batch 1900, loss[loss=0.1026, simple_loss=0.1261, pruned_loss=0.0395, over 5704.00 frames. ], tot_loss[loss=0.09688, simple_loss=0.1298, pruned_loss=0.03201, over 1082287.95 frames. ], batch size: 19, lr: 5.40e-03, grad_scale: 8.0 +2022-11-16 09:26:22,465 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103722.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 09:26:32,249 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.1282, 2.3514, 2.5801, 3.2618, 3.1195, 2.3677, 2.2398, 3.3732], + device='cuda:2'), covar=tensor([0.2005, 0.2376, 0.2115, 0.2181, 0.1675, 0.3340, 0.2303, 0.1631], + device='cuda:2'), in_proj_covar=tensor([0.0261, 0.0192, 0.0185, 0.0295, 0.0226, 0.0199, 0.0188, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:26:34,272 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103740.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:26:35,055 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.72 vs. limit=2.0 +2022-11-16 09:26:38,966 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.053e+01 1.311e+02 1.680e+02 2.035e+02 3.370e+02, threshold=3.360e+02, percent-clipped=0.0 +2022-11-16 09:26:39,512 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.83 vs. limit=5.0 +2022-11-16 09:26:43,675 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=103754.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:26:48,831 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.7048, 2.4512, 2.6790, 3.6106, 3.4605, 2.7384, 2.6865, 3.6781], + device='cuda:2'), covar=tensor([0.0904, 0.2335, 0.2097, 0.2462, 0.1310, 0.3004, 0.1987, 0.0696], + device='cuda:2'), in_proj_covar=tensor([0.0260, 0.0192, 0.0185, 0.0295, 0.0226, 0.0199, 0.0188, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:26:49,486 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8346, 2.2832, 3.4630, 2.8958, 3.5372, 2.2299, 3.1177, 3.6584], + device='cuda:2'), covar=tensor([0.0791, 0.1609, 0.1001, 0.1866, 0.0641, 0.1992, 0.1552, 0.1025], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0191, 0.0216, 0.0209, 0.0240, 0.0198, 0.0225, 0.0229], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:26:50,733 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103765.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:27:05,365 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.0497, 2.5574, 2.9452, 3.8944, 3.8774, 3.0486, 2.7583, 3.9542], + device='cuda:2'), covar=tensor([0.0785, 0.2730, 0.2175, 0.2635, 0.1137, 0.2859, 0.2174, 0.0895], + device='cuda:2'), in_proj_covar=tensor([0.0260, 0.0191, 0.0185, 0.0294, 0.0225, 0.0198, 0.0187, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:27:06,536 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=103788.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:27:15,377 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103801.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:27:15,903 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=103802.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:27:20,741 INFO [train.py:876] (2/4) Epoch 15, batch 2000, loss[loss=0.06678, simple_loss=0.1025, pruned_loss=0.01554, over 5188.00 frames. ], tot_loss[loss=0.09572, simple_loss=0.1287, pruned_loss=0.03136, over 1087426.79 frames. ], batch size: 8, lr: 5.40e-03, grad_scale: 8.0 +2022-11-16 09:27:24,416 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-11-16 09:27:24,833 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.6037, 1.1062, 1.0484, 0.9328, 1.0213, 1.2717, 1.1931, 1.2428], + device='cuda:2'), covar=tensor([0.4452, 0.1057, 0.3459, 0.2900, 0.2039, 0.0647, 0.2247, 0.1795], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0105, 0.0106, 0.0104, 0.0079, 0.0073, 0.0087, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-11-16 09:27:32,516 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103826.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:27:39,664 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=103836.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:27:47,518 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.204e+01 1.423e+02 1.683e+02 2.188e+02 4.061e+02, threshold=3.366e+02, percent-clipped=3.0 +2022-11-16 09:28:06,739 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103876.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:28:29,291 INFO [train.py:876] (2/4) Epoch 15, batch 2100, loss[loss=0.1149, simple_loss=0.1338, pruned_loss=0.04799, over 4109.00 frames. ], tot_loss[loss=0.0971, simple_loss=0.1297, pruned_loss=0.03223, over 1083162.56 frames. ], batch size: 181, lr: 5.40e-03, grad_scale: 8.0 +2022-11-16 09:28:38,353 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.7972, 2.8468, 2.4619, 2.9721, 2.2788, 2.4458, 2.7320, 3.2395], + device='cuda:2'), covar=tensor([0.1058, 0.1172, 0.2002, 0.1276, 0.1387, 0.0985, 0.1474, 0.1958], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0111, 0.0110, 0.0113, 0.0097, 0.0108, 0.0102, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:28:43,421 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.7884, 4.6817, 4.8442, 4.8906, 4.3092, 4.0733, 5.2667, 4.6312], + device='cuda:2'), covar=tensor([0.0383, 0.0835, 0.0311, 0.1092, 0.0532, 0.0373, 0.0674, 0.0616], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0114, 0.0099, 0.0126, 0.0093, 0.0083, 0.0150, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:28:48,744 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103937.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 09:28:55,840 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.938e+01 1.249e+02 1.692e+02 2.000e+02 3.700e+02, threshold=3.385e+02, percent-clipped=3.0 +2022-11-16 09:29:03,474 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.2468, 1.5349, 1.1035, 1.1794, 1.3163, 1.3640, 0.9143, 1.4867], + device='cuda:2'), covar=tensor([0.0090, 0.0050, 0.0092, 0.0079, 0.0076, 0.0077, 0.0122, 0.0068], + device='cuda:2'), in_proj_covar=tensor([0.0070, 0.0064, 0.0064, 0.0069, 0.0068, 0.0063, 0.0060, 0.0059], + device='cuda:2'), out_proj_covar=tensor([6.2550e-05, 5.6518e-05, 5.5690e-05, 6.0194e-05, 5.9732e-05, 5.4466e-05, + 5.3251e-05, 5.1339e-05], device='cuda:2') +2022-11-16 09:29:08,276 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.29 vs. limit=5.0 +2022-11-16 09:29:20,823 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5319, 4.4595, 3.1205, 4.3179, 3.4825, 3.1155, 2.5206, 3.6738], + device='cuda:2'), covar=tensor([0.1405, 0.0301, 0.1014, 0.0400, 0.0744, 0.0942, 0.1933, 0.0498], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0141, 0.0150, 0.0144, 0.0170, 0.0162, 0.0154, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:29:24,781 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103990.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:29:29,414 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103997.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:29:37,376 INFO [train.py:876] (2/4) Epoch 15, batch 2200, loss[loss=0.09805, simple_loss=0.1285, pruned_loss=0.03382, over 5632.00 frames. ], tot_loss[loss=0.09769, simple_loss=0.1302, pruned_loss=0.0326, over 1080512.10 frames. ], batch size: 32, lr: 5.40e-03, grad_scale: 8.0 +2022-11-16 09:29:46,236 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104022.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 09:30:02,947 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0444, 2.9498, 3.1488, 1.5532, 3.0093, 3.3689, 3.4308, 3.6963], + device='cuda:2'), covar=tensor([0.2146, 0.1625, 0.1645, 0.3388, 0.0889, 0.1088, 0.0616, 0.0840], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0180, 0.0168, 0.0184, 0.0189, 0.0208, 0.0176, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:30:04,014 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.673e+01 1.367e+02 1.685e+02 2.131e+02 5.334e+02, threshold=3.371e+02, percent-clipped=2.0 +2022-11-16 09:30:04,864 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=104049.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:30:06,209 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=104051.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:30:10,741 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=104058.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:30:15,355 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4487, 1.7021, 2.1742, 1.1441, 2.3709, 1.9088, 1.7250, 1.4892], + device='cuda:2'), covar=tensor([0.1629, 0.0665, 0.0757, 0.0962, 0.1264, 0.0854, 0.0447, 0.0896], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0027, 0.0019, 0.0022, 0.0018, 0.0017, 0.0025, 0.0018], + device='cuda:2'), out_proj_covar=tensor([9.5576e-05, 1.3394e-04, 1.0176e-04, 1.1498e-04, 1.0257e-04, 9.7698e-05, + 1.2784e-04, 9.8388e-05], device='cuda:2') +2022-11-16 09:30:18,835 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104070.0, num_to_drop=1, layers_to_drop={0} +2022-11-16 09:30:36,202 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104096.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:30:39,479 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8073, 2.4505, 2.7932, 3.6937, 3.5283, 2.8522, 2.7790, 3.7108], + device='cuda:2'), covar=tensor([0.0859, 0.2353, 0.2342, 0.2702, 0.1348, 0.3108, 0.1898, 0.0882], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0191, 0.0183, 0.0295, 0.0225, 0.0198, 0.0188, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:30:45,074 INFO [train.py:876] (2/4) Epoch 15, batch 2300, loss[loss=0.08868, simple_loss=0.1167, pruned_loss=0.03033, over 4967.00 frames. ], tot_loss[loss=0.09455, simple_loss=0.1278, pruned_loss=0.03063, over 1084975.14 frames. ], batch size: 109, lr: 5.39e-03, grad_scale: 8.0 +2022-11-16 09:30:45,932 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=104110.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:30:52,940 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104121.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:31:10,965 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.069e+01 1.436e+02 1.695e+02 2.146e+02 3.837e+02, threshold=3.391e+02, percent-clipped=3.0 +2022-11-16 09:31:18,496 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.8112, 2.6079, 2.6464, 2.4121, 2.8589, 2.6909, 2.7661, 2.8309], + device='cuda:2'), covar=tensor([0.0484, 0.0514, 0.0588, 0.0600, 0.0514, 0.0338, 0.0430, 0.0604], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0161, 0.0116, 0.0151, 0.0194, 0.0120, 0.0133, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0004, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 09:31:52,456 INFO [train.py:876] (2/4) Epoch 15, batch 2400, loss[loss=0.1022, simple_loss=0.1286, pruned_loss=0.03793, over 5595.00 frames. ], tot_loss[loss=0.09625, simple_loss=0.129, pruned_loss=0.03174, over 1087143.62 frames. ], batch size: 23, lr: 5.39e-03, grad_scale: 8.0 +2022-11-16 09:31:56,136 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9735, 2.9826, 2.7566, 3.0662, 2.5647, 2.7346, 2.9918, 3.1626], + device='cuda:2'), covar=tensor([0.1123, 0.0897, 0.1394, 0.0733, 0.1265, 0.0789, 0.0980, 0.2807], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0111, 0.0109, 0.0113, 0.0097, 0.0108, 0.0101, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:32:02,918 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.4448, 4.0063, 4.1708, 3.9830, 4.6037, 4.2529, 4.0799, 4.4649], + device='cuda:2'), covar=tensor([0.0641, 0.0902, 0.0795, 0.1026, 0.0644, 0.0658, 0.0668, 0.0888], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0161, 0.0115, 0.0150, 0.0193, 0.0119, 0.0132, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0004, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-11-16 09:32:02,949 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6749, 3.6346, 3.8204, 3.5264, 3.7234, 3.7070, 1.5256, 3.8204], + device='cuda:2'), covar=tensor([0.0358, 0.0478, 0.0318, 0.0370, 0.0382, 0.0398, 0.3308, 0.0427], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0089, 0.0090, 0.0082, 0.0102, 0.0091, 0.0131, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:32:08,119 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-11-16 09:32:08,476 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104232.0, num_to_drop=1, layers_to_drop={2} +2022-11-16 09:32:19,522 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.767e+01 1.368e+02 1.673e+02 2.290e+02 7.384e+02, threshold=3.347e+02, percent-clipped=4.0 +2022-11-16 09:32:56,471 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.9511, 2.4381, 2.5487, 3.3003, 3.2103, 2.5708, 2.2961, 3.2162], + device='cuda:2'), covar=tensor([0.1706, 0.2270, 0.1828, 0.2365, 0.1396, 0.2664, 0.2076, 0.1441], + device='cuda:2'), in_proj_covar=tensor([0.0261, 0.0193, 0.0185, 0.0296, 0.0227, 0.0199, 0.0188, 0.0250], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:33:00,207 INFO [train.py:876] (2/4) Epoch 15, batch 2500, loss[loss=0.0869, simple_loss=0.1212, pruned_loss=0.02631, over 5320.00 frames. ], tot_loss[loss=0.09716, simple_loss=0.1302, pruned_loss=0.03208, over 1087089.77 frames. ], batch size: 79, lr: 5.39e-03, grad_scale: 8.0 +2022-11-16 09:33:25,645 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104346.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:33:27,458 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.221e+01 1.438e+02 1.846e+02 2.292e+02 4.407e+02, threshold=3.693e+02, percent-clipped=4.0 +2022-11-16 09:33:30,201 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104353.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:33:59,807 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104396.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:34:06,226 INFO [zipformer.py:623] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104405.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:34:08,816 INFO [train.py:876] (2/4) Epoch 15, batch 2600, loss[loss=0.07705, simple_loss=0.1059, pruned_loss=0.02412, over 5743.00 frames. ], tot_loss[loss=0.09652, simple_loss=0.1297, pruned_loss=0.03168, over 1086138.07 frames. ], batch size: 13, lr: 5.39e-03, grad_scale: 8.0 +2022-11-16 09:34:16,651 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104421.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:34:18,948 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-11-16 09:34:20,844 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.3221, 4.1856, 4.3890, 4.3869, 4.0251, 3.8011, 4.7639, 4.2702], + device='cuda:2'), covar=tensor([0.0430, 0.0872, 0.0395, 0.1138, 0.0485, 0.0355, 0.0673, 0.0749], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0114, 0.0100, 0.0127, 0.0093, 0.0084, 0.0150, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:34:32,569 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104444.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:34:35,748 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.383e+01 1.354e+02 1.630e+02 1.866e+02 3.463e+02, threshold=3.260e+02, percent-clipped=0.0 +2022-11-16 09:34:49,468 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104469.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:35:16,364 INFO [train.py:876] (2/4) Epoch 15, batch 2700, loss[loss=0.1071, simple_loss=0.138, pruned_loss=0.03803, over 5713.00 frames. ], tot_loss[loss=0.09515, simple_loss=0.1287, pruned_loss=0.03081, over 1092183.16 frames. ], batch size: 31, lr: 5.38e-03, grad_scale: 8.0 +2022-11-16 09:35:31,703 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104532.0, num_to_drop=1, layers_to_drop={1} +2022-11-16 09:35:42,734 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.521e+01 1.320e+02 1.626e+02 1.993e+02 3.646e+02, threshold=3.252e+02, percent-clipped=1.0 +2022-11-16 09:36:04,049 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104580.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:36:17,433 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.8262, 3.8547, 3.8760, 3.6832, 3.7775, 3.7987, 1.5919, 3.9669], + device='cuda:2'), covar=tensor([0.0362, 0.0372, 0.0517, 0.0493, 0.0496, 0.0476, 0.3355, 0.0379], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0089, 0.0089, 0.0082, 0.0102, 0.0090, 0.0131, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:36:24,513 INFO [train.py:876] (2/4) Epoch 15, batch 2800, loss[loss=0.07931, simple_loss=0.1231, pruned_loss=0.01777, over 5597.00 frames. ], tot_loss[loss=0.09702, simple_loss=0.1305, pruned_loss=0.03176, over 1094575.48 frames. ], batch size: 22, lr: 5.38e-03, grad_scale: 8.0 +2022-11-16 09:36:37,419 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.9558, 1.4357, 0.9556, 1.1892, 1.3139, 1.1986, 0.7460, 1.3275], + device='cuda:2'), covar=tensor([0.0083, 0.0049, 0.0079, 0.0068, 0.0076, 0.0066, 0.0110, 0.0065], + device='cuda:2'), in_proj_covar=tensor([0.0070, 0.0065, 0.0064, 0.0068, 0.0067, 0.0063, 0.0060, 0.0060], + device='cuda:2'), out_proj_covar=tensor([6.2162e-05, 5.6946e-05, 5.5822e-05, 6.0016e-05, 5.9621e-05, 5.4300e-05, + 5.3289e-05, 5.1775e-05], device='cuda:2') +2022-11-16 09:36:49,221 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104646.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:36:50,326 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-11-16 09:36:51,003 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.819e+01 1.524e+02 1.785e+02 2.265e+02 4.174e+02, threshold=3.570e+02, percent-clipped=2.0 +2022-11-16 09:36:54,181 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104653.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:36:57,626 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.15 vs. limit=2.0 +2022-11-16 09:37:12,335 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-11-16 09:37:18,721 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6938, 2.2892, 2.5956, 3.5286, 3.5303, 2.6158, 2.4063, 3.5478], + device='cuda:2'), covar=tensor([0.1068, 0.2777, 0.2445, 0.2734, 0.1373, 0.3513, 0.2502, 0.1040], + device='cuda:2'), in_proj_covar=tensor([0.0260, 0.0194, 0.0186, 0.0297, 0.0226, 0.0199, 0.0190, 0.0251], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:37:21,820 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104694.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:37:26,310 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104701.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:37:29,418 INFO [zipformer.py:623] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104705.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:37:32,269 INFO [train.py:876] (2/4) Epoch 15, batch 2900, loss[loss=0.1118, simple_loss=0.1484, pruned_loss=0.0376, over 5811.00 frames. ], tot_loss[loss=0.09675, simple_loss=0.1302, pruned_loss=0.03164, over 1092678.37 frames. ], batch size: 22, lr: 5.38e-03, grad_scale: 8.0 +2022-11-16 09:37:59,213 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.093e+01 1.400e+02 1.739e+02 2.219e+02 5.401e+02, threshold=3.478e+02, percent-clipped=7.0 +2022-11-16 09:38:02,044 INFO [zipformer.py:623] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104753.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:38:20,911 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6606, 2.0484, 2.5985, 2.3274, 2.6281, 2.9047, 2.2979, 2.0189], + device='cuda:2'), covar=tensor([0.0568, 0.0443, 0.0322, 0.0371, 0.1608, 0.0471, 0.0241, 0.0770], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0027, 0.0019, 0.0023, 0.0019, 0.0018, 0.0026, 0.0018], + device='cuda:2'), out_proj_covar=tensor([9.7999e-05, 1.3712e-04, 1.0468e-04, 1.1800e-04, 1.0565e-04, 9.9335e-05, + 1.3050e-04, 1.0089e-04], device='cuda:2') +2022-11-16 09:38:27,909 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5879, 4.7303, 3.5961, 2.1675, 4.3783, 1.7954, 4.2640, 2.4705], + device='cuda:2'), covar=tensor([0.1398, 0.0102, 0.0506, 0.1936, 0.0196, 0.1827, 0.0233, 0.1428], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0105, 0.0114, 0.0109, 0.0102, 0.0119, 0.0100, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:38:33,928 INFO [scaling.py:664] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.34 vs. limit=5.0 +2022-11-16 09:38:40,271 INFO [train.py:876] (2/4) Epoch 15, batch 3000, loss[loss=0.1317, simple_loss=0.1527, pruned_loss=0.0553, over 5343.00 frames. ], tot_loss[loss=0.09704, simple_loss=0.1302, pruned_loss=0.03196, over 1088539.75 frames. ], batch size: 70, lr: 5.38e-03, grad_scale: 8.0 +2022-11-16 09:38:40,271 INFO [train.py:899] (2/4) Computing validation loss +2022-11-16 09:38:52,398 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.4339, 2.8797, 2.7907, 1.7205, 3.2761, 3.2254, 3.2307, 3.5851], + device='cuda:2'), covar=tensor([0.1661, 0.1468, 0.1283, 0.2907, 0.0547, 0.0870, 0.0549, 0.0636], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0178, 0.0166, 0.0181, 0.0186, 0.0205, 0.0175, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:38:58,034 INFO [train.py:908] (2/4) Epoch 15, validation: loss=0.1809, simple_loss=0.1888, pruned_loss=0.08654, over 1530663.00 frames. +2022-11-16 09:38:58,034 INFO [train.py:909] (2/4) Maximum memory allocated so far is 4715MB +2022-11-16 09:39:01,206 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.3761, 2.9872, 3.0173, 2.7796, 1.7621, 3.0005, 2.0851, 2.8621], + device='cuda:2'), covar=tensor([0.0366, 0.0187, 0.0160, 0.0270, 0.0546, 0.0196, 0.0488, 0.0181], + device='cuda:2'), in_proj_covar=tensor([0.0195, 0.0187, 0.0183, 0.0208, 0.0198, 0.0186, 0.0195, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-11-16 09:39:17,586 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.4548, 1.6057, 1.7409, 1.8092, 1.5944, 1.4631, 1.5566, 1.5566], + device='cuda:2'), covar=tensor([0.2752, 0.2384, 0.2077, 0.1668, 0.1888, 0.2713, 0.2175, 0.1204], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0110, 0.0108, 0.0111, 0.0096, 0.0107, 0.0100, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:39:17,612 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.7995, 1.2501, 0.8138, 0.9027, 1.0043, 0.9940, 0.6130, 1.1793], + device='cuda:2'), covar=tensor([0.0117, 0.0056, 0.0088, 0.0076, 0.0085, 0.0085, 0.0126, 0.0067], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0064, 0.0064, 0.0068, 0.0067, 0.0062, 0.0060, 0.0059], + device='cuda:2'), out_proj_covar=tensor([6.1778e-05, 5.6618e-05, 5.5546e-05, 5.9429e-05, 5.9114e-05, 5.3917e-05, + 5.2848e-05, 5.1173e-05], device='cuda:2') +2022-11-16 09:39:25,316 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.210e+01 1.336e+02 1.693e+02 2.076e+02 4.663e+02, threshold=3.385e+02, percent-clipped=3.0 +2022-11-16 09:39:38,180 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-11-16 09:40:06,263 INFO [train.py:876] (2/4) Epoch 15, batch 3100, loss[loss=0.08238, simple_loss=0.1211, pruned_loss=0.02183, over 5583.00 frames. ], tot_loss[loss=0.09621, simple_loss=0.1293, pruned_loss=0.03158, over 1083510.26 frames. ], batch size: 23, lr: 5.37e-03, grad_scale: 8.0 +2022-11-16 09:40:33,544 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.354e+01 1.349e+02 1.627e+02 2.090e+02 3.507e+02, threshold=3.255e+02, percent-clipped=1.0 +2022-11-16 09:41:18,932 INFO [train.py:876] (2/4) Epoch 15, batch 3200, loss[loss=0.09378, simple_loss=0.1293, pruned_loss=0.02915, over 5136.00 frames. ], tot_loss[loss=0.09603, simple_loss=0.1297, pruned_loss=0.03119, over 1086714.15 frames. ], batch size: 91, lr: 5.37e-03, grad_scale: 8.0 +2022-11-16 09:41:46,950 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.649e+01 1.417e+02 1.645e+02 2.110e+02 3.664e+02, threshold=3.291e+02, percent-clipped=2.0 +2022-11-16 09:42:17,051 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.6039, 1.8846, 2.4523, 2.2300, 2.4433, 1.7031, 2.2960, 2.6056], + device='cuda:2'), covar=tensor([0.0832, 0.1241, 0.0902, 0.0954, 0.1077, 0.1469, 0.1132, 0.0776], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0194, 0.0222, 0.0215, 0.0246, 0.0201, 0.0230, 0.0235], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:42:28,186 INFO [train.py:876] (2/4) Epoch 15, batch 3300, loss[loss=0.07455, simple_loss=0.1089, pruned_loss=0.02011, over 5737.00 frames. ], tot_loss[loss=0.0961, simple_loss=0.1293, pruned_loss=0.03144, over 1086621.09 frames. ], batch size: 13, lr: 5.37e-03, grad_scale: 8.0 +2022-11-16 09:42:38,183 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-11-16 09:42:48,728 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([0.8610, 1.0785, 0.7906, 0.9214, 0.9438, 1.0405, 0.7087, 1.1403], + device='cuda:2'), covar=tensor([0.0112, 0.0061, 0.0104, 0.0070, 0.0094, 0.0079, 0.0118, 0.0071], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0064, 0.0063, 0.0067, 0.0066, 0.0061, 0.0059, 0.0058], + device='cuda:2'), out_proj_covar=tensor([6.1076e-05, 5.6459e-05, 5.4543e-05, 5.8868e-05, 5.8212e-05, 5.2979e-05, + 5.2132e-05, 5.0735e-05], device='cuda:2') +2022-11-16 09:42:56,052 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.723e+01 1.318e+02 1.706e+02 2.251e+02 4.750e+02, threshold=3.412e+02, percent-clipped=5.0 +2022-11-16 09:43:26,323 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-11-16 09:43:37,382 INFO [train.py:876] (2/4) Epoch 15, batch 3400, loss[loss=0.09781, simple_loss=0.1311, pruned_loss=0.03227, over 5194.00 frames. ], tot_loss[loss=0.097, simple_loss=0.13, pruned_loss=0.032, over 1083885.99 frames. ], batch size: 91, lr: 5.37e-03, grad_scale: 8.0 +2022-11-16 09:44:05,015 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.327e+01 1.478e+02 1.801e+02 2.310e+02 1.078e+03, threshold=3.602e+02, percent-clipped=8.0 +2022-11-16 09:44:19,066 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.0841, 2.6656, 3.2724, 1.8617, 2.9373, 3.4994, 3.2230, 3.4561], + device='cuda:2'), covar=tensor([0.2541, 0.1859, 0.0848, 0.3366, 0.0879, 0.1170, 0.0940, 0.0947], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0176, 0.0164, 0.0178, 0.0184, 0.0202, 0.0171, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:44:46,640 INFO [train.py:876] (2/4) Epoch 15, batch 3500, loss[loss=0.0498, simple_loss=0.08613, pruned_loss=0.00674, over 4684.00 frames. ], tot_loss[loss=0.09585, simple_loss=0.1288, pruned_loss=0.03145, over 1081443.34 frames. ], batch size: 5, lr: 5.36e-03, grad_scale: 8.0 +2022-11-16 09:44:59,212 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.1519, 1.8481, 1.9989, 2.3370, 2.5282, 1.9780, 1.6675, 2.3838], + device='cuda:2'), covar=tensor([0.2429, 0.2109, 0.2001, 0.1264, 0.1300, 0.2603, 0.2198, 0.2117], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0192, 0.0184, 0.0294, 0.0225, 0.0197, 0.0189, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-11-16 09:45:13,771 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([2.5627, 4.3312, 2.9522, 4.1620, 3.2814, 3.0320, 2.2526, 3.5331], + device='cuda:2'), covar=tensor([0.1273, 0.0303, 0.1076, 0.0404, 0.0836, 0.1021, 0.1991, 0.0479], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0141, 0.0150, 0.0144, 0.0169, 0.0161, 0.0154, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-11-16 09:45:15,063 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.540e+01 1.261e+02 1.523e+02 1.855e+02 3.568e+02, threshold=3.045e+02, percent-clipped=0.0 +2022-11-16 09:45:20,634 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([4.1644, 4.0295, 4.0836, 4.1053, 3.8198, 3.4530, 4.5063, 3.9060], + device='cuda:2'), covar=tensor([0.0347, 0.0940, 0.0464, 0.1176, 0.0431, 0.0476, 0.0675, 0.0755], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0114, 0.0099, 0.0127, 0.0092, 0.0085, 0.0149, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:45:22,699 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([3.6221, 3.7414, 3.0597, 3.6702, 3.6004, 3.3844, 3.6504, 3.5850], + device='cuda:2'), covar=tensor([0.0846, 0.0916, 0.2346, 0.1073, 0.1203, 0.0714, 0.1026, 0.0712], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0185, 0.0276, 0.0178, 0.0219, 0.0177, 0.0192, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0004, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-11-16 09:45:42,592 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=105388.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:45:57,212 INFO [train.py:876] (2/4) Epoch 15, batch 3600, loss[loss=0.1358, simple_loss=0.1622, pruned_loss=0.05467, over 5461.00 frames. ], tot_loss[loss=0.09488, simple_loss=0.1284, pruned_loss=0.03067, over 1086193.39 frames. ], batch size: 53, lr: 5.36e-03, grad_scale: 8.0 +2022-11-16 09:46:15,448 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-11-16 09:46:25,419 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.909e+01 1.343e+02 1.539e+02 1.882e+02 3.139e+02, threshold=3.078e+02, percent-clipped=1.0 +2022-11-16 09:46:26,030 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=105449.0, num_to_drop=1, layers_to_drop={3} +2022-11-16 09:46:52,070 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=105486.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:47:03,236 INFO [zipformer.py:623] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=105502.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:47:07,851 INFO [train.py:876] (2/4) Epoch 15, batch 3700, loss[loss=0.06704, simple_loss=0.09812, pruned_loss=0.01799, over 5564.00 frames. ], tot_loss[loss=0.095, simple_loss=0.1281, pruned_loss=0.03095, over 1088702.94 frames. ], batch size: 14, lr: 5.36e-03, grad_scale: 8.0 +2022-11-16 09:47:08,371 INFO [scaling.py:664] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.27 vs. limit=2.0 +2022-11-16 09:47:34,344 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=105547.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:47:35,432 INFO [optim.py:343] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.168e+01 1.329e+02 1.592e+02 1.987e+02 4.072e+02, threshold=3.183e+02, percent-clipped=1.0 +2022-11-16 09:47:45,377 INFO [zipformer.py:623] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=105563.0, num_to_drop=0, layers_to_drop=set() +2022-11-16 09:48:04,003 INFO [zipformer.py:1411] (2/4) attn_weights_entropy = tensor([1.7361, 1.9593, 1.8035, 1.4208, 1.7796, 2.2496, 2.1017, 2.3703], + device='cuda:2'), covar=tensor([0.1850, 0.1932, 0.2104, 0.2823, 0.1396, 0.1249, 0.0941, 0.1313], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0176, 0.0165, 0.0178, 0.0183, 0.0203, 0.0172, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2')