{ "best_metric": 22.042796384901646, "best_model_checkpoint": "BA_Model_V2/checkpoint-3000", "epoch": 17.985611510791365, "eval_steps": 1000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.36, "grad_norm": 8.64929485321045, "learning_rate": 1.9600000000000003e-06, "loss": 1.2156, "step": 100 }, { "epoch": 0.72, "grad_norm": 3.87185001373291, "learning_rate": 3.96e-06, "loss": 0.3891, "step": 200 }, { "epoch": 1.08, "grad_norm": 3.1216392517089844, "learning_rate": 5.9600000000000005e-06, "loss": 0.2699, "step": 300 }, { "epoch": 1.44, "grad_norm": 6.549575328826904, "learning_rate": 7.960000000000002e-06, "loss": 0.2151, "step": 400 }, { "epoch": 1.8, "grad_norm": 2.978250026702881, "learning_rate": 9.960000000000001e-06, "loss": 0.2213, "step": 500 }, { "epoch": 2.16, "grad_norm": 8.912956237792969, "learning_rate": 9.782222222222222e-06, "loss": 0.1881, "step": 600 }, { "epoch": 2.52, "grad_norm": 2.4266467094421387, "learning_rate": 9.56e-06, "loss": 0.1264, "step": 700 }, { "epoch": 2.88, "grad_norm": 2.5170629024505615, "learning_rate": 9.33777777777778e-06, "loss": 0.1319, "step": 800 }, { "epoch": 3.24, "grad_norm": 1.9321565628051758, "learning_rate": 9.115555555555556e-06, "loss": 0.0872, "step": 900 }, { "epoch": 3.6, "grad_norm": 2.928337335586548, "learning_rate": 8.893333333333333e-06, "loss": 0.0687, "step": 1000 }, { "epoch": 3.6, "eval_cer": 12.433448002116961, "eval_loss": 0.37324240803718567, "eval_runtime": 946.036, "eval_samples_per_second": 1.568, "eval_steps_per_second": 0.197, "eval_wer": 22.268740031897927, "step": 1000 }, { "epoch": 3.96, "grad_norm": 2.3376998901367188, "learning_rate": 8.671111111111113e-06, "loss": 0.0724, "step": 1100 }, { "epoch": 4.32, "grad_norm": 1.9221140146255493, "learning_rate": 8.448888888888889e-06, "loss": 0.0423, "step": 1200 }, { "epoch": 4.68, "grad_norm": 1.7310447692871094, "learning_rate": 8.226666666666667e-06, "loss": 0.0423, "step": 1300 }, { "epoch": 5.04, "grad_norm": 1.6635299921035767, "learning_rate": 8.004444444444445e-06, "loss": 0.0425, "step": 1400 }, { "epoch": 5.4, "grad_norm": 1.2995408773422241, "learning_rate": 7.782222222222223e-06, "loss": 0.0272, "step": 1500 }, { "epoch": 5.76, "grad_norm": 2.0113301277160645, "learning_rate": 7.5600000000000005e-06, "loss": 0.0307, "step": 1600 }, { "epoch": 6.12, "grad_norm": 1.9505223035812378, "learning_rate": 7.337777777777778e-06, "loss": 0.0244, "step": 1700 }, { "epoch": 6.47, "grad_norm": 1.1860969066619873, "learning_rate": 7.115555555555557e-06, "loss": 0.018, "step": 1800 }, { "epoch": 6.83, "grad_norm": 0.8492623567581177, "learning_rate": 6.893333333333334e-06, "loss": 0.0196, "step": 1900 }, { "epoch": 7.19, "grad_norm": 1.2874751091003418, "learning_rate": 6.671111111111112e-06, "loss": 0.0151, "step": 2000 }, { "epoch": 7.19, "eval_cer": 12.361471288700715, "eval_loss": 0.4464028477668762, "eval_runtime": 940.6773, "eval_samples_per_second": 1.577, "eval_steps_per_second": 0.198, "eval_wer": 22.610978203083466, "step": 2000 }, { "epoch": 7.55, "grad_norm": 0.9422910809516907, "learning_rate": 6.448888888888889e-06, "loss": 0.0117, "step": 2100 }, { "epoch": 7.91, "grad_norm": 0.8238167762756348, "learning_rate": 6.2266666666666675e-06, "loss": 0.0119, "step": 2200 }, { "epoch": 8.27, "grad_norm": 0.7173901796340942, "learning_rate": 6.004444444444445e-06, "loss": 0.0094, "step": 2300 }, { "epoch": 8.63, "grad_norm": 0.7112721800804138, "learning_rate": 5.782222222222222e-06, "loss": 0.0103, "step": 2400 }, { "epoch": 8.99, "grad_norm": 0.7713698148727417, "learning_rate": 5.560000000000001e-06, "loss": 0.0087, "step": 2500 }, { "epoch": 9.35, "grad_norm": 0.798090934753418, "learning_rate": 5.337777777777779e-06, "loss": 0.0045, "step": 2600 }, { "epoch": 9.71, "grad_norm": 0.2931395471096039, "learning_rate": 5.115555555555556e-06, "loss": 0.0062, "step": 2700 }, { "epoch": 10.07, "grad_norm": 0.447310209274292, "learning_rate": 4.893333333333334e-06, "loss": 0.0044, "step": 2800 }, { "epoch": 10.43, "grad_norm": 0.07779108732938766, "learning_rate": 4.6711111111111115e-06, "loss": 0.0045, "step": 2900 }, { "epoch": 10.79, "grad_norm": 0.874796986579895, "learning_rate": 4.448888888888889e-06, "loss": 0.0039, "step": 3000 }, { "epoch": 10.79, "eval_cer": 12.351944958983859, "eval_loss": 0.4558783173561096, "eval_runtime": 949.821, "eval_samples_per_second": 1.561, "eval_steps_per_second": 0.196, "eval_wer": 22.042796384901646, "step": 3000 }, { "epoch": 11.15, "grad_norm": 0.058852240443229675, "learning_rate": 4.226666666666667e-06, "loss": 0.0029, "step": 3100 }, { "epoch": 11.51, "grad_norm": 0.043637294322252274, "learning_rate": 4.004444444444445e-06, "loss": 0.002, "step": 3200 }, { "epoch": 11.87, "grad_norm": 0.03788485378026962, "learning_rate": 3.782222222222223e-06, "loss": 0.0028, "step": 3300 }, { "epoch": 12.23, "grad_norm": 0.0388493537902832, "learning_rate": 3.5600000000000002e-06, "loss": 0.0021, "step": 3400 }, { "epoch": 12.59, "grad_norm": 0.36381232738494873, "learning_rate": 3.337777777777778e-06, "loss": 0.0014, "step": 3500 }, { "epoch": 12.95, "grad_norm": 0.0187509935349226, "learning_rate": 3.1155555555555555e-06, "loss": 0.0015, "step": 3600 }, { "epoch": 13.31, "grad_norm": 0.017583340406417847, "learning_rate": 2.8933333333333337e-06, "loss": 0.0016, "step": 3700 }, { "epoch": 13.67, "grad_norm": 0.019498176872730255, "learning_rate": 2.6711111111111116e-06, "loss": 0.0012, "step": 3800 }, { "epoch": 14.03, "grad_norm": 0.019207799807190895, "learning_rate": 2.448888888888889e-06, "loss": 0.0008, "step": 3900 }, { "epoch": 14.39, "grad_norm": 0.016761431470513344, "learning_rate": 2.226666666666667e-06, "loss": 0.0008, "step": 4000 }, { "epoch": 14.39, "eval_cer": 12.13548557819529, "eval_loss": 0.5066311359405518, "eval_runtime": 938.9637, "eval_samples_per_second": 1.579, "eval_steps_per_second": 0.198, "eval_wer": 22.155768208399788, "step": 4000 }, { "epoch": 14.75, "grad_norm": 0.014156104065477848, "learning_rate": 2.0044444444444446e-06, "loss": 0.0009, "step": 4100 }, { "epoch": 15.11, "grad_norm": 0.01545019168406725, "learning_rate": 1.7822222222222225e-06, "loss": 0.0008, "step": 4200 }, { "epoch": 15.47, "grad_norm": 0.011677294038236141, "learning_rate": 1.56e-06, "loss": 0.0006, "step": 4300 }, { "epoch": 15.83, "grad_norm": 0.018884949386119843, "learning_rate": 1.337777777777778e-06, "loss": 0.0006, "step": 4400 }, { "epoch": 16.19, "grad_norm": 0.013283302076160908, "learning_rate": 1.1155555555555558e-06, "loss": 0.0007, "step": 4500 }, { "epoch": 16.55, "grad_norm": 0.016208523884415627, "learning_rate": 8.933333333333334e-07, "loss": 0.0007, "step": 4600 }, { "epoch": 16.91, "grad_norm": 0.017564095556735992, "learning_rate": 6.711111111111111e-07, "loss": 0.0005, "step": 4700 }, { "epoch": 17.27, "grad_norm": 0.011613059788942337, "learning_rate": 4.488888888888889e-07, "loss": 0.0005, "step": 4800 }, { "epoch": 17.63, "grad_norm": 0.014883192256093025, "learning_rate": 2.266666666666667e-07, "loss": 0.0006, "step": 4900 }, { "epoch": 17.99, "grad_norm": 0.009823828004300594, "learning_rate": 4.444444444444445e-09, "loss": 0.0005, "step": 5000 }, { "epoch": 17.99, "eval_cer": 12.508070918232336, "eval_loss": 0.5289328098297119, "eval_runtime": 942.463, "eval_samples_per_second": 1.574, "eval_steps_per_second": 0.197, "eval_wer": 22.760499734183945, "step": 5000 } ], "logging_steps": 100, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 18, "save_steps": 1000, "total_flos": 1.696010243125248e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }