|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.2776, |
|
"eval_steps": 1000, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 88.54288482666016, |
|
"learning_rate": 4.2000000000000006e-07, |
|
"loss": 2.5968, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 71.25262451171875, |
|
"learning_rate": 9.200000000000001e-07, |
|
"loss": 2.3534, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 80.52667999267578, |
|
"learning_rate": 1.42e-06, |
|
"loss": 1.9642, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 57.064151763916016, |
|
"learning_rate": 1.9200000000000003e-06, |
|
"loss": 1.6321, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 50.983245849609375, |
|
"learning_rate": 2.42e-06, |
|
"loss": 1.5842, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 52.22635269165039, |
|
"learning_rate": 2.92e-06, |
|
"loss": 1.4106, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 53.9218864440918, |
|
"learning_rate": 3.4200000000000007e-06, |
|
"loss": 1.417, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 48.82598876953125, |
|
"learning_rate": 3.920000000000001e-06, |
|
"loss": 1.4496, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 63.08296585083008, |
|
"learning_rate": 4.42e-06, |
|
"loss": 1.3191, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 41.98011016845703, |
|
"learning_rate": 4.92e-06, |
|
"loss": 1.0756, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 53.777217864990234, |
|
"learning_rate": 5.420000000000001e-06, |
|
"loss": 1.2538, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 57.49655532836914, |
|
"learning_rate": 5.92e-06, |
|
"loss": 1.2225, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 52.660003662109375, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.2031, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 45.03782272338867, |
|
"learning_rate": 6.9e-06, |
|
"loss": 1.2953, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 49.226261138916016, |
|
"learning_rate": 7.4e-06, |
|
"loss": 1.0642, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 33.70058822631836, |
|
"learning_rate": 7.9e-06, |
|
"loss": 1.0962, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 47.71772766113281, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 1.3042, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 55.22367477416992, |
|
"learning_rate": 8.900000000000001e-06, |
|
"loss": 1.3794, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 62.34690475463867, |
|
"learning_rate": 9.4e-06, |
|
"loss": 1.1516, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 84.5876693725586, |
|
"learning_rate": 9.9e-06, |
|
"loss": 1.2527, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 53.2620735168457, |
|
"learning_rate": 9.955555555555556e-06, |
|
"loss": 1.1349, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 54.91654586791992, |
|
"learning_rate": 9.9e-06, |
|
"loss": 1.2107, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 43.851173400878906, |
|
"learning_rate": 9.844444444444446e-06, |
|
"loss": 1.4065, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 48.8751335144043, |
|
"learning_rate": 9.78888888888889e-06, |
|
"loss": 1.2255, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 58.15187454223633, |
|
"learning_rate": 9.733333333333334e-06, |
|
"loss": 1.1948, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 53.069602966308594, |
|
"learning_rate": 9.677777777777778e-06, |
|
"loss": 1.2878, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 45.524715423583984, |
|
"learning_rate": 9.622222222222222e-06, |
|
"loss": 1.1325, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 26.86398696899414, |
|
"learning_rate": 9.566666666666668e-06, |
|
"loss": 1.0762, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 48.63008499145508, |
|
"learning_rate": 9.511111111111112e-06, |
|
"loss": 1.279, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 50.58299255371094, |
|
"learning_rate": 9.455555555555557e-06, |
|
"loss": 1.0627, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 32.71992492675781, |
|
"learning_rate": 9.4e-06, |
|
"loss": 1.0438, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 42.12641143798828, |
|
"learning_rate": 9.344444444444446e-06, |
|
"loss": 1.1204, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 44.01963806152344, |
|
"learning_rate": 9.28888888888889e-06, |
|
"loss": 0.908, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 39.93177795410156, |
|
"learning_rate": 9.233333333333334e-06, |
|
"loss": 1.158, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 38.166259765625, |
|
"learning_rate": 9.17777777777778e-06, |
|
"loss": 1.2021, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 40.696678161621094, |
|
"learning_rate": 9.122222222222223e-06, |
|
"loss": 1.0054, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 48.79116439819336, |
|
"learning_rate": 9.066666666666667e-06, |
|
"loss": 0.9587, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 39.57270431518555, |
|
"learning_rate": 9.011111111111111e-06, |
|
"loss": 1.1084, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 54.647216796875, |
|
"learning_rate": 8.955555555555555e-06, |
|
"loss": 1.0416, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 42.69215774536133, |
|
"learning_rate": 8.900000000000001e-06, |
|
"loss": 0.97, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_cer": 38.476190476190474, |
|
"eval_loss": 0.7355929613113403, |
|
"eval_model_preparation_time": 0.0121, |
|
"eval_runtime": 116.5648, |
|
"eval_samples_per_second": 2.574, |
|
"eval_steps_per_second": 1.287, |
|
"eval_wer": 38.17307692307692, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 53.78874206542969, |
|
"learning_rate": 8.844444444444445e-06, |
|
"loss": 1.2455, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 34.4659423828125, |
|
"learning_rate": 8.788888888888891e-06, |
|
"loss": 1.0515, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 51.5352668762207, |
|
"learning_rate": 8.733333333333333e-06, |
|
"loss": 0.9412, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 41.004310607910156, |
|
"learning_rate": 8.677777777777779e-06, |
|
"loss": 1.0831, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 43.79686737060547, |
|
"learning_rate": 8.622222222222223e-06, |
|
"loss": 0.8895, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 32.330204010009766, |
|
"learning_rate": 8.566666666666667e-06, |
|
"loss": 0.8984, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 43.38914108276367, |
|
"learning_rate": 8.511111111111113e-06, |
|
"loss": 1.1215, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 62.52465057373047, |
|
"learning_rate": 8.455555555555555e-06, |
|
"loss": 0.9912, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 39.1291618347168, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 1.0422, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 53.39470291137695, |
|
"learning_rate": 8.344444444444445e-06, |
|
"loss": 1.1502, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 42.28481674194336, |
|
"learning_rate": 8.288888888888889e-06, |
|
"loss": 1.112, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 37.374542236328125, |
|
"learning_rate": 8.233333333333335e-06, |
|
"loss": 1.0152, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 52.94354248046875, |
|
"learning_rate": 8.177777777777779e-06, |
|
"loss": 0.9147, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 46.23354721069336, |
|
"learning_rate": 8.122222222222223e-06, |
|
"loss": 0.8429, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 40.089210510253906, |
|
"learning_rate": 8.066666666666667e-06, |
|
"loss": 0.8026, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 42.95212173461914, |
|
"learning_rate": 8.011111111111113e-06, |
|
"loss": 0.7804, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 56.216678619384766, |
|
"learning_rate": 7.955555555555557e-06, |
|
"loss": 0.8045, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 45.01803207397461, |
|
"learning_rate": 7.902222222222223e-06, |
|
"loss": 0.9807, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 63.81135940551758, |
|
"learning_rate": 7.846666666666667e-06, |
|
"loss": 0.9447, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 38.958457946777344, |
|
"learning_rate": 7.791111111111111e-06, |
|
"loss": 1.0144, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 48.87187957763672, |
|
"learning_rate": 7.735555555555557e-06, |
|
"loss": 1.0261, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 34.80329513549805, |
|
"learning_rate": 7.680000000000001e-06, |
|
"loss": 0.8152, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 45.12413024902344, |
|
"learning_rate": 7.624444444444445e-06, |
|
"loss": 0.8943, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 38.103729248046875, |
|
"learning_rate": 7.56888888888889e-06, |
|
"loss": 1.085, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 38.715492248535156, |
|
"learning_rate": 7.513333333333334e-06, |
|
"loss": 0.9853, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 42.01364517211914, |
|
"learning_rate": 7.457777777777778e-06, |
|
"loss": 0.8214, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 37.475799560546875, |
|
"learning_rate": 7.402222222222223e-06, |
|
"loss": 0.7706, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 40.387210845947266, |
|
"learning_rate": 7.346666666666668e-06, |
|
"loss": 1.081, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 21.215946197509766, |
|
"learning_rate": 7.291111111111112e-06, |
|
"loss": 0.8047, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 21.454147338867188, |
|
"learning_rate": 7.235555555555556e-06, |
|
"loss": 0.836, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 46.05310821533203, |
|
"learning_rate": 7.180000000000001e-06, |
|
"loss": 0.8883, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 61.988643646240234, |
|
"learning_rate": 7.124444444444445e-06, |
|
"loss": 0.8254, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.0038, |
|
"grad_norm": 34.00222396850586, |
|
"learning_rate": 7.06888888888889e-06, |
|
"loss": 0.5202, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.0088, |
|
"grad_norm": 23.966150283813477, |
|
"learning_rate": 7.0133333333333345e-06, |
|
"loss": 0.362, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.0138, |
|
"grad_norm": 19.050518035888672, |
|
"learning_rate": 6.9577777777777785e-06, |
|
"loss": 0.2891, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.0188, |
|
"grad_norm": 34.59785079956055, |
|
"learning_rate": 6.902222222222223e-06, |
|
"loss": 0.3438, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.0238, |
|
"grad_norm": 29.40850257873535, |
|
"learning_rate": 6.846666666666667e-06, |
|
"loss": 0.2822, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.0288, |
|
"grad_norm": 15.476716041564941, |
|
"learning_rate": 6.7911111111111115e-06, |
|
"loss": 0.4843, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.0338, |
|
"grad_norm": 33.912174224853516, |
|
"learning_rate": 6.735555555555556e-06, |
|
"loss": 0.3741, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.0388, |
|
"grad_norm": 30.411020278930664, |
|
"learning_rate": 6.680000000000001e-06, |
|
"loss": 0.3044, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0388, |
|
"eval_cer": 23.904761904761905, |
|
"eval_loss": 0.309874564409256, |
|
"eval_model_preparation_time": 0.0121, |
|
"eval_runtime": 119.6464, |
|
"eval_samples_per_second": 2.507, |
|
"eval_steps_per_second": 1.254, |
|
"eval_wer": 23.46153846153846, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0438, |
|
"grad_norm": 19.959854125976562, |
|
"learning_rate": 6.6244444444444445e-06, |
|
"loss": 0.2255, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.0488, |
|
"grad_norm": 20.751602172851562, |
|
"learning_rate": 6.568888888888889e-06, |
|
"loss": 0.3341, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.0538, |
|
"grad_norm": 24.52460479736328, |
|
"learning_rate": 6.513333333333333e-06, |
|
"loss": 0.4861, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.0588, |
|
"grad_norm": 5.740904331207275, |
|
"learning_rate": 6.457777777777778e-06, |
|
"loss": 0.4165, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.0638, |
|
"grad_norm": 24.452116012573242, |
|
"learning_rate": 6.402222222222223e-06, |
|
"loss": 0.4478, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.0688, |
|
"grad_norm": 29.230716705322266, |
|
"learning_rate": 6.346666666666668e-06, |
|
"loss": 0.4387, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.0738, |
|
"grad_norm": 41.07571792602539, |
|
"learning_rate": 6.291111111111111e-06, |
|
"loss": 0.2466, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.0788, |
|
"grad_norm": 19.89525032043457, |
|
"learning_rate": 6.235555555555556e-06, |
|
"loss": 0.3156, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0838, |
|
"grad_norm": 33.628971099853516, |
|
"learning_rate": 6.18e-06, |
|
"loss": 0.3624, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.0888, |
|
"grad_norm": 23.34870147705078, |
|
"learning_rate": 6.124444444444445e-06, |
|
"loss": 0.4263, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.0937999999999999, |
|
"grad_norm": 30.75408172607422, |
|
"learning_rate": 6.06888888888889e-06, |
|
"loss": 0.4153, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.0988, |
|
"grad_norm": 7.229944705963135, |
|
"learning_rate": 6.013333333333335e-06, |
|
"loss": 0.3068, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1038000000000001, |
|
"grad_norm": 37.87436294555664, |
|
"learning_rate": 5.957777777777778e-06, |
|
"loss": 0.3049, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.1088, |
|
"grad_norm": 33.34481430053711, |
|
"learning_rate": 5.902222222222223e-06, |
|
"loss": 0.2759, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.1138, |
|
"grad_norm": 24.211904525756836, |
|
"learning_rate": 5.846666666666667e-06, |
|
"loss": 0.35, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.1188, |
|
"grad_norm": 41.01383590698242, |
|
"learning_rate": 5.791111111111112e-06, |
|
"loss": 0.3992, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1238, |
|
"grad_norm": 6.543262004852295, |
|
"learning_rate": 5.735555555555557e-06, |
|
"loss": 0.3356, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.1288, |
|
"grad_norm": 34.344913482666016, |
|
"learning_rate": 5.68e-06, |
|
"loss": 0.3364, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.1338, |
|
"grad_norm": 9.35561752319336, |
|
"learning_rate": 5.624444444444445e-06, |
|
"loss": 0.3606, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.1388, |
|
"grad_norm": 21.762096405029297, |
|
"learning_rate": 5.56888888888889e-06, |
|
"loss": 0.3437, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.1438, |
|
"grad_norm": 29.119796752929688, |
|
"learning_rate": 5.513333333333334e-06, |
|
"loss": 0.428, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.1488, |
|
"grad_norm": 46.66371536254883, |
|
"learning_rate": 5.4577777777777785e-06, |
|
"loss": 0.4566, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.1538, |
|
"grad_norm": 15.0108642578125, |
|
"learning_rate": 5.402222222222223e-06, |
|
"loss": 0.3162, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.1588, |
|
"grad_norm": 60.40862274169922, |
|
"learning_rate": 5.346666666666667e-06, |
|
"loss": 0.3696, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.1638, |
|
"grad_norm": 26.4654598236084, |
|
"learning_rate": 5.2911111111111115e-06, |
|
"loss": 0.2831, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.1688, |
|
"grad_norm": 23.651691436767578, |
|
"learning_rate": 5.235555555555556e-06, |
|
"loss": 0.4484, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.1738, |
|
"grad_norm": 33.675167083740234, |
|
"learning_rate": 5.18e-06, |
|
"loss": 0.3062, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.1788, |
|
"grad_norm": 27.336896896362305, |
|
"learning_rate": 5.124444444444445e-06, |
|
"loss": 0.3738, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.1838, |
|
"grad_norm": 17.323768615722656, |
|
"learning_rate": 5.06888888888889e-06, |
|
"loss": 0.3182, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.1888, |
|
"grad_norm": 18.657014846801758, |
|
"learning_rate": 5.013333333333333e-06, |
|
"loss": 0.3589, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.1938, |
|
"grad_norm": 39.54949188232422, |
|
"learning_rate": 4.957777777777778e-06, |
|
"loss": 0.4187, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.1988, |
|
"grad_norm": 57.646602630615234, |
|
"learning_rate": 4.902222222222222e-06, |
|
"loss": 0.3374, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.2038, |
|
"grad_norm": 24.029294967651367, |
|
"learning_rate": 4.846666666666667e-06, |
|
"loss": 0.2373, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.2088, |
|
"grad_norm": 34.673709869384766, |
|
"learning_rate": 4.791111111111111e-06, |
|
"loss": 0.3274, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.2138, |
|
"grad_norm": 40.28670120239258, |
|
"learning_rate": 4.735555555555556e-06, |
|
"loss": 0.3105, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.2187999999999999, |
|
"grad_norm": 22.120824813842773, |
|
"learning_rate": 4.680000000000001e-06, |
|
"loss": 0.2678, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.2238, |
|
"grad_norm": 21.942718505859375, |
|
"learning_rate": 4.624444444444445e-06, |
|
"loss": 0.5025, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.2288000000000001, |
|
"grad_norm": 31.645906448364258, |
|
"learning_rate": 4.568888888888889e-06, |
|
"loss": 0.3753, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.2338, |
|
"grad_norm": 19.00498390197754, |
|
"learning_rate": 4.513333333333333e-06, |
|
"loss": 0.3421, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 1.2388, |
|
"grad_norm": 42.73635482788086, |
|
"learning_rate": 4.457777777777778e-06, |
|
"loss": 0.3108, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.2388, |
|
"eval_cer": 7.7142857142857135, |
|
"eval_loss": 0.11533673852682114, |
|
"eval_model_preparation_time": 0.0121, |
|
"eval_runtime": 120.7728, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 1.242, |
|
"eval_wer": 7.5, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.2438, |
|
"grad_norm": 20.023395538330078, |
|
"learning_rate": 4.402222222222223e-06, |
|
"loss": 0.3144, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 1.2488, |
|
"grad_norm": 32.98371505737305, |
|
"learning_rate": 4.346666666666667e-06, |
|
"loss": 0.2618, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.2538, |
|
"grad_norm": 42.568119049072266, |
|
"learning_rate": 4.291111111111112e-06, |
|
"loss": 0.331, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 1.2588, |
|
"grad_norm": 0.5769469738006592, |
|
"learning_rate": 4.235555555555556e-06, |
|
"loss": 0.3391, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.2638, |
|
"grad_norm": 20.518579483032227, |
|
"learning_rate": 4.18e-06, |
|
"loss": 0.2234, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.2688, |
|
"grad_norm": 27.69402313232422, |
|
"learning_rate": 4.124444444444445e-06, |
|
"loss": 0.295, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.2738, |
|
"grad_norm": 31.121999740600586, |
|
"learning_rate": 4.0688888888888896e-06, |
|
"loss": 0.3753, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 1.2788, |
|
"grad_norm": 14.77844524383545, |
|
"learning_rate": 4.013333333333334e-06, |
|
"loss": 0.3706, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.2838, |
|
"grad_norm": 27.002138137817383, |
|
"learning_rate": 3.9577777777777785e-06, |
|
"loss": 0.2957, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 1.2888, |
|
"grad_norm": 13.426039695739746, |
|
"learning_rate": 3.9022222222222225e-06, |
|
"loss": 0.2902, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.2938, |
|
"grad_norm": 41.75296401977539, |
|
"learning_rate": 3.8466666666666665e-06, |
|
"loss": 0.2924, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 1.2988, |
|
"grad_norm": 52.886409759521484, |
|
"learning_rate": 3.7911111111111114e-06, |
|
"loss": 0.2519, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.3038, |
|
"grad_norm": 33.73487854003906, |
|
"learning_rate": 3.7355555555555555e-06, |
|
"loss": 0.2966, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 1.3088, |
|
"grad_norm": 27.994157791137695, |
|
"learning_rate": 3.6800000000000003e-06, |
|
"loss": 0.2887, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.3138, |
|
"grad_norm": 32.397579193115234, |
|
"learning_rate": 3.624444444444445e-06, |
|
"loss": 0.2368, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 1.3188, |
|
"grad_norm": 25.96181869506836, |
|
"learning_rate": 3.568888888888889e-06, |
|
"loss": 0.2911, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.3237999999999999, |
|
"grad_norm": 7.705018520355225, |
|
"learning_rate": 3.5133333333333337e-06, |
|
"loss": 0.2647, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 1.3288, |
|
"grad_norm": 28.499221801757812, |
|
"learning_rate": 3.457777777777778e-06, |
|
"loss": 0.4149, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.3338, |
|
"grad_norm": 40.06334686279297, |
|
"learning_rate": 3.4022222222222222e-06, |
|
"loss": 0.2952, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 1.3388, |
|
"grad_norm": 1.474075198173523, |
|
"learning_rate": 3.346666666666667e-06, |
|
"loss": 0.207, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.3437999999999999, |
|
"grad_norm": 27.121145248413086, |
|
"learning_rate": 3.2911111111111116e-06, |
|
"loss": 0.3432, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 1.3488, |
|
"grad_norm": 22.38213539123535, |
|
"learning_rate": 3.2355555555555556e-06, |
|
"loss": 0.3522, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.3538000000000001, |
|
"grad_norm": 42.13764190673828, |
|
"learning_rate": 3.1800000000000005e-06, |
|
"loss": 0.3729, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 1.3588, |
|
"grad_norm": 27.436649322509766, |
|
"learning_rate": 3.124444444444445e-06, |
|
"loss": 0.1482, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.0026, |
|
"grad_norm": 16.43301010131836, |
|
"learning_rate": 3.068888888888889e-06, |
|
"loss": 0.1802, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 2.0076, |
|
"grad_norm": 11.994711875915527, |
|
"learning_rate": 3.013333333333334e-06, |
|
"loss": 0.0922, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.0126, |
|
"grad_norm": 7.826560020446777, |
|
"learning_rate": 2.957777777777778e-06, |
|
"loss": 0.0312, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 2.0176, |
|
"grad_norm": 6.457350730895996, |
|
"learning_rate": 2.9022222222222223e-06, |
|
"loss": 0.0343, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.0226, |
|
"grad_norm": 1.624599575996399, |
|
"learning_rate": 2.8466666666666672e-06, |
|
"loss": 0.0751, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 2.0276, |
|
"grad_norm": 4.553808212280273, |
|
"learning_rate": 2.7911111111111113e-06, |
|
"loss": 0.0905, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.0326, |
|
"grad_norm": 4.33929967880249, |
|
"learning_rate": 2.7355555555555557e-06, |
|
"loss": 0.079, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 2.0376, |
|
"grad_norm": 11.390565872192383, |
|
"learning_rate": 2.68e-06, |
|
"loss": 0.064, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.0426, |
|
"grad_norm": 0.5454270243644714, |
|
"learning_rate": 2.6244444444444446e-06, |
|
"loss": 0.0351, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 2.0476, |
|
"grad_norm": 25.50840950012207, |
|
"learning_rate": 2.568888888888889e-06, |
|
"loss": 0.0726, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.0526, |
|
"grad_norm": 5.380075931549072, |
|
"learning_rate": 2.5133333333333336e-06, |
|
"loss": 0.0405, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 2.0576, |
|
"grad_norm": 1.0126858949661255, |
|
"learning_rate": 2.457777777777778e-06, |
|
"loss": 0.0858, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.0626, |
|
"grad_norm": 1.7211323976516724, |
|
"learning_rate": 2.4022222222222225e-06, |
|
"loss": 0.0431, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 2.0676, |
|
"grad_norm": 2.049405574798584, |
|
"learning_rate": 2.346666666666667e-06, |
|
"loss": 0.0127, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.0726, |
|
"grad_norm": 1.535748839378357, |
|
"learning_rate": 2.2911111111111114e-06, |
|
"loss": 0.0559, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 2.0776, |
|
"grad_norm": 19.566328048706055, |
|
"learning_rate": 2.235555555555556e-06, |
|
"loss": 0.0544, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.0776, |
|
"eval_cer": 2.2857142857142856, |
|
"eval_loss": 0.029534637928009033, |
|
"eval_model_preparation_time": 0.0121, |
|
"eval_runtime": 119.7419, |
|
"eval_samples_per_second": 2.505, |
|
"eval_steps_per_second": 1.253, |
|
"eval_wer": 2.307692307692308, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.0826000000000002, |
|
"grad_norm": 0.4915294945240021, |
|
"learning_rate": 2.1800000000000003e-06, |
|
"loss": 0.0556, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 2.0876, |
|
"grad_norm": 13.67375659942627, |
|
"learning_rate": 2.1244444444444443e-06, |
|
"loss": 0.0377, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.0926, |
|
"grad_norm": 0.06471225619316101, |
|
"learning_rate": 2.0688888888888892e-06, |
|
"loss": 0.0535, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 2.0976, |
|
"grad_norm": 0.1591552495956421, |
|
"learning_rate": 2.0133333333333337e-06, |
|
"loss": 0.037, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.1026, |
|
"grad_norm": 1.6957018375396729, |
|
"learning_rate": 1.9577777777777777e-06, |
|
"loss": 0.0548, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 2.1076, |
|
"grad_norm": 7.906589508056641, |
|
"learning_rate": 1.9022222222222222e-06, |
|
"loss": 0.066, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.1126, |
|
"grad_norm": 0.4227987825870514, |
|
"learning_rate": 1.8466666666666668e-06, |
|
"loss": 0.0687, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 2.1176, |
|
"grad_norm": 0.3206275999546051, |
|
"learning_rate": 1.7911111111111113e-06, |
|
"loss": 0.0263, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.1226, |
|
"grad_norm": 0.16232123970985413, |
|
"learning_rate": 1.7355555555555555e-06, |
|
"loss": 0.0257, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 2.1276, |
|
"grad_norm": 21.80668830871582, |
|
"learning_rate": 1.6800000000000002e-06, |
|
"loss": 0.0346, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.1326, |
|
"grad_norm": 3.021885871887207, |
|
"learning_rate": 1.6244444444444447e-06, |
|
"loss": 0.0715, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 2.1376, |
|
"grad_norm": 9.51052188873291, |
|
"learning_rate": 1.568888888888889e-06, |
|
"loss": 0.0686, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.1426, |
|
"grad_norm": 7.770538806915283, |
|
"learning_rate": 1.5133333333333334e-06, |
|
"loss": 0.0681, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 2.1476, |
|
"grad_norm": 24.541362762451172, |
|
"learning_rate": 1.457777777777778e-06, |
|
"loss": 0.0589, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.1526, |
|
"grad_norm": 0.2108163684606552, |
|
"learning_rate": 1.4022222222222223e-06, |
|
"loss": 0.0407, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 2.1576, |
|
"grad_norm": 39.73221206665039, |
|
"learning_rate": 1.3466666666666668e-06, |
|
"loss": 0.0895, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.1626, |
|
"grad_norm": 4.3332438468933105, |
|
"learning_rate": 1.2911111111111112e-06, |
|
"loss": 0.0645, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 2.1676, |
|
"grad_norm": 1.142255425453186, |
|
"learning_rate": 1.2355555555555557e-06, |
|
"loss": 0.0592, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.1726, |
|
"grad_norm": 0.046843066811561584, |
|
"learning_rate": 1.1800000000000001e-06, |
|
"loss": 0.034, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 2.1776, |
|
"grad_norm": 14.69985580444336, |
|
"learning_rate": 1.1244444444444446e-06, |
|
"loss": 0.0704, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.1826, |
|
"grad_norm": 11.02341079711914, |
|
"learning_rate": 1.068888888888889e-06, |
|
"loss": 0.0621, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 2.1875999999999998, |
|
"grad_norm": 0.05967080965638161, |
|
"learning_rate": 1.0133333333333333e-06, |
|
"loss": 0.0331, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.1926, |
|
"grad_norm": 9.536999702453613, |
|
"learning_rate": 9.57777777777778e-07, |
|
"loss": 0.0762, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 2.1976, |
|
"grad_norm": 3.740366220474243, |
|
"learning_rate": 9.022222222222222e-07, |
|
"loss": 0.0346, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.2026, |
|
"grad_norm": 1.1851881742477417, |
|
"learning_rate": 8.466666666666668e-07, |
|
"loss": 0.0502, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 2.2076000000000002, |
|
"grad_norm": 23.711048126220703, |
|
"learning_rate": 7.911111111111111e-07, |
|
"loss": 0.0661, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.2126, |
|
"grad_norm": 1.0020112991333008, |
|
"learning_rate": 7.355555555555556e-07, |
|
"loss": 0.0252, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 2.2176, |
|
"grad_norm": 21.021774291992188, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.0415, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.2226, |
|
"grad_norm": 0.8725367784500122, |
|
"learning_rate": 6.244444444444445e-07, |
|
"loss": 0.0293, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 2.2276, |
|
"grad_norm": 0.3742901086807251, |
|
"learning_rate": 5.68888888888889e-07, |
|
"loss": 0.0146, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.2326, |
|
"grad_norm": 12.225566864013672, |
|
"learning_rate": 5.133333333333334e-07, |
|
"loss": 0.0775, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 2.2376, |
|
"grad_norm": 0.5857837200164795, |
|
"learning_rate": 4.5777777777777784e-07, |
|
"loss": 0.0335, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.2426, |
|
"grad_norm": 2.0213797092437744, |
|
"learning_rate": 4.0222222222222224e-07, |
|
"loss": 0.06, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 2.2476, |
|
"grad_norm": 0.22115977108478546, |
|
"learning_rate": 3.466666666666667e-07, |
|
"loss": 0.0437, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.2526, |
|
"grad_norm": 11.783333778381348, |
|
"learning_rate": 2.9111111111111116e-07, |
|
"loss": 0.0454, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 2.2576, |
|
"grad_norm": 2.67048978805542, |
|
"learning_rate": 2.3555555555555556e-07, |
|
"loss": 0.093, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.2626, |
|
"grad_norm": 0.16820040345191956, |
|
"learning_rate": 1.8e-07, |
|
"loss": 0.1193, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 2.2676, |
|
"grad_norm": 1.518930435180664, |
|
"learning_rate": 1.2444444444444446e-07, |
|
"loss": 0.0493, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.2726, |
|
"grad_norm": 0.10118613392114639, |
|
"learning_rate": 6.888888888888889e-08, |
|
"loss": 0.0538, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 2.2776, |
|
"grad_norm": 0.5119066834449768, |
|
"learning_rate": 1.3333333333333334e-08, |
|
"loss": 0.0678, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.2776, |
|
"eval_cer": 0.9523809523809524, |
|
"eval_loss": 0.014132725074887276, |
|
"eval_model_preparation_time": 0.0121, |
|
"eval_runtime": 119.856, |
|
"eval_samples_per_second": 2.503, |
|
"eval_steps_per_second": 1.252, |
|
"eval_wer": 0.9615384615384616, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.2776, |
|
"step": 5000, |
|
"total_flos": 1.02060490752e+19, |
|
"train_loss": 0.5477467903137208, |
|
"train_runtime": 5409.143, |
|
"train_samples_per_second": 1.849, |
|
"train_steps_per_second": 0.924 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.02060490752e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|