|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 453.84615384615387, |
|
"eval_steps": 100, |
|
"global_step": 29500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 47.11796188354492, |
|
"learning_rate": 9.990900000000001e-06, |
|
"loss": 3.6644, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 2.4919605255126953, |
|
"eval_runtime": 12.5517, |
|
"eval_samples_per_second": 10.437, |
|
"eval_steps_per_second": 1.354, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 31.28130531311035, |
|
"learning_rate": 9.980900000000001e-06, |
|
"loss": 2.2347, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 2.156316041946411, |
|
"eval_runtime": 11.1792, |
|
"eval_samples_per_second": 11.718, |
|
"eval_steps_per_second": 1.521, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 39.616390228271484, |
|
"learning_rate": 9.970900000000001e-06, |
|
"loss": 2.0254, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 2.024153709411621, |
|
"eval_runtime": 11.1934, |
|
"eval_samples_per_second": 11.703, |
|
"eval_steps_per_second": 1.519, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 28.285825729370117, |
|
"learning_rate": 9.960900000000001e-06, |
|
"loss": 1.9361, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"eval_loss": 1.9094743728637695, |
|
"eval_runtime": 11.3855, |
|
"eval_samples_per_second": 11.506, |
|
"eval_steps_per_second": 1.493, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 34.14302062988281, |
|
"learning_rate": 9.950900000000002e-06, |
|
"loss": 1.8531, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"eval_loss": 1.8729331493377686, |
|
"eval_runtime": 11.2935, |
|
"eval_samples_per_second": 11.6, |
|
"eval_steps_per_second": 1.505, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 39.09531784057617, |
|
"learning_rate": 9.940900000000002e-06, |
|
"loss": 1.7669, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"eval_loss": 1.831756830215454, |
|
"eval_runtime": 11.0535, |
|
"eval_samples_per_second": 11.851, |
|
"eval_steps_per_second": 1.538, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"grad_norm": 93.24444580078125, |
|
"learning_rate": 9.930900000000002e-06, |
|
"loss": 1.7518, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"eval_loss": 1.7832175493240356, |
|
"eval_runtime": 11.1684, |
|
"eval_samples_per_second": 11.729, |
|
"eval_steps_per_second": 1.522, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 32.21013641357422, |
|
"learning_rate": 9.920900000000002e-06, |
|
"loss": 1.7149, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"eval_loss": 1.7581098079681396, |
|
"eval_runtime": 11.101, |
|
"eval_samples_per_second": 11.801, |
|
"eval_steps_per_second": 1.531, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"grad_norm": 59.90657043457031, |
|
"learning_rate": 9.9109e-06, |
|
"loss": 1.6734, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"eval_loss": 1.7163844108581543, |
|
"eval_runtime": 11.1167, |
|
"eval_samples_per_second": 11.784, |
|
"eval_steps_per_second": 1.529, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 20.61592674255371, |
|
"learning_rate": 9.9009e-06, |
|
"loss": 1.6612, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"eval_loss": 1.6949567794799805, |
|
"eval_runtime": 11.0663, |
|
"eval_samples_per_second": 11.838, |
|
"eval_steps_per_second": 1.536, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"grad_norm": 17.60099220275879, |
|
"learning_rate": 9.8909e-06, |
|
"loss": 1.6199, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"eval_loss": 1.6769332885742188, |
|
"eval_runtime": 11.0531, |
|
"eval_samples_per_second": 11.852, |
|
"eval_steps_per_second": 1.538, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"grad_norm": 20.802692413330078, |
|
"learning_rate": 9.8809e-06, |
|
"loss": 1.6008, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"eval_loss": 1.6524990797042847, |
|
"eval_runtime": 11.0831, |
|
"eval_samples_per_second": 11.82, |
|
"eval_steps_per_second": 1.534, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 21.809823989868164, |
|
"learning_rate": 9.8709e-06, |
|
"loss": 1.5812, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 1.6428295373916626, |
|
"eval_runtime": 11.1093, |
|
"eval_samples_per_second": 11.792, |
|
"eval_steps_per_second": 1.53, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"grad_norm": 46.8908576965332, |
|
"learning_rate": 9.8609e-06, |
|
"loss": 1.5419, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"eval_loss": 1.6006404161453247, |
|
"eval_runtime": 11.2393, |
|
"eval_samples_per_second": 11.655, |
|
"eval_steps_per_second": 1.513, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"grad_norm": 27.15238380432129, |
|
"learning_rate": 9.8509e-06, |
|
"loss": 1.5374, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"eval_loss": 1.5862094163894653, |
|
"eval_runtime": 11.1815, |
|
"eval_samples_per_second": 11.716, |
|
"eval_steps_per_second": 1.52, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"grad_norm": 40.26778030395508, |
|
"learning_rate": 9.840900000000001e-06, |
|
"loss": 1.4923, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"eval_loss": 1.576373815536499, |
|
"eval_runtime": 11.1215, |
|
"eval_samples_per_second": 11.779, |
|
"eval_steps_per_second": 1.529, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"grad_norm": 35.266971588134766, |
|
"learning_rate": 9.830900000000001e-06, |
|
"loss": 1.4989, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"eval_loss": 1.5671430826187134, |
|
"eval_runtime": 11.1873, |
|
"eval_samples_per_second": 11.71, |
|
"eval_steps_per_second": 1.52, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"grad_norm": 26.813480377197266, |
|
"learning_rate": 9.820900000000001e-06, |
|
"loss": 1.4711, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"eval_loss": 1.522908329963684, |
|
"eval_runtime": 11.2106, |
|
"eval_samples_per_second": 11.685, |
|
"eval_steps_per_second": 1.516, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"grad_norm": 24.576723098754883, |
|
"learning_rate": 9.810900000000001e-06, |
|
"loss": 1.4421, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"eval_loss": 1.5039104223251343, |
|
"eval_runtime": 11.257, |
|
"eval_samples_per_second": 11.637, |
|
"eval_steps_per_second": 1.51, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"grad_norm": 28.480438232421875, |
|
"learning_rate": 9.800900000000001e-06, |
|
"loss": 1.4347, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"eval_loss": 1.5123459100723267, |
|
"eval_runtime": 11.187, |
|
"eval_samples_per_second": 11.71, |
|
"eval_steps_per_second": 1.52, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"grad_norm": 56.582088470458984, |
|
"learning_rate": 9.790900000000001e-06, |
|
"loss": 1.4212, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"eval_loss": 1.481844425201416, |
|
"eval_runtime": 11.2075, |
|
"eval_samples_per_second": 11.689, |
|
"eval_steps_per_second": 1.517, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"grad_norm": 38.5254020690918, |
|
"learning_rate": 9.780900000000002e-06, |
|
"loss": 1.3908, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"eval_loss": 1.4529048204421997, |
|
"eval_runtime": 11.197, |
|
"eval_samples_per_second": 11.7, |
|
"eval_steps_per_second": 1.518, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"grad_norm": 44.74857711791992, |
|
"learning_rate": 9.770900000000002e-06, |
|
"loss": 1.3734, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"eval_loss": 1.4617427587509155, |
|
"eval_runtime": 11.1235, |
|
"eval_samples_per_second": 11.777, |
|
"eval_steps_per_second": 1.528, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"grad_norm": 28.926782608032227, |
|
"learning_rate": 9.760900000000002e-06, |
|
"loss": 1.365, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"eval_loss": 1.4297789335250854, |
|
"eval_runtime": 11.3007, |
|
"eval_samples_per_second": 11.592, |
|
"eval_steps_per_second": 1.504, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 27.750213623046875, |
|
"learning_rate": 9.7509e-06, |
|
"loss": 1.3306, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"eval_loss": 1.4345914125442505, |
|
"eval_runtime": 11.2795, |
|
"eval_samples_per_second": 11.614, |
|
"eval_steps_per_second": 1.507, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 35.288352966308594, |
|
"learning_rate": 9.7409e-06, |
|
"loss": 1.3677, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 1.447089433670044, |
|
"eval_runtime": 11.1366, |
|
"eval_samples_per_second": 11.763, |
|
"eval_steps_per_second": 1.527, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"grad_norm": 65.49736022949219, |
|
"learning_rate": 9.7309e-06, |
|
"loss": 1.3453, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"eval_loss": 1.405300259590149, |
|
"eval_runtime": 11.1549, |
|
"eval_samples_per_second": 11.744, |
|
"eval_steps_per_second": 1.524, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"grad_norm": 24.333518981933594, |
|
"learning_rate": 9.7209e-06, |
|
"loss": 1.3206, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"eval_loss": 1.4218717813491821, |
|
"eval_runtime": 11.3113, |
|
"eval_samples_per_second": 11.581, |
|
"eval_steps_per_second": 1.503, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"grad_norm": 45.50777816772461, |
|
"learning_rate": 9.7109e-06, |
|
"loss": 1.3363, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"eval_loss": 1.4220006465911865, |
|
"eval_runtime": 11.156, |
|
"eval_samples_per_second": 11.743, |
|
"eval_steps_per_second": 1.524, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"grad_norm": 25.898344039916992, |
|
"learning_rate": 9.7009e-06, |
|
"loss": 1.2995, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"eval_loss": 1.3942431211471558, |
|
"eval_runtime": 11.2282, |
|
"eval_samples_per_second": 11.667, |
|
"eval_steps_per_second": 1.514, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"grad_norm": 56.0889892578125, |
|
"learning_rate": 9.6909e-06, |
|
"loss": 1.2994, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"eval_loss": 1.3970586061477661, |
|
"eval_runtime": 11.1682, |
|
"eval_samples_per_second": 11.73, |
|
"eval_steps_per_second": 1.522, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"grad_norm": 58.10311508178711, |
|
"learning_rate": 9.6809e-06, |
|
"loss": 1.2761, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"eval_loss": 1.390371561050415, |
|
"eval_runtime": 11.2406, |
|
"eval_samples_per_second": 11.654, |
|
"eval_steps_per_second": 1.512, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"grad_norm": 17.050870895385742, |
|
"learning_rate": 9.670900000000001e-06, |
|
"loss": 1.2712, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"eval_loss": 1.3936753273010254, |
|
"eval_runtime": 11.2364, |
|
"eval_samples_per_second": 11.659, |
|
"eval_steps_per_second": 1.513, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"grad_norm": 232.21804809570312, |
|
"learning_rate": 9.660900000000001e-06, |
|
"loss": 1.262, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"eval_loss": 1.4037091732025146, |
|
"eval_runtime": 11.2231, |
|
"eval_samples_per_second": 11.672, |
|
"eval_steps_per_second": 1.515, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"grad_norm": 35.11832046508789, |
|
"learning_rate": 9.650900000000001e-06, |
|
"loss": 1.2788, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"eval_loss": 1.3545280694961548, |
|
"eval_runtime": 11.1609, |
|
"eval_samples_per_second": 11.737, |
|
"eval_steps_per_second": 1.523, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"grad_norm": 27.077022552490234, |
|
"learning_rate": 9.640900000000001e-06, |
|
"loss": 1.2711, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"eval_loss": 1.3528518676757812, |
|
"eval_runtime": 11.2852, |
|
"eval_samples_per_second": 11.608, |
|
"eval_steps_per_second": 1.506, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"grad_norm": 46.97712326049805, |
|
"learning_rate": 9.630900000000001e-06, |
|
"loss": 1.2492, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"eval_loss": 1.3534098863601685, |
|
"eval_runtime": 11.2368, |
|
"eval_samples_per_second": 11.658, |
|
"eval_steps_per_second": 1.513, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"grad_norm": 42.06857681274414, |
|
"learning_rate": 9.620900000000001e-06, |
|
"loss": 1.2506, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"eval_loss": 1.3613977432250977, |
|
"eval_runtime": 11.2206, |
|
"eval_samples_per_second": 11.675, |
|
"eval_steps_per_second": 1.515, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 19.298952102661133, |
|
"learning_rate": 9.610900000000001e-06, |
|
"loss": 1.2201, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_loss": 1.3586474657058716, |
|
"eval_runtime": 11.2045, |
|
"eval_samples_per_second": 11.692, |
|
"eval_steps_per_second": 1.517, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"grad_norm": 30.0198974609375, |
|
"learning_rate": 9.600900000000002e-06, |
|
"loss": 1.2086, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"eval_loss": 1.3304755687713623, |
|
"eval_runtime": 11.193, |
|
"eval_samples_per_second": 11.704, |
|
"eval_steps_per_second": 1.519, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"grad_norm": 37.59902572631836, |
|
"learning_rate": 9.5909e-06, |
|
"loss": 1.2375, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"eval_loss": 1.331407904624939, |
|
"eval_runtime": 10.6714, |
|
"eval_samples_per_second": 12.276, |
|
"eval_steps_per_second": 1.593, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"grad_norm": 36.82079315185547, |
|
"learning_rate": 9.5809e-06, |
|
"loss": 1.2148, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"eval_loss": 1.3441548347473145, |
|
"eval_runtime": 10.7472, |
|
"eval_samples_per_second": 12.189, |
|
"eval_steps_per_second": 1.582, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"grad_norm": 30.974130630493164, |
|
"learning_rate": 9.5709e-06, |
|
"loss": 1.197, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"eval_loss": 1.34512197971344, |
|
"eval_runtime": 10.7398, |
|
"eval_samples_per_second": 12.198, |
|
"eval_steps_per_second": 1.583, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 67.6923076923077, |
|
"grad_norm": 20.45345115661621, |
|
"learning_rate": 9.5609e-06, |
|
"loss": 1.2361, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 67.6923076923077, |
|
"eval_loss": 1.3371080160140991, |
|
"eval_runtime": 10.654, |
|
"eval_samples_per_second": 12.296, |
|
"eval_steps_per_second": 1.596, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 69.23076923076923, |
|
"grad_norm": 19.758630752563477, |
|
"learning_rate": 9.5509e-06, |
|
"loss": 1.2001, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 69.23076923076923, |
|
"eval_loss": 1.3270760774612427, |
|
"eval_runtime": 10.6928, |
|
"eval_samples_per_second": 12.251, |
|
"eval_steps_per_second": 1.59, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 70.76923076923077, |
|
"grad_norm": 45.2899055480957, |
|
"learning_rate": 9.5409e-06, |
|
"loss": 1.192, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 70.76923076923077, |
|
"eval_loss": 1.3184590339660645, |
|
"eval_runtime": 10.7154, |
|
"eval_samples_per_second": 12.225, |
|
"eval_steps_per_second": 1.586, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 72.3076923076923, |
|
"grad_norm": 45.60734939575195, |
|
"learning_rate": 9.5309e-06, |
|
"loss": 1.2081, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 72.3076923076923, |
|
"eval_loss": 1.3107666969299316, |
|
"eval_runtime": 10.7029, |
|
"eval_samples_per_second": 12.24, |
|
"eval_steps_per_second": 1.588, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 73.84615384615384, |
|
"grad_norm": 26.859603881835938, |
|
"learning_rate": 9.5209e-06, |
|
"loss": 1.1729, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 73.84615384615384, |
|
"eval_loss": 1.310062289237976, |
|
"eval_runtime": 10.7544, |
|
"eval_samples_per_second": 12.181, |
|
"eval_steps_per_second": 1.581, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 75.38461538461539, |
|
"grad_norm": 19.90914535522461, |
|
"learning_rate": 9.5109e-06, |
|
"loss": 1.1899, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 75.38461538461539, |
|
"eval_loss": 1.3038017749786377, |
|
"eval_runtime": 10.6532, |
|
"eval_samples_per_second": 12.297, |
|
"eval_steps_per_second": 1.596, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 76.92307692307692, |
|
"grad_norm": 22.629934310913086, |
|
"learning_rate": 9.5009e-06, |
|
"loss": 1.1875, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 76.92307692307692, |
|
"eval_loss": 1.2785409688949585, |
|
"eval_runtime": 10.6388, |
|
"eval_samples_per_second": 12.313, |
|
"eval_steps_per_second": 1.598, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 78.46153846153847, |
|
"grad_norm": 45.462059020996094, |
|
"learning_rate": 9.490900000000001e-06, |
|
"loss": 1.1717, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 78.46153846153847, |
|
"eval_loss": 1.278078317642212, |
|
"eval_runtime": 10.9666, |
|
"eval_samples_per_second": 11.945, |
|
"eval_steps_per_second": 1.55, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 34.85255432128906, |
|
"learning_rate": 9.480900000000001e-06, |
|
"loss": 1.1657, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_loss": 1.2711185216903687, |
|
"eval_runtime": 11.0211, |
|
"eval_samples_per_second": 11.886, |
|
"eval_steps_per_second": 1.543, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 81.53846153846153, |
|
"grad_norm": 19.078449249267578, |
|
"learning_rate": 9.470900000000001e-06, |
|
"loss": 1.1814, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 81.53846153846153, |
|
"eval_loss": 1.2781996726989746, |
|
"eval_runtime": 11.0663, |
|
"eval_samples_per_second": 11.838, |
|
"eval_steps_per_second": 1.536, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 83.07692307692308, |
|
"grad_norm": 31.05898094177246, |
|
"learning_rate": 9.460900000000001e-06, |
|
"loss": 1.1452, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 83.07692307692308, |
|
"eval_loss": 1.2848775386810303, |
|
"eval_runtime": 11.01, |
|
"eval_samples_per_second": 11.898, |
|
"eval_steps_per_second": 1.544, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 84.61538461538461, |
|
"grad_norm": 28.712461471557617, |
|
"learning_rate": 9.450900000000001e-06, |
|
"loss": 1.1465, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 84.61538461538461, |
|
"eval_loss": 1.2928494215011597, |
|
"eval_runtime": 10.9253, |
|
"eval_samples_per_second": 11.991, |
|
"eval_steps_per_second": 1.556, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 86.15384615384616, |
|
"grad_norm": 19.871828079223633, |
|
"learning_rate": 9.440900000000001e-06, |
|
"loss": 1.1736, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 86.15384615384616, |
|
"eval_loss": 1.2648124694824219, |
|
"eval_runtime": 11.0314, |
|
"eval_samples_per_second": 11.875, |
|
"eval_steps_per_second": 1.541, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 87.6923076923077, |
|
"grad_norm": 22.47665023803711, |
|
"learning_rate": 9.4309e-06, |
|
"loss": 1.1184, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 87.6923076923077, |
|
"eval_loss": 1.2936598062515259, |
|
"eval_runtime": 10.9322, |
|
"eval_samples_per_second": 11.983, |
|
"eval_steps_per_second": 1.555, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 89.23076923076923, |
|
"grad_norm": 38.79877471923828, |
|
"learning_rate": 9.4209e-06, |
|
"loss": 1.1616, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 89.23076923076923, |
|
"eval_loss": 1.2650004625320435, |
|
"eval_runtime": 10.9434, |
|
"eval_samples_per_second": 11.971, |
|
"eval_steps_per_second": 1.553, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 90.76923076923077, |
|
"grad_norm": 40.851097106933594, |
|
"learning_rate": 9.4109e-06, |
|
"loss": 1.1469, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 90.76923076923077, |
|
"eval_loss": 1.252148151397705, |
|
"eval_runtime": 10.9867, |
|
"eval_samples_per_second": 11.923, |
|
"eval_steps_per_second": 1.547, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 92.3076923076923, |
|
"grad_norm": 21.35544204711914, |
|
"learning_rate": 9.4009e-06, |
|
"loss": 1.1489, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 92.3076923076923, |
|
"eval_loss": 1.259344220161438, |
|
"eval_runtime": 10.9649, |
|
"eval_samples_per_second": 11.947, |
|
"eval_steps_per_second": 1.55, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 93.84615384615384, |
|
"grad_norm": 33.265132904052734, |
|
"learning_rate": 9.3909e-06, |
|
"loss": 1.1315, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 93.84615384615384, |
|
"eval_loss": 1.252693772315979, |
|
"eval_runtime": 10.9852, |
|
"eval_samples_per_second": 11.925, |
|
"eval_steps_per_second": 1.548, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 95.38461538461539, |
|
"grad_norm": 23.43667221069336, |
|
"learning_rate": 9.381e-06, |
|
"loss": 1.119, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 95.38461538461539, |
|
"eval_loss": 1.254772424697876, |
|
"eval_runtime": 11.0937, |
|
"eval_samples_per_second": 11.808, |
|
"eval_steps_per_second": 1.532, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 96.92307692307692, |
|
"grad_norm": 51.93602752685547, |
|
"learning_rate": 9.371e-06, |
|
"loss": 1.1333, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 96.92307692307692, |
|
"eval_loss": 1.249408483505249, |
|
"eval_runtime": 11.0388, |
|
"eval_samples_per_second": 11.867, |
|
"eval_steps_per_second": 1.54, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 98.46153846153847, |
|
"grad_norm": 23.473421096801758, |
|
"learning_rate": 9.361e-06, |
|
"loss": 1.1164, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 98.46153846153847, |
|
"eval_loss": 1.2438522577285767, |
|
"eval_runtime": 11.1581, |
|
"eval_samples_per_second": 11.74, |
|
"eval_steps_per_second": 1.524, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 24.228403091430664, |
|
"learning_rate": 9.351e-06, |
|
"loss": 1.1333, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_loss": 1.248632788658142, |
|
"eval_runtime": 10.9675, |
|
"eval_samples_per_second": 11.944, |
|
"eval_steps_per_second": 1.55, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 101.53846153846153, |
|
"grad_norm": 18.29631996154785, |
|
"learning_rate": 9.341000000000001e-06, |
|
"loss": 1.1082, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 101.53846153846153, |
|
"eval_loss": 1.2509865760803223, |
|
"eval_runtime": 11.0693, |
|
"eval_samples_per_second": 11.834, |
|
"eval_steps_per_second": 1.536, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 103.07692307692308, |
|
"grad_norm": 42.855491638183594, |
|
"learning_rate": 9.331000000000001e-06, |
|
"loss": 1.1178, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 103.07692307692308, |
|
"eval_loss": 1.2890292406082153, |
|
"eval_runtime": 11.0366, |
|
"eval_samples_per_second": 11.87, |
|
"eval_steps_per_second": 1.54, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 104.61538461538461, |
|
"grad_norm": 46.675655364990234, |
|
"learning_rate": 9.321000000000001e-06, |
|
"loss": 1.1106, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 104.61538461538461, |
|
"eval_loss": 1.2719863653182983, |
|
"eval_runtime": 11.1266, |
|
"eval_samples_per_second": 11.774, |
|
"eval_steps_per_second": 1.528, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 106.15384615384616, |
|
"grad_norm": 26.414846420288086, |
|
"learning_rate": 9.311000000000001e-06, |
|
"loss": 1.1216, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 106.15384615384616, |
|
"eval_loss": 1.2423394918441772, |
|
"eval_runtime": 11.0197, |
|
"eval_samples_per_second": 11.888, |
|
"eval_steps_per_second": 1.543, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 107.6923076923077, |
|
"grad_norm": 31.4022274017334, |
|
"learning_rate": 9.301000000000001e-06, |
|
"loss": 1.1052, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 107.6923076923077, |
|
"eval_loss": 1.2372961044311523, |
|
"eval_runtime": 11.1127, |
|
"eval_samples_per_second": 11.788, |
|
"eval_steps_per_second": 1.53, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 109.23076923076923, |
|
"grad_norm": 23.16703987121582, |
|
"learning_rate": 9.291000000000001e-06, |
|
"loss": 1.0911, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 109.23076923076923, |
|
"eval_loss": 1.2309863567352295, |
|
"eval_runtime": 10.9572, |
|
"eval_samples_per_second": 11.956, |
|
"eval_steps_per_second": 1.551, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 110.76923076923077, |
|
"grad_norm": 21.648773193359375, |
|
"learning_rate": 9.281000000000001e-06, |
|
"loss": 1.0956, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 110.76923076923077, |
|
"eval_loss": 1.2261079549789429, |
|
"eval_runtime": 10.9351, |
|
"eval_samples_per_second": 11.98, |
|
"eval_steps_per_second": 1.555, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 112.3076923076923, |
|
"grad_norm": 24.5791072845459, |
|
"learning_rate": 9.271000000000002e-06, |
|
"loss": 1.0751, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 112.3076923076923, |
|
"eval_loss": 1.2161471843719482, |
|
"eval_runtime": 11.1052, |
|
"eval_samples_per_second": 11.796, |
|
"eval_steps_per_second": 1.531, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 113.84615384615384, |
|
"grad_norm": 35.867801666259766, |
|
"learning_rate": 9.261000000000002e-06, |
|
"loss": 1.086, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 113.84615384615384, |
|
"eval_loss": 1.2092362642288208, |
|
"eval_runtime": 11.2048, |
|
"eval_samples_per_second": 11.691, |
|
"eval_steps_per_second": 1.517, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 115.38461538461539, |
|
"grad_norm": 67.91041564941406, |
|
"learning_rate": 9.251000000000002e-06, |
|
"loss": 1.092, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 115.38461538461539, |
|
"eval_loss": 1.241829514503479, |
|
"eval_runtime": 10.9937, |
|
"eval_samples_per_second": 11.916, |
|
"eval_steps_per_second": 1.546, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 116.92307692307692, |
|
"grad_norm": 128.73751831054688, |
|
"learning_rate": 9.241000000000002e-06, |
|
"loss": 1.0764, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 116.92307692307692, |
|
"eval_loss": 1.2462713718414307, |
|
"eval_runtime": 10.9625, |
|
"eval_samples_per_second": 11.95, |
|
"eval_steps_per_second": 1.551, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 118.46153846153847, |
|
"grad_norm": 86.5144271850586, |
|
"learning_rate": 9.231000000000002e-06, |
|
"loss": 1.0643, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 118.46153846153847, |
|
"eval_loss": 1.2187525033950806, |
|
"eval_runtime": 10.9492, |
|
"eval_samples_per_second": 11.964, |
|
"eval_steps_per_second": 1.553, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"grad_norm": 19.4710750579834, |
|
"learning_rate": 9.221e-06, |
|
"loss": 1.0966, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"eval_loss": 1.2282384634017944, |
|
"eval_runtime": 10.9078, |
|
"eval_samples_per_second": 12.01, |
|
"eval_steps_per_second": 1.559, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 121.53846153846153, |
|
"grad_norm": 37.673744201660156, |
|
"learning_rate": 9.211e-06, |
|
"loss": 1.0632, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 121.53846153846153, |
|
"eval_loss": 1.2206230163574219, |
|
"eval_runtime": 11.0018, |
|
"eval_samples_per_second": 11.907, |
|
"eval_steps_per_second": 1.545, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 123.07692307692308, |
|
"grad_norm": 25.10326385498047, |
|
"learning_rate": 9.201e-06, |
|
"loss": 1.0873, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 123.07692307692308, |
|
"eval_loss": 1.2137339115142822, |
|
"eval_runtime": 10.9232, |
|
"eval_samples_per_second": 11.993, |
|
"eval_steps_per_second": 1.556, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 124.61538461538461, |
|
"grad_norm": 32.02176284790039, |
|
"learning_rate": 9.191e-06, |
|
"loss": 1.0568, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 124.61538461538461, |
|
"eval_loss": 1.2065187692642212, |
|
"eval_runtime": 10.9614, |
|
"eval_samples_per_second": 11.951, |
|
"eval_steps_per_second": 1.551, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 126.15384615384616, |
|
"grad_norm": 19.97406005859375, |
|
"learning_rate": 9.181e-06, |
|
"loss": 1.065, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 126.15384615384616, |
|
"eval_loss": 1.2094841003417969, |
|
"eval_runtime": 11.0274, |
|
"eval_samples_per_second": 11.879, |
|
"eval_steps_per_second": 1.542, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 127.6923076923077, |
|
"grad_norm": 31.624399185180664, |
|
"learning_rate": 9.171e-06, |
|
"loss": 1.0805, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 127.6923076923077, |
|
"eval_loss": 1.2149733304977417, |
|
"eval_runtime": 11.2014, |
|
"eval_samples_per_second": 11.695, |
|
"eval_steps_per_second": 1.518, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 129.23076923076923, |
|
"grad_norm": 29.24848747253418, |
|
"learning_rate": 9.161000000000001e-06, |
|
"loss": 1.0463, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 129.23076923076923, |
|
"eval_loss": 1.2077099084854126, |
|
"eval_runtime": 10.9432, |
|
"eval_samples_per_second": 11.971, |
|
"eval_steps_per_second": 1.553, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 130.76923076923077, |
|
"grad_norm": 27.14787483215332, |
|
"learning_rate": 9.151000000000001e-06, |
|
"loss": 1.0607, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 130.76923076923077, |
|
"eval_loss": 1.2046644687652588, |
|
"eval_runtime": 11.001, |
|
"eval_samples_per_second": 11.908, |
|
"eval_steps_per_second": 1.545, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 132.30769230769232, |
|
"grad_norm": 32.416194915771484, |
|
"learning_rate": 9.141000000000001e-06, |
|
"loss": 1.0365, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 132.30769230769232, |
|
"eval_loss": 1.195080041885376, |
|
"eval_runtime": 11.0205, |
|
"eval_samples_per_second": 11.887, |
|
"eval_steps_per_second": 1.543, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 133.84615384615384, |
|
"grad_norm": 58.863582611083984, |
|
"learning_rate": 9.131000000000001e-06, |
|
"loss": 1.0564, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 133.84615384615384, |
|
"eval_loss": 1.1966980695724487, |
|
"eval_runtime": 10.9043, |
|
"eval_samples_per_second": 12.014, |
|
"eval_steps_per_second": 1.559, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 135.3846153846154, |
|
"grad_norm": 26.08232307434082, |
|
"learning_rate": 9.121000000000001e-06, |
|
"loss": 1.0507, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 135.3846153846154, |
|
"eval_loss": 1.201367735862732, |
|
"eval_runtime": 10.9714, |
|
"eval_samples_per_second": 11.94, |
|
"eval_steps_per_second": 1.549, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 136.92307692307693, |
|
"grad_norm": 59.368019104003906, |
|
"learning_rate": 9.111000000000001e-06, |
|
"loss": 1.0508, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 136.92307692307693, |
|
"eval_loss": 1.208795428276062, |
|
"eval_runtime": 11.0529, |
|
"eval_samples_per_second": 11.852, |
|
"eval_steps_per_second": 1.538, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 138.46153846153845, |
|
"grad_norm": 43.07460021972656, |
|
"learning_rate": 9.101000000000001e-06, |
|
"loss": 1.0359, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 138.46153846153845, |
|
"eval_loss": 1.1782392263412476, |
|
"eval_runtime": 11.1519, |
|
"eval_samples_per_second": 11.747, |
|
"eval_steps_per_second": 1.524, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 140.0, |
|
"grad_norm": 22.39567756652832, |
|
"learning_rate": 9.091000000000002e-06, |
|
"loss": 1.0584, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 140.0, |
|
"eval_loss": 1.1873483657836914, |
|
"eval_runtime": 10.8732, |
|
"eval_samples_per_second": 12.048, |
|
"eval_steps_per_second": 1.563, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 141.53846153846155, |
|
"grad_norm": 22.813621520996094, |
|
"learning_rate": 9.081000000000002e-06, |
|
"loss": 1.0354, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 141.53846153846155, |
|
"eval_loss": 1.1731117963790894, |
|
"eval_runtime": 11.0847, |
|
"eval_samples_per_second": 11.818, |
|
"eval_steps_per_second": 1.534, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 143.07692307692307, |
|
"grad_norm": 31.44073486328125, |
|
"learning_rate": 9.071000000000002e-06, |
|
"loss": 1.0457, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 143.07692307692307, |
|
"eval_loss": 1.1962807178497314, |
|
"eval_runtime": 11.0668, |
|
"eval_samples_per_second": 11.837, |
|
"eval_steps_per_second": 1.536, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 144.6153846153846, |
|
"grad_norm": 18.18711280822754, |
|
"learning_rate": 9.061e-06, |
|
"loss": 1.0481, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 144.6153846153846, |
|
"eval_loss": 1.185339093208313, |
|
"eval_runtime": 10.9916, |
|
"eval_samples_per_second": 11.918, |
|
"eval_steps_per_second": 1.547, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 146.15384615384616, |
|
"grad_norm": 38.05665969848633, |
|
"learning_rate": 9.051e-06, |
|
"loss": 1.0391, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 146.15384615384616, |
|
"eval_loss": 1.1856777667999268, |
|
"eval_runtime": 11.0391, |
|
"eval_samples_per_second": 11.867, |
|
"eval_steps_per_second": 1.54, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 147.69230769230768, |
|
"grad_norm": 19.963260650634766, |
|
"learning_rate": 9.041e-06, |
|
"loss": 1.0322, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 147.69230769230768, |
|
"eval_loss": 1.1843148469924927, |
|
"eval_runtime": 11.0382, |
|
"eval_samples_per_second": 11.868, |
|
"eval_steps_per_second": 1.54, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 149.23076923076923, |
|
"grad_norm": 28.58108901977539, |
|
"learning_rate": 9.031e-06, |
|
"loss": 1.0369, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 149.23076923076923, |
|
"eval_loss": 1.1637182235717773, |
|
"eval_runtime": 11.2766, |
|
"eval_samples_per_second": 11.617, |
|
"eval_steps_per_second": 1.508, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 150.76923076923077, |
|
"grad_norm": 64.3956527709961, |
|
"learning_rate": 9.021e-06, |
|
"loss": 1.0519, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 150.76923076923077, |
|
"eval_loss": 1.1796802282333374, |
|
"eval_runtime": 11.3418, |
|
"eval_samples_per_second": 11.55, |
|
"eval_steps_per_second": 1.499, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 152.30769230769232, |
|
"grad_norm": 18.857580184936523, |
|
"learning_rate": 9.011e-06, |
|
"loss": 1.0272, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 152.30769230769232, |
|
"eval_loss": 1.1634279489517212, |
|
"eval_runtime": 11.6749, |
|
"eval_samples_per_second": 11.221, |
|
"eval_steps_per_second": 1.456, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 153.84615384615384, |
|
"grad_norm": 42.541683197021484, |
|
"learning_rate": 9.001e-06, |
|
"loss": 1.0287, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 153.84615384615384, |
|
"eval_loss": 1.1698088645935059, |
|
"eval_runtime": 11.068, |
|
"eval_samples_per_second": 11.836, |
|
"eval_steps_per_second": 1.536, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 155.3846153846154, |
|
"grad_norm": 30.52286720275879, |
|
"learning_rate": 8.991e-06, |
|
"loss": 1.0237, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 155.3846153846154, |
|
"eval_loss": 1.178311824798584, |
|
"eval_runtime": 11.1107, |
|
"eval_samples_per_second": 11.79, |
|
"eval_steps_per_second": 1.53, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 156.92307692307693, |
|
"grad_norm": 32.60612487792969, |
|
"learning_rate": 8.981000000000001e-06, |
|
"loss": 1.0362, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 156.92307692307693, |
|
"eval_loss": 1.157893180847168, |
|
"eval_runtime": 11.0484, |
|
"eval_samples_per_second": 11.857, |
|
"eval_steps_per_second": 1.539, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 158.46153846153845, |
|
"grad_norm": 26.15647315979004, |
|
"learning_rate": 8.971000000000001e-06, |
|
"loss": 0.998, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 158.46153846153845, |
|
"eval_loss": 1.1724605560302734, |
|
"eval_runtime": 11.2323, |
|
"eval_samples_per_second": 11.663, |
|
"eval_steps_per_second": 1.513, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 160.0, |
|
"grad_norm": 45.76993942260742, |
|
"learning_rate": 8.961000000000001e-06, |
|
"loss": 1.0432, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 160.0, |
|
"eval_loss": 1.1688075065612793, |
|
"eval_runtime": 11.0984, |
|
"eval_samples_per_second": 11.803, |
|
"eval_steps_per_second": 1.532, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 161.53846153846155, |
|
"grad_norm": 40.384578704833984, |
|
"learning_rate": 8.951000000000001e-06, |
|
"loss": 1.0421, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 161.53846153846155, |
|
"eval_loss": 1.1621845960617065, |
|
"eval_runtime": 11.1624, |
|
"eval_samples_per_second": 11.736, |
|
"eval_steps_per_second": 1.523, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 163.07692307692307, |
|
"grad_norm": 22.886058807373047, |
|
"learning_rate": 8.941000000000001e-06, |
|
"loss": 0.9912, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 163.07692307692307, |
|
"eval_loss": 1.1695055961608887, |
|
"eval_runtime": 10.9842, |
|
"eval_samples_per_second": 11.926, |
|
"eval_steps_per_second": 1.548, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 164.6153846153846, |
|
"grad_norm": 26.940736770629883, |
|
"learning_rate": 8.931000000000001e-06, |
|
"loss": 1.011, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 164.6153846153846, |
|
"eval_loss": 1.1458157300949097, |
|
"eval_runtime": 11.025, |
|
"eval_samples_per_second": 11.882, |
|
"eval_steps_per_second": 1.542, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 166.15384615384616, |
|
"grad_norm": 26.5013484954834, |
|
"learning_rate": 8.921000000000001e-06, |
|
"loss": 0.9876, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 166.15384615384616, |
|
"eval_loss": 1.1569631099700928, |
|
"eval_runtime": 11.101, |
|
"eval_samples_per_second": 11.801, |
|
"eval_steps_per_second": 1.531, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 167.69230769230768, |
|
"grad_norm": 29.859987258911133, |
|
"learning_rate": 8.911000000000002e-06, |
|
"loss": 1.0374, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 167.69230769230768, |
|
"eval_loss": 1.149316668510437, |
|
"eval_runtime": 11.1242, |
|
"eval_samples_per_second": 11.776, |
|
"eval_steps_per_second": 1.528, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 169.23076923076923, |
|
"grad_norm": 27.777738571166992, |
|
"learning_rate": 8.901e-06, |
|
"loss": 0.985, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 169.23076923076923, |
|
"eval_loss": 1.1608328819274902, |
|
"eval_runtime": 11.2215, |
|
"eval_samples_per_second": 11.674, |
|
"eval_steps_per_second": 1.515, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 170.76923076923077, |
|
"grad_norm": 39.21344757080078, |
|
"learning_rate": 8.891e-06, |
|
"loss": 1.0049, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 170.76923076923077, |
|
"eval_loss": 1.1642228364944458, |
|
"eval_runtime": 11.1947, |
|
"eval_samples_per_second": 11.702, |
|
"eval_steps_per_second": 1.519, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 172.30769230769232, |
|
"grad_norm": 29.880149841308594, |
|
"learning_rate": 8.881e-06, |
|
"loss": 0.9843, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 172.30769230769232, |
|
"eval_loss": 1.1574000120162964, |
|
"eval_runtime": 11.2634, |
|
"eval_samples_per_second": 11.631, |
|
"eval_steps_per_second": 1.509, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 173.84615384615384, |
|
"grad_norm": 63.53031539916992, |
|
"learning_rate": 8.871e-06, |
|
"loss": 1.0354, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 173.84615384615384, |
|
"eval_loss": 1.1575734615325928, |
|
"eval_runtime": 11.0265, |
|
"eval_samples_per_second": 11.88, |
|
"eval_steps_per_second": 1.542, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 175.3846153846154, |
|
"grad_norm": 26.937786102294922, |
|
"learning_rate": 8.861e-06, |
|
"loss": 0.9964, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 175.3846153846154, |
|
"eval_loss": 1.1552445888519287, |
|
"eval_runtime": 11.0325, |
|
"eval_samples_per_second": 11.874, |
|
"eval_steps_per_second": 1.541, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 176.92307692307693, |
|
"grad_norm": 87.28536987304688, |
|
"learning_rate": 8.851e-06, |
|
"loss": 0.9932, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 176.92307692307693, |
|
"eval_loss": 1.1411677598953247, |
|
"eval_runtime": 11.1527, |
|
"eval_samples_per_second": 11.746, |
|
"eval_steps_per_second": 1.524, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 178.46153846153845, |
|
"grad_norm": 25.903568267822266, |
|
"learning_rate": 8.841e-06, |
|
"loss": 0.9768, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 178.46153846153845, |
|
"eval_loss": 1.1635726690292358, |
|
"eval_runtime": 11.123, |
|
"eval_samples_per_second": 11.777, |
|
"eval_steps_per_second": 1.528, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 180.0, |
|
"grad_norm": 24.315654754638672, |
|
"learning_rate": 8.831e-06, |
|
"loss": 0.9984, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 180.0, |
|
"eval_loss": 1.1710366010665894, |
|
"eval_runtime": 11.1375, |
|
"eval_samples_per_second": 11.762, |
|
"eval_steps_per_second": 1.526, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 181.53846153846155, |
|
"grad_norm": 60.17182540893555, |
|
"learning_rate": 8.821e-06, |
|
"loss": 0.9703, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 181.53846153846155, |
|
"eval_loss": 1.1556936502456665, |
|
"eval_runtime": 11.0387, |
|
"eval_samples_per_second": 11.867, |
|
"eval_steps_per_second": 1.54, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 183.07692307692307, |
|
"grad_norm": 32.373477935791016, |
|
"learning_rate": 8.811000000000001e-06, |
|
"loss": 0.9996, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 183.07692307692307, |
|
"eval_loss": 1.146790623664856, |
|
"eval_runtime": 11.1805, |
|
"eval_samples_per_second": 11.717, |
|
"eval_steps_per_second": 1.521, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 184.6153846153846, |
|
"grad_norm": 42.578575134277344, |
|
"learning_rate": 8.801000000000001e-06, |
|
"loss": 0.9795, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 184.6153846153846, |
|
"eval_loss": 1.144544005393982, |
|
"eval_runtime": 11.0704, |
|
"eval_samples_per_second": 11.833, |
|
"eval_steps_per_second": 1.536, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 186.15384615384616, |
|
"grad_norm": 22.79789161682129, |
|
"learning_rate": 8.791000000000001e-06, |
|
"loss": 0.9905, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 186.15384615384616, |
|
"eval_loss": 1.1581685543060303, |
|
"eval_runtime": 11.272, |
|
"eval_samples_per_second": 11.622, |
|
"eval_steps_per_second": 1.508, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 187.69230769230768, |
|
"grad_norm": 52.061012268066406, |
|
"learning_rate": 8.781200000000002e-06, |
|
"loss": 0.9817, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 187.69230769230768, |
|
"eval_loss": 1.159809947013855, |
|
"eval_runtime": 11.2021, |
|
"eval_samples_per_second": 11.694, |
|
"eval_steps_per_second": 1.518, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 189.23076923076923, |
|
"grad_norm": 35.95882034301758, |
|
"learning_rate": 8.7712e-06, |
|
"loss": 1.0071, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 189.23076923076923, |
|
"eval_loss": 1.1944890022277832, |
|
"eval_runtime": 10.9263, |
|
"eval_samples_per_second": 11.989, |
|
"eval_steps_per_second": 1.556, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 190.76923076923077, |
|
"grad_norm": 213.48587036132812, |
|
"learning_rate": 8.7612e-06, |
|
"loss": 0.9997, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 190.76923076923077, |
|
"eval_loss": 1.191455602645874, |
|
"eval_runtime": 10.917, |
|
"eval_samples_per_second": 12.0, |
|
"eval_steps_per_second": 1.557, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 192.30769230769232, |
|
"grad_norm": 19.97510528564453, |
|
"learning_rate": 8.7512e-06, |
|
"loss": 1.001, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 192.30769230769232, |
|
"eval_loss": 1.167776346206665, |
|
"eval_runtime": 11.033, |
|
"eval_samples_per_second": 11.873, |
|
"eval_steps_per_second": 1.541, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 193.84615384615384, |
|
"grad_norm": 30.815828323364258, |
|
"learning_rate": 8.7412e-06, |
|
"loss": 0.9719, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 193.84615384615384, |
|
"eval_loss": 1.150451898574829, |
|
"eval_runtime": 11.2471, |
|
"eval_samples_per_second": 11.647, |
|
"eval_steps_per_second": 1.512, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 195.3846153846154, |
|
"grad_norm": 40.32701110839844, |
|
"learning_rate": 8.7312e-06, |
|
"loss": 0.9658, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 195.3846153846154, |
|
"eval_loss": 1.1517494916915894, |
|
"eval_runtime": 11.1893, |
|
"eval_samples_per_second": 11.708, |
|
"eval_steps_per_second": 1.519, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 196.92307692307693, |
|
"grad_norm": 42.11077117919922, |
|
"learning_rate": 8.7212e-06, |
|
"loss": 0.9744, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 196.92307692307693, |
|
"eval_loss": 1.1507395505905151, |
|
"eval_runtime": 11.2196, |
|
"eval_samples_per_second": 11.676, |
|
"eval_steps_per_second": 1.515, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 198.46153846153845, |
|
"grad_norm": 20.991779327392578, |
|
"learning_rate": 8.7112e-06, |
|
"loss": 0.9695, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 198.46153846153845, |
|
"eval_loss": 1.1557880640029907, |
|
"eval_runtime": 11.2158, |
|
"eval_samples_per_second": 11.68, |
|
"eval_steps_per_second": 1.516, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"grad_norm": 22.79688835144043, |
|
"learning_rate": 8.7012e-06, |
|
"loss": 0.9652, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"eval_loss": 1.1554670333862305, |
|
"eval_runtime": 11.1246, |
|
"eval_samples_per_second": 11.776, |
|
"eval_steps_per_second": 1.528, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 201.53846153846155, |
|
"grad_norm": 180.34512329101562, |
|
"learning_rate": 8.6912e-06, |
|
"loss": 0.9664, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 201.53846153846155, |
|
"eval_loss": 1.1403967142105103, |
|
"eval_runtime": 10.9895, |
|
"eval_samples_per_second": 11.921, |
|
"eval_steps_per_second": 1.547, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 203.07692307692307, |
|
"grad_norm": 31.583358764648438, |
|
"learning_rate": 8.6812e-06, |
|
"loss": 0.9672, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 203.07692307692307, |
|
"eval_loss": 1.1569470167160034, |
|
"eval_runtime": 11.2695, |
|
"eval_samples_per_second": 11.624, |
|
"eval_steps_per_second": 1.508, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 204.6153846153846, |
|
"grad_norm": 34.05722427368164, |
|
"learning_rate": 8.671200000000001e-06, |
|
"loss": 0.9531, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 204.6153846153846, |
|
"eval_loss": 1.1408321857452393, |
|
"eval_runtime": 11.1074, |
|
"eval_samples_per_second": 11.794, |
|
"eval_steps_per_second": 1.531, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 206.15384615384616, |
|
"grad_norm": 26.748388290405273, |
|
"learning_rate": 8.661200000000001e-06, |
|
"loss": 0.9484, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 206.15384615384616, |
|
"eval_loss": 1.151693344116211, |
|
"eval_runtime": 10.9585, |
|
"eval_samples_per_second": 11.954, |
|
"eval_steps_per_second": 1.551, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 207.69230769230768, |
|
"grad_norm": 19.531770706176758, |
|
"learning_rate": 8.651200000000001e-06, |
|
"loss": 0.971, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 207.69230769230768, |
|
"eval_loss": 1.138724684715271, |
|
"eval_runtime": 11.0296, |
|
"eval_samples_per_second": 11.877, |
|
"eval_steps_per_second": 1.541, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 209.23076923076923, |
|
"grad_norm": 23.87537384033203, |
|
"learning_rate": 8.641200000000001e-06, |
|
"loss": 0.944, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 209.23076923076923, |
|
"eval_loss": 1.1402664184570312, |
|
"eval_runtime": 11.1505, |
|
"eval_samples_per_second": 11.748, |
|
"eval_steps_per_second": 1.525, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 210.76923076923077, |
|
"grad_norm": 25.069852828979492, |
|
"learning_rate": 8.631200000000001e-06, |
|
"loss": 0.9581, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 210.76923076923077, |
|
"eval_loss": 1.1348073482513428, |
|
"eval_runtime": 11.063, |
|
"eval_samples_per_second": 11.841, |
|
"eval_steps_per_second": 1.537, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 212.30769230769232, |
|
"grad_norm": 23.794719696044922, |
|
"learning_rate": 8.621200000000001e-06, |
|
"loss": 0.957, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 212.30769230769232, |
|
"eval_loss": 1.143198013305664, |
|
"eval_runtime": 11.1422, |
|
"eval_samples_per_second": 11.757, |
|
"eval_steps_per_second": 1.526, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 213.84615384615384, |
|
"grad_norm": 26.059829711914062, |
|
"learning_rate": 8.611200000000002e-06, |
|
"loss": 0.9554, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 213.84615384615384, |
|
"eval_loss": 1.1333541870117188, |
|
"eval_runtime": 10.948, |
|
"eval_samples_per_second": 11.966, |
|
"eval_steps_per_second": 1.553, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 215.3846153846154, |
|
"grad_norm": 49.8937873840332, |
|
"learning_rate": 8.6012e-06, |
|
"loss": 0.9607, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 215.3846153846154, |
|
"eval_loss": 1.1584446430206299, |
|
"eval_runtime": 11.2488, |
|
"eval_samples_per_second": 11.646, |
|
"eval_steps_per_second": 1.511, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 216.92307692307693, |
|
"grad_norm": 17.9267520904541, |
|
"learning_rate": 8.5912e-06, |
|
"loss": 0.9444, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 216.92307692307693, |
|
"eval_loss": 1.1573532819747925, |
|
"eval_runtime": 11.143, |
|
"eval_samples_per_second": 11.756, |
|
"eval_steps_per_second": 1.526, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 218.46153846153845, |
|
"grad_norm": 27.38156509399414, |
|
"learning_rate": 8.5812e-06, |
|
"loss": 0.928, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 218.46153846153845, |
|
"eval_loss": 1.1540145874023438, |
|
"eval_runtime": 11.0475, |
|
"eval_samples_per_second": 11.858, |
|
"eval_steps_per_second": 1.539, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 220.0, |
|
"grad_norm": 42.785037994384766, |
|
"learning_rate": 8.5712e-06, |
|
"loss": 0.9548, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 220.0, |
|
"eval_loss": 1.1379021406173706, |
|
"eval_runtime": 10.9412, |
|
"eval_samples_per_second": 11.973, |
|
"eval_steps_per_second": 1.554, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 221.53846153846155, |
|
"grad_norm": 39.50480270385742, |
|
"learning_rate": 8.5612e-06, |
|
"loss": 0.9583, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 221.53846153846155, |
|
"eval_loss": 1.1666078567504883, |
|
"eval_runtime": 11.2066, |
|
"eval_samples_per_second": 11.69, |
|
"eval_steps_per_second": 1.517, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 223.07692307692307, |
|
"grad_norm": 15.560932159423828, |
|
"learning_rate": 8.5512e-06, |
|
"loss": 0.9306, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 223.07692307692307, |
|
"eval_loss": 1.151904582977295, |
|
"eval_runtime": 11.2376, |
|
"eval_samples_per_second": 11.657, |
|
"eval_steps_per_second": 1.513, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 224.6153846153846, |
|
"grad_norm": 36.12020492553711, |
|
"learning_rate": 8.541400000000001e-06, |
|
"loss": 0.9668, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 224.6153846153846, |
|
"eval_loss": 1.139450192451477, |
|
"eval_runtime": 11.1643, |
|
"eval_samples_per_second": 11.734, |
|
"eval_steps_per_second": 1.523, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 226.15384615384616, |
|
"grad_norm": 31.29511070251465, |
|
"learning_rate": 8.531400000000001e-06, |
|
"loss": 0.9646, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 226.15384615384616, |
|
"eval_loss": 1.1311490535736084, |
|
"eval_runtime": 11.0, |
|
"eval_samples_per_second": 11.909, |
|
"eval_steps_per_second": 1.545, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 227.69230769230768, |
|
"grad_norm": 22.08748435974121, |
|
"learning_rate": 8.521400000000001e-06, |
|
"loss": 0.922, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 227.69230769230768, |
|
"eval_loss": 1.1504499912261963, |
|
"eval_runtime": 11.135, |
|
"eval_samples_per_second": 11.765, |
|
"eval_steps_per_second": 1.527, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 229.23076923076923, |
|
"grad_norm": 26.33457374572754, |
|
"learning_rate": 8.511400000000001e-06, |
|
"loss": 0.9306, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 229.23076923076923, |
|
"eval_loss": 1.136217713356018, |
|
"eval_runtime": 11.1465, |
|
"eval_samples_per_second": 11.753, |
|
"eval_steps_per_second": 1.525, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 230.76923076923077, |
|
"grad_norm": 49.193206787109375, |
|
"learning_rate": 8.501400000000001e-06, |
|
"loss": 0.938, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 230.76923076923077, |
|
"eval_loss": 1.1409271955490112, |
|
"eval_runtime": 11.0858, |
|
"eval_samples_per_second": 11.817, |
|
"eval_steps_per_second": 1.533, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 232.30769230769232, |
|
"grad_norm": 22.36850357055664, |
|
"learning_rate": 8.491400000000001e-06, |
|
"loss": 0.9218, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 232.30769230769232, |
|
"eval_loss": 1.131103515625, |
|
"eval_runtime": 11.0691, |
|
"eval_samples_per_second": 11.835, |
|
"eval_steps_per_second": 1.536, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 233.84615384615384, |
|
"grad_norm": 26.34011459350586, |
|
"learning_rate": 8.481400000000002e-06, |
|
"loss": 0.9617, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 233.84615384615384, |
|
"eval_loss": 1.1415542364120483, |
|
"eval_runtime": 10.9887, |
|
"eval_samples_per_second": 11.921, |
|
"eval_steps_per_second": 1.547, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 235.3846153846154, |
|
"grad_norm": 29.583358764648438, |
|
"learning_rate": 8.4714e-06, |
|
"loss": 0.9272, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 235.3846153846154, |
|
"eval_loss": 1.144914150238037, |
|
"eval_runtime": 11.2646, |
|
"eval_samples_per_second": 11.629, |
|
"eval_steps_per_second": 1.509, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 236.92307692307693, |
|
"grad_norm": 31.824247360229492, |
|
"learning_rate": 8.4614e-06, |
|
"loss": 0.9207, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 236.92307692307693, |
|
"eval_loss": 1.1387474536895752, |
|
"eval_runtime": 11.0721, |
|
"eval_samples_per_second": 11.832, |
|
"eval_steps_per_second": 1.535, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 238.46153846153845, |
|
"grad_norm": 41.94277572631836, |
|
"learning_rate": 8.4514e-06, |
|
"loss": 0.9454, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 238.46153846153845, |
|
"eval_loss": 1.1316168308258057, |
|
"eval_runtime": 11.1831, |
|
"eval_samples_per_second": 11.714, |
|
"eval_steps_per_second": 1.52, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 240.0, |
|
"grad_norm": 21.150598526000977, |
|
"learning_rate": 8.4414e-06, |
|
"loss": 0.9249, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 240.0, |
|
"eval_loss": 1.1368097066879272, |
|
"eval_runtime": 11.0887, |
|
"eval_samples_per_second": 11.814, |
|
"eval_steps_per_second": 1.533, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 241.53846153846155, |
|
"grad_norm": 47.432212829589844, |
|
"learning_rate": 8.4314e-06, |
|
"loss": 0.9212, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 241.53846153846155, |
|
"eval_loss": 1.125348448753357, |
|
"eval_runtime": 11.2434, |
|
"eval_samples_per_second": 11.651, |
|
"eval_steps_per_second": 1.512, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 243.07692307692307, |
|
"grad_norm": 28.406036376953125, |
|
"learning_rate": 8.4214e-06, |
|
"loss": 0.9272, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 243.07692307692307, |
|
"eval_loss": 1.1328097581863403, |
|
"eval_runtime": 11.1097, |
|
"eval_samples_per_second": 11.791, |
|
"eval_steps_per_second": 1.53, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 244.6153846153846, |
|
"grad_norm": 53.369564056396484, |
|
"learning_rate": 8.4114e-06, |
|
"loss": 0.9174, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 244.6153846153846, |
|
"eval_loss": 1.1235861778259277, |
|
"eval_runtime": 10.9581, |
|
"eval_samples_per_second": 11.955, |
|
"eval_steps_per_second": 1.551, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 246.15384615384616, |
|
"grad_norm": 31.435935974121094, |
|
"learning_rate": 8.4014e-06, |
|
"loss": 0.9041, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 246.15384615384616, |
|
"eval_loss": 1.1266686916351318, |
|
"eval_runtime": 11.1276, |
|
"eval_samples_per_second": 11.773, |
|
"eval_steps_per_second": 1.528, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 247.69230769230768, |
|
"grad_norm": 32.799991607666016, |
|
"learning_rate": 8.3914e-06, |
|
"loss": 0.9062, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 247.69230769230768, |
|
"eval_loss": 1.1481693983078003, |
|
"eval_runtime": 11.0435, |
|
"eval_samples_per_second": 11.862, |
|
"eval_steps_per_second": 1.539, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 249.23076923076923, |
|
"grad_norm": 36.49935531616211, |
|
"learning_rate": 8.3814e-06, |
|
"loss": 0.9163, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 249.23076923076923, |
|
"eval_loss": 1.139769434928894, |
|
"eval_runtime": 11.1922, |
|
"eval_samples_per_second": 11.705, |
|
"eval_steps_per_second": 1.519, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 250.76923076923077, |
|
"grad_norm": 35.4781379699707, |
|
"learning_rate": 8.371400000000001e-06, |
|
"loss": 0.9219, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 250.76923076923077, |
|
"eval_loss": 1.1498539447784424, |
|
"eval_runtime": 11.1805, |
|
"eval_samples_per_second": 11.717, |
|
"eval_steps_per_second": 1.52, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 252.30769230769232, |
|
"grad_norm": 17.645612716674805, |
|
"learning_rate": 8.361400000000001e-06, |
|
"loss": 0.9278, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 252.30769230769232, |
|
"eval_loss": 1.1338902711868286, |
|
"eval_runtime": 11.1146, |
|
"eval_samples_per_second": 11.786, |
|
"eval_steps_per_second": 1.53, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 253.84615384615384, |
|
"grad_norm": 32.81660079956055, |
|
"learning_rate": 8.351400000000001e-06, |
|
"loss": 0.9108, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 253.84615384615384, |
|
"eval_loss": 1.1279574632644653, |
|
"eval_runtime": 11.1151, |
|
"eval_samples_per_second": 11.786, |
|
"eval_steps_per_second": 1.529, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 255.3846153846154, |
|
"grad_norm": 22.3878116607666, |
|
"learning_rate": 8.341400000000001e-06, |
|
"loss": 0.9011, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 255.3846153846154, |
|
"eval_loss": 1.1570419073104858, |
|
"eval_runtime": 11.0784, |
|
"eval_samples_per_second": 11.825, |
|
"eval_steps_per_second": 1.535, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 256.9230769230769, |
|
"grad_norm": 29.845205307006836, |
|
"learning_rate": 8.331400000000001e-06, |
|
"loss": 0.9314, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 256.9230769230769, |
|
"eval_loss": 1.1365561485290527, |
|
"eval_runtime": 11.2405, |
|
"eval_samples_per_second": 11.654, |
|
"eval_steps_per_second": 1.512, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 258.46153846153845, |
|
"grad_norm": 21.02674102783203, |
|
"learning_rate": 8.321400000000001e-06, |
|
"loss": 0.9021, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 258.46153846153845, |
|
"eval_loss": 1.164974570274353, |
|
"eval_runtime": 11.0643, |
|
"eval_samples_per_second": 11.84, |
|
"eval_steps_per_second": 1.536, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 260.0, |
|
"grad_norm": 22.380117416381836, |
|
"learning_rate": 8.3114e-06, |
|
"loss": 0.912, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 260.0, |
|
"eval_loss": 1.1483317613601685, |
|
"eval_runtime": 11.1852, |
|
"eval_samples_per_second": 11.712, |
|
"eval_steps_per_second": 1.52, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 261.53846153846155, |
|
"grad_norm": 39.20146560668945, |
|
"learning_rate": 8.3014e-06, |
|
"loss": 0.9165, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 261.53846153846155, |
|
"eval_loss": 1.159449577331543, |
|
"eval_runtime": 11.4058, |
|
"eval_samples_per_second": 11.485, |
|
"eval_steps_per_second": 1.49, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 263.0769230769231, |
|
"grad_norm": 46.305389404296875, |
|
"learning_rate": 8.2914e-06, |
|
"loss": 0.916, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 263.0769230769231, |
|
"eval_loss": 1.146033525466919, |
|
"eval_runtime": 11.3638, |
|
"eval_samples_per_second": 11.528, |
|
"eval_steps_per_second": 1.496, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 264.61538461538464, |
|
"grad_norm": 33.07489776611328, |
|
"learning_rate": 8.2814e-06, |
|
"loss": 0.9147, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 264.61538461538464, |
|
"eval_loss": 1.143062710762024, |
|
"eval_runtime": 11.3544, |
|
"eval_samples_per_second": 11.537, |
|
"eval_steps_per_second": 1.497, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 266.15384615384613, |
|
"grad_norm": 35.233131408691406, |
|
"learning_rate": 8.2714e-06, |
|
"loss": 0.9151, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 266.15384615384613, |
|
"eval_loss": 1.126172661781311, |
|
"eval_runtime": 11.1185, |
|
"eval_samples_per_second": 11.782, |
|
"eval_steps_per_second": 1.529, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 267.6923076923077, |
|
"grad_norm": 32.72975540161133, |
|
"learning_rate": 8.2614e-06, |
|
"loss": 0.8881, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 267.6923076923077, |
|
"eval_loss": 1.1455607414245605, |
|
"eval_runtime": 11.0568, |
|
"eval_samples_per_second": 11.848, |
|
"eval_steps_per_second": 1.538, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 269.2307692307692, |
|
"grad_norm": 16.41983985900879, |
|
"learning_rate": 8.2514e-06, |
|
"loss": 0.9027, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 269.2307692307692, |
|
"eval_loss": 1.1283539533615112, |
|
"eval_runtime": 11.3233, |
|
"eval_samples_per_second": 11.569, |
|
"eval_steps_per_second": 1.501, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 270.7692307692308, |
|
"grad_norm": 20.32726287841797, |
|
"learning_rate": 8.2414e-06, |
|
"loss": 0.9391, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 270.7692307692308, |
|
"eval_loss": 1.124210000038147, |
|
"eval_runtime": 11.3345, |
|
"eval_samples_per_second": 11.558, |
|
"eval_steps_per_second": 1.5, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 272.3076923076923, |
|
"grad_norm": 23.14797019958496, |
|
"learning_rate": 8.2314e-06, |
|
"loss": 0.8899, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 272.3076923076923, |
|
"eval_loss": 1.1297597885131836, |
|
"eval_runtime": 11.1272, |
|
"eval_samples_per_second": 11.773, |
|
"eval_steps_per_second": 1.528, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 273.84615384615387, |
|
"grad_norm": 18.778406143188477, |
|
"learning_rate": 8.2214e-06, |
|
"loss": 0.9074, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 273.84615384615387, |
|
"eval_loss": 1.135562777519226, |
|
"eval_runtime": 11.2964, |
|
"eval_samples_per_second": 11.597, |
|
"eval_steps_per_second": 1.505, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 275.38461538461536, |
|
"grad_norm": 27.574323654174805, |
|
"learning_rate": 8.2114e-06, |
|
"loss": 0.8931, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 275.38461538461536, |
|
"eval_loss": 1.1423242092132568, |
|
"eval_runtime": 11.0992, |
|
"eval_samples_per_second": 11.803, |
|
"eval_steps_per_second": 1.532, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 276.9230769230769, |
|
"grad_norm": 26.559467315673828, |
|
"learning_rate": 8.2014e-06, |
|
"loss": 0.8913, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 276.9230769230769, |
|
"eval_loss": 1.1252741813659668, |
|
"eval_runtime": 11.2765, |
|
"eval_samples_per_second": 11.617, |
|
"eval_steps_per_second": 1.508, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 278.46153846153845, |
|
"grad_norm": 24.442596435546875, |
|
"learning_rate": 8.191400000000001e-06, |
|
"loss": 0.8993, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 278.46153846153845, |
|
"eval_loss": 1.1197646856307983, |
|
"eval_runtime": 11.0479, |
|
"eval_samples_per_second": 11.857, |
|
"eval_steps_per_second": 1.539, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 280.0, |
|
"grad_norm": 42.99458694458008, |
|
"learning_rate": 8.181400000000001e-06, |
|
"loss": 0.8925, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 280.0, |
|
"eval_loss": 1.129381775856018, |
|
"eval_runtime": 11.1979, |
|
"eval_samples_per_second": 11.699, |
|
"eval_steps_per_second": 1.518, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 281.53846153846155, |
|
"grad_norm": 38.08549118041992, |
|
"learning_rate": 8.171400000000001e-06, |
|
"loss": 0.8699, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 281.53846153846155, |
|
"eval_loss": 1.1298307180404663, |
|
"eval_runtime": 11.1097, |
|
"eval_samples_per_second": 11.792, |
|
"eval_steps_per_second": 1.53, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 283.0769230769231, |
|
"grad_norm": 49.90501022338867, |
|
"learning_rate": 8.161400000000001e-06, |
|
"loss": 0.9207, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 283.0769230769231, |
|
"eval_loss": 1.1229758262634277, |
|
"eval_runtime": 11.284, |
|
"eval_samples_per_second": 11.609, |
|
"eval_steps_per_second": 1.507, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 284.61538461538464, |
|
"grad_norm": 37.63615036010742, |
|
"learning_rate": 8.1514e-06, |
|
"loss": 0.9061, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 284.61538461538464, |
|
"eval_loss": 1.1395354270935059, |
|
"eval_runtime": 11.2087, |
|
"eval_samples_per_second": 11.687, |
|
"eval_steps_per_second": 1.517, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 286.15384615384613, |
|
"grad_norm": 17.88991355895996, |
|
"learning_rate": 8.1414e-06, |
|
"loss": 0.8664, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 286.15384615384613, |
|
"eval_loss": 1.1339645385742188, |
|
"eval_runtime": 11.317, |
|
"eval_samples_per_second": 11.576, |
|
"eval_steps_per_second": 1.502, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 287.6923076923077, |
|
"grad_norm": 33.13370895385742, |
|
"learning_rate": 8.1314e-06, |
|
"loss": 0.8759, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 287.6923076923077, |
|
"eval_loss": 1.1445599794387817, |
|
"eval_runtime": 11.0472, |
|
"eval_samples_per_second": 11.858, |
|
"eval_steps_per_second": 1.539, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 289.2307692307692, |
|
"grad_norm": 22.776575088500977, |
|
"learning_rate": 8.1214e-06, |
|
"loss": 0.8889, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 289.2307692307692, |
|
"eval_loss": 1.1401453018188477, |
|
"eval_runtime": 11.2523, |
|
"eval_samples_per_second": 11.642, |
|
"eval_steps_per_second": 1.511, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 290.7692307692308, |
|
"grad_norm": 19.893653869628906, |
|
"learning_rate": 8.1114e-06, |
|
"loss": 0.8945, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 290.7692307692308, |
|
"eval_loss": 1.1185678243637085, |
|
"eval_runtime": 10.9814, |
|
"eval_samples_per_second": 11.929, |
|
"eval_steps_per_second": 1.548, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 292.3076923076923, |
|
"grad_norm": 35.09921646118164, |
|
"learning_rate": 8.1015e-06, |
|
"loss": 0.8821, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 292.3076923076923, |
|
"eval_loss": 1.1313185691833496, |
|
"eval_runtime": 11.2698, |
|
"eval_samples_per_second": 11.624, |
|
"eval_steps_per_second": 1.508, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 293.84615384615387, |
|
"grad_norm": 36.43528366088867, |
|
"learning_rate": 8.0915e-06, |
|
"loss": 0.8794, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 293.84615384615387, |
|
"eval_loss": 1.1506413221359253, |
|
"eval_runtime": 10.9821, |
|
"eval_samples_per_second": 11.928, |
|
"eval_steps_per_second": 1.548, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 295.38461538461536, |
|
"grad_norm": 24.73018455505371, |
|
"learning_rate": 8.0815e-06, |
|
"loss": 0.8856, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 295.38461538461536, |
|
"eval_loss": 1.1280685663223267, |
|
"eval_runtime": 11.2539, |
|
"eval_samples_per_second": 11.64, |
|
"eval_steps_per_second": 1.511, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 296.9230769230769, |
|
"grad_norm": 50.01460647583008, |
|
"learning_rate": 8.0715e-06, |
|
"loss": 0.8532, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 296.9230769230769, |
|
"eval_loss": 1.124637484550476, |
|
"eval_runtime": 11.1106, |
|
"eval_samples_per_second": 11.791, |
|
"eval_steps_per_second": 1.53, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 298.46153846153845, |
|
"grad_norm": 38.828033447265625, |
|
"learning_rate": 8.0615e-06, |
|
"loss": 0.8425, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 298.46153846153845, |
|
"eval_loss": 1.1076736450195312, |
|
"eval_runtime": 11.3645, |
|
"eval_samples_per_second": 11.527, |
|
"eval_steps_per_second": 1.496, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 300.0, |
|
"grad_norm": 18.97437858581543, |
|
"learning_rate": 8.0515e-06, |
|
"loss": 0.8934, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 300.0, |
|
"eval_loss": 1.110048532485962, |
|
"eval_runtime": 11.1778, |
|
"eval_samples_per_second": 11.72, |
|
"eval_steps_per_second": 1.521, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 301.53846153846155, |
|
"grad_norm": 49.44120407104492, |
|
"learning_rate": 8.041500000000001e-06, |
|
"loss": 0.8694, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 301.53846153846155, |
|
"eval_loss": 1.1411337852478027, |
|
"eval_runtime": 11.4024, |
|
"eval_samples_per_second": 11.489, |
|
"eval_steps_per_second": 1.491, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 303.0769230769231, |
|
"grad_norm": 32.77436828613281, |
|
"learning_rate": 8.031500000000001e-06, |
|
"loss": 0.8876, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 303.0769230769231, |
|
"eval_loss": 1.1274378299713135, |
|
"eval_runtime": 11.3743, |
|
"eval_samples_per_second": 11.517, |
|
"eval_steps_per_second": 1.495, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 304.61538461538464, |
|
"grad_norm": 33.37275695800781, |
|
"learning_rate": 8.021500000000001e-06, |
|
"loss": 0.839, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 304.61538461538464, |
|
"eval_loss": 1.1159642934799194, |
|
"eval_runtime": 11.2947, |
|
"eval_samples_per_second": 11.598, |
|
"eval_steps_per_second": 1.505, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 306.15384615384613, |
|
"grad_norm": 15.981072425842285, |
|
"learning_rate": 8.011500000000001e-06, |
|
"loss": 0.8796, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 306.15384615384613, |
|
"eval_loss": 1.1134443283081055, |
|
"eval_runtime": 11.0923, |
|
"eval_samples_per_second": 11.81, |
|
"eval_steps_per_second": 1.533, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 307.6923076923077, |
|
"grad_norm": 33.69294738769531, |
|
"learning_rate": 8.001500000000001e-06, |
|
"loss": 0.8757, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 307.6923076923077, |
|
"eval_loss": 1.1174206733703613, |
|
"eval_runtime": 11.1824, |
|
"eval_samples_per_second": 11.715, |
|
"eval_steps_per_second": 1.52, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 309.2307692307692, |
|
"grad_norm": 39.100887298583984, |
|
"learning_rate": 7.991500000000001e-06, |
|
"loss": 0.9037, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 309.2307692307692, |
|
"eval_loss": 1.1444469690322876, |
|
"eval_runtime": 11.2517, |
|
"eval_samples_per_second": 11.643, |
|
"eval_steps_per_second": 1.511, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 310.7692307692308, |
|
"grad_norm": 34.204410552978516, |
|
"learning_rate": 7.981500000000001e-06, |
|
"loss": 0.8714, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 310.7692307692308, |
|
"eval_loss": 1.113229751586914, |
|
"eval_runtime": 11.1634, |
|
"eval_samples_per_second": 11.735, |
|
"eval_steps_per_second": 1.523, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 312.3076923076923, |
|
"grad_norm": 34.093692779541016, |
|
"learning_rate": 7.971500000000002e-06, |
|
"loss": 0.8952, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 312.3076923076923, |
|
"eval_loss": 1.132067084312439, |
|
"eval_runtime": 11.0613, |
|
"eval_samples_per_second": 11.843, |
|
"eval_steps_per_second": 1.537, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 313.84615384615387, |
|
"grad_norm": 33.44735336303711, |
|
"learning_rate": 7.961500000000002e-06, |
|
"loss": 0.8545, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 313.84615384615387, |
|
"eval_loss": 1.102783441543579, |
|
"eval_runtime": 10.9986, |
|
"eval_samples_per_second": 11.911, |
|
"eval_steps_per_second": 1.546, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 315.38461538461536, |
|
"grad_norm": 21.25609588623047, |
|
"learning_rate": 7.9516e-06, |
|
"loss": 0.896, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 315.38461538461536, |
|
"eval_loss": 1.1276897192001343, |
|
"eval_runtime": 11.0387, |
|
"eval_samples_per_second": 11.867, |
|
"eval_steps_per_second": 1.54, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 316.9230769230769, |
|
"grad_norm": 42.066017150878906, |
|
"learning_rate": 7.9416e-06, |
|
"loss": 0.8545, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 316.9230769230769, |
|
"eval_loss": 1.1395957469940186, |
|
"eval_runtime": 11.2855, |
|
"eval_samples_per_second": 11.608, |
|
"eval_steps_per_second": 1.506, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 318.46153846153845, |
|
"grad_norm": 22.6724796295166, |
|
"learning_rate": 7.9316e-06, |
|
"loss": 0.8838, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 318.46153846153845, |
|
"eval_loss": 1.1254605054855347, |
|
"eval_runtime": 11.2226, |
|
"eval_samples_per_second": 11.673, |
|
"eval_steps_per_second": 1.515, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 320.0, |
|
"grad_norm": 31.141693115234375, |
|
"learning_rate": 7.9216e-06, |
|
"loss": 0.8704, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 320.0, |
|
"eval_loss": 1.1148459911346436, |
|
"eval_runtime": 11.3548, |
|
"eval_samples_per_second": 11.537, |
|
"eval_steps_per_second": 1.497, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 321.53846153846155, |
|
"grad_norm": 53.709205627441406, |
|
"learning_rate": 7.9116e-06, |
|
"loss": 0.8571, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 321.53846153846155, |
|
"eval_loss": 1.1235041618347168, |
|
"eval_runtime": 11.3363, |
|
"eval_samples_per_second": 11.556, |
|
"eval_steps_per_second": 1.5, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 323.0769230769231, |
|
"grad_norm": 36.34547805786133, |
|
"learning_rate": 7.9016e-06, |
|
"loss": 0.8749, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 323.0769230769231, |
|
"eval_loss": 1.1102306842803955, |
|
"eval_runtime": 11.2154, |
|
"eval_samples_per_second": 11.68, |
|
"eval_steps_per_second": 1.516, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 324.61538461538464, |
|
"grad_norm": 19.974227905273438, |
|
"learning_rate": 7.8916e-06, |
|
"loss": 0.8572, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 324.61538461538464, |
|
"eval_loss": 1.1081122159957886, |
|
"eval_runtime": 11.2322, |
|
"eval_samples_per_second": 11.663, |
|
"eval_steps_per_second": 1.513, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 326.15384615384613, |
|
"grad_norm": 31.686569213867188, |
|
"learning_rate": 7.881600000000001e-06, |
|
"loss": 0.8689, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 326.15384615384613, |
|
"eval_loss": 1.1191128492355347, |
|
"eval_runtime": 11.4469, |
|
"eval_samples_per_second": 11.444, |
|
"eval_steps_per_second": 1.485, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 327.6923076923077, |
|
"grad_norm": 35.97456359863281, |
|
"learning_rate": 7.871600000000001e-06, |
|
"loss": 0.8421, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 327.6923076923077, |
|
"eval_loss": 1.1023869514465332, |
|
"eval_runtime": 11.2157, |
|
"eval_samples_per_second": 11.68, |
|
"eval_steps_per_second": 1.516, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 329.2307692307692, |
|
"grad_norm": 34.61111068725586, |
|
"learning_rate": 7.861600000000001e-06, |
|
"loss": 0.8546, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 329.2307692307692, |
|
"eval_loss": 1.1294784545898438, |
|
"eval_runtime": 11.1536, |
|
"eval_samples_per_second": 11.745, |
|
"eval_steps_per_second": 1.524, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 330.7692307692308, |
|
"grad_norm": 21.682897567749023, |
|
"learning_rate": 7.8516e-06, |
|
"loss": 0.828, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 330.7692307692308, |
|
"eval_loss": 1.101345181465149, |
|
"eval_runtime": 11.0361, |
|
"eval_samples_per_second": 11.87, |
|
"eval_steps_per_second": 1.54, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 332.3076923076923, |
|
"grad_norm": 31.514686584472656, |
|
"learning_rate": 7.8416e-06, |
|
"loss": 0.8762, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 332.3076923076923, |
|
"eval_loss": 1.1125372648239136, |
|
"eval_runtime": 11.2868, |
|
"eval_samples_per_second": 11.607, |
|
"eval_steps_per_second": 1.506, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 333.84615384615387, |
|
"grad_norm": 34.57918167114258, |
|
"learning_rate": 7.8316e-06, |
|
"loss": 0.8368, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 333.84615384615387, |
|
"eval_loss": 1.1226669549942017, |
|
"eval_runtime": 11.4486, |
|
"eval_samples_per_second": 11.442, |
|
"eval_steps_per_second": 1.485, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 335.38461538461536, |
|
"grad_norm": 57.26470947265625, |
|
"learning_rate": 7.8216e-06, |
|
"loss": 0.8675, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 335.38461538461536, |
|
"eval_loss": 1.1287323236465454, |
|
"eval_runtime": 11.197, |
|
"eval_samples_per_second": 11.7, |
|
"eval_steps_per_second": 1.518, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 336.9230769230769, |
|
"grad_norm": 26.583576202392578, |
|
"learning_rate": 7.8116e-06, |
|
"loss": 0.8555, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 336.9230769230769, |
|
"eval_loss": 1.1035945415496826, |
|
"eval_runtime": 11.2619, |
|
"eval_samples_per_second": 11.632, |
|
"eval_steps_per_second": 1.51, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 338.46153846153845, |
|
"grad_norm": 44.196590423583984, |
|
"learning_rate": 7.8016e-06, |
|
"loss": 0.8601, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 338.46153846153845, |
|
"eval_loss": 1.09674870967865, |
|
"eval_runtime": 11.2173, |
|
"eval_samples_per_second": 11.678, |
|
"eval_steps_per_second": 1.516, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 340.0, |
|
"grad_norm": 34.388145446777344, |
|
"learning_rate": 7.7916e-06, |
|
"loss": 0.8347, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 340.0, |
|
"eval_loss": 1.100284218788147, |
|
"eval_runtime": 11.2123, |
|
"eval_samples_per_second": 11.684, |
|
"eval_steps_per_second": 1.516, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 341.53846153846155, |
|
"grad_norm": 33.55098342895508, |
|
"learning_rate": 7.7816e-06, |
|
"loss": 0.8429, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 341.53846153846155, |
|
"eval_loss": 1.1070573329925537, |
|
"eval_runtime": 11.1389, |
|
"eval_samples_per_second": 11.761, |
|
"eval_steps_per_second": 1.526, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 343.0769230769231, |
|
"grad_norm": 21.367860794067383, |
|
"learning_rate": 7.7716e-06, |
|
"loss": 0.827, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 343.0769230769231, |
|
"eval_loss": 1.1184412240982056, |
|
"eval_runtime": 11.2313, |
|
"eval_samples_per_second": 11.664, |
|
"eval_steps_per_second": 1.514, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 344.61538461538464, |
|
"grad_norm": 32.62434387207031, |
|
"learning_rate": 7.7616e-06, |
|
"loss": 0.8566, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 344.61538461538464, |
|
"eval_loss": 1.1258198022842407, |
|
"eval_runtime": 11.1788, |
|
"eval_samples_per_second": 11.719, |
|
"eval_steps_per_second": 1.521, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 346.15384615384613, |
|
"grad_norm": 44.411659240722656, |
|
"learning_rate": 7.7516e-06, |
|
"loss": 0.8666, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 346.15384615384613, |
|
"eval_loss": 1.121845006942749, |
|
"eval_runtime": 11.1251, |
|
"eval_samples_per_second": 11.775, |
|
"eval_steps_per_second": 1.528, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 347.6923076923077, |
|
"grad_norm": 41.10401153564453, |
|
"learning_rate": 7.7416e-06, |
|
"loss": 0.8354, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 347.6923076923077, |
|
"eval_loss": 1.1154077053070068, |
|
"eval_runtime": 11.161, |
|
"eval_samples_per_second": 11.737, |
|
"eval_steps_per_second": 1.523, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 349.2307692307692, |
|
"grad_norm": 31.414745330810547, |
|
"learning_rate": 7.7316e-06, |
|
"loss": 0.8321, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 349.2307692307692, |
|
"eval_loss": 1.1233044862747192, |
|
"eval_runtime": 11.0203, |
|
"eval_samples_per_second": 11.887, |
|
"eval_steps_per_second": 1.543, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 350.7692307692308, |
|
"grad_norm": 37.39424133300781, |
|
"learning_rate": 7.7216e-06, |
|
"loss": 0.8384, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 350.7692307692308, |
|
"eval_loss": 1.1059685945510864, |
|
"eval_runtime": 10.9825, |
|
"eval_samples_per_second": 11.928, |
|
"eval_steps_per_second": 1.548, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 352.3076923076923, |
|
"grad_norm": 24.75067901611328, |
|
"learning_rate": 7.7116e-06, |
|
"loss": 0.8216, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 352.3076923076923, |
|
"eval_loss": 1.1147040128707886, |
|
"eval_runtime": 11.1066, |
|
"eval_samples_per_second": 11.795, |
|
"eval_steps_per_second": 1.531, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 353.84615384615387, |
|
"grad_norm": 21.775754928588867, |
|
"learning_rate": 7.701600000000001e-06, |
|
"loss": 0.8562, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 353.84615384615387, |
|
"eval_loss": 1.1105334758758545, |
|
"eval_runtime": 11.0277, |
|
"eval_samples_per_second": 11.879, |
|
"eval_steps_per_second": 1.542, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 355.38461538461536, |
|
"grad_norm": 33.22850799560547, |
|
"learning_rate": 7.6916e-06, |
|
"loss": 0.813, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 355.38461538461536, |
|
"eval_loss": 1.113241195678711, |
|
"eval_runtime": 10.9954, |
|
"eval_samples_per_second": 11.914, |
|
"eval_steps_per_second": 1.546, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 356.9230769230769, |
|
"grad_norm": 31.402652740478516, |
|
"learning_rate": 7.6816e-06, |
|
"loss": 0.8527, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 356.9230769230769, |
|
"eval_loss": 1.1040674448013306, |
|
"eval_runtime": 11.0415, |
|
"eval_samples_per_second": 11.864, |
|
"eval_steps_per_second": 1.54, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 358.46153846153845, |
|
"grad_norm": 106.14508819580078, |
|
"learning_rate": 7.6716e-06, |
|
"loss": 0.8223, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 358.46153846153845, |
|
"eval_loss": 1.1121721267700195, |
|
"eval_runtime": 11.1579, |
|
"eval_samples_per_second": 11.741, |
|
"eval_steps_per_second": 1.524, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 360.0, |
|
"grad_norm": 32.76395797729492, |
|
"learning_rate": 7.6616e-06, |
|
"loss": 0.8199, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 360.0, |
|
"eval_loss": 1.109994649887085, |
|
"eval_runtime": 11.0068, |
|
"eval_samples_per_second": 11.902, |
|
"eval_steps_per_second": 1.545, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 361.53846153846155, |
|
"grad_norm": 44.64813232421875, |
|
"learning_rate": 7.6516e-06, |
|
"loss": 0.8566, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 361.53846153846155, |
|
"eval_loss": 1.1383651494979858, |
|
"eval_runtime": 11.1866, |
|
"eval_samples_per_second": 11.71, |
|
"eval_steps_per_second": 1.52, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 363.0769230769231, |
|
"grad_norm": 20.096393585205078, |
|
"learning_rate": 7.6416e-06, |
|
"loss": 0.795, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 363.0769230769231, |
|
"eval_loss": 1.1200674772262573, |
|
"eval_runtime": 11.289, |
|
"eval_samples_per_second": 11.604, |
|
"eval_steps_per_second": 1.506, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 364.61538461538464, |
|
"grad_norm": 30.219837188720703, |
|
"learning_rate": 7.6316e-06, |
|
"loss": 0.8413, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 364.61538461538464, |
|
"eval_loss": 1.111639380455017, |
|
"eval_runtime": 11.2547, |
|
"eval_samples_per_second": 11.64, |
|
"eval_steps_per_second": 1.51, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 366.15384615384613, |
|
"grad_norm": 54.91815185546875, |
|
"learning_rate": 7.6216e-06, |
|
"loss": 0.8284, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 366.15384615384613, |
|
"eval_loss": 1.1330946683883667, |
|
"eval_runtime": 11.3054, |
|
"eval_samples_per_second": 11.587, |
|
"eval_steps_per_second": 1.504, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 367.6923076923077, |
|
"grad_norm": 40.97207260131836, |
|
"learning_rate": 7.6116e-06, |
|
"loss": 0.8387, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 367.6923076923077, |
|
"eval_loss": 1.1550222635269165, |
|
"eval_runtime": 11.2372, |
|
"eval_samples_per_second": 11.658, |
|
"eval_steps_per_second": 1.513, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 369.2307692307692, |
|
"grad_norm": 36.580745697021484, |
|
"learning_rate": 7.6017e-06, |
|
"loss": 0.829, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 369.2307692307692, |
|
"eval_loss": 1.1261051893234253, |
|
"eval_runtime": 11.1834, |
|
"eval_samples_per_second": 11.714, |
|
"eval_steps_per_second": 1.52, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 370.7692307692308, |
|
"grad_norm": 46.304649353027344, |
|
"learning_rate": 7.5917000000000005e-06, |
|
"loss": 0.8316, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 370.7692307692308, |
|
"eval_loss": 1.1410084962844849, |
|
"eval_runtime": 11.3511, |
|
"eval_samples_per_second": 11.541, |
|
"eval_steps_per_second": 1.498, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 372.3076923076923, |
|
"grad_norm": 32.472293853759766, |
|
"learning_rate": 7.581700000000001e-06, |
|
"loss": 0.8309, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 372.3076923076923, |
|
"eval_loss": 1.1283127069473267, |
|
"eval_runtime": 11.1949, |
|
"eval_samples_per_second": 11.702, |
|
"eval_steps_per_second": 1.519, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 373.84615384615387, |
|
"grad_norm": 24.16025161743164, |
|
"learning_rate": 7.571700000000001e-06, |
|
"loss": 0.8154, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 373.84615384615387, |
|
"eval_loss": 1.1386806964874268, |
|
"eval_runtime": 11.1089, |
|
"eval_samples_per_second": 11.792, |
|
"eval_steps_per_second": 1.53, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 375.38461538461536, |
|
"grad_norm": 16.86387825012207, |
|
"learning_rate": 7.561700000000001e-06, |
|
"loss": 0.8221, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 375.38461538461536, |
|
"eval_loss": 1.09730863571167, |
|
"eval_runtime": 11.1299, |
|
"eval_samples_per_second": 11.77, |
|
"eval_steps_per_second": 1.527, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 376.9230769230769, |
|
"grad_norm": 34.62860107421875, |
|
"learning_rate": 7.551700000000001e-06, |
|
"loss": 0.8106, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 376.9230769230769, |
|
"eval_loss": 1.1233881711959839, |
|
"eval_runtime": 11.0352, |
|
"eval_samples_per_second": 11.871, |
|
"eval_steps_per_second": 1.541, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 378.46153846153845, |
|
"grad_norm": 16.957393646240234, |
|
"learning_rate": 7.541700000000001e-06, |
|
"loss": 0.8362, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 378.46153846153845, |
|
"eval_loss": 1.1209690570831299, |
|
"eval_runtime": 11.0555, |
|
"eval_samples_per_second": 11.849, |
|
"eval_steps_per_second": 1.538, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 380.0, |
|
"grad_norm": 21.519577026367188, |
|
"learning_rate": 7.531700000000001e-06, |
|
"loss": 0.8139, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 380.0, |
|
"eval_loss": 1.1106966733932495, |
|
"eval_runtime": 11.0492, |
|
"eval_samples_per_second": 11.856, |
|
"eval_steps_per_second": 1.539, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 381.53846153846155, |
|
"grad_norm": 26.86697769165039, |
|
"learning_rate": 7.5217e-06, |
|
"loss": 0.8036, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 381.53846153846155, |
|
"eval_loss": 1.1153205633163452, |
|
"eval_runtime": 11.1284, |
|
"eval_samples_per_second": 11.772, |
|
"eval_steps_per_second": 1.528, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 383.0769230769231, |
|
"grad_norm": 21.1047420501709, |
|
"learning_rate": 7.5117000000000004e-06, |
|
"loss": 0.8115, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 383.0769230769231, |
|
"eval_loss": 1.116546630859375, |
|
"eval_runtime": 11.4261, |
|
"eval_samples_per_second": 11.465, |
|
"eval_steps_per_second": 1.488, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 384.61538461538464, |
|
"grad_norm": 40.87736892700195, |
|
"learning_rate": 7.5017000000000005e-06, |
|
"loss": 0.8146, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 384.61538461538464, |
|
"eval_loss": 1.1162028312683105, |
|
"eval_runtime": 11.3423, |
|
"eval_samples_per_second": 11.55, |
|
"eval_steps_per_second": 1.499, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 386.15384615384613, |
|
"grad_norm": 19.28094482421875, |
|
"learning_rate": 7.491700000000001e-06, |
|
"loss": 0.8114, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 386.15384615384613, |
|
"eval_loss": 1.106558918952942, |
|
"eval_runtime": 11.0618, |
|
"eval_samples_per_second": 11.843, |
|
"eval_steps_per_second": 1.537, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 387.6923076923077, |
|
"grad_norm": 19.689420700073242, |
|
"learning_rate": 7.481700000000001e-06, |
|
"loss": 0.8076, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 387.6923076923077, |
|
"eval_loss": 1.1141376495361328, |
|
"eval_runtime": 11.0491, |
|
"eval_samples_per_second": 11.856, |
|
"eval_steps_per_second": 1.539, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 389.2307692307692, |
|
"grad_norm": 22.76107406616211, |
|
"learning_rate": 7.471700000000001e-06, |
|
"loss": 0.8077, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 389.2307692307692, |
|
"eval_loss": 1.1186425685882568, |
|
"eval_runtime": 11.1798, |
|
"eval_samples_per_second": 11.718, |
|
"eval_steps_per_second": 1.521, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 390.7692307692308, |
|
"grad_norm": 40.23360061645508, |
|
"learning_rate": 7.461700000000001e-06, |
|
"loss": 0.7924, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 390.7692307692308, |
|
"eval_loss": 1.1189498901367188, |
|
"eval_runtime": 11.1824, |
|
"eval_samples_per_second": 11.715, |
|
"eval_steps_per_second": 1.52, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 392.3076923076923, |
|
"grad_norm": 32.873207092285156, |
|
"learning_rate": 7.451700000000001e-06, |
|
"loss": 0.8335, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 392.3076923076923, |
|
"eval_loss": 1.1451479196548462, |
|
"eval_runtime": 11.1543, |
|
"eval_samples_per_second": 11.744, |
|
"eval_steps_per_second": 1.524, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 393.84615384615387, |
|
"grad_norm": 27.307552337646484, |
|
"learning_rate": 7.4417e-06, |
|
"loss": 0.7926, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 393.84615384615387, |
|
"eval_loss": 1.1189228296279907, |
|
"eval_runtime": 11.0875, |
|
"eval_samples_per_second": 11.815, |
|
"eval_steps_per_second": 1.533, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 395.38461538461536, |
|
"grad_norm": 22.984905242919922, |
|
"learning_rate": 7.4317e-06, |
|
"loss": 0.8039, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 395.38461538461536, |
|
"eval_loss": 1.1381311416625977, |
|
"eval_runtime": 11.1226, |
|
"eval_samples_per_second": 11.778, |
|
"eval_steps_per_second": 1.528, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 396.9230769230769, |
|
"grad_norm": 43.75572967529297, |
|
"learning_rate": 7.4217000000000004e-06, |
|
"loss": 0.8426, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 396.9230769230769, |
|
"eval_loss": 1.1005278825759888, |
|
"eval_runtime": 10.9018, |
|
"eval_samples_per_second": 12.016, |
|
"eval_steps_per_second": 1.559, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 398.46153846153845, |
|
"grad_norm": 45.551212310791016, |
|
"learning_rate": 7.4117000000000005e-06, |
|
"loss": 0.7918, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 398.46153846153845, |
|
"eval_loss": 1.1017777919769287, |
|
"eval_runtime": 10.8997, |
|
"eval_samples_per_second": 12.019, |
|
"eval_steps_per_second": 1.56, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 400.0, |
|
"grad_norm": 27.237998962402344, |
|
"learning_rate": 7.401700000000001e-06, |
|
"loss": 0.825, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 400.0, |
|
"eval_loss": 1.1006879806518555, |
|
"eval_runtime": 10.8272, |
|
"eval_samples_per_second": 12.099, |
|
"eval_steps_per_second": 1.57, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 401.53846153846155, |
|
"grad_norm": 20.51366424560547, |
|
"learning_rate": 7.391700000000001e-06, |
|
"loss": 0.8069, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 401.53846153846155, |
|
"eval_loss": 1.121657371520996, |
|
"eval_runtime": 10.956, |
|
"eval_samples_per_second": 11.957, |
|
"eval_steps_per_second": 1.552, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 403.0769230769231, |
|
"grad_norm": 38.978363037109375, |
|
"learning_rate": 7.381700000000001e-06, |
|
"loss": 0.8105, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 403.0769230769231, |
|
"eval_loss": 1.1141672134399414, |
|
"eval_runtime": 10.8817, |
|
"eval_samples_per_second": 12.039, |
|
"eval_steps_per_second": 1.562, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 404.61538461538464, |
|
"grad_norm": 40.20176315307617, |
|
"learning_rate": 7.371700000000001e-06, |
|
"loss": 0.7953, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 404.61538461538464, |
|
"eval_loss": 1.1042087078094482, |
|
"eval_runtime": 10.8826, |
|
"eval_samples_per_second": 12.038, |
|
"eval_steps_per_second": 1.562, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 406.15384615384613, |
|
"grad_norm": 27.35833168029785, |
|
"learning_rate": 7.3617e-06, |
|
"loss": 0.8075, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 406.15384615384613, |
|
"eval_loss": 1.1115680932998657, |
|
"eval_runtime": 10.9465, |
|
"eval_samples_per_second": 11.967, |
|
"eval_steps_per_second": 1.553, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 407.6923076923077, |
|
"grad_norm": 32.054351806640625, |
|
"learning_rate": 7.3517e-06, |
|
"loss": 0.7958, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 407.6923076923077, |
|
"eval_loss": 1.1209728717803955, |
|
"eval_runtime": 10.8226, |
|
"eval_samples_per_second": 12.104, |
|
"eval_steps_per_second": 1.571, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 409.2307692307692, |
|
"grad_norm": 37.53184509277344, |
|
"learning_rate": 7.3417e-06, |
|
"loss": 0.7859, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 409.2307692307692, |
|
"eval_loss": 1.0920478105545044, |
|
"eval_runtime": 10.8488, |
|
"eval_samples_per_second": 12.075, |
|
"eval_steps_per_second": 1.567, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 410.7692307692308, |
|
"grad_norm": 39.949039459228516, |
|
"learning_rate": 7.3317000000000005e-06, |
|
"loss": 0.8, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 410.7692307692308, |
|
"eval_loss": 1.0909216403961182, |
|
"eval_runtime": 10.9975, |
|
"eval_samples_per_second": 11.912, |
|
"eval_steps_per_second": 1.546, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 412.3076923076923, |
|
"grad_norm": 28.89598846435547, |
|
"learning_rate": 7.3217000000000006e-06, |
|
"loss": 0.8168, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 412.3076923076923, |
|
"eval_loss": 1.108217477798462, |
|
"eval_runtime": 10.9623, |
|
"eval_samples_per_second": 11.95, |
|
"eval_steps_per_second": 1.551, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 413.84615384615387, |
|
"grad_norm": 33.56279754638672, |
|
"learning_rate": 7.311700000000001e-06, |
|
"loss": 0.7854, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 413.84615384615387, |
|
"eval_loss": 1.1230065822601318, |
|
"eval_runtime": 10.8811, |
|
"eval_samples_per_second": 12.039, |
|
"eval_steps_per_second": 1.562, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 415.38461538461536, |
|
"grad_norm": 26.578338623046875, |
|
"learning_rate": 7.301700000000001e-06, |
|
"loss": 0.7954, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 415.38461538461536, |
|
"eval_loss": 1.1114639043807983, |
|
"eval_runtime": 10.9172, |
|
"eval_samples_per_second": 11.999, |
|
"eval_steps_per_second": 1.557, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 416.9230769230769, |
|
"grad_norm": 31.6333065032959, |
|
"learning_rate": 7.291700000000001e-06, |
|
"loss": 0.8057, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 416.9230769230769, |
|
"eval_loss": 1.1136469841003418, |
|
"eval_runtime": 10.835, |
|
"eval_samples_per_second": 12.09, |
|
"eval_steps_per_second": 1.569, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 418.46153846153845, |
|
"grad_norm": 17.80211067199707, |
|
"learning_rate": 7.2817e-06, |
|
"loss": 0.8074, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 418.46153846153845, |
|
"eval_loss": 1.0823259353637695, |
|
"eval_runtime": 11.1165, |
|
"eval_samples_per_second": 11.784, |
|
"eval_steps_per_second": 1.529, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 420.0, |
|
"grad_norm": 50.79414367675781, |
|
"learning_rate": 7.2717e-06, |
|
"loss": 0.778, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 420.0, |
|
"eval_loss": 1.0840650796890259, |
|
"eval_runtime": 10.9207, |
|
"eval_samples_per_second": 11.996, |
|
"eval_steps_per_second": 1.557, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 421.53846153846155, |
|
"grad_norm": 27.64950942993164, |
|
"learning_rate": 7.2617e-06, |
|
"loss": 0.79, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 421.53846153846155, |
|
"eval_loss": 1.1120820045471191, |
|
"eval_runtime": 10.8896, |
|
"eval_samples_per_second": 12.03, |
|
"eval_steps_per_second": 1.561, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 423.0769230769231, |
|
"grad_norm": 28.115442276000977, |
|
"learning_rate": 7.2517e-06, |
|
"loss": 0.7927, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 423.0769230769231, |
|
"eval_loss": 1.0890986919403076, |
|
"eval_runtime": 10.8766, |
|
"eval_samples_per_second": 12.044, |
|
"eval_steps_per_second": 1.563, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 424.61538461538464, |
|
"grad_norm": 56.913578033447266, |
|
"learning_rate": 7.2417000000000005e-06, |
|
"loss": 0.7843, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 424.61538461538464, |
|
"eval_loss": 1.0650087594985962, |
|
"eval_runtime": 10.8169, |
|
"eval_samples_per_second": 12.111, |
|
"eval_steps_per_second": 1.572, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 426.15384615384613, |
|
"grad_norm": 28.091888427734375, |
|
"learning_rate": 7.231800000000001e-06, |
|
"loss": 0.8113, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 426.15384615384613, |
|
"eval_loss": 1.103638768196106, |
|
"eval_runtime": 10.8897, |
|
"eval_samples_per_second": 12.03, |
|
"eval_steps_per_second": 1.561, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 427.6923076923077, |
|
"grad_norm": 21.665067672729492, |
|
"learning_rate": 7.221800000000001e-06, |
|
"loss": 0.7803, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 427.6923076923077, |
|
"eval_loss": 1.1119697093963623, |
|
"eval_runtime": 10.8141, |
|
"eval_samples_per_second": 12.114, |
|
"eval_steps_per_second": 1.572, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 429.2307692307692, |
|
"grad_norm": 65.23661804199219, |
|
"learning_rate": 7.211800000000001e-06, |
|
"loss": 0.7963, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 429.2307692307692, |
|
"eval_loss": 1.1217514276504517, |
|
"eval_runtime": 10.8793, |
|
"eval_samples_per_second": 12.041, |
|
"eval_steps_per_second": 1.563, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 430.7692307692308, |
|
"grad_norm": 43.18936538696289, |
|
"learning_rate": 7.201800000000001e-06, |
|
"loss": 0.7976, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 430.7692307692308, |
|
"eval_loss": 1.1384552717208862, |
|
"eval_runtime": 10.8325, |
|
"eval_samples_per_second": 12.093, |
|
"eval_steps_per_second": 1.569, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 432.3076923076923, |
|
"grad_norm": 33.29544448852539, |
|
"learning_rate": 7.191800000000001e-06, |
|
"loss": 0.803, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 432.3076923076923, |
|
"eval_loss": 1.1162676811218262, |
|
"eval_runtime": 10.8215, |
|
"eval_samples_per_second": 12.106, |
|
"eval_steps_per_second": 1.571, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 433.84615384615387, |
|
"grad_norm": 22.86684799194336, |
|
"learning_rate": 7.181800000000001e-06, |
|
"loss": 0.8037, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 433.84615384615387, |
|
"eval_loss": 1.1036708354949951, |
|
"eval_runtime": 10.8309, |
|
"eval_samples_per_second": 12.095, |
|
"eval_steps_per_second": 1.57, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 435.38461538461536, |
|
"grad_norm": 21.229990005493164, |
|
"learning_rate": 7.171800000000001e-06, |
|
"loss": 0.7713, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 435.38461538461536, |
|
"eval_loss": 1.0910826921463013, |
|
"eval_runtime": 10.8187, |
|
"eval_samples_per_second": 12.109, |
|
"eval_steps_per_second": 1.571, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 436.9230769230769, |
|
"grad_norm": 33.30841064453125, |
|
"learning_rate": 7.161800000000001e-06, |
|
"loss": 0.7956, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 436.9230769230769, |
|
"eval_loss": 1.0862958431243896, |
|
"eval_runtime": 10.8992, |
|
"eval_samples_per_second": 12.019, |
|
"eval_steps_per_second": 1.56, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 438.46153846153845, |
|
"grad_norm": 24.01555633544922, |
|
"learning_rate": 7.151800000000001e-06, |
|
"loss": 0.7883, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 438.46153846153845, |
|
"eval_loss": 1.1144325733184814, |
|
"eval_runtime": 11.0935, |
|
"eval_samples_per_second": 11.809, |
|
"eval_steps_per_second": 1.532, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 440.0, |
|
"grad_norm": 43.260921478271484, |
|
"learning_rate": 7.141800000000001e-06, |
|
"loss": 0.7885, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 440.0, |
|
"eval_loss": 1.0987013578414917, |
|
"eval_runtime": 10.803, |
|
"eval_samples_per_second": 12.126, |
|
"eval_steps_per_second": 1.574, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 441.53846153846155, |
|
"grad_norm": 24.575450897216797, |
|
"learning_rate": 7.131800000000001e-06, |
|
"loss": 0.8052, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 441.53846153846155, |
|
"eval_loss": 1.1236560344696045, |
|
"eval_runtime": 10.7671, |
|
"eval_samples_per_second": 12.167, |
|
"eval_steps_per_second": 1.579, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 443.0769230769231, |
|
"grad_norm": 58.26254653930664, |
|
"learning_rate": 7.121800000000001e-06, |
|
"loss": 0.7856, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 443.0769230769231, |
|
"eval_loss": 1.1264104843139648, |
|
"eval_runtime": 11.0135, |
|
"eval_samples_per_second": 11.894, |
|
"eval_steps_per_second": 1.544, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 444.61538461538464, |
|
"grad_norm": 41.40846633911133, |
|
"learning_rate": 7.111800000000001e-06, |
|
"loss": 0.7923, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 444.61538461538464, |
|
"eval_loss": 1.1042475700378418, |
|
"eval_runtime": 10.8929, |
|
"eval_samples_per_second": 12.026, |
|
"eval_steps_per_second": 1.561, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 446.15384615384613, |
|
"grad_norm": 42.24570846557617, |
|
"learning_rate": 7.101800000000001e-06, |
|
"loss": 0.8239, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 446.15384615384613, |
|
"eval_loss": 1.0964312553405762, |
|
"eval_runtime": 11.2262, |
|
"eval_samples_per_second": 11.669, |
|
"eval_steps_per_second": 1.514, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 447.6923076923077, |
|
"grad_norm": 29.23143768310547, |
|
"learning_rate": 7.091800000000001e-06, |
|
"loss": 0.7698, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 447.6923076923077, |
|
"eval_loss": 1.0975351333618164, |
|
"eval_runtime": 10.963, |
|
"eval_samples_per_second": 11.949, |
|
"eval_steps_per_second": 1.551, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 449.2307692307692, |
|
"grad_norm": 50.939388275146484, |
|
"learning_rate": 7.0818000000000005e-06, |
|
"loss": 0.8042, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 449.2307692307692, |
|
"eval_loss": 1.095157504081726, |
|
"eval_runtime": 10.842, |
|
"eval_samples_per_second": 12.083, |
|
"eval_steps_per_second": 1.568, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 450.7692307692308, |
|
"grad_norm": 14.69772720336914, |
|
"learning_rate": 7.071800000000001e-06, |
|
"loss": 0.7854, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 450.7692307692308, |
|
"eval_loss": 1.0933393239974976, |
|
"eval_runtime": 10.8659, |
|
"eval_samples_per_second": 12.056, |
|
"eval_steps_per_second": 1.565, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 452.3076923076923, |
|
"grad_norm": 25.141313552856445, |
|
"learning_rate": 7.061800000000001e-06, |
|
"loss": 0.7794, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 452.3076923076923, |
|
"eval_loss": 1.1168276071548462, |
|
"eval_runtime": 10.8764, |
|
"eval_samples_per_second": 12.044, |
|
"eval_steps_per_second": 1.563, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 453.84615384615387, |
|
"grad_norm": 19.738088607788086, |
|
"learning_rate": 7.051800000000001e-06, |
|
"loss": 0.7703, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 453.84615384615387, |
|
"eval_loss": 1.1066123247146606, |
|
"eval_runtime": 10.8416, |
|
"eval_samples_per_second": 12.083, |
|
"eval_steps_per_second": 1.568, |
|
"step": 29500 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1539, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1293756588781568e+20, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|