|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 169.23076923076923, |
|
"eval_steps": 100, |
|
"global_step": 11000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 47.11796188354492, |
|
"learning_rate": 9.990900000000001e-06, |
|
"loss": 3.6644, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 2.4919605255126953, |
|
"eval_runtime": 12.5517, |
|
"eval_samples_per_second": 10.437, |
|
"eval_steps_per_second": 1.354, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 31.28130531311035, |
|
"learning_rate": 9.980900000000001e-06, |
|
"loss": 2.2347, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 2.156316041946411, |
|
"eval_runtime": 11.1792, |
|
"eval_samples_per_second": 11.718, |
|
"eval_steps_per_second": 1.521, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 39.616390228271484, |
|
"learning_rate": 9.970900000000001e-06, |
|
"loss": 2.0254, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 2.024153709411621, |
|
"eval_runtime": 11.1934, |
|
"eval_samples_per_second": 11.703, |
|
"eval_steps_per_second": 1.519, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 28.285825729370117, |
|
"learning_rate": 9.960900000000001e-06, |
|
"loss": 1.9361, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"eval_loss": 1.9094743728637695, |
|
"eval_runtime": 11.3855, |
|
"eval_samples_per_second": 11.506, |
|
"eval_steps_per_second": 1.493, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 34.14302062988281, |
|
"learning_rate": 9.950900000000002e-06, |
|
"loss": 1.8531, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"eval_loss": 1.8729331493377686, |
|
"eval_runtime": 11.2935, |
|
"eval_samples_per_second": 11.6, |
|
"eval_steps_per_second": 1.505, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 39.09531784057617, |
|
"learning_rate": 9.940900000000002e-06, |
|
"loss": 1.7669, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"eval_loss": 1.831756830215454, |
|
"eval_runtime": 11.0535, |
|
"eval_samples_per_second": 11.851, |
|
"eval_steps_per_second": 1.538, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"grad_norm": 93.24444580078125, |
|
"learning_rate": 9.930900000000002e-06, |
|
"loss": 1.7518, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"eval_loss": 1.7832175493240356, |
|
"eval_runtime": 11.1684, |
|
"eval_samples_per_second": 11.729, |
|
"eval_steps_per_second": 1.522, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 32.21013641357422, |
|
"learning_rate": 9.920900000000002e-06, |
|
"loss": 1.7149, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"eval_loss": 1.7581098079681396, |
|
"eval_runtime": 11.101, |
|
"eval_samples_per_second": 11.801, |
|
"eval_steps_per_second": 1.531, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"grad_norm": 59.90657043457031, |
|
"learning_rate": 9.9109e-06, |
|
"loss": 1.6734, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"eval_loss": 1.7163844108581543, |
|
"eval_runtime": 11.1167, |
|
"eval_samples_per_second": 11.784, |
|
"eval_steps_per_second": 1.529, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 20.61592674255371, |
|
"learning_rate": 9.9009e-06, |
|
"loss": 1.6612, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"eval_loss": 1.6949567794799805, |
|
"eval_runtime": 11.0663, |
|
"eval_samples_per_second": 11.838, |
|
"eval_steps_per_second": 1.536, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"grad_norm": 17.60099220275879, |
|
"learning_rate": 9.8909e-06, |
|
"loss": 1.6199, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"eval_loss": 1.6769332885742188, |
|
"eval_runtime": 11.0531, |
|
"eval_samples_per_second": 11.852, |
|
"eval_steps_per_second": 1.538, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"grad_norm": 20.802692413330078, |
|
"learning_rate": 9.8809e-06, |
|
"loss": 1.6008, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"eval_loss": 1.6524990797042847, |
|
"eval_runtime": 11.0831, |
|
"eval_samples_per_second": 11.82, |
|
"eval_steps_per_second": 1.534, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 21.809823989868164, |
|
"learning_rate": 9.8709e-06, |
|
"loss": 1.5812, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 1.6428295373916626, |
|
"eval_runtime": 11.1093, |
|
"eval_samples_per_second": 11.792, |
|
"eval_steps_per_second": 1.53, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"grad_norm": 46.8908576965332, |
|
"learning_rate": 9.8609e-06, |
|
"loss": 1.5419, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"eval_loss": 1.6006404161453247, |
|
"eval_runtime": 11.2393, |
|
"eval_samples_per_second": 11.655, |
|
"eval_steps_per_second": 1.513, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"grad_norm": 27.15238380432129, |
|
"learning_rate": 9.8509e-06, |
|
"loss": 1.5374, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"eval_loss": 1.5862094163894653, |
|
"eval_runtime": 11.1815, |
|
"eval_samples_per_second": 11.716, |
|
"eval_steps_per_second": 1.52, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"grad_norm": 40.26778030395508, |
|
"learning_rate": 9.840900000000001e-06, |
|
"loss": 1.4923, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"eval_loss": 1.576373815536499, |
|
"eval_runtime": 11.1215, |
|
"eval_samples_per_second": 11.779, |
|
"eval_steps_per_second": 1.529, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"grad_norm": 35.266971588134766, |
|
"learning_rate": 9.830900000000001e-06, |
|
"loss": 1.4989, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"eval_loss": 1.5671430826187134, |
|
"eval_runtime": 11.1873, |
|
"eval_samples_per_second": 11.71, |
|
"eval_steps_per_second": 1.52, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"grad_norm": 26.813480377197266, |
|
"learning_rate": 9.820900000000001e-06, |
|
"loss": 1.4711, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"eval_loss": 1.522908329963684, |
|
"eval_runtime": 11.2106, |
|
"eval_samples_per_second": 11.685, |
|
"eval_steps_per_second": 1.516, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"grad_norm": 24.576723098754883, |
|
"learning_rate": 9.810900000000001e-06, |
|
"loss": 1.4421, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"eval_loss": 1.5039104223251343, |
|
"eval_runtime": 11.257, |
|
"eval_samples_per_second": 11.637, |
|
"eval_steps_per_second": 1.51, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"grad_norm": 28.480438232421875, |
|
"learning_rate": 9.800900000000001e-06, |
|
"loss": 1.4347, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"eval_loss": 1.5123459100723267, |
|
"eval_runtime": 11.187, |
|
"eval_samples_per_second": 11.71, |
|
"eval_steps_per_second": 1.52, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"grad_norm": 56.582088470458984, |
|
"learning_rate": 9.790900000000001e-06, |
|
"loss": 1.4212, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"eval_loss": 1.481844425201416, |
|
"eval_runtime": 11.2075, |
|
"eval_samples_per_second": 11.689, |
|
"eval_steps_per_second": 1.517, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"grad_norm": 38.5254020690918, |
|
"learning_rate": 9.780900000000002e-06, |
|
"loss": 1.3908, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"eval_loss": 1.4529048204421997, |
|
"eval_runtime": 11.197, |
|
"eval_samples_per_second": 11.7, |
|
"eval_steps_per_second": 1.518, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"grad_norm": 44.74857711791992, |
|
"learning_rate": 9.770900000000002e-06, |
|
"loss": 1.3734, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"eval_loss": 1.4617427587509155, |
|
"eval_runtime": 11.1235, |
|
"eval_samples_per_second": 11.777, |
|
"eval_steps_per_second": 1.528, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"grad_norm": 28.926782608032227, |
|
"learning_rate": 9.760900000000002e-06, |
|
"loss": 1.365, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"eval_loss": 1.4297789335250854, |
|
"eval_runtime": 11.3007, |
|
"eval_samples_per_second": 11.592, |
|
"eval_steps_per_second": 1.504, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 27.750213623046875, |
|
"learning_rate": 9.7509e-06, |
|
"loss": 1.3306, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"eval_loss": 1.4345914125442505, |
|
"eval_runtime": 11.2795, |
|
"eval_samples_per_second": 11.614, |
|
"eval_steps_per_second": 1.507, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 35.288352966308594, |
|
"learning_rate": 9.7409e-06, |
|
"loss": 1.3677, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 1.447089433670044, |
|
"eval_runtime": 11.1366, |
|
"eval_samples_per_second": 11.763, |
|
"eval_steps_per_second": 1.527, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"grad_norm": 65.49736022949219, |
|
"learning_rate": 9.7309e-06, |
|
"loss": 1.3453, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"eval_loss": 1.405300259590149, |
|
"eval_runtime": 11.1549, |
|
"eval_samples_per_second": 11.744, |
|
"eval_steps_per_second": 1.524, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"grad_norm": 24.333518981933594, |
|
"learning_rate": 9.7209e-06, |
|
"loss": 1.3206, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"eval_loss": 1.4218717813491821, |
|
"eval_runtime": 11.3113, |
|
"eval_samples_per_second": 11.581, |
|
"eval_steps_per_second": 1.503, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"grad_norm": 45.50777816772461, |
|
"learning_rate": 9.7109e-06, |
|
"loss": 1.3363, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"eval_loss": 1.4220006465911865, |
|
"eval_runtime": 11.156, |
|
"eval_samples_per_second": 11.743, |
|
"eval_steps_per_second": 1.524, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"grad_norm": 25.898344039916992, |
|
"learning_rate": 9.7009e-06, |
|
"loss": 1.2995, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"eval_loss": 1.3942431211471558, |
|
"eval_runtime": 11.2282, |
|
"eval_samples_per_second": 11.667, |
|
"eval_steps_per_second": 1.514, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"grad_norm": 56.0889892578125, |
|
"learning_rate": 9.6909e-06, |
|
"loss": 1.2994, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"eval_loss": 1.3970586061477661, |
|
"eval_runtime": 11.1682, |
|
"eval_samples_per_second": 11.73, |
|
"eval_steps_per_second": 1.522, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"grad_norm": 58.10311508178711, |
|
"learning_rate": 9.6809e-06, |
|
"loss": 1.2761, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"eval_loss": 1.390371561050415, |
|
"eval_runtime": 11.2406, |
|
"eval_samples_per_second": 11.654, |
|
"eval_steps_per_second": 1.512, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"grad_norm": 17.050870895385742, |
|
"learning_rate": 9.670900000000001e-06, |
|
"loss": 1.2712, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"eval_loss": 1.3936753273010254, |
|
"eval_runtime": 11.2364, |
|
"eval_samples_per_second": 11.659, |
|
"eval_steps_per_second": 1.513, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"grad_norm": 232.21804809570312, |
|
"learning_rate": 9.660900000000001e-06, |
|
"loss": 1.262, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"eval_loss": 1.4037091732025146, |
|
"eval_runtime": 11.2231, |
|
"eval_samples_per_second": 11.672, |
|
"eval_steps_per_second": 1.515, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"grad_norm": 35.11832046508789, |
|
"learning_rate": 9.650900000000001e-06, |
|
"loss": 1.2788, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"eval_loss": 1.3545280694961548, |
|
"eval_runtime": 11.1609, |
|
"eval_samples_per_second": 11.737, |
|
"eval_steps_per_second": 1.523, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"grad_norm": 27.077022552490234, |
|
"learning_rate": 9.640900000000001e-06, |
|
"loss": 1.2711, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"eval_loss": 1.3528518676757812, |
|
"eval_runtime": 11.2852, |
|
"eval_samples_per_second": 11.608, |
|
"eval_steps_per_second": 1.506, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"grad_norm": 46.97712326049805, |
|
"learning_rate": 9.630900000000001e-06, |
|
"loss": 1.2492, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"eval_loss": 1.3534098863601685, |
|
"eval_runtime": 11.2368, |
|
"eval_samples_per_second": 11.658, |
|
"eval_steps_per_second": 1.513, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"grad_norm": 42.06857681274414, |
|
"learning_rate": 9.620900000000001e-06, |
|
"loss": 1.2506, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"eval_loss": 1.3613977432250977, |
|
"eval_runtime": 11.2206, |
|
"eval_samples_per_second": 11.675, |
|
"eval_steps_per_second": 1.515, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 19.298952102661133, |
|
"learning_rate": 9.610900000000001e-06, |
|
"loss": 1.2201, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_loss": 1.3586474657058716, |
|
"eval_runtime": 11.2045, |
|
"eval_samples_per_second": 11.692, |
|
"eval_steps_per_second": 1.517, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"grad_norm": 30.0198974609375, |
|
"learning_rate": 9.600900000000002e-06, |
|
"loss": 1.2086, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"eval_loss": 1.3304755687713623, |
|
"eval_runtime": 11.193, |
|
"eval_samples_per_second": 11.704, |
|
"eval_steps_per_second": 1.519, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"grad_norm": 37.59902572631836, |
|
"learning_rate": 9.5909e-06, |
|
"loss": 1.2375, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"eval_loss": 1.331407904624939, |
|
"eval_runtime": 10.6714, |
|
"eval_samples_per_second": 12.276, |
|
"eval_steps_per_second": 1.593, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"grad_norm": 36.82079315185547, |
|
"learning_rate": 9.5809e-06, |
|
"loss": 1.2148, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"eval_loss": 1.3441548347473145, |
|
"eval_runtime": 10.7472, |
|
"eval_samples_per_second": 12.189, |
|
"eval_steps_per_second": 1.582, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"grad_norm": 30.974130630493164, |
|
"learning_rate": 9.5709e-06, |
|
"loss": 1.197, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"eval_loss": 1.34512197971344, |
|
"eval_runtime": 10.7398, |
|
"eval_samples_per_second": 12.198, |
|
"eval_steps_per_second": 1.583, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 67.6923076923077, |
|
"grad_norm": 20.45345115661621, |
|
"learning_rate": 9.5609e-06, |
|
"loss": 1.2361, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 67.6923076923077, |
|
"eval_loss": 1.3371080160140991, |
|
"eval_runtime": 10.654, |
|
"eval_samples_per_second": 12.296, |
|
"eval_steps_per_second": 1.596, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 69.23076923076923, |
|
"grad_norm": 19.758630752563477, |
|
"learning_rate": 9.5509e-06, |
|
"loss": 1.2001, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 69.23076923076923, |
|
"eval_loss": 1.3270760774612427, |
|
"eval_runtime": 10.6928, |
|
"eval_samples_per_second": 12.251, |
|
"eval_steps_per_second": 1.59, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 70.76923076923077, |
|
"grad_norm": 45.2899055480957, |
|
"learning_rate": 9.5409e-06, |
|
"loss": 1.192, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 70.76923076923077, |
|
"eval_loss": 1.3184590339660645, |
|
"eval_runtime": 10.7154, |
|
"eval_samples_per_second": 12.225, |
|
"eval_steps_per_second": 1.586, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 72.3076923076923, |
|
"grad_norm": 45.60734939575195, |
|
"learning_rate": 9.5309e-06, |
|
"loss": 1.2081, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 72.3076923076923, |
|
"eval_loss": 1.3107666969299316, |
|
"eval_runtime": 10.7029, |
|
"eval_samples_per_second": 12.24, |
|
"eval_steps_per_second": 1.588, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 73.84615384615384, |
|
"grad_norm": 26.859603881835938, |
|
"learning_rate": 9.5209e-06, |
|
"loss": 1.1729, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 73.84615384615384, |
|
"eval_loss": 1.310062289237976, |
|
"eval_runtime": 10.7544, |
|
"eval_samples_per_second": 12.181, |
|
"eval_steps_per_second": 1.581, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 75.38461538461539, |
|
"grad_norm": 19.90914535522461, |
|
"learning_rate": 9.5109e-06, |
|
"loss": 1.1899, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 75.38461538461539, |
|
"eval_loss": 1.3038017749786377, |
|
"eval_runtime": 10.6532, |
|
"eval_samples_per_second": 12.297, |
|
"eval_steps_per_second": 1.596, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 76.92307692307692, |
|
"grad_norm": 22.629934310913086, |
|
"learning_rate": 9.5009e-06, |
|
"loss": 1.1875, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 76.92307692307692, |
|
"eval_loss": 1.2785409688949585, |
|
"eval_runtime": 10.6388, |
|
"eval_samples_per_second": 12.313, |
|
"eval_steps_per_second": 1.598, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 78.46153846153847, |
|
"grad_norm": 45.462059020996094, |
|
"learning_rate": 9.490900000000001e-06, |
|
"loss": 1.1717, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 78.46153846153847, |
|
"eval_loss": 1.278078317642212, |
|
"eval_runtime": 10.9666, |
|
"eval_samples_per_second": 11.945, |
|
"eval_steps_per_second": 1.55, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 34.85255432128906, |
|
"learning_rate": 9.480900000000001e-06, |
|
"loss": 1.1657, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_loss": 1.2711185216903687, |
|
"eval_runtime": 11.0211, |
|
"eval_samples_per_second": 11.886, |
|
"eval_steps_per_second": 1.543, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 81.53846153846153, |
|
"grad_norm": 19.078449249267578, |
|
"learning_rate": 9.470900000000001e-06, |
|
"loss": 1.1814, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 81.53846153846153, |
|
"eval_loss": 1.2781996726989746, |
|
"eval_runtime": 11.0663, |
|
"eval_samples_per_second": 11.838, |
|
"eval_steps_per_second": 1.536, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 83.07692307692308, |
|
"grad_norm": 31.05898094177246, |
|
"learning_rate": 9.460900000000001e-06, |
|
"loss": 1.1452, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 83.07692307692308, |
|
"eval_loss": 1.2848775386810303, |
|
"eval_runtime": 11.01, |
|
"eval_samples_per_second": 11.898, |
|
"eval_steps_per_second": 1.544, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 84.61538461538461, |
|
"grad_norm": 28.712461471557617, |
|
"learning_rate": 9.450900000000001e-06, |
|
"loss": 1.1465, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 84.61538461538461, |
|
"eval_loss": 1.2928494215011597, |
|
"eval_runtime": 10.9253, |
|
"eval_samples_per_second": 11.991, |
|
"eval_steps_per_second": 1.556, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 86.15384615384616, |
|
"grad_norm": 19.871828079223633, |
|
"learning_rate": 9.440900000000001e-06, |
|
"loss": 1.1736, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 86.15384615384616, |
|
"eval_loss": 1.2648124694824219, |
|
"eval_runtime": 11.0314, |
|
"eval_samples_per_second": 11.875, |
|
"eval_steps_per_second": 1.541, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 87.6923076923077, |
|
"grad_norm": 22.47665023803711, |
|
"learning_rate": 9.4309e-06, |
|
"loss": 1.1184, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 87.6923076923077, |
|
"eval_loss": 1.2936598062515259, |
|
"eval_runtime": 10.9322, |
|
"eval_samples_per_second": 11.983, |
|
"eval_steps_per_second": 1.555, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 89.23076923076923, |
|
"grad_norm": 38.79877471923828, |
|
"learning_rate": 9.4209e-06, |
|
"loss": 1.1616, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 89.23076923076923, |
|
"eval_loss": 1.2650004625320435, |
|
"eval_runtime": 10.9434, |
|
"eval_samples_per_second": 11.971, |
|
"eval_steps_per_second": 1.553, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 90.76923076923077, |
|
"grad_norm": 40.851097106933594, |
|
"learning_rate": 9.4109e-06, |
|
"loss": 1.1469, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 90.76923076923077, |
|
"eval_loss": 1.252148151397705, |
|
"eval_runtime": 10.9867, |
|
"eval_samples_per_second": 11.923, |
|
"eval_steps_per_second": 1.547, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 92.3076923076923, |
|
"grad_norm": 21.35544204711914, |
|
"learning_rate": 9.4009e-06, |
|
"loss": 1.1489, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 92.3076923076923, |
|
"eval_loss": 1.259344220161438, |
|
"eval_runtime": 10.9649, |
|
"eval_samples_per_second": 11.947, |
|
"eval_steps_per_second": 1.55, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 93.84615384615384, |
|
"grad_norm": 33.265132904052734, |
|
"learning_rate": 9.3909e-06, |
|
"loss": 1.1315, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 93.84615384615384, |
|
"eval_loss": 1.252693772315979, |
|
"eval_runtime": 10.9852, |
|
"eval_samples_per_second": 11.925, |
|
"eval_steps_per_second": 1.548, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 95.38461538461539, |
|
"grad_norm": 23.43667221069336, |
|
"learning_rate": 9.381e-06, |
|
"loss": 1.119, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 95.38461538461539, |
|
"eval_loss": 1.254772424697876, |
|
"eval_runtime": 11.0937, |
|
"eval_samples_per_second": 11.808, |
|
"eval_steps_per_second": 1.532, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 96.92307692307692, |
|
"grad_norm": 51.93602752685547, |
|
"learning_rate": 9.371e-06, |
|
"loss": 1.1333, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 96.92307692307692, |
|
"eval_loss": 1.249408483505249, |
|
"eval_runtime": 11.0388, |
|
"eval_samples_per_second": 11.867, |
|
"eval_steps_per_second": 1.54, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 98.46153846153847, |
|
"grad_norm": 23.473421096801758, |
|
"learning_rate": 9.361e-06, |
|
"loss": 1.1164, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 98.46153846153847, |
|
"eval_loss": 1.2438522577285767, |
|
"eval_runtime": 11.1581, |
|
"eval_samples_per_second": 11.74, |
|
"eval_steps_per_second": 1.524, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 24.228403091430664, |
|
"learning_rate": 9.351e-06, |
|
"loss": 1.1333, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_loss": 1.248632788658142, |
|
"eval_runtime": 10.9675, |
|
"eval_samples_per_second": 11.944, |
|
"eval_steps_per_second": 1.55, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 101.53846153846153, |
|
"grad_norm": 18.29631996154785, |
|
"learning_rate": 9.341000000000001e-06, |
|
"loss": 1.1082, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 101.53846153846153, |
|
"eval_loss": 1.2509865760803223, |
|
"eval_runtime": 11.0693, |
|
"eval_samples_per_second": 11.834, |
|
"eval_steps_per_second": 1.536, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 103.07692307692308, |
|
"grad_norm": 42.855491638183594, |
|
"learning_rate": 9.331000000000001e-06, |
|
"loss": 1.1178, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 103.07692307692308, |
|
"eval_loss": 1.2890292406082153, |
|
"eval_runtime": 11.0366, |
|
"eval_samples_per_second": 11.87, |
|
"eval_steps_per_second": 1.54, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 104.61538461538461, |
|
"grad_norm": 46.675655364990234, |
|
"learning_rate": 9.321000000000001e-06, |
|
"loss": 1.1106, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 104.61538461538461, |
|
"eval_loss": 1.2719863653182983, |
|
"eval_runtime": 11.1266, |
|
"eval_samples_per_second": 11.774, |
|
"eval_steps_per_second": 1.528, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 106.15384615384616, |
|
"grad_norm": 26.414846420288086, |
|
"learning_rate": 9.311000000000001e-06, |
|
"loss": 1.1216, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 106.15384615384616, |
|
"eval_loss": 1.2423394918441772, |
|
"eval_runtime": 11.0197, |
|
"eval_samples_per_second": 11.888, |
|
"eval_steps_per_second": 1.543, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 107.6923076923077, |
|
"grad_norm": 31.4022274017334, |
|
"learning_rate": 9.301000000000001e-06, |
|
"loss": 1.1052, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 107.6923076923077, |
|
"eval_loss": 1.2372961044311523, |
|
"eval_runtime": 11.1127, |
|
"eval_samples_per_second": 11.788, |
|
"eval_steps_per_second": 1.53, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 109.23076923076923, |
|
"grad_norm": 23.16703987121582, |
|
"learning_rate": 9.291000000000001e-06, |
|
"loss": 1.0911, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 109.23076923076923, |
|
"eval_loss": 1.2309863567352295, |
|
"eval_runtime": 10.9572, |
|
"eval_samples_per_second": 11.956, |
|
"eval_steps_per_second": 1.551, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 110.76923076923077, |
|
"grad_norm": 21.648773193359375, |
|
"learning_rate": 9.281000000000001e-06, |
|
"loss": 1.0956, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 110.76923076923077, |
|
"eval_loss": 1.2261079549789429, |
|
"eval_runtime": 10.9351, |
|
"eval_samples_per_second": 11.98, |
|
"eval_steps_per_second": 1.555, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 112.3076923076923, |
|
"grad_norm": 24.5791072845459, |
|
"learning_rate": 9.271000000000002e-06, |
|
"loss": 1.0751, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 112.3076923076923, |
|
"eval_loss": 1.2161471843719482, |
|
"eval_runtime": 11.1052, |
|
"eval_samples_per_second": 11.796, |
|
"eval_steps_per_second": 1.531, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 113.84615384615384, |
|
"grad_norm": 35.867801666259766, |
|
"learning_rate": 9.261000000000002e-06, |
|
"loss": 1.086, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 113.84615384615384, |
|
"eval_loss": 1.2092362642288208, |
|
"eval_runtime": 11.2048, |
|
"eval_samples_per_second": 11.691, |
|
"eval_steps_per_second": 1.517, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 115.38461538461539, |
|
"grad_norm": 67.91041564941406, |
|
"learning_rate": 9.251000000000002e-06, |
|
"loss": 1.092, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 115.38461538461539, |
|
"eval_loss": 1.241829514503479, |
|
"eval_runtime": 10.9937, |
|
"eval_samples_per_second": 11.916, |
|
"eval_steps_per_second": 1.546, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 116.92307692307692, |
|
"grad_norm": 128.73751831054688, |
|
"learning_rate": 9.241000000000002e-06, |
|
"loss": 1.0764, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 116.92307692307692, |
|
"eval_loss": 1.2462713718414307, |
|
"eval_runtime": 10.9625, |
|
"eval_samples_per_second": 11.95, |
|
"eval_steps_per_second": 1.551, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 118.46153846153847, |
|
"grad_norm": 86.5144271850586, |
|
"learning_rate": 9.231000000000002e-06, |
|
"loss": 1.0643, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 118.46153846153847, |
|
"eval_loss": 1.2187525033950806, |
|
"eval_runtime": 10.9492, |
|
"eval_samples_per_second": 11.964, |
|
"eval_steps_per_second": 1.553, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"grad_norm": 19.4710750579834, |
|
"learning_rate": 9.221e-06, |
|
"loss": 1.0966, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"eval_loss": 1.2282384634017944, |
|
"eval_runtime": 10.9078, |
|
"eval_samples_per_second": 12.01, |
|
"eval_steps_per_second": 1.559, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 121.53846153846153, |
|
"grad_norm": 37.673744201660156, |
|
"learning_rate": 9.211e-06, |
|
"loss": 1.0632, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 121.53846153846153, |
|
"eval_loss": 1.2206230163574219, |
|
"eval_runtime": 11.0018, |
|
"eval_samples_per_second": 11.907, |
|
"eval_steps_per_second": 1.545, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 123.07692307692308, |
|
"grad_norm": 25.10326385498047, |
|
"learning_rate": 9.201e-06, |
|
"loss": 1.0873, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 123.07692307692308, |
|
"eval_loss": 1.2137339115142822, |
|
"eval_runtime": 10.9232, |
|
"eval_samples_per_second": 11.993, |
|
"eval_steps_per_second": 1.556, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 124.61538461538461, |
|
"grad_norm": 32.02176284790039, |
|
"learning_rate": 9.191e-06, |
|
"loss": 1.0568, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 124.61538461538461, |
|
"eval_loss": 1.2065187692642212, |
|
"eval_runtime": 10.9614, |
|
"eval_samples_per_second": 11.951, |
|
"eval_steps_per_second": 1.551, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 126.15384615384616, |
|
"grad_norm": 19.97406005859375, |
|
"learning_rate": 9.181e-06, |
|
"loss": 1.065, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 126.15384615384616, |
|
"eval_loss": 1.2094841003417969, |
|
"eval_runtime": 11.0274, |
|
"eval_samples_per_second": 11.879, |
|
"eval_steps_per_second": 1.542, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 127.6923076923077, |
|
"grad_norm": 31.624399185180664, |
|
"learning_rate": 9.171e-06, |
|
"loss": 1.0805, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 127.6923076923077, |
|
"eval_loss": 1.2149733304977417, |
|
"eval_runtime": 11.2014, |
|
"eval_samples_per_second": 11.695, |
|
"eval_steps_per_second": 1.518, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 129.23076923076923, |
|
"grad_norm": 29.24848747253418, |
|
"learning_rate": 9.161000000000001e-06, |
|
"loss": 1.0463, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 129.23076923076923, |
|
"eval_loss": 1.2077099084854126, |
|
"eval_runtime": 10.9432, |
|
"eval_samples_per_second": 11.971, |
|
"eval_steps_per_second": 1.553, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 130.76923076923077, |
|
"grad_norm": 27.14787483215332, |
|
"learning_rate": 9.151000000000001e-06, |
|
"loss": 1.0607, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 130.76923076923077, |
|
"eval_loss": 1.2046644687652588, |
|
"eval_runtime": 11.001, |
|
"eval_samples_per_second": 11.908, |
|
"eval_steps_per_second": 1.545, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 132.30769230769232, |
|
"grad_norm": 32.416194915771484, |
|
"learning_rate": 9.141000000000001e-06, |
|
"loss": 1.0365, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 132.30769230769232, |
|
"eval_loss": 1.195080041885376, |
|
"eval_runtime": 11.0205, |
|
"eval_samples_per_second": 11.887, |
|
"eval_steps_per_second": 1.543, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 133.84615384615384, |
|
"grad_norm": 58.863582611083984, |
|
"learning_rate": 9.131000000000001e-06, |
|
"loss": 1.0564, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 133.84615384615384, |
|
"eval_loss": 1.1966980695724487, |
|
"eval_runtime": 10.9043, |
|
"eval_samples_per_second": 12.014, |
|
"eval_steps_per_second": 1.559, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 135.3846153846154, |
|
"grad_norm": 26.08232307434082, |
|
"learning_rate": 9.121000000000001e-06, |
|
"loss": 1.0507, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 135.3846153846154, |
|
"eval_loss": 1.201367735862732, |
|
"eval_runtime": 10.9714, |
|
"eval_samples_per_second": 11.94, |
|
"eval_steps_per_second": 1.549, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 136.92307692307693, |
|
"grad_norm": 59.368019104003906, |
|
"learning_rate": 9.111000000000001e-06, |
|
"loss": 1.0508, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 136.92307692307693, |
|
"eval_loss": 1.208795428276062, |
|
"eval_runtime": 11.0529, |
|
"eval_samples_per_second": 11.852, |
|
"eval_steps_per_second": 1.538, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 138.46153846153845, |
|
"grad_norm": 43.07460021972656, |
|
"learning_rate": 9.101000000000001e-06, |
|
"loss": 1.0359, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 138.46153846153845, |
|
"eval_loss": 1.1782392263412476, |
|
"eval_runtime": 11.1519, |
|
"eval_samples_per_second": 11.747, |
|
"eval_steps_per_second": 1.524, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 140.0, |
|
"grad_norm": 22.39567756652832, |
|
"learning_rate": 9.091000000000002e-06, |
|
"loss": 1.0584, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 140.0, |
|
"eval_loss": 1.1873483657836914, |
|
"eval_runtime": 10.8732, |
|
"eval_samples_per_second": 12.048, |
|
"eval_steps_per_second": 1.563, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 141.53846153846155, |
|
"grad_norm": 22.813621520996094, |
|
"learning_rate": 9.081000000000002e-06, |
|
"loss": 1.0354, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 141.53846153846155, |
|
"eval_loss": 1.1731117963790894, |
|
"eval_runtime": 11.0847, |
|
"eval_samples_per_second": 11.818, |
|
"eval_steps_per_second": 1.534, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 143.07692307692307, |
|
"grad_norm": 31.44073486328125, |
|
"learning_rate": 9.071000000000002e-06, |
|
"loss": 1.0457, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 143.07692307692307, |
|
"eval_loss": 1.1962807178497314, |
|
"eval_runtime": 11.0668, |
|
"eval_samples_per_second": 11.837, |
|
"eval_steps_per_second": 1.536, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 144.6153846153846, |
|
"grad_norm": 18.18711280822754, |
|
"learning_rate": 9.061e-06, |
|
"loss": 1.0481, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 144.6153846153846, |
|
"eval_loss": 1.185339093208313, |
|
"eval_runtime": 10.9916, |
|
"eval_samples_per_second": 11.918, |
|
"eval_steps_per_second": 1.547, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 146.15384615384616, |
|
"grad_norm": 38.05665969848633, |
|
"learning_rate": 9.051e-06, |
|
"loss": 1.0391, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 146.15384615384616, |
|
"eval_loss": 1.1856777667999268, |
|
"eval_runtime": 11.0391, |
|
"eval_samples_per_second": 11.867, |
|
"eval_steps_per_second": 1.54, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 147.69230769230768, |
|
"grad_norm": 19.963260650634766, |
|
"learning_rate": 9.041e-06, |
|
"loss": 1.0322, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 147.69230769230768, |
|
"eval_loss": 1.1843148469924927, |
|
"eval_runtime": 11.0382, |
|
"eval_samples_per_second": 11.868, |
|
"eval_steps_per_second": 1.54, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 149.23076923076923, |
|
"grad_norm": 28.58108901977539, |
|
"learning_rate": 9.031e-06, |
|
"loss": 1.0369, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 149.23076923076923, |
|
"eval_loss": 1.1637182235717773, |
|
"eval_runtime": 11.2766, |
|
"eval_samples_per_second": 11.617, |
|
"eval_steps_per_second": 1.508, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 150.76923076923077, |
|
"grad_norm": 64.3956527709961, |
|
"learning_rate": 9.021e-06, |
|
"loss": 1.0519, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 150.76923076923077, |
|
"eval_loss": 1.1796802282333374, |
|
"eval_runtime": 11.3418, |
|
"eval_samples_per_second": 11.55, |
|
"eval_steps_per_second": 1.499, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 152.30769230769232, |
|
"grad_norm": 18.857580184936523, |
|
"learning_rate": 9.011e-06, |
|
"loss": 1.0272, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 152.30769230769232, |
|
"eval_loss": 1.1634279489517212, |
|
"eval_runtime": 11.6749, |
|
"eval_samples_per_second": 11.221, |
|
"eval_steps_per_second": 1.456, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 153.84615384615384, |
|
"grad_norm": 42.541683197021484, |
|
"learning_rate": 9.001e-06, |
|
"loss": 1.0287, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 153.84615384615384, |
|
"eval_loss": 1.1698088645935059, |
|
"eval_runtime": 11.068, |
|
"eval_samples_per_second": 11.836, |
|
"eval_steps_per_second": 1.536, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 155.3846153846154, |
|
"grad_norm": 30.52286720275879, |
|
"learning_rate": 8.991e-06, |
|
"loss": 1.0237, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 155.3846153846154, |
|
"eval_loss": 1.178311824798584, |
|
"eval_runtime": 11.1107, |
|
"eval_samples_per_second": 11.79, |
|
"eval_steps_per_second": 1.53, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 156.92307692307693, |
|
"grad_norm": 32.60612487792969, |
|
"learning_rate": 8.981000000000001e-06, |
|
"loss": 1.0362, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 156.92307692307693, |
|
"eval_loss": 1.157893180847168, |
|
"eval_runtime": 11.0484, |
|
"eval_samples_per_second": 11.857, |
|
"eval_steps_per_second": 1.539, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 158.46153846153845, |
|
"grad_norm": 26.15647315979004, |
|
"learning_rate": 8.971000000000001e-06, |
|
"loss": 0.998, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 158.46153846153845, |
|
"eval_loss": 1.1724605560302734, |
|
"eval_runtime": 11.2323, |
|
"eval_samples_per_second": 11.663, |
|
"eval_steps_per_second": 1.513, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 160.0, |
|
"grad_norm": 45.76993942260742, |
|
"learning_rate": 8.961000000000001e-06, |
|
"loss": 1.0432, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 160.0, |
|
"eval_loss": 1.1688075065612793, |
|
"eval_runtime": 11.0984, |
|
"eval_samples_per_second": 11.803, |
|
"eval_steps_per_second": 1.532, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 161.53846153846155, |
|
"grad_norm": 40.384578704833984, |
|
"learning_rate": 8.951000000000001e-06, |
|
"loss": 1.0421, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 161.53846153846155, |
|
"eval_loss": 1.1621845960617065, |
|
"eval_runtime": 11.1624, |
|
"eval_samples_per_second": 11.736, |
|
"eval_steps_per_second": 1.523, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 163.07692307692307, |
|
"grad_norm": 22.886058807373047, |
|
"learning_rate": 8.941000000000001e-06, |
|
"loss": 0.9912, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 163.07692307692307, |
|
"eval_loss": 1.1695055961608887, |
|
"eval_runtime": 10.9842, |
|
"eval_samples_per_second": 11.926, |
|
"eval_steps_per_second": 1.548, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 164.6153846153846, |
|
"grad_norm": 26.940736770629883, |
|
"learning_rate": 8.931000000000001e-06, |
|
"loss": 1.011, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 164.6153846153846, |
|
"eval_loss": 1.1458157300949097, |
|
"eval_runtime": 11.025, |
|
"eval_samples_per_second": 11.882, |
|
"eval_steps_per_second": 1.542, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 166.15384615384616, |
|
"grad_norm": 26.5013484954834, |
|
"learning_rate": 8.921000000000001e-06, |
|
"loss": 0.9876, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 166.15384615384616, |
|
"eval_loss": 1.1569631099700928, |
|
"eval_runtime": 11.101, |
|
"eval_samples_per_second": 11.801, |
|
"eval_steps_per_second": 1.531, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 167.69230769230768, |
|
"grad_norm": 29.859987258911133, |
|
"learning_rate": 8.911000000000002e-06, |
|
"loss": 1.0374, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 167.69230769230768, |
|
"eval_loss": 1.149316668510437, |
|
"eval_runtime": 11.1242, |
|
"eval_samples_per_second": 11.776, |
|
"eval_steps_per_second": 1.528, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 169.23076923076923, |
|
"grad_norm": 27.777738571166992, |
|
"learning_rate": 8.901e-06, |
|
"loss": 0.985, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 169.23076923076923, |
|
"eval_loss": 1.1608328819274902, |
|
"eval_runtime": 11.2215, |
|
"eval_samples_per_second": 11.674, |
|
"eval_steps_per_second": 1.515, |
|
"step": 11000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1539, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.222258013933568e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|