diff --git "a/checkpoint-29000/trainer_state.json" "b/checkpoint-29000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-29000/trainer_state.json" @@ -0,0 +1,4383 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 446.15384615384613, + "eval_steps": 100, + "global_step": 29000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.5384615384615383, + "grad_norm": 47.11796188354492, + "learning_rate": 9.990900000000001e-06, + "loss": 3.6644, + "step": 100 + }, + { + "epoch": 1.5384615384615383, + "eval_loss": 2.4919605255126953, + "eval_runtime": 12.5517, + "eval_samples_per_second": 10.437, + "eval_steps_per_second": 1.354, + "step": 100 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 31.28130531311035, + "learning_rate": 9.980900000000001e-06, + "loss": 2.2347, + "step": 200 + }, + { + "epoch": 3.076923076923077, + "eval_loss": 2.156316041946411, + "eval_runtime": 11.1792, + "eval_samples_per_second": 11.718, + "eval_steps_per_second": 1.521, + "step": 200 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 39.616390228271484, + "learning_rate": 9.970900000000001e-06, + "loss": 2.0254, + "step": 300 + }, + { + "epoch": 4.615384615384615, + "eval_loss": 2.024153709411621, + "eval_runtime": 11.1934, + "eval_samples_per_second": 11.703, + "eval_steps_per_second": 1.519, + "step": 300 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 28.285825729370117, + "learning_rate": 9.960900000000001e-06, + "loss": 1.9361, + "step": 400 + }, + { + "epoch": 6.153846153846154, + "eval_loss": 1.9094743728637695, + "eval_runtime": 11.3855, + "eval_samples_per_second": 11.506, + "eval_steps_per_second": 1.493, + "step": 400 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 34.14302062988281, + "learning_rate": 9.950900000000002e-06, + "loss": 1.8531, + "step": 500 + }, + { + "epoch": 7.6923076923076925, + "eval_loss": 1.8729331493377686, + "eval_runtime": 11.2935, + "eval_samples_per_second": 11.6, + "eval_steps_per_second": 1.505, + "step": 500 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 39.09531784057617, + "learning_rate": 9.940900000000002e-06, + "loss": 1.7669, + "step": 600 + }, + { + "epoch": 9.23076923076923, + "eval_loss": 1.831756830215454, + "eval_runtime": 11.0535, + "eval_samples_per_second": 11.851, + "eval_steps_per_second": 1.538, + "step": 600 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 93.24444580078125, + "learning_rate": 9.930900000000002e-06, + "loss": 1.7518, + "step": 700 + }, + { + "epoch": 10.76923076923077, + "eval_loss": 1.7832175493240356, + "eval_runtime": 11.1684, + "eval_samples_per_second": 11.729, + "eval_steps_per_second": 1.522, + "step": 700 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 32.21013641357422, + "learning_rate": 9.920900000000002e-06, + "loss": 1.7149, + "step": 800 + }, + { + "epoch": 12.307692307692308, + "eval_loss": 1.7581098079681396, + "eval_runtime": 11.101, + "eval_samples_per_second": 11.801, + "eval_steps_per_second": 1.531, + "step": 800 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 59.90657043457031, + "learning_rate": 9.9109e-06, + "loss": 1.6734, + "step": 900 + }, + { + "epoch": 13.846153846153847, + "eval_loss": 1.7163844108581543, + "eval_runtime": 11.1167, + "eval_samples_per_second": 11.784, + "eval_steps_per_second": 1.529, + "step": 900 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 20.61592674255371, + "learning_rate": 9.9009e-06, + "loss": 1.6612, + "step": 1000 + }, + { + "epoch": 15.384615384615385, + "eval_loss": 1.6949567794799805, + "eval_runtime": 11.0663, + "eval_samples_per_second": 11.838, + "eval_steps_per_second": 1.536, + "step": 1000 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 17.60099220275879, + "learning_rate": 9.8909e-06, + "loss": 1.6199, + "step": 1100 + }, + { + "epoch": 16.923076923076923, + "eval_loss": 1.6769332885742188, + "eval_runtime": 11.0531, + "eval_samples_per_second": 11.852, + "eval_steps_per_second": 1.538, + "step": 1100 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 20.802692413330078, + "learning_rate": 9.8809e-06, + "loss": 1.6008, + "step": 1200 + }, + { + "epoch": 18.46153846153846, + "eval_loss": 1.6524990797042847, + "eval_runtime": 11.0831, + "eval_samples_per_second": 11.82, + "eval_steps_per_second": 1.534, + "step": 1200 + }, + { + "epoch": 20.0, + "grad_norm": 21.809823989868164, + "learning_rate": 9.8709e-06, + "loss": 1.5812, + "step": 1300 + }, + { + "epoch": 20.0, + "eval_loss": 1.6428295373916626, + "eval_runtime": 11.1093, + "eval_samples_per_second": 11.792, + "eval_steps_per_second": 1.53, + "step": 1300 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 46.8908576965332, + "learning_rate": 9.8609e-06, + "loss": 1.5419, + "step": 1400 + }, + { + "epoch": 21.53846153846154, + "eval_loss": 1.6006404161453247, + "eval_runtime": 11.2393, + "eval_samples_per_second": 11.655, + "eval_steps_per_second": 1.513, + "step": 1400 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 27.15238380432129, + "learning_rate": 9.8509e-06, + "loss": 1.5374, + "step": 1500 + }, + { + "epoch": 23.076923076923077, + "eval_loss": 1.5862094163894653, + "eval_runtime": 11.1815, + "eval_samples_per_second": 11.716, + "eval_steps_per_second": 1.52, + "step": 1500 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 40.26778030395508, + "learning_rate": 9.840900000000001e-06, + "loss": 1.4923, + "step": 1600 + }, + { + "epoch": 24.615384615384617, + "eval_loss": 1.576373815536499, + "eval_runtime": 11.1215, + "eval_samples_per_second": 11.779, + "eval_steps_per_second": 1.529, + "step": 1600 + }, + { + "epoch": 26.153846153846153, + "grad_norm": 35.266971588134766, + "learning_rate": 9.830900000000001e-06, + "loss": 1.4989, + "step": 1700 + }, + { + "epoch": 26.153846153846153, + "eval_loss": 1.5671430826187134, + "eval_runtime": 11.1873, + "eval_samples_per_second": 11.71, + "eval_steps_per_second": 1.52, + "step": 1700 + }, + { + "epoch": 27.692307692307693, + "grad_norm": 26.813480377197266, + "learning_rate": 9.820900000000001e-06, + "loss": 1.4711, + "step": 1800 + }, + { + "epoch": 27.692307692307693, + "eval_loss": 1.522908329963684, + "eval_runtime": 11.2106, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 1.516, + "step": 1800 + }, + { + "epoch": 29.23076923076923, + "grad_norm": 24.576723098754883, + "learning_rate": 9.810900000000001e-06, + "loss": 1.4421, + "step": 1900 + }, + { + "epoch": 29.23076923076923, + "eval_loss": 1.5039104223251343, + "eval_runtime": 11.257, + "eval_samples_per_second": 11.637, + "eval_steps_per_second": 1.51, + "step": 1900 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 28.480438232421875, + "learning_rate": 9.800900000000001e-06, + "loss": 1.4347, + "step": 2000 + }, + { + "epoch": 30.76923076923077, + "eval_loss": 1.5123459100723267, + "eval_runtime": 11.187, + "eval_samples_per_second": 11.71, + "eval_steps_per_second": 1.52, + "step": 2000 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 56.582088470458984, + "learning_rate": 9.790900000000001e-06, + "loss": 1.4212, + "step": 2100 + }, + { + "epoch": 32.30769230769231, + "eval_loss": 1.481844425201416, + "eval_runtime": 11.2075, + "eval_samples_per_second": 11.689, + "eval_steps_per_second": 1.517, + "step": 2100 + }, + { + "epoch": 33.84615384615385, + "grad_norm": 38.5254020690918, + "learning_rate": 9.780900000000002e-06, + "loss": 1.3908, + "step": 2200 + }, + { + "epoch": 33.84615384615385, + "eval_loss": 1.4529048204421997, + "eval_runtime": 11.197, + "eval_samples_per_second": 11.7, + "eval_steps_per_second": 1.518, + "step": 2200 + }, + { + "epoch": 35.38461538461539, + "grad_norm": 44.74857711791992, + "learning_rate": 9.770900000000002e-06, + "loss": 1.3734, + "step": 2300 + }, + { + "epoch": 35.38461538461539, + "eval_loss": 1.4617427587509155, + "eval_runtime": 11.1235, + "eval_samples_per_second": 11.777, + "eval_steps_per_second": 1.528, + "step": 2300 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 28.926782608032227, + "learning_rate": 9.760900000000002e-06, + "loss": 1.365, + "step": 2400 + }, + { + "epoch": 36.92307692307692, + "eval_loss": 1.4297789335250854, + "eval_runtime": 11.3007, + "eval_samples_per_second": 11.592, + "eval_steps_per_second": 1.504, + "step": 2400 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 27.750213623046875, + "learning_rate": 9.7509e-06, + "loss": 1.3306, + "step": 2500 + }, + { + "epoch": 38.46153846153846, + "eval_loss": 1.4345914125442505, + "eval_runtime": 11.2795, + "eval_samples_per_second": 11.614, + "eval_steps_per_second": 1.507, + "step": 2500 + }, + { + "epoch": 40.0, + "grad_norm": 35.288352966308594, + "learning_rate": 9.7409e-06, + "loss": 1.3677, + "step": 2600 + }, + { + "epoch": 40.0, + "eval_loss": 1.447089433670044, + "eval_runtime": 11.1366, + "eval_samples_per_second": 11.763, + "eval_steps_per_second": 1.527, + "step": 2600 + }, + { + "epoch": 41.53846153846154, + "grad_norm": 65.49736022949219, + "learning_rate": 9.7309e-06, + "loss": 1.3453, + "step": 2700 + }, + { + "epoch": 41.53846153846154, + "eval_loss": 1.405300259590149, + "eval_runtime": 11.1549, + "eval_samples_per_second": 11.744, + "eval_steps_per_second": 1.524, + "step": 2700 + }, + { + "epoch": 43.07692307692308, + "grad_norm": 24.333518981933594, + "learning_rate": 9.7209e-06, + "loss": 1.3206, + "step": 2800 + }, + { + "epoch": 43.07692307692308, + "eval_loss": 1.4218717813491821, + "eval_runtime": 11.3113, + "eval_samples_per_second": 11.581, + "eval_steps_per_second": 1.503, + "step": 2800 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 45.50777816772461, + "learning_rate": 9.7109e-06, + "loss": 1.3363, + "step": 2900 + }, + { + "epoch": 44.61538461538461, + "eval_loss": 1.4220006465911865, + "eval_runtime": 11.156, + "eval_samples_per_second": 11.743, + "eval_steps_per_second": 1.524, + "step": 2900 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 25.898344039916992, + "learning_rate": 9.7009e-06, + "loss": 1.2995, + "step": 3000 + }, + { + "epoch": 46.15384615384615, + "eval_loss": 1.3942431211471558, + "eval_runtime": 11.2282, + "eval_samples_per_second": 11.667, + "eval_steps_per_second": 1.514, + "step": 3000 + }, + { + "epoch": 47.69230769230769, + "grad_norm": 56.0889892578125, + "learning_rate": 9.6909e-06, + "loss": 1.2994, + "step": 3100 + }, + { + "epoch": 47.69230769230769, + "eval_loss": 1.3970586061477661, + "eval_runtime": 11.1682, + "eval_samples_per_second": 11.73, + "eval_steps_per_second": 1.522, + "step": 3100 + }, + { + "epoch": 49.23076923076923, + "grad_norm": 58.10311508178711, + "learning_rate": 9.6809e-06, + "loss": 1.2761, + "step": 3200 + }, + { + "epoch": 49.23076923076923, + "eval_loss": 1.390371561050415, + "eval_runtime": 11.2406, + "eval_samples_per_second": 11.654, + "eval_steps_per_second": 1.512, + "step": 3200 + }, + { + "epoch": 50.76923076923077, + "grad_norm": 17.050870895385742, + "learning_rate": 9.670900000000001e-06, + "loss": 1.2712, + "step": 3300 + }, + { + "epoch": 50.76923076923077, + "eval_loss": 1.3936753273010254, + "eval_runtime": 11.2364, + "eval_samples_per_second": 11.659, + "eval_steps_per_second": 1.513, + "step": 3300 + }, + { + "epoch": 52.30769230769231, + "grad_norm": 232.21804809570312, + "learning_rate": 9.660900000000001e-06, + "loss": 1.262, + "step": 3400 + }, + { + "epoch": 52.30769230769231, + "eval_loss": 1.4037091732025146, + "eval_runtime": 11.2231, + "eval_samples_per_second": 11.672, + "eval_steps_per_second": 1.515, + "step": 3400 + }, + { + "epoch": 53.84615384615385, + "grad_norm": 35.11832046508789, + "learning_rate": 9.650900000000001e-06, + "loss": 1.2788, + "step": 3500 + }, + { + "epoch": 53.84615384615385, + "eval_loss": 1.3545280694961548, + "eval_runtime": 11.1609, + "eval_samples_per_second": 11.737, + "eval_steps_per_second": 1.523, + "step": 3500 + }, + { + "epoch": 55.38461538461539, + "grad_norm": 27.077022552490234, + "learning_rate": 9.640900000000001e-06, + "loss": 1.2711, + "step": 3600 + }, + { + "epoch": 55.38461538461539, + "eval_loss": 1.3528518676757812, + "eval_runtime": 11.2852, + "eval_samples_per_second": 11.608, + "eval_steps_per_second": 1.506, + "step": 3600 + }, + { + "epoch": 56.92307692307692, + "grad_norm": 46.97712326049805, + "learning_rate": 9.630900000000001e-06, + "loss": 1.2492, + "step": 3700 + }, + { + "epoch": 56.92307692307692, + "eval_loss": 1.3534098863601685, + "eval_runtime": 11.2368, + "eval_samples_per_second": 11.658, + "eval_steps_per_second": 1.513, + "step": 3700 + }, + { + "epoch": 58.46153846153846, + "grad_norm": 42.06857681274414, + "learning_rate": 9.620900000000001e-06, + "loss": 1.2506, + "step": 3800 + }, + { + "epoch": 58.46153846153846, + "eval_loss": 1.3613977432250977, + "eval_runtime": 11.2206, + "eval_samples_per_second": 11.675, + "eval_steps_per_second": 1.515, + "step": 3800 + }, + { + "epoch": 60.0, + "grad_norm": 19.298952102661133, + "learning_rate": 9.610900000000001e-06, + "loss": 1.2201, + "step": 3900 + }, + { + "epoch": 60.0, + "eval_loss": 1.3586474657058716, + "eval_runtime": 11.2045, + "eval_samples_per_second": 11.692, + "eval_steps_per_second": 1.517, + "step": 3900 + }, + { + "epoch": 61.53846153846154, + "grad_norm": 30.0198974609375, + "learning_rate": 9.600900000000002e-06, + "loss": 1.2086, + "step": 4000 + }, + { + "epoch": 61.53846153846154, + "eval_loss": 1.3304755687713623, + "eval_runtime": 11.193, + "eval_samples_per_second": 11.704, + "eval_steps_per_second": 1.519, + "step": 4000 + }, + { + "epoch": 63.07692307692308, + "grad_norm": 37.59902572631836, + "learning_rate": 9.5909e-06, + "loss": 1.2375, + "step": 4100 + }, + { + "epoch": 63.07692307692308, + "eval_loss": 1.331407904624939, + "eval_runtime": 10.6714, + "eval_samples_per_second": 12.276, + "eval_steps_per_second": 1.593, + "step": 4100 + }, + { + "epoch": 64.61538461538461, + "grad_norm": 36.82079315185547, + "learning_rate": 9.5809e-06, + "loss": 1.2148, + "step": 4200 + }, + { + "epoch": 64.61538461538461, + "eval_loss": 1.3441548347473145, + "eval_runtime": 10.7472, + "eval_samples_per_second": 12.189, + "eval_steps_per_second": 1.582, + "step": 4200 + }, + { + "epoch": 66.15384615384616, + "grad_norm": 30.974130630493164, + "learning_rate": 9.5709e-06, + "loss": 1.197, + "step": 4300 + }, + { + "epoch": 66.15384615384616, + "eval_loss": 1.34512197971344, + "eval_runtime": 10.7398, + "eval_samples_per_second": 12.198, + "eval_steps_per_second": 1.583, + "step": 4300 + }, + { + "epoch": 67.6923076923077, + "grad_norm": 20.45345115661621, + "learning_rate": 9.5609e-06, + "loss": 1.2361, + "step": 4400 + }, + { + "epoch": 67.6923076923077, + "eval_loss": 1.3371080160140991, + "eval_runtime": 10.654, + "eval_samples_per_second": 12.296, + "eval_steps_per_second": 1.596, + "step": 4400 + }, + { + "epoch": 69.23076923076923, + "grad_norm": 19.758630752563477, + "learning_rate": 9.5509e-06, + "loss": 1.2001, + "step": 4500 + }, + { + "epoch": 69.23076923076923, + "eval_loss": 1.3270760774612427, + "eval_runtime": 10.6928, + "eval_samples_per_second": 12.251, + "eval_steps_per_second": 1.59, + "step": 4500 + }, + { + "epoch": 70.76923076923077, + "grad_norm": 45.2899055480957, + "learning_rate": 9.5409e-06, + "loss": 1.192, + "step": 4600 + }, + { + "epoch": 70.76923076923077, + "eval_loss": 1.3184590339660645, + "eval_runtime": 10.7154, + "eval_samples_per_second": 12.225, + "eval_steps_per_second": 1.586, + "step": 4600 + }, + { + "epoch": 72.3076923076923, + "grad_norm": 45.60734939575195, + "learning_rate": 9.5309e-06, + "loss": 1.2081, + "step": 4700 + }, + { + "epoch": 72.3076923076923, + "eval_loss": 1.3107666969299316, + "eval_runtime": 10.7029, + "eval_samples_per_second": 12.24, + "eval_steps_per_second": 1.588, + "step": 4700 + }, + { + "epoch": 73.84615384615384, + "grad_norm": 26.859603881835938, + "learning_rate": 9.5209e-06, + "loss": 1.1729, + "step": 4800 + }, + { + "epoch": 73.84615384615384, + "eval_loss": 1.310062289237976, + "eval_runtime": 10.7544, + "eval_samples_per_second": 12.181, + "eval_steps_per_second": 1.581, + "step": 4800 + }, + { + "epoch": 75.38461538461539, + "grad_norm": 19.90914535522461, + "learning_rate": 9.5109e-06, + "loss": 1.1899, + "step": 4900 + }, + { + "epoch": 75.38461538461539, + "eval_loss": 1.3038017749786377, + "eval_runtime": 10.6532, + "eval_samples_per_second": 12.297, + "eval_steps_per_second": 1.596, + "step": 4900 + }, + { + "epoch": 76.92307692307692, + "grad_norm": 22.629934310913086, + "learning_rate": 9.5009e-06, + "loss": 1.1875, + "step": 5000 + }, + { + "epoch": 76.92307692307692, + "eval_loss": 1.2785409688949585, + "eval_runtime": 10.6388, + "eval_samples_per_second": 12.313, + "eval_steps_per_second": 1.598, + "step": 5000 + }, + { + "epoch": 78.46153846153847, + "grad_norm": 45.462059020996094, + "learning_rate": 9.490900000000001e-06, + "loss": 1.1717, + "step": 5100 + }, + { + "epoch": 78.46153846153847, + "eval_loss": 1.278078317642212, + "eval_runtime": 10.9666, + "eval_samples_per_second": 11.945, + "eval_steps_per_second": 1.55, + "step": 5100 + }, + { + "epoch": 80.0, + "grad_norm": 34.85255432128906, + "learning_rate": 9.480900000000001e-06, + "loss": 1.1657, + "step": 5200 + }, + { + "epoch": 80.0, + "eval_loss": 1.2711185216903687, + "eval_runtime": 11.0211, + "eval_samples_per_second": 11.886, + "eval_steps_per_second": 1.543, + "step": 5200 + }, + { + "epoch": 81.53846153846153, + "grad_norm": 19.078449249267578, + "learning_rate": 9.470900000000001e-06, + "loss": 1.1814, + "step": 5300 + }, + { + "epoch": 81.53846153846153, + "eval_loss": 1.2781996726989746, + "eval_runtime": 11.0663, + "eval_samples_per_second": 11.838, + "eval_steps_per_second": 1.536, + "step": 5300 + }, + { + "epoch": 83.07692307692308, + "grad_norm": 31.05898094177246, + "learning_rate": 9.460900000000001e-06, + "loss": 1.1452, + "step": 5400 + }, + { + "epoch": 83.07692307692308, + "eval_loss": 1.2848775386810303, + "eval_runtime": 11.01, + "eval_samples_per_second": 11.898, + "eval_steps_per_second": 1.544, + "step": 5400 + }, + { + "epoch": 84.61538461538461, + "grad_norm": 28.712461471557617, + "learning_rate": 9.450900000000001e-06, + "loss": 1.1465, + "step": 5500 + }, + { + "epoch": 84.61538461538461, + "eval_loss": 1.2928494215011597, + "eval_runtime": 10.9253, + "eval_samples_per_second": 11.991, + "eval_steps_per_second": 1.556, + "step": 5500 + }, + { + "epoch": 86.15384615384616, + "grad_norm": 19.871828079223633, + "learning_rate": 9.440900000000001e-06, + "loss": 1.1736, + "step": 5600 + }, + { + "epoch": 86.15384615384616, + "eval_loss": 1.2648124694824219, + "eval_runtime": 11.0314, + "eval_samples_per_second": 11.875, + "eval_steps_per_second": 1.541, + "step": 5600 + }, + { + "epoch": 87.6923076923077, + "grad_norm": 22.47665023803711, + "learning_rate": 9.4309e-06, + "loss": 1.1184, + "step": 5700 + }, + { + "epoch": 87.6923076923077, + "eval_loss": 1.2936598062515259, + "eval_runtime": 10.9322, + "eval_samples_per_second": 11.983, + "eval_steps_per_second": 1.555, + "step": 5700 + }, + { + "epoch": 89.23076923076923, + "grad_norm": 38.79877471923828, + "learning_rate": 9.4209e-06, + "loss": 1.1616, + "step": 5800 + }, + { + "epoch": 89.23076923076923, + "eval_loss": 1.2650004625320435, + "eval_runtime": 10.9434, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 1.553, + "step": 5800 + }, + { + "epoch": 90.76923076923077, + "grad_norm": 40.851097106933594, + "learning_rate": 9.4109e-06, + "loss": 1.1469, + "step": 5900 + }, + { + "epoch": 90.76923076923077, + "eval_loss": 1.252148151397705, + "eval_runtime": 10.9867, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 1.547, + "step": 5900 + }, + { + "epoch": 92.3076923076923, + "grad_norm": 21.35544204711914, + "learning_rate": 9.4009e-06, + "loss": 1.1489, + "step": 6000 + }, + { + "epoch": 92.3076923076923, + "eval_loss": 1.259344220161438, + "eval_runtime": 10.9649, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.55, + "step": 6000 + }, + { + "epoch": 93.84615384615384, + "grad_norm": 33.265132904052734, + "learning_rate": 9.3909e-06, + "loss": 1.1315, + "step": 6100 + }, + { + "epoch": 93.84615384615384, + "eval_loss": 1.252693772315979, + "eval_runtime": 10.9852, + "eval_samples_per_second": 11.925, + "eval_steps_per_second": 1.548, + "step": 6100 + }, + { + "epoch": 95.38461538461539, + "grad_norm": 23.43667221069336, + "learning_rate": 9.381e-06, + "loss": 1.119, + "step": 6200 + }, + { + "epoch": 95.38461538461539, + "eval_loss": 1.254772424697876, + "eval_runtime": 11.0937, + "eval_samples_per_second": 11.808, + "eval_steps_per_second": 1.532, + "step": 6200 + }, + { + "epoch": 96.92307692307692, + "grad_norm": 51.93602752685547, + "learning_rate": 9.371e-06, + "loss": 1.1333, + "step": 6300 + }, + { + "epoch": 96.92307692307692, + "eval_loss": 1.249408483505249, + "eval_runtime": 11.0388, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 1.54, + "step": 6300 + }, + { + "epoch": 98.46153846153847, + "grad_norm": 23.473421096801758, + "learning_rate": 9.361e-06, + "loss": 1.1164, + "step": 6400 + }, + { + "epoch": 98.46153846153847, + "eval_loss": 1.2438522577285767, + "eval_runtime": 11.1581, + "eval_samples_per_second": 11.74, + "eval_steps_per_second": 1.524, + "step": 6400 + }, + { + "epoch": 100.0, + "grad_norm": 24.228403091430664, + "learning_rate": 9.351e-06, + "loss": 1.1333, + "step": 6500 + }, + { + "epoch": 100.0, + "eval_loss": 1.248632788658142, + "eval_runtime": 10.9675, + "eval_samples_per_second": 11.944, + "eval_steps_per_second": 1.55, + "step": 6500 + }, + { + "epoch": 101.53846153846153, + "grad_norm": 18.29631996154785, + "learning_rate": 9.341000000000001e-06, + "loss": 1.1082, + "step": 6600 + }, + { + "epoch": 101.53846153846153, + "eval_loss": 1.2509865760803223, + "eval_runtime": 11.0693, + "eval_samples_per_second": 11.834, + "eval_steps_per_second": 1.536, + "step": 6600 + }, + { + "epoch": 103.07692307692308, + "grad_norm": 42.855491638183594, + "learning_rate": 9.331000000000001e-06, + "loss": 1.1178, + "step": 6700 + }, + { + "epoch": 103.07692307692308, + "eval_loss": 1.2890292406082153, + "eval_runtime": 11.0366, + "eval_samples_per_second": 11.87, + "eval_steps_per_second": 1.54, + "step": 6700 + }, + { + "epoch": 104.61538461538461, + "grad_norm": 46.675655364990234, + "learning_rate": 9.321000000000001e-06, + "loss": 1.1106, + "step": 6800 + }, + { + "epoch": 104.61538461538461, + "eval_loss": 1.2719863653182983, + "eval_runtime": 11.1266, + "eval_samples_per_second": 11.774, + "eval_steps_per_second": 1.528, + "step": 6800 + }, + { + "epoch": 106.15384615384616, + "grad_norm": 26.414846420288086, + "learning_rate": 9.311000000000001e-06, + "loss": 1.1216, + "step": 6900 + }, + { + "epoch": 106.15384615384616, + "eval_loss": 1.2423394918441772, + "eval_runtime": 11.0197, + "eval_samples_per_second": 11.888, + "eval_steps_per_second": 1.543, + "step": 6900 + }, + { + "epoch": 107.6923076923077, + "grad_norm": 31.4022274017334, + "learning_rate": 9.301000000000001e-06, + "loss": 1.1052, + "step": 7000 + }, + { + "epoch": 107.6923076923077, + "eval_loss": 1.2372961044311523, + "eval_runtime": 11.1127, + "eval_samples_per_second": 11.788, + "eval_steps_per_second": 1.53, + "step": 7000 + }, + { + "epoch": 109.23076923076923, + "grad_norm": 23.16703987121582, + "learning_rate": 9.291000000000001e-06, + "loss": 1.0911, + "step": 7100 + }, + { + "epoch": 109.23076923076923, + "eval_loss": 1.2309863567352295, + "eval_runtime": 10.9572, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 1.551, + "step": 7100 + }, + { + "epoch": 110.76923076923077, + "grad_norm": 21.648773193359375, + "learning_rate": 9.281000000000001e-06, + "loss": 1.0956, + "step": 7200 + }, + { + "epoch": 110.76923076923077, + "eval_loss": 1.2261079549789429, + "eval_runtime": 10.9351, + "eval_samples_per_second": 11.98, + "eval_steps_per_second": 1.555, + "step": 7200 + }, + { + "epoch": 112.3076923076923, + "grad_norm": 24.5791072845459, + "learning_rate": 9.271000000000002e-06, + "loss": 1.0751, + "step": 7300 + }, + { + "epoch": 112.3076923076923, + "eval_loss": 1.2161471843719482, + "eval_runtime": 11.1052, + "eval_samples_per_second": 11.796, + "eval_steps_per_second": 1.531, + "step": 7300 + }, + { + "epoch": 113.84615384615384, + "grad_norm": 35.867801666259766, + "learning_rate": 9.261000000000002e-06, + "loss": 1.086, + "step": 7400 + }, + { + "epoch": 113.84615384615384, + "eval_loss": 1.2092362642288208, + "eval_runtime": 11.2048, + "eval_samples_per_second": 11.691, + "eval_steps_per_second": 1.517, + "step": 7400 + }, + { + "epoch": 115.38461538461539, + "grad_norm": 67.91041564941406, + "learning_rate": 9.251000000000002e-06, + "loss": 1.092, + "step": 7500 + }, + { + "epoch": 115.38461538461539, + "eval_loss": 1.241829514503479, + "eval_runtime": 10.9937, + "eval_samples_per_second": 11.916, + "eval_steps_per_second": 1.546, + "step": 7500 + }, + { + "epoch": 116.92307692307692, + "grad_norm": 128.73751831054688, + "learning_rate": 9.241000000000002e-06, + "loss": 1.0764, + "step": 7600 + }, + { + "epoch": 116.92307692307692, + "eval_loss": 1.2462713718414307, + "eval_runtime": 10.9625, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 1.551, + "step": 7600 + }, + { + "epoch": 118.46153846153847, + "grad_norm": 86.5144271850586, + "learning_rate": 9.231000000000002e-06, + "loss": 1.0643, + "step": 7700 + }, + { + "epoch": 118.46153846153847, + "eval_loss": 1.2187525033950806, + "eval_runtime": 10.9492, + "eval_samples_per_second": 11.964, + "eval_steps_per_second": 1.553, + "step": 7700 + }, + { + "epoch": 120.0, + "grad_norm": 19.4710750579834, + "learning_rate": 9.221e-06, + "loss": 1.0966, + "step": 7800 + }, + { + "epoch": 120.0, + "eval_loss": 1.2282384634017944, + "eval_runtime": 10.9078, + "eval_samples_per_second": 12.01, + "eval_steps_per_second": 1.559, + "step": 7800 + }, + { + "epoch": 121.53846153846153, + "grad_norm": 37.673744201660156, + "learning_rate": 9.211e-06, + "loss": 1.0632, + "step": 7900 + }, + { + "epoch": 121.53846153846153, + "eval_loss": 1.2206230163574219, + "eval_runtime": 11.0018, + "eval_samples_per_second": 11.907, + "eval_steps_per_second": 1.545, + "step": 7900 + }, + { + "epoch": 123.07692307692308, + "grad_norm": 25.10326385498047, + "learning_rate": 9.201e-06, + "loss": 1.0873, + "step": 8000 + }, + { + "epoch": 123.07692307692308, + "eval_loss": 1.2137339115142822, + "eval_runtime": 10.9232, + "eval_samples_per_second": 11.993, + "eval_steps_per_second": 1.556, + "step": 8000 + }, + { + "epoch": 124.61538461538461, + "grad_norm": 32.02176284790039, + "learning_rate": 9.191e-06, + "loss": 1.0568, + "step": 8100 + }, + { + "epoch": 124.61538461538461, + "eval_loss": 1.2065187692642212, + "eval_runtime": 10.9614, + "eval_samples_per_second": 11.951, + "eval_steps_per_second": 1.551, + "step": 8100 + }, + { + "epoch": 126.15384615384616, + "grad_norm": 19.97406005859375, + "learning_rate": 9.181e-06, + "loss": 1.065, + "step": 8200 + }, + { + "epoch": 126.15384615384616, + "eval_loss": 1.2094841003417969, + "eval_runtime": 11.0274, + "eval_samples_per_second": 11.879, + "eval_steps_per_second": 1.542, + "step": 8200 + }, + { + "epoch": 127.6923076923077, + "grad_norm": 31.624399185180664, + "learning_rate": 9.171e-06, + "loss": 1.0805, + "step": 8300 + }, + { + "epoch": 127.6923076923077, + "eval_loss": 1.2149733304977417, + "eval_runtime": 11.2014, + "eval_samples_per_second": 11.695, + "eval_steps_per_second": 1.518, + "step": 8300 + }, + { + "epoch": 129.23076923076923, + "grad_norm": 29.24848747253418, + "learning_rate": 9.161000000000001e-06, + "loss": 1.0463, + "step": 8400 + }, + { + "epoch": 129.23076923076923, + "eval_loss": 1.2077099084854126, + "eval_runtime": 10.9432, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 1.553, + "step": 8400 + }, + { + "epoch": 130.76923076923077, + "grad_norm": 27.14787483215332, + "learning_rate": 9.151000000000001e-06, + "loss": 1.0607, + "step": 8500 + }, + { + "epoch": 130.76923076923077, + "eval_loss": 1.2046644687652588, + "eval_runtime": 11.001, + "eval_samples_per_second": 11.908, + "eval_steps_per_second": 1.545, + "step": 8500 + }, + { + "epoch": 132.30769230769232, + "grad_norm": 32.416194915771484, + "learning_rate": 9.141000000000001e-06, + "loss": 1.0365, + "step": 8600 + }, + { + "epoch": 132.30769230769232, + "eval_loss": 1.195080041885376, + "eval_runtime": 11.0205, + "eval_samples_per_second": 11.887, + "eval_steps_per_second": 1.543, + "step": 8600 + }, + { + "epoch": 133.84615384615384, + "grad_norm": 58.863582611083984, + "learning_rate": 9.131000000000001e-06, + "loss": 1.0564, + "step": 8700 + }, + { + "epoch": 133.84615384615384, + "eval_loss": 1.1966980695724487, + "eval_runtime": 10.9043, + "eval_samples_per_second": 12.014, + "eval_steps_per_second": 1.559, + "step": 8700 + }, + { + "epoch": 135.3846153846154, + "grad_norm": 26.08232307434082, + "learning_rate": 9.121000000000001e-06, + "loss": 1.0507, + "step": 8800 + }, + { + "epoch": 135.3846153846154, + "eval_loss": 1.201367735862732, + "eval_runtime": 10.9714, + "eval_samples_per_second": 11.94, + "eval_steps_per_second": 1.549, + "step": 8800 + }, + { + "epoch": 136.92307692307693, + "grad_norm": 59.368019104003906, + "learning_rate": 9.111000000000001e-06, + "loss": 1.0508, + "step": 8900 + }, + { + "epoch": 136.92307692307693, + "eval_loss": 1.208795428276062, + "eval_runtime": 11.0529, + "eval_samples_per_second": 11.852, + "eval_steps_per_second": 1.538, + "step": 8900 + }, + { + "epoch": 138.46153846153845, + "grad_norm": 43.07460021972656, + "learning_rate": 9.101000000000001e-06, + "loss": 1.0359, + "step": 9000 + }, + { + "epoch": 138.46153846153845, + "eval_loss": 1.1782392263412476, + "eval_runtime": 11.1519, + "eval_samples_per_second": 11.747, + "eval_steps_per_second": 1.524, + "step": 9000 + }, + { + "epoch": 140.0, + "grad_norm": 22.39567756652832, + "learning_rate": 9.091000000000002e-06, + "loss": 1.0584, + "step": 9100 + }, + { + "epoch": 140.0, + "eval_loss": 1.1873483657836914, + "eval_runtime": 10.8732, + "eval_samples_per_second": 12.048, + "eval_steps_per_second": 1.563, + "step": 9100 + }, + { + "epoch": 141.53846153846155, + "grad_norm": 22.813621520996094, + "learning_rate": 9.081000000000002e-06, + "loss": 1.0354, + "step": 9200 + }, + { + "epoch": 141.53846153846155, + "eval_loss": 1.1731117963790894, + "eval_runtime": 11.0847, + "eval_samples_per_second": 11.818, + "eval_steps_per_second": 1.534, + "step": 9200 + }, + { + "epoch": 143.07692307692307, + "grad_norm": 31.44073486328125, + "learning_rate": 9.071000000000002e-06, + "loss": 1.0457, + "step": 9300 + }, + { + "epoch": 143.07692307692307, + "eval_loss": 1.1962807178497314, + "eval_runtime": 11.0668, + "eval_samples_per_second": 11.837, + "eval_steps_per_second": 1.536, + "step": 9300 + }, + { + "epoch": 144.6153846153846, + "grad_norm": 18.18711280822754, + "learning_rate": 9.061e-06, + "loss": 1.0481, + "step": 9400 + }, + { + "epoch": 144.6153846153846, + "eval_loss": 1.185339093208313, + "eval_runtime": 10.9916, + "eval_samples_per_second": 11.918, + "eval_steps_per_second": 1.547, + "step": 9400 + }, + { + "epoch": 146.15384615384616, + "grad_norm": 38.05665969848633, + "learning_rate": 9.051e-06, + "loss": 1.0391, + "step": 9500 + }, + { + "epoch": 146.15384615384616, + "eval_loss": 1.1856777667999268, + "eval_runtime": 11.0391, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 1.54, + "step": 9500 + }, + { + "epoch": 147.69230769230768, + "grad_norm": 19.963260650634766, + "learning_rate": 9.041e-06, + "loss": 1.0322, + "step": 9600 + }, + { + "epoch": 147.69230769230768, + "eval_loss": 1.1843148469924927, + "eval_runtime": 11.0382, + "eval_samples_per_second": 11.868, + "eval_steps_per_second": 1.54, + "step": 9600 + }, + { + "epoch": 149.23076923076923, + "grad_norm": 28.58108901977539, + "learning_rate": 9.031e-06, + "loss": 1.0369, + "step": 9700 + }, + { + "epoch": 149.23076923076923, + "eval_loss": 1.1637182235717773, + "eval_runtime": 11.2766, + "eval_samples_per_second": 11.617, + "eval_steps_per_second": 1.508, + "step": 9700 + }, + { + "epoch": 150.76923076923077, + "grad_norm": 64.3956527709961, + "learning_rate": 9.021e-06, + "loss": 1.0519, + "step": 9800 + }, + { + "epoch": 150.76923076923077, + "eval_loss": 1.1796802282333374, + "eval_runtime": 11.3418, + "eval_samples_per_second": 11.55, + "eval_steps_per_second": 1.499, + "step": 9800 + }, + { + "epoch": 152.30769230769232, + "grad_norm": 18.857580184936523, + "learning_rate": 9.011e-06, + "loss": 1.0272, + "step": 9900 + }, + { + "epoch": 152.30769230769232, + "eval_loss": 1.1634279489517212, + "eval_runtime": 11.6749, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.456, + "step": 9900 + }, + { + "epoch": 153.84615384615384, + "grad_norm": 42.541683197021484, + "learning_rate": 9.001e-06, + "loss": 1.0287, + "step": 10000 + }, + { + "epoch": 153.84615384615384, + "eval_loss": 1.1698088645935059, + "eval_runtime": 11.068, + "eval_samples_per_second": 11.836, + "eval_steps_per_second": 1.536, + "step": 10000 + }, + { + "epoch": 155.3846153846154, + "grad_norm": 30.52286720275879, + "learning_rate": 8.991e-06, + "loss": 1.0237, + "step": 10100 + }, + { + "epoch": 155.3846153846154, + "eval_loss": 1.178311824798584, + "eval_runtime": 11.1107, + "eval_samples_per_second": 11.79, + "eval_steps_per_second": 1.53, + "step": 10100 + }, + { + "epoch": 156.92307692307693, + "grad_norm": 32.60612487792969, + "learning_rate": 8.981000000000001e-06, + "loss": 1.0362, + "step": 10200 + }, + { + "epoch": 156.92307692307693, + "eval_loss": 1.157893180847168, + "eval_runtime": 11.0484, + "eval_samples_per_second": 11.857, + "eval_steps_per_second": 1.539, + "step": 10200 + }, + { + "epoch": 158.46153846153845, + "grad_norm": 26.15647315979004, + "learning_rate": 8.971000000000001e-06, + "loss": 0.998, + "step": 10300 + }, + { + "epoch": 158.46153846153845, + "eval_loss": 1.1724605560302734, + "eval_runtime": 11.2323, + "eval_samples_per_second": 11.663, + "eval_steps_per_second": 1.513, + "step": 10300 + }, + { + "epoch": 160.0, + "grad_norm": 45.76993942260742, + "learning_rate": 8.961000000000001e-06, + "loss": 1.0432, + "step": 10400 + }, + { + "epoch": 160.0, + "eval_loss": 1.1688075065612793, + "eval_runtime": 11.0984, + "eval_samples_per_second": 11.803, + "eval_steps_per_second": 1.532, + "step": 10400 + }, + { + "epoch": 161.53846153846155, + "grad_norm": 40.384578704833984, + "learning_rate": 8.951000000000001e-06, + "loss": 1.0421, + "step": 10500 + }, + { + "epoch": 161.53846153846155, + "eval_loss": 1.1621845960617065, + "eval_runtime": 11.1624, + "eval_samples_per_second": 11.736, + "eval_steps_per_second": 1.523, + "step": 10500 + }, + { + "epoch": 163.07692307692307, + "grad_norm": 22.886058807373047, + "learning_rate": 8.941000000000001e-06, + "loss": 0.9912, + "step": 10600 + }, + { + "epoch": 163.07692307692307, + "eval_loss": 1.1695055961608887, + "eval_runtime": 10.9842, + "eval_samples_per_second": 11.926, + "eval_steps_per_second": 1.548, + "step": 10600 + }, + { + "epoch": 164.6153846153846, + "grad_norm": 26.940736770629883, + "learning_rate": 8.931000000000001e-06, + "loss": 1.011, + "step": 10700 + }, + { + "epoch": 164.6153846153846, + "eval_loss": 1.1458157300949097, + "eval_runtime": 11.025, + "eval_samples_per_second": 11.882, + "eval_steps_per_second": 1.542, + "step": 10700 + }, + { + "epoch": 166.15384615384616, + "grad_norm": 26.5013484954834, + "learning_rate": 8.921000000000001e-06, + "loss": 0.9876, + "step": 10800 + }, + { + "epoch": 166.15384615384616, + "eval_loss": 1.1569631099700928, + "eval_runtime": 11.101, + "eval_samples_per_second": 11.801, + "eval_steps_per_second": 1.531, + "step": 10800 + }, + { + "epoch": 167.69230769230768, + "grad_norm": 29.859987258911133, + "learning_rate": 8.911000000000002e-06, + "loss": 1.0374, + "step": 10900 + }, + { + "epoch": 167.69230769230768, + "eval_loss": 1.149316668510437, + "eval_runtime": 11.1242, + "eval_samples_per_second": 11.776, + "eval_steps_per_second": 1.528, + "step": 10900 + }, + { + "epoch": 169.23076923076923, + "grad_norm": 27.777738571166992, + "learning_rate": 8.901e-06, + "loss": 0.985, + "step": 11000 + }, + { + "epoch": 169.23076923076923, + "eval_loss": 1.1608328819274902, + "eval_runtime": 11.2215, + "eval_samples_per_second": 11.674, + "eval_steps_per_second": 1.515, + "step": 11000 + }, + { + "epoch": 170.76923076923077, + "grad_norm": 39.21344757080078, + "learning_rate": 8.891e-06, + "loss": 1.0049, + "step": 11100 + }, + { + "epoch": 170.76923076923077, + "eval_loss": 1.1642228364944458, + "eval_runtime": 11.1947, + "eval_samples_per_second": 11.702, + "eval_steps_per_second": 1.519, + "step": 11100 + }, + { + "epoch": 172.30769230769232, + "grad_norm": 29.880149841308594, + "learning_rate": 8.881e-06, + "loss": 0.9843, + "step": 11200 + }, + { + "epoch": 172.30769230769232, + "eval_loss": 1.1574000120162964, + "eval_runtime": 11.2634, + "eval_samples_per_second": 11.631, + "eval_steps_per_second": 1.509, + "step": 11200 + }, + { + "epoch": 173.84615384615384, + "grad_norm": 63.53031539916992, + "learning_rate": 8.871e-06, + "loss": 1.0354, + "step": 11300 + }, + { + "epoch": 173.84615384615384, + "eval_loss": 1.1575734615325928, + "eval_runtime": 11.0265, + "eval_samples_per_second": 11.88, + "eval_steps_per_second": 1.542, + "step": 11300 + }, + { + "epoch": 175.3846153846154, + "grad_norm": 26.937786102294922, + "learning_rate": 8.861e-06, + "loss": 0.9964, + "step": 11400 + }, + { + "epoch": 175.3846153846154, + "eval_loss": 1.1552445888519287, + "eval_runtime": 11.0325, + "eval_samples_per_second": 11.874, + "eval_steps_per_second": 1.541, + "step": 11400 + }, + { + "epoch": 176.92307692307693, + "grad_norm": 87.28536987304688, + "learning_rate": 8.851e-06, + "loss": 0.9932, + "step": 11500 + }, + { + "epoch": 176.92307692307693, + "eval_loss": 1.1411677598953247, + "eval_runtime": 11.1527, + "eval_samples_per_second": 11.746, + "eval_steps_per_second": 1.524, + "step": 11500 + }, + { + "epoch": 178.46153846153845, + "grad_norm": 25.903568267822266, + "learning_rate": 8.841e-06, + "loss": 0.9768, + "step": 11600 + }, + { + "epoch": 178.46153846153845, + "eval_loss": 1.1635726690292358, + "eval_runtime": 11.123, + "eval_samples_per_second": 11.777, + "eval_steps_per_second": 1.528, + "step": 11600 + }, + { + "epoch": 180.0, + "grad_norm": 24.315654754638672, + "learning_rate": 8.831e-06, + "loss": 0.9984, + "step": 11700 + }, + { + "epoch": 180.0, + "eval_loss": 1.1710366010665894, + "eval_runtime": 11.1375, + "eval_samples_per_second": 11.762, + "eval_steps_per_second": 1.526, + "step": 11700 + }, + { + "epoch": 181.53846153846155, + "grad_norm": 60.17182540893555, + "learning_rate": 8.821e-06, + "loss": 0.9703, + "step": 11800 + }, + { + "epoch": 181.53846153846155, + "eval_loss": 1.1556936502456665, + "eval_runtime": 11.0387, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 1.54, + "step": 11800 + }, + { + "epoch": 183.07692307692307, + "grad_norm": 32.373477935791016, + "learning_rate": 8.811000000000001e-06, + "loss": 0.9996, + "step": 11900 + }, + { + "epoch": 183.07692307692307, + "eval_loss": 1.146790623664856, + "eval_runtime": 11.1805, + "eval_samples_per_second": 11.717, + "eval_steps_per_second": 1.521, + "step": 11900 + }, + { + "epoch": 184.6153846153846, + "grad_norm": 42.578575134277344, + "learning_rate": 8.801000000000001e-06, + "loss": 0.9795, + "step": 12000 + }, + { + "epoch": 184.6153846153846, + "eval_loss": 1.144544005393982, + "eval_runtime": 11.0704, + "eval_samples_per_second": 11.833, + "eval_steps_per_second": 1.536, + "step": 12000 + }, + { + "epoch": 186.15384615384616, + "grad_norm": 22.79789161682129, + "learning_rate": 8.791000000000001e-06, + "loss": 0.9905, + "step": 12100 + }, + { + "epoch": 186.15384615384616, + "eval_loss": 1.1581685543060303, + "eval_runtime": 11.272, + "eval_samples_per_second": 11.622, + "eval_steps_per_second": 1.508, + "step": 12100 + }, + { + "epoch": 187.69230769230768, + "grad_norm": 52.061012268066406, + "learning_rate": 8.781200000000002e-06, + "loss": 0.9817, + "step": 12200 + }, + { + "epoch": 187.69230769230768, + "eval_loss": 1.159809947013855, + "eval_runtime": 11.2021, + "eval_samples_per_second": 11.694, + "eval_steps_per_second": 1.518, + "step": 12200 + }, + { + "epoch": 189.23076923076923, + "grad_norm": 35.95882034301758, + "learning_rate": 8.7712e-06, + "loss": 1.0071, + "step": 12300 + }, + { + "epoch": 189.23076923076923, + "eval_loss": 1.1944890022277832, + "eval_runtime": 10.9263, + "eval_samples_per_second": 11.989, + "eval_steps_per_second": 1.556, + "step": 12300 + }, + { + "epoch": 190.76923076923077, + "grad_norm": 213.48587036132812, + "learning_rate": 8.7612e-06, + "loss": 0.9997, + "step": 12400 + }, + { + "epoch": 190.76923076923077, + "eval_loss": 1.191455602645874, + "eval_runtime": 10.917, + "eval_samples_per_second": 12.0, + "eval_steps_per_second": 1.557, + "step": 12400 + }, + { + "epoch": 192.30769230769232, + "grad_norm": 19.97510528564453, + "learning_rate": 8.7512e-06, + "loss": 1.001, + "step": 12500 + }, + { + "epoch": 192.30769230769232, + "eval_loss": 1.167776346206665, + "eval_runtime": 11.033, + "eval_samples_per_second": 11.873, + "eval_steps_per_second": 1.541, + "step": 12500 + }, + { + "epoch": 193.84615384615384, + "grad_norm": 30.815828323364258, + "learning_rate": 8.7412e-06, + "loss": 0.9719, + "step": 12600 + }, + { + "epoch": 193.84615384615384, + "eval_loss": 1.150451898574829, + "eval_runtime": 11.2471, + "eval_samples_per_second": 11.647, + "eval_steps_per_second": 1.512, + "step": 12600 + }, + { + "epoch": 195.3846153846154, + "grad_norm": 40.32701110839844, + "learning_rate": 8.7312e-06, + "loss": 0.9658, + "step": 12700 + }, + { + "epoch": 195.3846153846154, + "eval_loss": 1.1517494916915894, + "eval_runtime": 11.1893, + "eval_samples_per_second": 11.708, + "eval_steps_per_second": 1.519, + "step": 12700 + }, + { + "epoch": 196.92307692307693, + "grad_norm": 42.11077117919922, + "learning_rate": 8.7212e-06, + "loss": 0.9744, + "step": 12800 + }, + { + "epoch": 196.92307692307693, + "eval_loss": 1.1507395505905151, + "eval_runtime": 11.2196, + "eval_samples_per_second": 11.676, + "eval_steps_per_second": 1.515, + "step": 12800 + }, + { + "epoch": 198.46153846153845, + "grad_norm": 20.991779327392578, + "learning_rate": 8.7112e-06, + "loss": 0.9695, + "step": 12900 + }, + { + "epoch": 198.46153846153845, + "eval_loss": 1.1557880640029907, + "eval_runtime": 11.2158, + "eval_samples_per_second": 11.68, + "eval_steps_per_second": 1.516, + "step": 12900 + }, + { + "epoch": 200.0, + "grad_norm": 22.79688835144043, + "learning_rate": 8.7012e-06, + "loss": 0.9652, + "step": 13000 + }, + { + "epoch": 200.0, + "eval_loss": 1.1554670333862305, + "eval_runtime": 11.1246, + "eval_samples_per_second": 11.776, + "eval_steps_per_second": 1.528, + "step": 13000 + }, + { + "epoch": 201.53846153846155, + "grad_norm": 180.34512329101562, + "learning_rate": 8.6912e-06, + "loss": 0.9664, + "step": 13100 + }, + { + "epoch": 201.53846153846155, + "eval_loss": 1.1403967142105103, + "eval_runtime": 10.9895, + "eval_samples_per_second": 11.921, + "eval_steps_per_second": 1.547, + "step": 13100 + }, + { + "epoch": 203.07692307692307, + "grad_norm": 31.583358764648438, + "learning_rate": 8.6812e-06, + "loss": 0.9672, + "step": 13200 + }, + { + "epoch": 203.07692307692307, + "eval_loss": 1.1569470167160034, + "eval_runtime": 11.2695, + "eval_samples_per_second": 11.624, + "eval_steps_per_second": 1.508, + "step": 13200 + }, + { + "epoch": 204.6153846153846, + "grad_norm": 34.05722427368164, + "learning_rate": 8.671200000000001e-06, + "loss": 0.9531, + "step": 13300 + }, + { + "epoch": 204.6153846153846, + "eval_loss": 1.1408321857452393, + "eval_runtime": 11.1074, + "eval_samples_per_second": 11.794, + "eval_steps_per_second": 1.531, + "step": 13300 + }, + { + "epoch": 206.15384615384616, + "grad_norm": 26.748388290405273, + "learning_rate": 8.661200000000001e-06, + "loss": 0.9484, + "step": 13400 + }, + { + "epoch": 206.15384615384616, + "eval_loss": 1.151693344116211, + "eval_runtime": 10.9585, + "eval_samples_per_second": 11.954, + "eval_steps_per_second": 1.551, + "step": 13400 + }, + { + "epoch": 207.69230769230768, + "grad_norm": 19.531770706176758, + "learning_rate": 8.651200000000001e-06, + "loss": 0.971, + "step": 13500 + }, + { + "epoch": 207.69230769230768, + "eval_loss": 1.138724684715271, + "eval_runtime": 11.0296, + "eval_samples_per_second": 11.877, + "eval_steps_per_second": 1.541, + "step": 13500 + }, + { + "epoch": 209.23076923076923, + "grad_norm": 23.87537384033203, + "learning_rate": 8.641200000000001e-06, + "loss": 0.944, + "step": 13600 + }, + { + "epoch": 209.23076923076923, + "eval_loss": 1.1402664184570312, + "eval_runtime": 11.1505, + "eval_samples_per_second": 11.748, + "eval_steps_per_second": 1.525, + "step": 13600 + }, + { + "epoch": 210.76923076923077, + "grad_norm": 25.069852828979492, + "learning_rate": 8.631200000000001e-06, + "loss": 0.9581, + "step": 13700 + }, + { + "epoch": 210.76923076923077, + "eval_loss": 1.1348073482513428, + "eval_runtime": 11.063, + "eval_samples_per_second": 11.841, + "eval_steps_per_second": 1.537, + "step": 13700 + }, + { + "epoch": 212.30769230769232, + "grad_norm": 23.794719696044922, + "learning_rate": 8.621200000000001e-06, + "loss": 0.957, + "step": 13800 + }, + { + "epoch": 212.30769230769232, + "eval_loss": 1.143198013305664, + "eval_runtime": 11.1422, + "eval_samples_per_second": 11.757, + "eval_steps_per_second": 1.526, + "step": 13800 + }, + { + "epoch": 213.84615384615384, + "grad_norm": 26.059829711914062, + "learning_rate": 8.611200000000002e-06, + "loss": 0.9554, + "step": 13900 + }, + { + "epoch": 213.84615384615384, + "eval_loss": 1.1333541870117188, + "eval_runtime": 10.948, + "eval_samples_per_second": 11.966, + "eval_steps_per_second": 1.553, + "step": 13900 + }, + { + "epoch": 215.3846153846154, + "grad_norm": 49.8937873840332, + "learning_rate": 8.6012e-06, + "loss": 0.9607, + "step": 14000 + }, + { + "epoch": 215.3846153846154, + "eval_loss": 1.1584446430206299, + "eval_runtime": 11.2488, + "eval_samples_per_second": 11.646, + "eval_steps_per_second": 1.511, + "step": 14000 + }, + { + "epoch": 216.92307692307693, + "grad_norm": 17.9267520904541, + "learning_rate": 8.5912e-06, + "loss": 0.9444, + "step": 14100 + }, + { + "epoch": 216.92307692307693, + "eval_loss": 1.1573532819747925, + "eval_runtime": 11.143, + "eval_samples_per_second": 11.756, + "eval_steps_per_second": 1.526, + "step": 14100 + }, + { + "epoch": 218.46153846153845, + "grad_norm": 27.38156509399414, + "learning_rate": 8.5812e-06, + "loss": 0.928, + "step": 14200 + }, + { + "epoch": 218.46153846153845, + "eval_loss": 1.1540145874023438, + "eval_runtime": 11.0475, + "eval_samples_per_second": 11.858, + "eval_steps_per_second": 1.539, + "step": 14200 + }, + { + "epoch": 220.0, + "grad_norm": 42.785037994384766, + "learning_rate": 8.5712e-06, + "loss": 0.9548, + "step": 14300 + }, + { + "epoch": 220.0, + "eval_loss": 1.1379021406173706, + "eval_runtime": 10.9412, + "eval_samples_per_second": 11.973, + "eval_steps_per_second": 1.554, + "step": 14300 + }, + { + "epoch": 221.53846153846155, + "grad_norm": 39.50480270385742, + "learning_rate": 8.5612e-06, + "loss": 0.9583, + "step": 14400 + }, + { + "epoch": 221.53846153846155, + "eval_loss": 1.1666078567504883, + "eval_runtime": 11.2066, + "eval_samples_per_second": 11.69, + "eval_steps_per_second": 1.517, + "step": 14400 + }, + { + "epoch": 223.07692307692307, + "grad_norm": 15.560932159423828, + "learning_rate": 8.5512e-06, + "loss": 0.9306, + "step": 14500 + }, + { + "epoch": 223.07692307692307, + "eval_loss": 1.151904582977295, + "eval_runtime": 11.2376, + "eval_samples_per_second": 11.657, + "eval_steps_per_second": 1.513, + "step": 14500 + }, + { + "epoch": 224.6153846153846, + "grad_norm": 36.12020492553711, + "learning_rate": 8.541400000000001e-06, + "loss": 0.9668, + "step": 14600 + }, + { + "epoch": 224.6153846153846, + "eval_loss": 1.139450192451477, + "eval_runtime": 11.1643, + "eval_samples_per_second": 11.734, + "eval_steps_per_second": 1.523, + "step": 14600 + }, + { + "epoch": 226.15384615384616, + "grad_norm": 31.29511070251465, + "learning_rate": 8.531400000000001e-06, + "loss": 0.9646, + "step": 14700 + }, + { + "epoch": 226.15384615384616, + "eval_loss": 1.1311490535736084, + "eval_runtime": 11.0, + "eval_samples_per_second": 11.909, + "eval_steps_per_second": 1.545, + "step": 14700 + }, + { + "epoch": 227.69230769230768, + "grad_norm": 22.08748435974121, + "learning_rate": 8.521400000000001e-06, + "loss": 0.922, + "step": 14800 + }, + { + "epoch": 227.69230769230768, + "eval_loss": 1.1504499912261963, + "eval_runtime": 11.135, + "eval_samples_per_second": 11.765, + "eval_steps_per_second": 1.527, + "step": 14800 + }, + { + "epoch": 229.23076923076923, + "grad_norm": 26.33457374572754, + "learning_rate": 8.511400000000001e-06, + "loss": 0.9306, + "step": 14900 + }, + { + "epoch": 229.23076923076923, + "eval_loss": 1.136217713356018, + "eval_runtime": 11.1465, + "eval_samples_per_second": 11.753, + "eval_steps_per_second": 1.525, + "step": 14900 + }, + { + "epoch": 230.76923076923077, + "grad_norm": 49.193206787109375, + "learning_rate": 8.501400000000001e-06, + "loss": 0.938, + "step": 15000 + }, + { + "epoch": 230.76923076923077, + "eval_loss": 1.1409271955490112, + "eval_runtime": 11.0858, + "eval_samples_per_second": 11.817, + "eval_steps_per_second": 1.533, + "step": 15000 + }, + { + "epoch": 232.30769230769232, + "grad_norm": 22.36850357055664, + "learning_rate": 8.491400000000001e-06, + "loss": 0.9218, + "step": 15100 + }, + { + "epoch": 232.30769230769232, + "eval_loss": 1.131103515625, + "eval_runtime": 11.0691, + "eval_samples_per_second": 11.835, + "eval_steps_per_second": 1.536, + "step": 15100 + }, + { + "epoch": 233.84615384615384, + "grad_norm": 26.34011459350586, + "learning_rate": 8.481400000000002e-06, + "loss": 0.9617, + "step": 15200 + }, + { + "epoch": 233.84615384615384, + "eval_loss": 1.1415542364120483, + "eval_runtime": 10.9887, + "eval_samples_per_second": 11.921, + "eval_steps_per_second": 1.547, + "step": 15200 + }, + { + "epoch": 235.3846153846154, + "grad_norm": 29.583358764648438, + "learning_rate": 8.4714e-06, + "loss": 0.9272, + "step": 15300 + }, + { + "epoch": 235.3846153846154, + "eval_loss": 1.144914150238037, + "eval_runtime": 11.2646, + "eval_samples_per_second": 11.629, + "eval_steps_per_second": 1.509, + "step": 15300 + }, + { + "epoch": 236.92307692307693, + "grad_norm": 31.824247360229492, + "learning_rate": 8.4614e-06, + "loss": 0.9207, + "step": 15400 + }, + { + "epoch": 236.92307692307693, + "eval_loss": 1.1387474536895752, + "eval_runtime": 11.0721, + "eval_samples_per_second": 11.832, + "eval_steps_per_second": 1.535, + "step": 15400 + }, + { + "epoch": 238.46153846153845, + "grad_norm": 41.94277572631836, + "learning_rate": 8.4514e-06, + "loss": 0.9454, + "step": 15500 + }, + { + "epoch": 238.46153846153845, + "eval_loss": 1.1316168308258057, + "eval_runtime": 11.1831, + "eval_samples_per_second": 11.714, + "eval_steps_per_second": 1.52, + "step": 15500 + }, + { + "epoch": 240.0, + "grad_norm": 21.150598526000977, + "learning_rate": 8.4414e-06, + "loss": 0.9249, + "step": 15600 + }, + { + "epoch": 240.0, + "eval_loss": 1.1368097066879272, + "eval_runtime": 11.0887, + "eval_samples_per_second": 11.814, + "eval_steps_per_second": 1.533, + "step": 15600 + }, + { + "epoch": 241.53846153846155, + "grad_norm": 47.432212829589844, + "learning_rate": 8.4314e-06, + "loss": 0.9212, + "step": 15700 + }, + { + "epoch": 241.53846153846155, + "eval_loss": 1.125348448753357, + "eval_runtime": 11.2434, + "eval_samples_per_second": 11.651, + "eval_steps_per_second": 1.512, + "step": 15700 + }, + { + "epoch": 243.07692307692307, + "grad_norm": 28.406036376953125, + "learning_rate": 8.4214e-06, + "loss": 0.9272, + "step": 15800 + }, + { + "epoch": 243.07692307692307, + "eval_loss": 1.1328097581863403, + "eval_runtime": 11.1097, + "eval_samples_per_second": 11.791, + "eval_steps_per_second": 1.53, + "step": 15800 + }, + { + "epoch": 244.6153846153846, + "grad_norm": 53.369564056396484, + "learning_rate": 8.4114e-06, + "loss": 0.9174, + "step": 15900 + }, + { + "epoch": 244.6153846153846, + "eval_loss": 1.1235861778259277, + "eval_runtime": 10.9581, + "eval_samples_per_second": 11.955, + "eval_steps_per_second": 1.551, + "step": 15900 + }, + { + "epoch": 246.15384615384616, + "grad_norm": 31.435935974121094, + "learning_rate": 8.4014e-06, + "loss": 0.9041, + "step": 16000 + }, + { + "epoch": 246.15384615384616, + "eval_loss": 1.1266686916351318, + "eval_runtime": 11.1276, + "eval_samples_per_second": 11.773, + "eval_steps_per_second": 1.528, + "step": 16000 + }, + { + "epoch": 247.69230769230768, + "grad_norm": 32.799991607666016, + "learning_rate": 8.3914e-06, + "loss": 0.9062, + "step": 16100 + }, + { + "epoch": 247.69230769230768, + "eval_loss": 1.1481693983078003, + "eval_runtime": 11.0435, + "eval_samples_per_second": 11.862, + "eval_steps_per_second": 1.539, + "step": 16100 + }, + { + "epoch": 249.23076923076923, + "grad_norm": 36.49935531616211, + "learning_rate": 8.3814e-06, + "loss": 0.9163, + "step": 16200 + }, + { + "epoch": 249.23076923076923, + "eval_loss": 1.139769434928894, + "eval_runtime": 11.1922, + "eval_samples_per_second": 11.705, + "eval_steps_per_second": 1.519, + "step": 16200 + }, + { + "epoch": 250.76923076923077, + "grad_norm": 35.4781379699707, + "learning_rate": 8.371400000000001e-06, + "loss": 0.9219, + "step": 16300 + }, + { + "epoch": 250.76923076923077, + "eval_loss": 1.1498539447784424, + "eval_runtime": 11.1805, + "eval_samples_per_second": 11.717, + "eval_steps_per_second": 1.52, + "step": 16300 + }, + { + "epoch": 252.30769230769232, + "grad_norm": 17.645612716674805, + "learning_rate": 8.361400000000001e-06, + "loss": 0.9278, + "step": 16400 + }, + { + "epoch": 252.30769230769232, + "eval_loss": 1.1338902711868286, + "eval_runtime": 11.1146, + "eval_samples_per_second": 11.786, + "eval_steps_per_second": 1.53, + "step": 16400 + }, + { + "epoch": 253.84615384615384, + "grad_norm": 32.81660079956055, + "learning_rate": 8.351400000000001e-06, + "loss": 0.9108, + "step": 16500 + }, + { + "epoch": 253.84615384615384, + "eval_loss": 1.1279574632644653, + "eval_runtime": 11.1151, + "eval_samples_per_second": 11.786, + "eval_steps_per_second": 1.529, + "step": 16500 + }, + { + "epoch": 255.3846153846154, + "grad_norm": 22.3878116607666, + "learning_rate": 8.341400000000001e-06, + "loss": 0.9011, + "step": 16600 + }, + { + "epoch": 255.3846153846154, + "eval_loss": 1.1570419073104858, + "eval_runtime": 11.0784, + "eval_samples_per_second": 11.825, + "eval_steps_per_second": 1.535, + "step": 16600 + }, + { + "epoch": 256.9230769230769, + "grad_norm": 29.845205307006836, + "learning_rate": 8.331400000000001e-06, + "loss": 0.9314, + "step": 16700 + }, + { + "epoch": 256.9230769230769, + "eval_loss": 1.1365561485290527, + "eval_runtime": 11.2405, + "eval_samples_per_second": 11.654, + "eval_steps_per_second": 1.512, + "step": 16700 + }, + { + "epoch": 258.46153846153845, + "grad_norm": 21.02674102783203, + "learning_rate": 8.321400000000001e-06, + "loss": 0.9021, + "step": 16800 + }, + { + "epoch": 258.46153846153845, + "eval_loss": 1.164974570274353, + "eval_runtime": 11.0643, + "eval_samples_per_second": 11.84, + "eval_steps_per_second": 1.536, + "step": 16800 + }, + { + "epoch": 260.0, + "grad_norm": 22.380117416381836, + "learning_rate": 8.3114e-06, + "loss": 0.912, + "step": 16900 + }, + { + "epoch": 260.0, + "eval_loss": 1.1483317613601685, + "eval_runtime": 11.1852, + "eval_samples_per_second": 11.712, + "eval_steps_per_second": 1.52, + "step": 16900 + }, + { + "epoch": 261.53846153846155, + "grad_norm": 39.20146560668945, + "learning_rate": 8.3014e-06, + "loss": 0.9165, + "step": 17000 + }, + { + "epoch": 261.53846153846155, + "eval_loss": 1.159449577331543, + "eval_runtime": 11.4058, + "eval_samples_per_second": 11.485, + "eval_steps_per_second": 1.49, + "step": 17000 + }, + { + "epoch": 263.0769230769231, + "grad_norm": 46.305389404296875, + "learning_rate": 8.2914e-06, + "loss": 0.916, + "step": 17100 + }, + { + "epoch": 263.0769230769231, + "eval_loss": 1.146033525466919, + "eval_runtime": 11.3638, + "eval_samples_per_second": 11.528, + "eval_steps_per_second": 1.496, + "step": 17100 + }, + { + "epoch": 264.61538461538464, + "grad_norm": 33.07489776611328, + "learning_rate": 8.2814e-06, + "loss": 0.9147, + "step": 17200 + }, + { + "epoch": 264.61538461538464, + "eval_loss": 1.143062710762024, + "eval_runtime": 11.3544, + "eval_samples_per_second": 11.537, + "eval_steps_per_second": 1.497, + "step": 17200 + }, + { + "epoch": 266.15384615384613, + "grad_norm": 35.233131408691406, + "learning_rate": 8.2714e-06, + "loss": 0.9151, + "step": 17300 + }, + { + "epoch": 266.15384615384613, + "eval_loss": 1.126172661781311, + "eval_runtime": 11.1185, + "eval_samples_per_second": 11.782, + "eval_steps_per_second": 1.529, + "step": 17300 + }, + { + "epoch": 267.6923076923077, + "grad_norm": 32.72975540161133, + "learning_rate": 8.2614e-06, + "loss": 0.8881, + "step": 17400 + }, + { + "epoch": 267.6923076923077, + "eval_loss": 1.1455607414245605, + "eval_runtime": 11.0568, + "eval_samples_per_second": 11.848, + "eval_steps_per_second": 1.538, + "step": 17400 + }, + { + "epoch": 269.2307692307692, + "grad_norm": 16.41983985900879, + "learning_rate": 8.2514e-06, + "loss": 0.9027, + "step": 17500 + }, + { + "epoch": 269.2307692307692, + "eval_loss": 1.1283539533615112, + "eval_runtime": 11.3233, + "eval_samples_per_second": 11.569, + "eval_steps_per_second": 1.501, + "step": 17500 + }, + { + "epoch": 270.7692307692308, + "grad_norm": 20.32726287841797, + "learning_rate": 8.2414e-06, + "loss": 0.9391, + "step": 17600 + }, + { + "epoch": 270.7692307692308, + "eval_loss": 1.124210000038147, + "eval_runtime": 11.3345, + "eval_samples_per_second": 11.558, + "eval_steps_per_second": 1.5, + "step": 17600 + }, + { + "epoch": 272.3076923076923, + "grad_norm": 23.14797019958496, + "learning_rate": 8.2314e-06, + "loss": 0.8899, + "step": 17700 + }, + { + "epoch": 272.3076923076923, + "eval_loss": 1.1297597885131836, + "eval_runtime": 11.1272, + "eval_samples_per_second": 11.773, + "eval_steps_per_second": 1.528, + "step": 17700 + }, + { + "epoch": 273.84615384615387, + "grad_norm": 18.778406143188477, + "learning_rate": 8.2214e-06, + "loss": 0.9074, + "step": 17800 + }, + { + "epoch": 273.84615384615387, + "eval_loss": 1.135562777519226, + "eval_runtime": 11.2964, + "eval_samples_per_second": 11.597, + "eval_steps_per_second": 1.505, + "step": 17800 + }, + { + "epoch": 275.38461538461536, + "grad_norm": 27.574323654174805, + "learning_rate": 8.2114e-06, + "loss": 0.8931, + "step": 17900 + }, + { + "epoch": 275.38461538461536, + "eval_loss": 1.1423242092132568, + "eval_runtime": 11.0992, + "eval_samples_per_second": 11.803, + "eval_steps_per_second": 1.532, + "step": 17900 + }, + { + "epoch": 276.9230769230769, + "grad_norm": 26.559467315673828, + "learning_rate": 8.2014e-06, + "loss": 0.8913, + "step": 18000 + }, + { + "epoch": 276.9230769230769, + "eval_loss": 1.1252741813659668, + "eval_runtime": 11.2765, + "eval_samples_per_second": 11.617, + "eval_steps_per_second": 1.508, + "step": 18000 + }, + { + "epoch": 278.46153846153845, + "grad_norm": 24.442596435546875, + "learning_rate": 8.191400000000001e-06, + "loss": 0.8993, + "step": 18100 + }, + { + "epoch": 278.46153846153845, + "eval_loss": 1.1197646856307983, + "eval_runtime": 11.0479, + "eval_samples_per_second": 11.857, + "eval_steps_per_second": 1.539, + "step": 18100 + }, + { + "epoch": 280.0, + "grad_norm": 42.99458694458008, + "learning_rate": 8.181400000000001e-06, + "loss": 0.8925, + "step": 18200 + }, + { + "epoch": 280.0, + "eval_loss": 1.129381775856018, + "eval_runtime": 11.1979, + "eval_samples_per_second": 11.699, + "eval_steps_per_second": 1.518, + "step": 18200 + }, + { + "epoch": 281.53846153846155, + "grad_norm": 38.08549118041992, + "learning_rate": 8.171400000000001e-06, + "loss": 0.8699, + "step": 18300 + }, + { + "epoch": 281.53846153846155, + "eval_loss": 1.1298307180404663, + "eval_runtime": 11.1097, + "eval_samples_per_second": 11.792, + "eval_steps_per_second": 1.53, + "step": 18300 + }, + { + "epoch": 283.0769230769231, + "grad_norm": 49.90501022338867, + "learning_rate": 8.161400000000001e-06, + "loss": 0.9207, + "step": 18400 + }, + { + "epoch": 283.0769230769231, + "eval_loss": 1.1229758262634277, + "eval_runtime": 11.284, + "eval_samples_per_second": 11.609, + "eval_steps_per_second": 1.507, + "step": 18400 + }, + { + "epoch": 284.61538461538464, + "grad_norm": 37.63615036010742, + "learning_rate": 8.1514e-06, + "loss": 0.9061, + "step": 18500 + }, + { + "epoch": 284.61538461538464, + "eval_loss": 1.1395354270935059, + "eval_runtime": 11.2087, + "eval_samples_per_second": 11.687, + "eval_steps_per_second": 1.517, + "step": 18500 + }, + { + "epoch": 286.15384615384613, + "grad_norm": 17.88991355895996, + "learning_rate": 8.1414e-06, + "loss": 0.8664, + "step": 18600 + }, + { + "epoch": 286.15384615384613, + "eval_loss": 1.1339645385742188, + "eval_runtime": 11.317, + "eval_samples_per_second": 11.576, + "eval_steps_per_second": 1.502, + "step": 18600 + }, + { + "epoch": 287.6923076923077, + "grad_norm": 33.13370895385742, + "learning_rate": 8.1314e-06, + "loss": 0.8759, + "step": 18700 + }, + { + "epoch": 287.6923076923077, + "eval_loss": 1.1445599794387817, + "eval_runtime": 11.0472, + "eval_samples_per_second": 11.858, + "eval_steps_per_second": 1.539, + "step": 18700 + }, + { + "epoch": 289.2307692307692, + "grad_norm": 22.776575088500977, + "learning_rate": 8.1214e-06, + "loss": 0.8889, + "step": 18800 + }, + { + "epoch": 289.2307692307692, + "eval_loss": 1.1401453018188477, + "eval_runtime": 11.2523, + "eval_samples_per_second": 11.642, + "eval_steps_per_second": 1.511, + "step": 18800 + }, + { + "epoch": 290.7692307692308, + "grad_norm": 19.893653869628906, + "learning_rate": 8.1114e-06, + "loss": 0.8945, + "step": 18900 + }, + { + "epoch": 290.7692307692308, + "eval_loss": 1.1185678243637085, + "eval_runtime": 10.9814, + "eval_samples_per_second": 11.929, + "eval_steps_per_second": 1.548, + "step": 18900 + }, + { + "epoch": 292.3076923076923, + "grad_norm": 35.09921646118164, + "learning_rate": 8.1015e-06, + "loss": 0.8821, + "step": 19000 + }, + { + "epoch": 292.3076923076923, + "eval_loss": 1.1313185691833496, + "eval_runtime": 11.2698, + "eval_samples_per_second": 11.624, + "eval_steps_per_second": 1.508, + "step": 19000 + }, + { + "epoch": 293.84615384615387, + "grad_norm": 36.43528366088867, + "learning_rate": 8.0915e-06, + "loss": 0.8794, + "step": 19100 + }, + { + "epoch": 293.84615384615387, + "eval_loss": 1.1506413221359253, + "eval_runtime": 10.9821, + "eval_samples_per_second": 11.928, + "eval_steps_per_second": 1.548, + "step": 19100 + }, + { + "epoch": 295.38461538461536, + "grad_norm": 24.73018455505371, + "learning_rate": 8.0815e-06, + "loss": 0.8856, + "step": 19200 + }, + { + "epoch": 295.38461538461536, + "eval_loss": 1.1280685663223267, + "eval_runtime": 11.2539, + "eval_samples_per_second": 11.64, + "eval_steps_per_second": 1.511, + "step": 19200 + }, + { + "epoch": 296.9230769230769, + "grad_norm": 50.01460647583008, + "learning_rate": 8.0715e-06, + "loss": 0.8532, + "step": 19300 + }, + { + "epoch": 296.9230769230769, + "eval_loss": 1.124637484550476, + "eval_runtime": 11.1106, + "eval_samples_per_second": 11.791, + "eval_steps_per_second": 1.53, + "step": 19300 + }, + { + "epoch": 298.46153846153845, + "grad_norm": 38.828033447265625, + "learning_rate": 8.0615e-06, + "loss": 0.8425, + "step": 19400 + }, + { + "epoch": 298.46153846153845, + "eval_loss": 1.1076736450195312, + "eval_runtime": 11.3645, + "eval_samples_per_second": 11.527, + "eval_steps_per_second": 1.496, + "step": 19400 + }, + { + "epoch": 300.0, + "grad_norm": 18.97437858581543, + "learning_rate": 8.0515e-06, + "loss": 0.8934, + "step": 19500 + }, + { + "epoch": 300.0, + "eval_loss": 1.110048532485962, + "eval_runtime": 11.1778, + "eval_samples_per_second": 11.72, + "eval_steps_per_second": 1.521, + "step": 19500 + }, + { + "epoch": 301.53846153846155, + "grad_norm": 49.44120407104492, + "learning_rate": 8.041500000000001e-06, + "loss": 0.8694, + "step": 19600 + }, + { + "epoch": 301.53846153846155, + "eval_loss": 1.1411337852478027, + "eval_runtime": 11.4024, + "eval_samples_per_second": 11.489, + "eval_steps_per_second": 1.491, + "step": 19600 + }, + { + "epoch": 303.0769230769231, + "grad_norm": 32.77436828613281, + "learning_rate": 8.031500000000001e-06, + "loss": 0.8876, + "step": 19700 + }, + { + "epoch": 303.0769230769231, + "eval_loss": 1.1274378299713135, + "eval_runtime": 11.3743, + "eval_samples_per_second": 11.517, + "eval_steps_per_second": 1.495, + "step": 19700 + }, + { + "epoch": 304.61538461538464, + "grad_norm": 33.37275695800781, + "learning_rate": 8.021500000000001e-06, + "loss": 0.839, + "step": 19800 + }, + { + "epoch": 304.61538461538464, + "eval_loss": 1.1159642934799194, + "eval_runtime": 11.2947, + "eval_samples_per_second": 11.598, + "eval_steps_per_second": 1.505, + "step": 19800 + }, + { + "epoch": 306.15384615384613, + "grad_norm": 15.981072425842285, + "learning_rate": 8.011500000000001e-06, + "loss": 0.8796, + "step": 19900 + }, + { + "epoch": 306.15384615384613, + "eval_loss": 1.1134443283081055, + "eval_runtime": 11.0923, + "eval_samples_per_second": 11.81, + "eval_steps_per_second": 1.533, + "step": 19900 + }, + { + "epoch": 307.6923076923077, + "grad_norm": 33.69294738769531, + "learning_rate": 8.001500000000001e-06, + "loss": 0.8757, + "step": 20000 + }, + { + "epoch": 307.6923076923077, + "eval_loss": 1.1174206733703613, + "eval_runtime": 11.1824, + "eval_samples_per_second": 11.715, + "eval_steps_per_second": 1.52, + "step": 20000 + }, + { + "epoch": 309.2307692307692, + "grad_norm": 39.100887298583984, + "learning_rate": 7.991500000000001e-06, + "loss": 0.9037, + "step": 20100 + }, + { + "epoch": 309.2307692307692, + "eval_loss": 1.1444469690322876, + "eval_runtime": 11.2517, + "eval_samples_per_second": 11.643, + "eval_steps_per_second": 1.511, + "step": 20100 + }, + { + "epoch": 310.7692307692308, + "grad_norm": 34.204410552978516, + "learning_rate": 7.981500000000001e-06, + "loss": 0.8714, + "step": 20200 + }, + { + "epoch": 310.7692307692308, + "eval_loss": 1.113229751586914, + "eval_runtime": 11.1634, + "eval_samples_per_second": 11.735, + "eval_steps_per_second": 1.523, + "step": 20200 + }, + { + "epoch": 312.3076923076923, + "grad_norm": 34.093692779541016, + "learning_rate": 7.971500000000002e-06, + "loss": 0.8952, + "step": 20300 + }, + { + "epoch": 312.3076923076923, + "eval_loss": 1.132067084312439, + "eval_runtime": 11.0613, + "eval_samples_per_second": 11.843, + "eval_steps_per_second": 1.537, + "step": 20300 + }, + { + "epoch": 313.84615384615387, + "grad_norm": 33.44735336303711, + "learning_rate": 7.961500000000002e-06, + "loss": 0.8545, + "step": 20400 + }, + { + "epoch": 313.84615384615387, + "eval_loss": 1.102783441543579, + "eval_runtime": 10.9986, + "eval_samples_per_second": 11.911, + "eval_steps_per_second": 1.546, + "step": 20400 + }, + { + "epoch": 315.38461538461536, + "grad_norm": 21.25609588623047, + "learning_rate": 7.9516e-06, + "loss": 0.896, + "step": 20500 + }, + { + "epoch": 315.38461538461536, + "eval_loss": 1.1276897192001343, + "eval_runtime": 11.0387, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 1.54, + "step": 20500 + }, + { + "epoch": 316.9230769230769, + "grad_norm": 42.066017150878906, + "learning_rate": 7.9416e-06, + "loss": 0.8545, + "step": 20600 + }, + { + "epoch": 316.9230769230769, + "eval_loss": 1.1395957469940186, + "eval_runtime": 11.2855, + "eval_samples_per_second": 11.608, + "eval_steps_per_second": 1.506, + "step": 20600 + }, + { + "epoch": 318.46153846153845, + "grad_norm": 22.6724796295166, + "learning_rate": 7.9316e-06, + "loss": 0.8838, + "step": 20700 + }, + { + "epoch": 318.46153846153845, + "eval_loss": 1.1254605054855347, + "eval_runtime": 11.2226, + "eval_samples_per_second": 11.673, + "eval_steps_per_second": 1.515, + "step": 20700 + }, + { + "epoch": 320.0, + "grad_norm": 31.141693115234375, + "learning_rate": 7.9216e-06, + "loss": 0.8704, + "step": 20800 + }, + { + "epoch": 320.0, + "eval_loss": 1.1148459911346436, + "eval_runtime": 11.3548, + "eval_samples_per_second": 11.537, + "eval_steps_per_second": 1.497, + "step": 20800 + }, + { + "epoch": 321.53846153846155, + "grad_norm": 53.709205627441406, + "learning_rate": 7.9116e-06, + "loss": 0.8571, + "step": 20900 + }, + { + "epoch": 321.53846153846155, + "eval_loss": 1.1235041618347168, + "eval_runtime": 11.3363, + "eval_samples_per_second": 11.556, + "eval_steps_per_second": 1.5, + "step": 20900 + }, + { + "epoch": 323.0769230769231, + "grad_norm": 36.34547805786133, + "learning_rate": 7.9016e-06, + "loss": 0.8749, + "step": 21000 + }, + { + "epoch": 323.0769230769231, + "eval_loss": 1.1102306842803955, + "eval_runtime": 11.2154, + "eval_samples_per_second": 11.68, + "eval_steps_per_second": 1.516, + "step": 21000 + }, + { + "epoch": 324.61538461538464, + "grad_norm": 19.974227905273438, + "learning_rate": 7.8916e-06, + "loss": 0.8572, + "step": 21100 + }, + { + "epoch": 324.61538461538464, + "eval_loss": 1.1081122159957886, + "eval_runtime": 11.2322, + "eval_samples_per_second": 11.663, + "eval_steps_per_second": 1.513, + "step": 21100 + }, + { + "epoch": 326.15384615384613, + "grad_norm": 31.686569213867188, + "learning_rate": 7.881600000000001e-06, + "loss": 0.8689, + "step": 21200 + }, + { + "epoch": 326.15384615384613, + "eval_loss": 1.1191128492355347, + "eval_runtime": 11.4469, + "eval_samples_per_second": 11.444, + "eval_steps_per_second": 1.485, + "step": 21200 + }, + { + "epoch": 327.6923076923077, + "grad_norm": 35.97456359863281, + "learning_rate": 7.871600000000001e-06, + "loss": 0.8421, + "step": 21300 + }, + { + "epoch": 327.6923076923077, + "eval_loss": 1.1023869514465332, + "eval_runtime": 11.2157, + "eval_samples_per_second": 11.68, + "eval_steps_per_second": 1.516, + "step": 21300 + }, + { + "epoch": 329.2307692307692, + "grad_norm": 34.61111068725586, + "learning_rate": 7.861600000000001e-06, + "loss": 0.8546, + "step": 21400 + }, + { + "epoch": 329.2307692307692, + "eval_loss": 1.1294784545898438, + "eval_runtime": 11.1536, + "eval_samples_per_second": 11.745, + "eval_steps_per_second": 1.524, + "step": 21400 + }, + { + "epoch": 330.7692307692308, + "grad_norm": 21.682897567749023, + "learning_rate": 7.8516e-06, + "loss": 0.828, + "step": 21500 + }, + { + "epoch": 330.7692307692308, + "eval_loss": 1.101345181465149, + "eval_runtime": 11.0361, + "eval_samples_per_second": 11.87, + "eval_steps_per_second": 1.54, + "step": 21500 + }, + { + "epoch": 332.3076923076923, + "grad_norm": 31.514686584472656, + "learning_rate": 7.8416e-06, + "loss": 0.8762, + "step": 21600 + }, + { + "epoch": 332.3076923076923, + "eval_loss": 1.1125372648239136, + "eval_runtime": 11.2868, + "eval_samples_per_second": 11.607, + "eval_steps_per_second": 1.506, + "step": 21600 + }, + { + "epoch": 333.84615384615387, + "grad_norm": 34.57918167114258, + "learning_rate": 7.8316e-06, + "loss": 0.8368, + "step": 21700 + }, + { + "epoch": 333.84615384615387, + "eval_loss": 1.1226669549942017, + "eval_runtime": 11.4486, + "eval_samples_per_second": 11.442, + "eval_steps_per_second": 1.485, + "step": 21700 + }, + { + "epoch": 335.38461538461536, + "grad_norm": 57.26470947265625, + "learning_rate": 7.8216e-06, + "loss": 0.8675, + "step": 21800 + }, + { + "epoch": 335.38461538461536, + "eval_loss": 1.1287323236465454, + "eval_runtime": 11.197, + "eval_samples_per_second": 11.7, + "eval_steps_per_second": 1.518, + "step": 21800 + }, + { + "epoch": 336.9230769230769, + "grad_norm": 26.583576202392578, + "learning_rate": 7.8116e-06, + "loss": 0.8555, + "step": 21900 + }, + { + "epoch": 336.9230769230769, + "eval_loss": 1.1035945415496826, + "eval_runtime": 11.2619, + "eval_samples_per_second": 11.632, + "eval_steps_per_second": 1.51, + "step": 21900 + }, + { + "epoch": 338.46153846153845, + "grad_norm": 44.196590423583984, + "learning_rate": 7.8016e-06, + "loss": 0.8601, + "step": 22000 + }, + { + "epoch": 338.46153846153845, + "eval_loss": 1.09674870967865, + "eval_runtime": 11.2173, + "eval_samples_per_second": 11.678, + "eval_steps_per_second": 1.516, + "step": 22000 + }, + { + "epoch": 340.0, + "grad_norm": 34.388145446777344, + "learning_rate": 7.7916e-06, + "loss": 0.8347, + "step": 22100 + }, + { + "epoch": 340.0, + "eval_loss": 1.100284218788147, + "eval_runtime": 11.2123, + "eval_samples_per_second": 11.684, + "eval_steps_per_second": 1.516, + "step": 22100 + }, + { + "epoch": 341.53846153846155, + "grad_norm": 33.55098342895508, + "learning_rate": 7.7816e-06, + "loss": 0.8429, + "step": 22200 + }, + { + "epoch": 341.53846153846155, + "eval_loss": 1.1070573329925537, + "eval_runtime": 11.1389, + "eval_samples_per_second": 11.761, + "eval_steps_per_second": 1.526, + "step": 22200 + }, + { + "epoch": 343.0769230769231, + "grad_norm": 21.367860794067383, + "learning_rate": 7.7716e-06, + "loss": 0.827, + "step": 22300 + }, + { + "epoch": 343.0769230769231, + "eval_loss": 1.1184412240982056, + "eval_runtime": 11.2313, + "eval_samples_per_second": 11.664, + "eval_steps_per_second": 1.514, + "step": 22300 + }, + { + "epoch": 344.61538461538464, + "grad_norm": 32.62434387207031, + "learning_rate": 7.7616e-06, + "loss": 0.8566, + "step": 22400 + }, + { + "epoch": 344.61538461538464, + "eval_loss": 1.1258198022842407, + "eval_runtime": 11.1788, + "eval_samples_per_second": 11.719, + "eval_steps_per_second": 1.521, + "step": 22400 + }, + { + "epoch": 346.15384615384613, + "grad_norm": 44.411659240722656, + "learning_rate": 7.7516e-06, + "loss": 0.8666, + "step": 22500 + }, + { + "epoch": 346.15384615384613, + "eval_loss": 1.121845006942749, + "eval_runtime": 11.1251, + "eval_samples_per_second": 11.775, + "eval_steps_per_second": 1.528, + "step": 22500 + }, + { + "epoch": 347.6923076923077, + "grad_norm": 41.10401153564453, + "learning_rate": 7.7416e-06, + "loss": 0.8354, + "step": 22600 + }, + { + "epoch": 347.6923076923077, + "eval_loss": 1.1154077053070068, + "eval_runtime": 11.161, + "eval_samples_per_second": 11.737, + "eval_steps_per_second": 1.523, + "step": 22600 + }, + { + "epoch": 349.2307692307692, + "grad_norm": 31.414745330810547, + "learning_rate": 7.7316e-06, + "loss": 0.8321, + "step": 22700 + }, + { + "epoch": 349.2307692307692, + "eval_loss": 1.1233044862747192, + "eval_runtime": 11.0203, + "eval_samples_per_second": 11.887, + "eval_steps_per_second": 1.543, + "step": 22700 + }, + { + "epoch": 350.7692307692308, + "grad_norm": 37.39424133300781, + "learning_rate": 7.7216e-06, + "loss": 0.8384, + "step": 22800 + }, + { + "epoch": 350.7692307692308, + "eval_loss": 1.1059685945510864, + "eval_runtime": 10.9825, + "eval_samples_per_second": 11.928, + "eval_steps_per_second": 1.548, + "step": 22800 + }, + { + "epoch": 352.3076923076923, + "grad_norm": 24.75067901611328, + "learning_rate": 7.7116e-06, + "loss": 0.8216, + "step": 22900 + }, + { + "epoch": 352.3076923076923, + "eval_loss": 1.1147040128707886, + "eval_runtime": 11.1066, + "eval_samples_per_second": 11.795, + "eval_steps_per_second": 1.531, + "step": 22900 + }, + { + "epoch": 353.84615384615387, + "grad_norm": 21.775754928588867, + "learning_rate": 7.701600000000001e-06, + "loss": 0.8562, + "step": 23000 + }, + { + "epoch": 353.84615384615387, + "eval_loss": 1.1105334758758545, + "eval_runtime": 11.0277, + "eval_samples_per_second": 11.879, + "eval_steps_per_second": 1.542, + "step": 23000 + }, + { + "epoch": 355.38461538461536, + "grad_norm": 33.22850799560547, + "learning_rate": 7.6916e-06, + "loss": 0.813, + "step": 23100 + }, + { + "epoch": 355.38461538461536, + "eval_loss": 1.113241195678711, + "eval_runtime": 10.9954, + "eval_samples_per_second": 11.914, + "eval_steps_per_second": 1.546, + "step": 23100 + }, + { + "epoch": 356.9230769230769, + "grad_norm": 31.402652740478516, + "learning_rate": 7.6816e-06, + "loss": 0.8527, + "step": 23200 + }, + { + "epoch": 356.9230769230769, + "eval_loss": 1.1040674448013306, + "eval_runtime": 11.0415, + "eval_samples_per_second": 11.864, + "eval_steps_per_second": 1.54, + "step": 23200 + }, + { + "epoch": 358.46153846153845, + "grad_norm": 106.14508819580078, + "learning_rate": 7.6716e-06, + "loss": 0.8223, + "step": 23300 + }, + { + "epoch": 358.46153846153845, + "eval_loss": 1.1121721267700195, + "eval_runtime": 11.1579, + "eval_samples_per_second": 11.741, + "eval_steps_per_second": 1.524, + "step": 23300 + }, + { + "epoch": 360.0, + "grad_norm": 32.76395797729492, + "learning_rate": 7.6616e-06, + "loss": 0.8199, + "step": 23400 + }, + { + "epoch": 360.0, + "eval_loss": 1.109994649887085, + "eval_runtime": 11.0068, + "eval_samples_per_second": 11.902, + "eval_steps_per_second": 1.545, + "step": 23400 + }, + { + "epoch": 361.53846153846155, + "grad_norm": 44.64813232421875, + "learning_rate": 7.6516e-06, + "loss": 0.8566, + "step": 23500 + }, + { + "epoch": 361.53846153846155, + "eval_loss": 1.1383651494979858, + "eval_runtime": 11.1866, + "eval_samples_per_second": 11.71, + "eval_steps_per_second": 1.52, + "step": 23500 + }, + { + "epoch": 363.0769230769231, + "grad_norm": 20.096393585205078, + "learning_rate": 7.6416e-06, + "loss": 0.795, + "step": 23600 + }, + { + "epoch": 363.0769230769231, + "eval_loss": 1.1200674772262573, + "eval_runtime": 11.289, + "eval_samples_per_second": 11.604, + "eval_steps_per_second": 1.506, + "step": 23600 + }, + { + "epoch": 364.61538461538464, + "grad_norm": 30.219837188720703, + "learning_rate": 7.6316e-06, + "loss": 0.8413, + "step": 23700 + }, + { + "epoch": 364.61538461538464, + "eval_loss": 1.111639380455017, + "eval_runtime": 11.2547, + "eval_samples_per_second": 11.64, + "eval_steps_per_second": 1.51, + "step": 23700 + }, + { + "epoch": 366.15384615384613, + "grad_norm": 54.91815185546875, + "learning_rate": 7.6216e-06, + "loss": 0.8284, + "step": 23800 + }, + { + "epoch": 366.15384615384613, + "eval_loss": 1.1330946683883667, + "eval_runtime": 11.3054, + "eval_samples_per_second": 11.587, + "eval_steps_per_second": 1.504, + "step": 23800 + }, + { + "epoch": 367.6923076923077, + "grad_norm": 40.97207260131836, + "learning_rate": 7.6116e-06, + "loss": 0.8387, + "step": 23900 + }, + { + "epoch": 367.6923076923077, + "eval_loss": 1.1550222635269165, + "eval_runtime": 11.2372, + "eval_samples_per_second": 11.658, + "eval_steps_per_second": 1.513, + "step": 23900 + }, + { + "epoch": 369.2307692307692, + "grad_norm": 36.580745697021484, + "learning_rate": 7.6017e-06, + "loss": 0.829, + "step": 24000 + }, + { + "epoch": 369.2307692307692, + "eval_loss": 1.1261051893234253, + "eval_runtime": 11.1834, + "eval_samples_per_second": 11.714, + "eval_steps_per_second": 1.52, + "step": 24000 + }, + { + "epoch": 370.7692307692308, + "grad_norm": 46.304649353027344, + "learning_rate": 7.5917000000000005e-06, + "loss": 0.8316, + "step": 24100 + }, + { + "epoch": 370.7692307692308, + "eval_loss": 1.1410084962844849, + "eval_runtime": 11.3511, + "eval_samples_per_second": 11.541, + "eval_steps_per_second": 1.498, + "step": 24100 + }, + { + "epoch": 372.3076923076923, + "grad_norm": 32.472293853759766, + "learning_rate": 7.581700000000001e-06, + "loss": 0.8309, + "step": 24200 + }, + { + "epoch": 372.3076923076923, + "eval_loss": 1.1283127069473267, + "eval_runtime": 11.1949, + "eval_samples_per_second": 11.702, + "eval_steps_per_second": 1.519, + "step": 24200 + }, + { + "epoch": 373.84615384615387, + "grad_norm": 24.16025161743164, + "learning_rate": 7.571700000000001e-06, + "loss": 0.8154, + "step": 24300 + }, + { + "epoch": 373.84615384615387, + "eval_loss": 1.1386806964874268, + "eval_runtime": 11.1089, + "eval_samples_per_second": 11.792, + "eval_steps_per_second": 1.53, + "step": 24300 + }, + { + "epoch": 375.38461538461536, + "grad_norm": 16.86387825012207, + "learning_rate": 7.561700000000001e-06, + "loss": 0.8221, + "step": 24400 + }, + { + "epoch": 375.38461538461536, + "eval_loss": 1.09730863571167, + "eval_runtime": 11.1299, + "eval_samples_per_second": 11.77, + "eval_steps_per_second": 1.527, + "step": 24400 + }, + { + "epoch": 376.9230769230769, + "grad_norm": 34.62860107421875, + "learning_rate": 7.551700000000001e-06, + "loss": 0.8106, + "step": 24500 + }, + { + "epoch": 376.9230769230769, + "eval_loss": 1.1233881711959839, + "eval_runtime": 11.0352, + "eval_samples_per_second": 11.871, + "eval_steps_per_second": 1.541, + "step": 24500 + }, + { + "epoch": 378.46153846153845, + "grad_norm": 16.957393646240234, + "learning_rate": 7.541700000000001e-06, + "loss": 0.8362, + "step": 24600 + }, + { + "epoch": 378.46153846153845, + "eval_loss": 1.1209690570831299, + "eval_runtime": 11.0555, + "eval_samples_per_second": 11.849, + "eval_steps_per_second": 1.538, + "step": 24600 + }, + { + "epoch": 380.0, + "grad_norm": 21.519577026367188, + "learning_rate": 7.531700000000001e-06, + "loss": 0.8139, + "step": 24700 + }, + { + "epoch": 380.0, + "eval_loss": 1.1106966733932495, + "eval_runtime": 11.0492, + "eval_samples_per_second": 11.856, + "eval_steps_per_second": 1.539, + "step": 24700 + }, + { + "epoch": 381.53846153846155, + "grad_norm": 26.86697769165039, + "learning_rate": 7.5217e-06, + "loss": 0.8036, + "step": 24800 + }, + { + "epoch": 381.53846153846155, + "eval_loss": 1.1153205633163452, + "eval_runtime": 11.1284, + "eval_samples_per_second": 11.772, + "eval_steps_per_second": 1.528, + "step": 24800 + }, + { + "epoch": 383.0769230769231, + "grad_norm": 21.1047420501709, + "learning_rate": 7.5117000000000004e-06, + "loss": 0.8115, + "step": 24900 + }, + { + "epoch": 383.0769230769231, + "eval_loss": 1.116546630859375, + "eval_runtime": 11.4261, + "eval_samples_per_second": 11.465, + "eval_steps_per_second": 1.488, + "step": 24900 + }, + { + "epoch": 384.61538461538464, + "grad_norm": 40.87736892700195, + "learning_rate": 7.5017000000000005e-06, + "loss": 0.8146, + "step": 25000 + }, + { + "epoch": 384.61538461538464, + "eval_loss": 1.1162028312683105, + "eval_runtime": 11.3423, + "eval_samples_per_second": 11.55, + "eval_steps_per_second": 1.499, + "step": 25000 + }, + { + "epoch": 386.15384615384613, + "grad_norm": 19.28094482421875, + "learning_rate": 7.491700000000001e-06, + "loss": 0.8114, + "step": 25100 + }, + { + "epoch": 386.15384615384613, + "eval_loss": 1.106558918952942, + "eval_runtime": 11.0618, + "eval_samples_per_second": 11.843, + "eval_steps_per_second": 1.537, + "step": 25100 + }, + { + "epoch": 387.6923076923077, + "grad_norm": 19.689420700073242, + "learning_rate": 7.481700000000001e-06, + "loss": 0.8076, + "step": 25200 + }, + { + "epoch": 387.6923076923077, + "eval_loss": 1.1141376495361328, + "eval_runtime": 11.0491, + "eval_samples_per_second": 11.856, + "eval_steps_per_second": 1.539, + "step": 25200 + }, + { + "epoch": 389.2307692307692, + "grad_norm": 22.76107406616211, + "learning_rate": 7.471700000000001e-06, + "loss": 0.8077, + "step": 25300 + }, + { + "epoch": 389.2307692307692, + "eval_loss": 1.1186425685882568, + "eval_runtime": 11.1798, + "eval_samples_per_second": 11.718, + "eval_steps_per_second": 1.521, + "step": 25300 + }, + { + "epoch": 390.7692307692308, + "grad_norm": 40.23360061645508, + "learning_rate": 7.461700000000001e-06, + "loss": 0.7924, + "step": 25400 + }, + { + "epoch": 390.7692307692308, + "eval_loss": 1.1189498901367188, + "eval_runtime": 11.1824, + "eval_samples_per_second": 11.715, + "eval_steps_per_second": 1.52, + "step": 25400 + }, + { + "epoch": 392.3076923076923, + "grad_norm": 32.873207092285156, + "learning_rate": 7.451700000000001e-06, + "loss": 0.8335, + "step": 25500 + }, + { + "epoch": 392.3076923076923, + "eval_loss": 1.1451479196548462, + "eval_runtime": 11.1543, + "eval_samples_per_second": 11.744, + "eval_steps_per_second": 1.524, + "step": 25500 + }, + { + "epoch": 393.84615384615387, + "grad_norm": 27.307552337646484, + "learning_rate": 7.4417e-06, + "loss": 0.7926, + "step": 25600 + }, + { + "epoch": 393.84615384615387, + "eval_loss": 1.1189228296279907, + "eval_runtime": 11.0875, + "eval_samples_per_second": 11.815, + "eval_steps_per_second": 1.533, + "step": 25600 + }, + { + "epoch": 395.38461538461536, + "grad_norm": 22.984905242919922, + "learning_rate": 7.4317e-06, + "loss": 0.8039, + "step": 25700 + }, + { + "epoch": 395.38461538461536, + "eval_loss": 1.1381311416625977, + "eval_runtime": 11.1226, + "eval_samples_per_second": 11.778, + "eval_steps_per_second": 1.528, + "step": 25700 + }, + { + "epoch": 396.9230769230769, + "grad_norm": 43.75572967529297, + "learning_rate": 7.4217000000000004e-06, + "loss": 0.8426, + "step": 25800 + }, + { + "epoch": 396.9230769230769, + "eval_loss": 1.1005278825759888, + "eval_runtime": 10.9018, + "eval_samples_per_second": 12.016, + "eval_steps_per_second": 1.559, + "step": 25800 + }, + { + "epoch": 398.46153846153845, + "grad_norm": 45.551212310791016, + "learning_rate": 7.4117000000000005e-06, + "loss": 0.7918, + "step": 25900 + }, + { + "epoch": 398.46153846153845, + "eval_loss": 1.1017777919769287, + "eval_runtime": 10.8997, + "eval_samples_per_second": 12.019, + "eval_steps_per_second": 1.56, + "step": 25900 + }, + { + "epoch": 400.0, + "grad_norm": 27.237998962402344, + "learning_rate": 7.401700000000001e-06, + "loss": 0.825, + "step": 26000 + }, + { + "epoch": 400.0, + "eval_loss": 1.1006879806518555, + "eval_runtime": 10.8272, + "eval_samples_per_second": 12.099, + "eval_steps_per_second": 1.57, + "step": 26000 + }, + { + "epoch": 401.53846153846155, + "grad_norm": 20.51366424560547, + "learning_rate": 7.391700000000001e-06, + "loss": 0.8069, + "step": 26100 + }, + { + "epoch": 401.53846153846155, + "eval_loss": 1.121657371520996, + "eval_runtime": 10.956, + "eval_samples_per_second": 11.957, + "eval_steps_per_second": 1.552, + "step": 26100 + }, + { + "epoch": 403.0769230769231, + "grad_norm": 38.978363037109375, + "learning_rate": 7.381700000000001e-06, + "loss": 0.8105, + "step": 26200 + }, + { + "epoch": 403.0769230769231, + "eval_loss": 1.1141672134399414, + "eval_runtime": 10.8817, + "eval_samples_per_second": 12.039, + "eval_steps_per_second": 1.562, + "step": 26200 + }, + { + "epoch": 404.61538461538464, + "grad_norm": 40.20176315307617, + "learning_rate": 7.371700000000001e-06, + "loss": 0.7953, + "step": 26300 + }, + { + "epoch": 404.61538461538464, + "eval_loss": 1.1042087078094482, + "eval_runtime": 10.8826, + "eval_samples_per_second": 12.038, + "eval_steps_per_second": 1.562, + "step": 26300 + }, + { + "epoch": 406.15384615384613, + "grad_norm": 27.35833168029785, + "learning_rate": 7.3617e-06, + "loss": 0.8075, + "step": 26400 + }, + { + "epoch": 406.15384615384613, + "eval_loss": 1.1115680932998657, + "eval_runtime": 10.9465, + "eval_samples_per_second": 11.967, + "eval_steps_per_second": 1.553, + "step": 26400 + }, + { + "epoch": 407.6923076923077, + "grad_norm": 32.054351806640625, + "learning_rate": 7.3517e-06, + "loss": 0.7958, + "step": 26500 + }, + { + "epoch": 407.6923076923077, + "eval_loss": 1.1209728717803955, + "eval_runtime": 10.8226, + "eval_samples_per_second": 12.104, + "eval_steps_per_second": 1.571, + "step": 26500 + }, + { + "epoch": 409.2307692307692, + "grad_norm": 37.53184509277344, + "learning_rate": 7.3417e-06, + "loss": 0.7859, + "step": 26600 + }, + { + "epoch": 409.2307692307692, + "eval_loss": 1.0920478105545044, + "eval_runtime": 10.8488, + "eval_samples_per_second": 12.075, + "eval_steps_per_second": 1.567, + "step": 26600 + }, + { + "epoch": 410.7692307692308, + "grad_norm": 39.949039459228516, + "learning_rate": 7.3317000000000005e-06, + "loss": 0.8, + "step": 26700 + }, + { + "epoch": 410.7692307692308, + "eval_loss": 1.0909216403961182, + "eval_runtime": 10.9975, + "eval_samples_per_second": 11.912, + "eval_steps_per_second": 1.546, + "step": 26700 + }, + { + "epoch": 412.3076923076923, + "grad_norm": 28.89598846435547, + "learning_rate": 7.3217000000000006e-06, + "loss": 0.8168, + "step": 26800 + }, + { + "epoch": 412.3076923076923, + "eval_loss": 1.108217477798462, + "eval_runtime": 10.9623, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 1.551, + "step": 26800 + }, + { + "epoch": 413.84615384615387, + "grad_norm": 33.56279754638672, + "learning_rate": 7.311700000000001e-06, + "loss": 0.7854, + "step": 26900 + }, + { + "epoch": 413.84615384615387, + "eval_loss": 1.1230065822601318, + "eval_runtime": 10.8811, + "eval_samples_per_second": 12.039, + "eval_steps_per_second": 1.562, + "step": 26900 + }, + { + "epoch": 415.38461538461536, + "grad_norm": 26.578338623046875, + "learning_rate": 7.301700000000001e-06, + "loss": 0.7954, + "step": 27000 + }, + { + "epoch": 415.38461538461536, + "eval_loss": 1.1114639043807983, + "eval_runtime": 10.9172, + "eval_samples_per_second": 11.999, + "eval_steps_per_second": 1.557, + "step": 27000 + }, + { + "epoch": 416.9230769230769, + "grad_norm": 31.6333065032959, + "learning_rate": 7.291700000000001e-06, + "loss": 0.8057, + "step": 27100 + }, + { + "epoch": 416.9230769230769, + "eval_loss": 1.1136469841003418, + "eval_runtime": 10.835, + "eval_samples_per_second": 12.09, + "eval_steps_per_second": 1.569, + "step": 27100 + }, + { + "epoch": 418.46153846153845, + "grad_norm": 17.80211067199707, + "learning_rate": 7.2817e-06, + "loss": 0.8074, + "step": 27200 + }, + { + "epoch": 418.46153846153845, + "eval_loss": 1.0823259353637695, + "eval_runtime": 11.1165, + "eval_samples_per_second": 11.784, + "eval_steps_per_second": 1.529, + "step": 27200 + }, + { + "epoch": 420.0, + "grad_norm": 50.79414367675781, + "learning_rate": 7.2717e-06, + "loss": 0.778, + "step": 27300 + }, + { + "epoch": 420.0, + "eval_loss": 1.0840650796890259, + "eval_runtime": 10.9207, + "eval_samples_per_second": 11.996, + "eval_steps_per_second": 1.557, + "step": 27300 + }, + { + "epoch": 421.53846153846155, + "grad_norm": 27.64950942993164, + "learning_rate": 7.2617e-06, + "loss": 0.79, + "step": 27400 + }, + { + "epoch": 421.53846153846155, + "eval_loss": 1.1120820045471191, + "eval_runtime": 10.8896, + "eval_samples_per_second": 12.03, + "eval_steps_per_second": 1.561, + "step": 27400 + }, + { + "epoch": 423.0769230769231, + "grad_norm": 28.115442276000977, + "learning_rate": 7.2517e-06, + "loss": 0.7927, + "step": 27500 + }, + { + "epoch": 423.0769230769231, + "eval_loss": 1.0890986919403076, + "eval_runtime": 10.8766, + "eval_samples_per_second": 12.044, + "eval_steps_per_second": 1.563, + "step": 27500 + }, + { + "epoch": 424.61538461538464, + "grad_norm": 56.913578033447266, + "learning_rate": 7.2417000000000005e-06, + "loss": 0.7843, + "step": 27600 + }, + { + "epoch": 424.61538461538464, + "eval_loss": 1.0650087594985962, + "eval_runtime": 10.8169, + "eval_samples_per_second": 12.111, + "eval_steps_per_second": 1.572, + "step": 27600 + }, + { + "epoch": 426.15384615384613, + "grad_norm": 28.091888427734375, + "learning_rate": 7.231800000000001e-06, + "loss": 0.8113, + "step": 27700 + }, + { + "epoch": 426.15384615384613, + "eval_loss": 1.103638768196106, + "eval_runtime": 10.8897, + "eval_samples_per_second": 12.03, + "eval_steps_per_second": 1.561, + "step": 27700 + }, + { + "epoch": 427.6923076923077, + "grad_norm": 21.665067672729492, + "learning_rate": 7.221800000000001e-06, + "loss": 0.7803, + "step": 27800 + }, + { + "epoch": 427.6923076923077, + "eval_loss": 1.1119697093963623, + "eval_runtime": 10.8141, + "eval_samples_per_second": 12.114, + "eval_steps_per_second": 1.572, + "step": 27800 + }, + { + "epoch": 429.2307692307692, + "grad_norm": 65.23661804199219, + "learning_rate": 7.211800000000001e-06, + "loss": 0.7963, + "step": 27900 + }, + { + "epoch": 429.2307692307692, + "eval_loss": 1.1217514276504517, + "eval_runtime": 10.8793, + "eval_samples_per_second": 12.041, + "eval_steps_per_second": 1.563, + "step": 27900 + }, + { + "epoch": 430.7692307692308, + "grad_norm": 43.18936538696289, + "learning_rate": 7.201800000000001e-06, + "loss": 0.7976, + "step": 28000 + }, + { + "epoch": 430.7692307692308, + "eval_loss": 1.1384552717208862, + "eval_runtime": 10.8325, + "eval_samples_per_second": 12.093, + "eval_steps_per_second": 1.569, + "step": 28000 + }, + { + "epoch": 432.3076923076923, + "grad_norm": 33.29544448852539, + "learning_rate": 7.191800000000001e-06, + "loss": 0.803, + "step": 28100 + }, + { + "epoch": 432.3076923076923, + "eval_loss": 1.1162676811218262, + "eval_runtime": 10.8215, + "eval_samples_per_second": 12.106, + "eval_steps_per_second": 1.571, + "step": 28100 + }, + { + "epoch": 433.84615384615387, + "grad_norm": 22.86684799194336, + "learning_rate": 7.181800000000001e-06, + "loss": 0.8037, + "step": 28200 + }, + { + "epoch": 433.84615384615387, + "eval_loss": 1.1036708354949951, + "eval_runtime": 10.8309, + "eval_samples_per_second": 12.095, + "eval_steps_per_second": 1.57, + "step": 28200 + }, + { + "epoch": 435.38461538461536, + "grad_norm": 21.229990005493164, + "learning_rate": 7.171800000000001e-06, + "loss": 0.7713, + "step": 28300 + }, + { + "epoch": 435.38461538461536, + "eval_loss": 1.0910826921463013, + "eval_runtime": 10.8187, + "eval_samples_per_second": 12.109, + "eval_steps_per_second": 1.571, + "step": 28300 + }, + { + "epoch": 436.9230769230769, + "grad_norm": 33.30841064453125, + "learning_rate": 7.161800000000001e-06, + "loss": 0.7956, + "step": 28400 + }, + { + "epoch": 436.9230769230769, + "eval_loss": 1.0862958431243896, + "eval_runtime": 10.8992, + "eval_samples_per_second": 12.019, + "eval_steps_per_second": 1.56, + "step": 28400 + }, + { + "epoch": 438.46153846153845, + "grad_norm": 24.01555633544922, + "learning_rate": 7.151800000000001e-06, + "loss": 0.7883, + "step": 28500 + }, + { + "epoch": 438.46153846153845, + "eval_loss": 1.1144325733184814, + "eval_runtime": 11.0935, + "eval_samples_per_second": 11.809, + "eval_steps_per_second": 1.532, + "step": 28500 + }, + { + "epoch": 440.0, + "grad_norm": 43.260921478271484, + "learning_rate": 7.141800000000001e-06, + "loss": 0.7885, + "step": 28600 + }, + { + "epoch": 440.0, + "eval_loss": 1.0987013578414917, + "eval_runtime": 10.803, + "eval_samples_per_second": 12.126, + "eval_steps_per_second": 1.574, + "step": 28600 + }, + { + "epoch": 441.53846153846155, + "grad_norm": 24.575450897216797, + "learning_rate": 7.131800000000001e-06, + "loss": 0.8052, + "step": 28700 + }, + { + "epoch": 441.53846153846155, + "eval_loss": 1.1236560344696045, + "eval_runtime": 10.7671, + "eval_samples_per_second": 12.167, + "eval_steps_per_second": 1.579, + "step": 28700 + }, + { + "epoch": 443.0769230769231, + "grad_norm": 58.26254653930664, + "learning_rate": 7.121800000000001e-06, + "loss": 0.7856, + "step": 28800 + }, + { + "epoch": 443.0769230769231, + "eval_loss": 1.1264104843139648, + "eval_runtime": 11.0135, + "eval_samples_per_second": 11.894, + "eval_steps_per_second": 1.544, + "step": 28800 + }, + { + "epoch": 444.61538461538464, + "grad_norm": 41.40846633911133, + "learning_rate": 7.111800000000001e-06, + "loss": 0.7923, + "step": 28900 + }, + { + "epoch": 444.61538461538464, + "eval_loss": 1.1042475700378418, + "eval_runtime": 10.8929, + "eval_samples_per_second": 12.026, + "eval_steps_per_second": 1.561, + "step": 28900 + }, + { + "epoch": 446.15384615384613, + "grad_norm": 42.24570846557617, + "learning_rate": 7.101800000000001e-06, + "loss": 0.8239, + "step": 29000 + }, + { + "epoch": 446.15384615384613, + "eval_loss": 1.0964312553405762, + "eval_runtime": 11.2262, + "eval_samples_per_second": 11.669, + "eval_steps_per_second": 1.514, + "step": 29000 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1539, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1102635005677568e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}