diff --git "a/checkpoint-53000/trainer_state.json" "b/checkpoint-53000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-53000/trainer_state.json" @@ -0,0 +1,7983 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 815.3846153846154, + "eval_steps": 100, + "global_step": 53000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.5384615384615383, + "grad_norm": 47.11796188354492, + "learning_rate": 9.990900000000001e-06, + "loss": 3.6644, + "step": 100 + }, + { + "epoch": 1.5384615384615383, + "eval_loss": 2.4919605255126953, + "eval_runtime": 12.5517, + "eval_samples_per_second": 10.437, + "eval_steps_per_second": 1.354, + "step": 100 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 31.28130531311035, + "learning_rate": 9.980900000000001e-06, + "loss": 2.2347, + "step": 200 + }, + { + "epoch": 3.076923076923077, + "eval_loss": 2.156316041946411, + "eval_runtime": 11.1792, + "eval_samples_per_second": 11.718, + "eval_steps_per_second": 1.521, + "step": 200 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 39.616390228271484, + "learning_rate": 9.970900000000001e-06, + "loss": 2.0254, + "step": 300 + }, + { + "epoch": 4.615384615384615, + "eval_loss": 2.024153709411621, + "eval_runtime": 11.1934, + "eval_samples_per_second": 11.703, + "eval_steps_per_second": 1.519, + "step": 300 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 28.285825729370117, + "learning_rate": 9.960900000000001e-06, + "loss": 1.9361, + "step": 400 + }, + { + "epoch": 6.153846153846154, + "eval_loss": 1.9094743728637695, + "eval_runtime": 11.3855, + "eval_samples_per_second": 11.506, + "eval_steps_per_second": 1.493, + "step": 400 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 34.14302062988281, + "learning_rate": 9.950900000000002e-06, + "loss": 1.8531, + "step": 500 + }, + { + "epoch": 7.6923076923076925, + "eval_loss": 1.8729331493377686, + "eval_runtime": 11.2935, + "eval_samples_per_second": 11.6, + "eval_steps_per_second": 1.505, + "step": 500 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 39.09531784057617, + "learning_rate": 9.940900000000002e-06, + "loss": 1.7669, + "step": 600 + }, + { + "epoch": 9.23076923076923, + "eval_loss": 1.831756830215454, + "eval_runtime": 11.0535, + "eval_samples_per_second": 11.851, + "eval_steps_per_second": 1.538, + "step": 600 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 93.24444580078125, + "learning_rate": 9.930900000000002e-06, + "loss": 1.7518, + "step": 700 + }, + { + "epoch": 10.76923076923077, + "eval_loss": 1.7832175493240356, + "eval_runtime": 11.1684, + "eval_samples_per_second": 11.729, + "eval_steps_per_second": 1.522, + "step": 700 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 32.21013641357422, + "learning_rate": 9.920900000000002e-06, + "loss": 1.7149, + "step": 800 + }, + { + "epoch": 12.307692307692308, + "eval_loss": 1.7581098079681396, + "eval_runtime": 11.101, + "eval_samples_per_second": 11.801, + "eval_steps_per_second": 1.531, + "step": 800 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 59.90657043457031, + "learning_rate": 9.9109e-06, + "loss": 1.6734, + "step": 900 + }, + { + "epoch": 13.846153846153847, + "eval_loss": 1.7163844108581543, + "eval_runtime": 11.1167, + "eval_samples_per_second": 11.784, + "eval_steps_per_second": 1.529, + "step": 900 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 20.61592674255371, + "learning_rate": 9.9009e-06, + "loss": 1.6612, + "step": 1000 + }, + { + "epoch": 15.384615384615385, + "eval_loss": 1.6949567794799805, + "eval_runtime": 11.0663, + "eval_samples_per_second": 11.838, + "eval_steps_per_second": 1.536, + "step": 1000 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 17.60099220275879, + "learning_rate": 9.8909e-06, + "loss": 1.6199, + "step": 1100 + }, + { + "epoch": 16.923076923076923, + "eval_loss": 1.6769332885742188, + "eval_runtime": 11.0531, + "eval_samples_per_second": 11.852, + "eval_steps_per_second": 1.538, + "step": 1100 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 20.802692413330078, + "learning_rate": 9.8809e-06, + "loss": 1.6008, + "step": 1200 + }, + { + "epoch": 18.46153846153846, + "eval_loss": 1.6524990797042847, + "eval_runtime": 11.0831, + "eval_samples_per_second": 11.82, + "eval_steps_per_second": 1.534, + "step": 1200 + }, + { + "epoch": 20.0, + "grad_norm": 21.809823989868164, + "learning_rate": 9.8709e-06, + "loss": 1.5812, + "step": 1300 + }, + { + "epoch": 20.0, + "eval_loss": 1.6428295373916626, + "eval_runtime": 11.1093, + "eval_samples_per_second": 11.792, + "eval_steps_per_second": 1.53, + "step": 1300 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 46.8908576965332, + "learning_rate": 9.8609e-06, + "loss": 1.5419, + "step": 1400 + }, + { + "epoch": 21.53846153846154, + "eval_loss": 1.6006404161453247, + "eval_runtime": 11.2393, + "eval_samples_per_second": 11.655, + "eval_steps_per_second": 1.513, + "step": 1400 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 27.15238380432129, + "learning_rate": 9.8509e-06, + "loss": 1.5374, + "step": 1500 + }, + { + "epoch": 23.076923076923077, + "eval_loss": 1.5862094163894653, + "eval_runtime": 11.1815, + "eval_samples_per_second": 11.716, + "eval_steps_per_second": 1.52, + "step": 1500 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 40.26778030395508, + "learning_rate": 9.840900000000001e-06, + "loss": 1.4923, + "step": 1600 + }, + { + "epoch": 24.615384615384617, + "eval_loss": 1.576373815536499, + "eval_runtime": 11.1215, + "eval_samples_per_second": 11.779, + "eval_steps_per_second": 1.529, + "step": 1600 + }, + { + "epoch": 26.153846153846153, + "grad_norm": 35.266971588134766, + "learning_rate": 9.830900000000001e-06, + "loss": 1.4989, + "step": 1700 + }, + { + "epoch": 26.153846153846153, + "eval_loss": 1.5671430826187134, + "eval_runtime": 11.1873, + "eval_samples_per_second": 11.71, + "eval_steps_per_second": 1.52, + "step": 1700 + }, + { + "epoch": 27.692307692307693, + "grad_norm": 26.813480377197266, + "learning_rate": 9.820900000000001e-06, + "loss": 1.4711, + "step": 1800 + }, + { + "epoch": 27.692307692307693, + "eval_loss": 1.522908329963684, + "eval_runtime": 11.2106, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 1.516, + "step": 1800 + }, + { + "epoch": 29.23076923076923, + "grad_norm": 24.576723098754883, + "learning_rate": 9.810900000000001e-06, + "loss": 1.4421, + "step": 1900 + }, + { + "epoch": 29.23076923076923, + "eval_loss": 1.5039104223251343, + "eval_runtime": 11.257, + "eval_samples_per_second": 11.637, + "eval_steps_per_second": 1.51, + "step": 1900 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 28.480438232421875, + "learning_rate": 9.800900000000001e-06, + "loss": 1.4347, + "step": 2000 + }, + { + "epoch": 30.76923076923077, + "eval_loss": 1.5123459100723267, + "eval_runtime": 11.187, + "eval_samples_per_second": 11.71, + "eval_steps_per_second": 1.52, + "step": 2000 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 56.582088470458984, + "learning_rate": 9.790900000000001e-06, + "loss": 1.4212, + "step": 2100 + }, + { + "epoch": 32.30769230769231, + "eval_loss": 1.481844425201416, + "eval_runtime": 11.2075, + "eval_samples_per_second": 11.689, + "eval_steps_per_second": 1.517, + "step": 2100 + }, + { + "epoch": 33.84615384615385, + "grad_norm": 38.5254020690918, + "learning_rate": 9.780900000000002e-06, + "loss": 1.3908, + "step": 2200 + }, + { + "epoch": 33.84615384615385, + "eval_loss": 1.4529048204421997, + "eval_runtime": 11.197, + "eval_samples_per_second": 11.7, + "eval_steps_per_second": 1.518, + "step": 2200 + }, + { + "epoch": 35.38461538461539, + "grad_norm": 44.74857711791992, + "learning_rate": 9.770900000000002e-06, + "loss": 1.3734, + "step": 2300 + }, + { + "epoch": 35.38461538461539, + "eval_loss": 1.4617427587509155, + "eval_runtime": 11.1235, + "eval_samples_per_second": 11.777, + "eval_steps_per_second": 1.528, + "step": 2300 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 28.926782608032227, + "learning_rate": 9.760900000000002e-06, + "loss": 1.365, + "step": 2400 + }, + { + "epoch": 36.92307692307692, + "eval_loss": 1.4297789335250854, + "eval_runtime": 11.3007, + "eval_samples_per_second": 11.592, + "eval_steps_per_second": 1.504, + "step": 2400 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 27.750213623046875, + "learning_rate": 9.7509e-06, + "loss": 1.3306, + "step": 2500 + }, + { + "epoch": 38.46153846153846, + "eval_loss": 1.4345914125442505, + "eval_runtime": 11.2795, + "eval_samples_per_second": 11.614, + "eval_steps_per_second": 1.507, + "step": 2500 + }, + { + "epoch": 40.0, + "grad_norm": 35.288352966308594, + "learning_rate": 9.7409e-06, + "loss": 1.3677, + "step": 2600 + }, + { + "epoch": 40.0, + "eval_loss": 1.447089433670044, + "eval_runtime": 11.1366, + "eval_samples_per_second": 11.763, + "eval_steps_per_second": 1.527, + "step": 2600 + }, + { + "epoch": 41.53846153846154, + "grad_norm": 65.49736022949219, + "learning_rate": 9.7309e-06, + "loss": 1.3453, + "step": 2700 + }, + { + "epoch": 41.53846153846154, + "eval_loss": 1.405300259590149, + "eval_runtime": 11.1549, + "eval_samples_per_second": 11.744, + "eval_steps_per_second": 1.524, + "step": 2700 + }, + { + "epoch": 43.07692307692308, + "grad_norm": 24.333518981933594, + "learning_rate": 9.7209e-06, + "loss": 1.3206, + "step": 2800 + }, + { + "epoch": 43.07692307692308, + "eval_loss": 1.4218717813491821, + "eval_runtime": 11.3113, + "eval_samples_per_second": 11.581, + "eval_steps_per_second": 1.503, + "step": 2800 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 45.50777816772461, + "learning_rate": 9.7109e-06, + "loss": 1.3363, + "step": 2900 + }, + { + "epoch": 44.61538461538461, + "eval_loss": 1.4220006465911865, + "eval_runtime": 11.156, + "eval_samples_per_second": 11.743, + "eval_steps_per_second": 1.524, + "step": 2900 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 25.898344039916992, + "learning_rate": 9.7009e-06, + "loss": 1.2995, + "step": 3000 + }, + { + "epoch": 46.15384615384615, + "eval_loss": 1.3942431211471558, + "eval_runtime": 11.2282, + "eval_samples_per_second": 11.667, + "eval_steps_per_second": 1.514, + "step": 3000 + }, + { + "epoch": 47.69230769230769, + "grad_norm": 56.0889892578125, + "learning_rate": 9.6909e-06, + "loss": 1.2994, + "step": 3100 + }, + { + "epoch": 47.69230769230769, + "eval_loss": 1.3970586061477661, + "eval_runtime": 11.1682, + "eval_samples_per_second": 11.73, + "eval_steps_per_second": 1.522, + "step": 3100 + }, + { + "epoch": 49.23076923076923, + "grad_norm": 58.10311508178711, + "learning_rate": 9.6809e-06, + "loss": 1.2761, + "step": 3200 + }, + { + "epoch": 49.23076923076923, + "eval_loss": 1.390371561050415, + "eval_runtime": 11.2406, + "eval_samples_per_second": 11.654, + "eval_steps_per_second": 1.512, + "step": 3200 + }, + { + "epoch": 50.76923076923077, + "grad_norm": 17.050870895385742, + "learning_rate": 9.670900000000001e-06, + "loss": 1.2712, + "step": 3300 + }, + { + "epoch": 50.76923076923077, + "eval_loss": 1.3936753273010254, + "eval_runtime": 11.2364, + "eval_samples_per_second": 11.659, + "eval_steps_per_second": 1.513, + "step": 3300 + }, + { + "epoch": 52.30769230769231, + "grad_norm": 232.21804809570312, + "learning_rate": 9.660900000000001e-06, + "loss": 1.262, + "step": 3400 + }, + { + "epoch": 52.30769230769231, + "eval_loss": 1.4037091732025146, + "eval_runtime": 11.2231, + "eval_samples_per_second": 11.672, + "eval_steps_per_second": 1.515, + "step": 3400 + }, + { + "epoch": 53.84615384615385, + "grad_norm": 35.11832046508789, + "learning_rate": 9.650900000000001e-06, + "loss": 1.2788, + "step": 3500 + }, + { + "epoch": 53.84615384615385, + "eval_loss": 1.3545280694961548, + "eval_runtime": 11.1609, + "eval_samples_per_second": 11.737, + "eval_steps_per_second": 1.523, + "step": 3500 + }, + { + "epoch": 55.38461538461539, + "grad_norm": 27.077022552490234, + "learning_rate": 9.640900000000001e-06, + "loss": 1.2711, + "step": 3600 + }, + { + "epoch": 55.38461538461539, + "eval_loss": 1.3528518676757812, + "eval_runtime": 11.2852, + "eval_samples_per_second": 11.608, + "eval_steps_per_second": 1.506, + "step": 3600 + }, + { + "epoch": 56.92307692307692, + "grad_norm": 46.97712326049805, + "learning_rate": 9.630900000000001e-06, + "loss": 1.2492, + "step": 3700 + }, + { + "epoch": 56.92307692307692, + "eval_loss": 1.3534098863601685, + "eval_runtime": 11.2368, + "eval_samples_per_second": 11.658, + "eval_steps_per_second": 1.513, + "step": 3700 + }, + { + "epoch": 58.46153846153846, + "grad_norm": 42.06857681274414, + "learning_rate": 9.620900000000001e-06, + "loss": 1.2506, + "step": 3800 + }, + { + "epoch": 58.46153846153846, + "eval_loss": 1.3613977432250977, + "eval_runtime": 11.2206, + "eval_samples_per_second": 11.675, + "eval_steps_per_second": 1.515, + "step": 3800 + }, + { + "epoch": 60.0, + "grad_norm": 19.298952102661133, + "learning_rate": 9.610900000000001e-06, + "loss": 1.2201, + "step": 3900 + }, + { + "epoch": 60.0, + "eval_loss": 1.3586474657058716, + "eval_runtime": 11.2045, + "eval_samples_per_second": 11.692, + "eval_steps_per_second": 1.517, + "step": 3900 + }, + { + "epoch": 61.53846153846154, + "grad_norm": 30.0198974609375, + "learning_rate": 9.600900000000002e-06, + "loss": 1.2086, + "step": 4000 + }, + { + "epoch": 61.53846153846154, + "eval_loss": 1.3304755687713623, + "eval_runtime": 11.193, + "eval_samples_per_second": 11.704, + "eval_steps_per_second": 1.519, + "step": 4000 + }, + { + "epoch": 63.07692307692308, + "grad_norm": 37.59902572631836, + "learning_rate": 9.5909e-06, + "loss": 1.2375, + "step": 4100 + }, + { + "epoch": 63.07692307692308, + "eval_loss": 1.331407904624939, + "eval_runtime": 10.6714, + "eval_samples_per_second": 12.276, + "eval_steps_per_second": 1.593, + "step": 4100 + }, + { + "epoch": 64.61538461538461, + "grad_norm": 36.82079315185547, + "learning_rate": 9.5809e-06, + "loss": 1.2148, + "step": 4200 + }, + { + "epoch": 64.61538461538461, + "eval_loss": 1.3441548347473145, + "eval_runtime": 10.7472, + "eval_samples_per_second": 12.189, + "eval_steps_per_second": 1.582, + "step": 4200 + }, + { + "epoch": 66.15384615384616, + "grad_norm": 30.974130630493164, + "learning_rate": 9.5709e-06, + "loss": 1.197, + "step": 4300 + }, + { + "epoch": 66.15384615384616, + "eval_loss": 1.34512197971344, + "eval_runtime": 10.7398, + "eval_samples_per_second": 12.198, + "eval_steps_per_second": 1.583, + "step": 4300 + }, + { + "epoch": 67.6923076923077, + "grad_norm": 20.45345115661621, + "learning_rate": 9.5609e-06, + "loss": 1.2361, + "step": 4400 + }, + { + "epoch": 67.6923076923077, + "eval_loss": 1.3371080160140991, + "eval_runtime": 10.654, + "eval_samples_per_second": 12.296, + "eval_steps_per_second": 1.596, + "step": 4400 + }, + { + "epoch": 69.23076923076923, + "grad_norm": 19.758630752563477, + "learning_rate": 9.5509e-06, + "loss": 1.2001, + "step": 4500 + }, + { + "epoch": 69.23076923076923, + "eval_loss": 1.3270760774612427, + "eval_runtime": 10.6928, + "eval_samples_per_second": 12.251, + "eval_steps_per_second": 1.59, + "step": 4500 + }, + { + "epoch": 70.76923076923077, + "grad_norm": 45.2899055480957, + "learning_rate": 9.5409e-06, + "loss": 1.192, + "step": 4600 + }, + { + "epoch": 70.76923076923077, + "eval_loss": 1.3184590339660645, + "eval_runtime": 10.7154, + "eval_samples_per_second": 12.225, + "eval_steps_per_second": 1.586, + "step": 4600 + }, + { + "epoch": 72.3076923076923, + "grad_norm": 45.60734939575195, + "learning_rate": 9.5309e-06, + "loss": 1.2081, + "step": 4700 + }, + { + "epoch": 72.3076923076923, + "eval_loss": 1.3107666969299316, + "eval_runtime": 10.7029, + "eval_samples_per_second": 12.24, + "eval_steps_per_second": 1.588, + "step": 4700 + }, + { + "epoch": 73.84615384615384, + "grad_norm": 26.859603881835938, + "learning_rate": 9.5209e-06, + "loss": 1.1729, + "step": 4800 + }, + { + "epoch": 73.84615384615384, + "eval_loss": 1.310062289237976, + "eval_runtime": 10.7544, + "eval_samples_per_second": 12.181, + "eval_steps_per_second": 1.581, + "step": 4800 + }, + { + "epoch": 75.38461538461539, + "grad_norm": 19.90914535522461, + "learning_rate": 9.5109e-06, + "loss": 1.1899, + "step": 4900 + }, + { + "epoch": 75.38461538461539, + "eval_loss": 1.3038017749786377, + "eval_runtime": 10.6532, + "eval_samples_per_second": 12.297, + "eval_steps_per_second": 1.596, + "step": 4900 + }, + { + "epoch": 76.92307692307692, + "grad_norm": 22.629934310913086, + "learning_rate": 9.5009e-06, + "loss": 1.1875, + "step": 5000 + }, + { + "epoch": 76.92307692307692, + "eval_loss": 1.2785409688949585, + "eval_runtime": 10.6388, + "eval_samples_per_second": 12.313, + "eval_steps_per_second": 1.598, + "step": 5000 + }, + { + "epoch": 78.46153846153847, + "grad_norm": 45.462059020996094, + "learning_rate": 9.490900000000001e-06, + "loss": 1.1717, + "step": 5100 + }, + { + "epoch": 78.46153846153847, + "eval_loss": 1.278078317642212, + "eval_runtime": 10.9666, + "eval_samples_per_second": 11.945, + "eval_steps_per_second": 1.55, + "step": 5100 + }, + { + "epoch": 80.0, + "grad_norm": 34.85255432128906, + "learning_rate": 9.480900000000001e-06, + "loss": 1.1657, + "step": 5200 + }, + { + "epoch": 80.0, + "eval_loss": 1.2711185216903687, + "eval_runtime": 11.0211, + "eval_samples_per_second": 11.886, + "eval_steps_per_second": 1.543, + "step": 5200 + }, + { + "epoch": 81.53846153846153, + "grad_norm": 19.078449249267578, + "learning_rate": 9.470900000000001e-06, + "loss": 1.1814, + "step": 5300 + }, + { + "epoch": 81.53846153846153, + "eval_loss": 1.2781996726989746, + "eval_runtime": 11.0663, + "eval_samples_per_second": 11.838, + "eval_steps_per_second": 1.536, + "step": 5300 + }, + { + "epoch": 83.07692307692308, + "grad_norm": 31.05898094177246, + "learning_rate": 9.460900000000001e-06, + "loss": 1.1452, + "step": 5400 + }, + { + "epoch": 83.07692307692308, + "eval_loss": 1.2848775386810303, + "eval_runtime": 11.01, + "eval_samples_per_second": 11.898, + "eval_steps_per_second": 1.544, + "step": 5400 + }, + { + "epoch": 84.61538461538461, + "grad_norm": 28.712461471557617, + "learning_rate": 9.450900000000001e-06, + "loss": 1.1465, + "step": 5500 + }, + { + "epoch": 84.61538461538461, + "eval_loss": 1.2928494215011597, + "eval_runtime": 10.9253, + "eval_samples_per_second": 11.991, + "eval_steps_per_second": 1.556, + "step": 5500 + }, + { + "epoch": 86.15384615384616, + "grad_norm": 19.871828079223633, + "learning_rate": 9.440900000000001e-06, + "loss": 1.1736, + "step": 5600 + }, + { + "epoch": 86.15384615384616, + "eval_loss": 1.2648124694824219, + "eval_runtime": 11.0314, + "eval_samples_per_second": 11.875, + "eval_steps_per_second": 1.541, + "step": 5600 + }, + { + "epoch": 87.6923076923077, + "grad_norm": 22.47665023803711, + "learning_rate": 9.4309e-06, + "loss": 1.1184, + "step": 5700 + }, + { + "epoch": 87.6923076923077, + "eval_loss": 1.2936598062515259, + "eval_runtime": 10.9322, + "eval_samples_per_second": 11.983, + "eval_steps_per_second": 1.555, + "step": 5700 + }, + { + "epoch": 89.23076923076923, + "grad_norm": 38.79877471923828, + "learning_rate": 9.4209e-06, + "loss": 1.1616, + "step": 5800 + }, + { + "epoch": 89.23076923076923, + "eval_loss": 1.2650004625320435, + "eval_runtime": 10.9434, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 1.553, + "step": 5800 + }, + { + "epoch": 90.76923076923077, + "grad_norm": 40.851097106933594, + "learning_rate": 9.4109e-06, + "loss": 1.1469, + "step": 5900 + }, + { + "epoch": 90.76923076923077, + "eval_loss": 1.252148151397705, + "eval_runtime": 10.9867, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 1.547, + "step": 5900 + }, + { + "epoch": 92.3076923076923, + "grad_norm": 21.35544204711914, + "learning_rate": 9.4009e-06, + "loss": 1.1489, + "step": 6000 + }, + { + "epoch": 92.3076923076923, + "eval_loss": 1.259344220161438, + "eval_runtime": 10.9649, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.55, + "step": 6000 + }, + { + "epoch": 93.84615384615384, + "grad_norm": 33.265132904052734, + "learning_rate": 9.3909e-06, + "loss": 1.1315, + "step": 6100 + }, + { + "epoch": 93.84615384615384, + "eval_loss": 1.252693772315979, + "eval_runtime": 10.9852, + "eval_samples_per_second": 11.925, + "eval_steps_per_second": 1.548, + "step": 6100 + }, + { + "epoch": 95.38461538461539, + "grad_norm": 23.43667221069336, + "learning_rate": 9.381e-06, + "loss": 1.119, + "step": 6200 + }, + { + "epoch": 95.38461538461539, + "eval_loss": 1.254772424697876, + "eval_runtime": 11.0937, + "eval_samples_per_second": 11.808, + "eval_steps_per_second": 1.532, + "step": 6200 + }, + { + "epoch": 96.92307692307692, + "grad_norm": 51.93602752685547, + "learning_rate": 9.371e-06, + "loss": 1.1333, + "step": 6300 + }, + { + "epoch": 96.92307692307692, + "eval_loss": 1.249408483505249, + "eval_runtime": 11.0388, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 1.54, + "step": 6300 + }, + { + "epoch": 98.46153846153847, + "grad_norm": 23.473421096801758, + "learning_rate": 9.361e-06, + "loss": 1.1164, + "step": 6400 + }, + { + "epoch": 98.46153846153847, + "eval_loss": 1.2438522577285767, + "eval_runtime": 11.1581, + "eval_samples_per_second": 11.74, + "eval_steps_per_second": 1.524, + "step": 6400 + }, + { + "epoch": 100.0, + "grad_norm": 24.228403091430664, + "learning_rate": 9.351e-06, + "loss": 1.1333, + "step": 6500 + }, + { + "epoch": 100.0, + "eval_loss": 1.248632788658142, + "eval_runtime": 10.9675, + "eval_samples_per_second": 11.944, + "eval_steps_per_second": 1.55, + "step": 6500 + }, + { + "epoch": 101.53846153846153, + "grad_norm": 18.29631996154785, + "learning_rate": 9.341000000000001e-06, + "loss": 1.1082, + "step": 6600 + }, + { + "epoch": 101.53846153846153, + "eval_loss": 1.2509865760803223, + "eval_runtime": 11.0693, + "eval_samples_per_second": 11.834, + "eval_steps_per_second": 1.536, + "step": 6600 + }, + { + "epoch": 103.07692307692308, + "grad_norm": 42.855491638183594, + "learning_rate": 9.331000000000001e-06, + "loss": 1.1178, + "step": 6700 + }, + { + "epoch": 103.07692307692308, + "eval_loss": 1.2890292406082153, + "eval_runtime": 11.0366, + "eval_samples_per_second": 11.87, + "eval_steps_per_second": 1.54, + "step": 6700 + }, + { + "epoch": 104.61538461538461, + "grad_norm": 46.675655364990234, + "learning_rate": 9.321000000000001e-06, + "loss": 1.1106, + "step": 6800 + }, + { + "epoch": 104.61538461538461, + "eval_loss": 1.2719863653182983, + "eval_runtime": 11.1266, + "eval_samples_per_second": 11.774, + "eval_steps_per_second": 1.528, + "step": 6800 + }, + { + "epoch": 106.15384615384616, + "grad_norm": 26.414846420288086, + "learning_rate": 9.311000000000001e-06, + "loss": 1.1216, + "step": 6900 + }, + { + "epoch": 106.15384615384616, + "eval_loss": 1.2423394918441772, + "eval_runtime": 11.0197, + "eval_samples_per_second": 11.888, + "eval_steps_per_second": 1.543, + "step": 6900 + }, + { + "epoch": 107.6923076923077, + "grad_norm": 31.4022274017334, + "learning_rate": 9.301000000000001e-06, + "loss": 1.1052, + "step": 7000 + }, + { + "epoch": 107.6923076923077, + "eval_loss": 1.2372961044311523, + "eval_runtime": 11.1127, + "eval_samples_per_second": 11.788, + "eval_steps_per_second": 1.53, + "step": 7000 + }, + { + "epoch": 109.23076923076923, + "grad_norm": 23.16703987121582, + "learning_rate": 9.291000000000001e-06, + "loss": 1.0911, + "step": 7100 + }, + { + "epoch": 109.23076923076923, + "eval_loss": 1.2309863567352295, + "eval_runtime": 10.9572, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 1.551, + "step": 7100 + }, + { + "epoch": 110.76923076923077, + "grad_norm": 21.648773193359375, + "learning_rate": 9.281000000000001e-06, + "loss": 1.0956, + "step": 7200 + }, + { + "epoch": 110.76923076923077, + "eval_loss": 1.2261079549789429, + "eval_runtime": 10.9351, + "eval_samples_per_second": 11.98, + "eval_steps_per_second": 1.555, + "step": 7200 + }, + { + "epoch": 112.3076923076923, + "grad_norm": 24.5791072845459, + "learning_rate": 9.271000000000002e-06, + "loss": 1.0751, + "step": 7300 + }, + { + "epoch": 112.3076923076923, + "eval_loss": 1.2161471843719482, + "eval_runtime": 11.1052, + "eval_samples_per_second": 11.796, + "eval_steps_per_second": 1.531, + "step": 7300 + }, + { + "epoch": 113.84615384615384, + "grad_norm": 35.867801666259766, + "learning_rate": 9.261000000000002e-06, + "loss": 1.086, + "step": 7400 + }, + { + "epoch": 113.84615384615384, + "eval_loss": 1.2092362642288208, + "eval_runtime": 11.2048, + "eval_samples_per_second": 11.691, + "eval_steps_per_second": 1.517, + "step": 7400 + }, + { + "epoch": 115.38461538461539, + "grad_norm": 67.91041564941406, + "learning_rate": 9.251000000000002e-06, + "loss": 1.092, + "step": 7500 + }, + { + "epoch": 115.38461538461539, + "eval_loss": 1.241829514503479, + "eval_runtime": 10.9937, + "eval_samples_per_second": 11.916, + "eval_steps_per_second": 1.546, + "step": 7500 + }, + { + "epoch": 116.92307692307692, + "grad_norm": 128.73751831054688, + "learning_rate": 9.241000000000002e-06, + "loss": 1.0764, + "step": 7600 + }, + { + "epoch": 116.92307692307692, + "eval_loss": 1.2462713718414307, + "eval_runtime": 10.9625, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 1.551, + "step": 7600 + }, + { + "epoch": 118.46153846153847, + "grad_norm": 86.5144271850586, + "learning_rate": 9.231000000000002e-06, + "loss": 1.0643, + "step": 7700 + }, + { + "epoch": 118.46153846153847, + "eval_loss": 1.2187525033950806, + "eval_runtime": 10.9492, + "eval_samples_per_second": 11.964, + "eval_steps_per_second": 1.553, + "step": 7700 + }, + { + "epoch": 120.0, + "grad_norm": 19.4710750579834, + "learning_rate": 9.221e-06, + "loss": 1.0966, + "step": 7800 + }, + { + "epoch": 120.0, + "eval_loss": 1.2282384634017944, + "eval_runtime": 10.9078, + "eval_samples_per_second": 12.01, + "eval_steps_per_second": 1.559, + "step": 7800 + }, + { + "epoch": 121.53846153846153, + "grad_norm": 37.673744201660156, + "learning_rate": 9.211e-06, + "loss": 1.0632, + "step": 7900 + }, + { + "epoch": 121.53846153846153, + "eval_loss": 1.2206230163574219, + "eval_runtime": 11.0018, + "eval_samples_per_second": 11.907, + "eval_steps_per_second": 1.545, + "step": 7900 + }, + { + "epoch": 123.07692307692308, + "grad_norm": 25.10326385498047, + "learning_rate": 9.201e-06, + "loss": 1.0873, + "step": 8000 + }, + { + "epoch": 123.07692307692308, + "eval_loss": 1.2137339115142822, + "eval_runtime": 10.9232, + "eval_samples_per_second": 11.993, + "eval_steps_per_second": 1.556, + "step": 8000 + }, + { + "epoch": 124.61538461538461, + "grad_norm": 32.02176284790039, + "learning_rate": 9.191e-06, + "loss": 1.0568, + "step": 8100 + }, + { + "epoch": 124.61538461538461, + "eval_loss": 1.2065187692642212, + "eval_runtime": 10.9614, + "eval_samples_per_second": 11.951, + "eval_steps_per_second": 1.551, + "step": 8100 + }, + { + "epoch": 126.15384615384616, + "grad_norm": 19.97406005859375, + "learning_rate": 9.181e-06, + "loss": 1.065, + "step": 8200 + }, + { + "epoch": 126.15384615384616, + "eval_loss": 1.2094841003417969, + "eval_runtime": 11.0274, + "eval_samples_per_second": 11.879, + "eval_steps_per_second": 1.542, + "step": 8200 + }, + { + "epoch": 127.6923076923077, + "grad_norm": 31.624399185180664, + "learning_rate": 9.171e-06, + "loss": 1.0805, + "step": 8300 + }, + { + "epoch": 127.6923076923077, + "eval_loss": 1.2149733304977417, + "eval_runtime": 11.2014, + "eval_samples_per_second": 11.695, + "eval_steps_per_second": 1.518, + "step": 8300 + }, + { + "epoch": 129.23076923076923, + "grad_norm": 29.24848747253418, + "learning_rate": 9.161000000000001e-06, + "loss": 1.0463, + "step": 8400 + }, + { + "epoch": 129.23076923076923, + "eval_loss": 1.2077099084854126, + "eval_runtime": 10.9432, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 1.553, + "step": 8400 + }, + { + "epoch": 130.76923076923077, + "grad_norm": 27.14787483215332, + "learning_rate": 9.151000000000001e-06, + "loss": 1.0607, + "step": 8500 + }, + { + "epoch": 130.76923076923077, + "eval_loss": 1.2046644687652588, + "eval_runtime": 11.001, + "eval_samples_per_second": 11.908, + "eval_steps_per_second": 1.545, + "step": 8500 + }, + { + "epoch": 132.30769230769232, + "grad_norm": 32.416194915771484, + "learning_rate": 9.141000000000001e-06, + "loss": 1.0365, + "step": 8600 + }, + { + "epoch": 132.30769230769232, + "eval_loss": 1.195080041885376, + "eval_runtime": 11.0205, + "eval_samples_per_second": 11.887, + "eval_steps_per_second": 1.543, + "step": 8600 + }, + { + "epoch": 133.84615384615384, + "grad_norm": 58.863582611083984, + "learning_rate": 9.131000000000001e-06, + "loss": 1.0564, + "step": 8700 + }, + { + "epoch": 133.84615384615384, + "eval_loss": 1.1966980695724487, + "eval_runtime": 10.9043, + "eval_samples_per_second": 12.014, + "eval_steps_per_second": 1.559, + "step": 8700 + }, + { + "epoch": 135.3846153846154, + "grad_norm": 26.08232307434082, + "learning_rate": 9.121000000000001e-06, + "loss": 1.0507, + "step": 8800 + }, + { + "epoch": 135.3846153846154, + "eval_loss": 1.201367735862732, + "eval_runtime": 10.9714, + "eval_samples_per_second": 11.94, + "eval_steps_per_second": 1.549, + "step": 8800 + }, + { + "epoch": 136.92307692307693, + "grad_norm": 59.368019104003906, + "learning_rate": 9.111000000000001e-06, + "loss": 1.0508, + "step": 8900 + }, + { + "epoch": 136.92307692307693, + "eval_loss": 1.208795428276062, + "eval_runtime": 11.0529, + "eval_samples_per_second": 11.852, + "eval_steps_per_second": 1.538, + "step": 8900 + }, + { + "epoch": 138.46153846153845, + "grad_norm": 43.07460021972656, + "learning_rate": 9.101000000000001e-06, + "loss": 1.0359, + "step": 9000 + }, + { + "epoch": 138.46153846153845, + "eval_loss": 1.1782392263412476, + "eval_runtime": 11.1519, + "eval_samples_per_second": 11.747, + "eval_steps_per_second": 1.524, + "step": 9000 + }, + { + "epoch": 140.0, + "grad_norm": 22.39567756652832, + "learning_rate": 9.091000000000002e-06, + "loss": 1.0584, + "step": 9100 + }, + { + "epoch": 140.0, + "eval_loss": 1.1873483657836914, + "eval_runtime": 10.8732, + "eval_samples_per_second": 12.048, + "eval_steps_per_second": 1.563, + "step": 9100 + }, + { + "epoch": 141.53846153846155, + "grad_norm": 22.813621520996094, + "learning_rate": 9.081000000000002e-06, + "loss": 1.0354, + "step": 9200 + }, + { + "epoch": 141.53846153846155, + "eval_loss": 1.1731117963790894, + "eval_runtime": 11.0847, + "eval_samples_per_second": 11.818, + "eval_steps_per_second": 1.534, + "step": 9200 + }, + { + "epoch": 143.07692307692307, + "grad_norm": 31.44073486328125, + "learning_rate": 9.071000000000002e-06, + "loss": 1.0457, + "step": 9300 + }, + { + "epoch": 143.07692307692307, + "eval_loss": 1.1962807178497314, + "eval_runtime": 11.0668, + "eval_samples_per_second": 11.837, + "eval_steps_per_second": 1.536, + "step": 9300 + }, + { + "epoch": 144.6153846153846, + "grad_norm": 18.18711280822754, + "learning_rate": 9.061e-06, + "loss": 1.0481, + "step": 9400 + }, + { + "epoch": 144.6153846153846, + "eval_loss": 1.185339093208313, + "eval_runtime": 10.9916, + "eval_samples_per_second": 11.918, + "eval_steps_per_second": 1.547, + "step": 9400 + }, + { + "epoch": 146.15384615384616, + "grad_norm": 38.05665969848633, + "learning_rate": 9.051e-06, + "loss": 1.0391, + "step": 9500 + }, + { + "epoch": 146.15384615384616, + "eval_loss": 1.1856777667999268, + "eval_runtime": 11.0391, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 1.54, + "step": 9500 + }, + { + "epoch": 147.69230769230768, + "grad_norm": 19.963260650634766, + "learning_rate": 9.041e-06, + "loss": 1.0322, + "step": 9600 + }, + { + "epoch": 147.69230769230768, + "eval_loss": 1.1843148469924927, + "eval_runtime": 11.0382, + "eval_samples_per_second": 11.868, + "eval_steps_per_second": 1.54, + "step": 9600 + }, + { + "epoch": 149.23076923076923, + "grad_norm": 28.58108901977539, + "learning_rate": 9.031e-06, + "loss": 1.0369, + "step": 9700 + }, + { + "epoch": 149.23076923076923, + "eval_loss": 1.1637182235717773, + "eval_runtime": 11.2766, + "eval_samples_per_second": 11.617, + "eval_steps_per_second": 1.508, + "step": 9700 + }, + { + "epoch": 150.76923076923077, + "grad_norm": 64.3956527709961, + "learning_rate": 9.021e-06, + "loss": 1.0519, + "step": 9800 + }, + { + "epoch": 150.76923076923077, + "eval_loss": 1.1796802282333374, + "eval_runtime": 11.3418, + "eval_samples_per_second": 11.55, + "eval_steps_per_second": 1.499, + "step": 9800 + }, + { + "epoch": 152.30769230769232, + "grad_norm": 18.857580184936523, + "learning_rate": 9.011e-06, + "loss": 1.0272, + "step": 9900 + }, + { + "epoch": 152.30769230769232, + "eval_loss": 1.1634279489517212, + "eval_runtime": 11.6749, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.456, + "step": 9900 + }, + { + "epoch": 153.84615384615384, + "grad_norm": 42.541683197021484, + "learning_rate": 9.001e-06, + "loss": 1.0287, + "step": 10000 + }, + { + "epoch": 153.84615384615384, + "eval_loss": 1.1698088645935059, + "eval_runtime": 11.068, + "eval_samples_per_second": 11.836, + "eval_steps_per_second": 1.536, + "step": 10000 + }, + { + "epoch": 155.3846153846154, + "grad_norm": 30.52286720275879, + "learning_rate": 8.991e-06, + "loss": 1.0237, + "step": 10100 + }, + { + "epoch": 155.3846153846154, + "eval_loss": 1.178311824798584, + "eval_runtime": 11.1107, + "eval_samples_per_second": 11.79, + "eval_steps_per_second": 1.53, + "step": 10100 + }, + { + "epoch": 156.92307692307693, + "grad_norm": 32.60612487792969, + "learning_rate": 8.981000000000001e-06, + "loss": 1.0362, + "step": 10200 + }, + { + "epoch": 156.92307692307693, + "eval_loss": 1.157893180847168, + "eval_runtime": 11.0484, + "eval_samples_per_second": 11.857, + "eval_steps_per_second": 1.539, + "step": 10200 + }, + { + "epoch": 158.46153846153845, + "grad_norm": 26.15647315979004, + "learning_rate": 8.971000000000001e-06, + "loss": 0.998, + "step": 10300 + }, + { + "epoch": 158.46153846153845, + "eval_loss": 1.1724605560302734, + "eval_runtime": 11.2323, + "eval_samples_per_second": 11.663, + "eval_steps_per_second": 1.513, + "step": 10300 + }, + { + "epoch": 160.0, + "grad_norm": 45.76993942260742, + "learning_rate": 8.961000000000001e-06, + "loss": 1.0432, + "step": 10400 + }, + { + "epoch": 160.0, + "eval_loss": 1.1688075065612793, + "eval_runtime": 11.0984, + "eval_samples_per_second": 11.803, + "eval_steps_per_second": 1.532, + "step": 10400 + }, + { + "epoch": 161.53846153846155, + "grad_norm": 40.384578704833984, + "learning_rate": 8.951000000000001e-06, + "loss": 1.0421, + "step": 10500 + }, + { + "epoch": 161.53846153846155, + "eval_loss": 1.1621845960617065, + "eval_runtime": 11.1624, + "eval_samples_per_second": 11.736, + "eval_steps_per_second": 1.523, + "step": 10500 + }, + { + "epoch": 163.07692307692307, + "grad_norm": 22.886058807373047, + "learning_rate": 8.941000000000001e-06, + "loss": 0.9912, + "step": 10600 + }, + { + "epoch": 163.07692307692307, + "eval_loss": 1.1695055961608887, + "eval_runtime": 10.9842, + "eval_samples_per_second": 11.926, + "eval_steps_per_second": 1.548, + "step": 10600 + }, + { + "epoch": 164.6153846153846, + "grad_norm": 26.940736770629883, + "learning_rate": 8.931000000000001e-06, + "loss": 1.011, + "step": 10700 + }, + { + "epoch": 164.6153846153846, + "eval_loss": 1.1458157300949097, + "eval_runtime": 11.025, + "eval_samples_per_second": 11.882, + "eval_steps_per_second": 1.542, + "step": 10700 + }, + { + "epoch": 166.15384615384616, + "grad_norm": 26.5013484954834, + "learning_rate": 8.921000000000001e-06, + "loss": 0.9876, + "step": 10800 + }, + { + "epoch": 166.15384615384616, + "eval_loss": 1.1569631099700928, + "eval_runtime": 11.101, + "eval_samples_per_second": 11.801, + "eval_steps_per_second": 1.531, + "step": 10800 + }, + { + "epoch": 167.69230769230768, + "grad_norm": 29.859987258911133, + "learning_rate": 8.911000000000002e-06, + "loss": 1.0374, + "step": 10900 + }, + { + "epoch": 167.69230769230768, + "eval_loss": 1.149316668510437, + "eval_runtime": 11.1242, + "eval_samples_per_second": 11.776, + "eval_steps_per_second": 1.528, + "step": 10900 + }, + { + "epoch": 169.23076923076923, + "grad_norm": 27.777738571166992, + "learning_rate": 8.901e-06, + "loss": 0.985, + "step": 11000 + }, + { + "epoch": 169.23076923076923, + "eval_loss": 1.1608328819274902, + "eval_runtime": 11.2215, + "eval_samples_per_second": 11.674, + "eval_steps_per_second": 1.515, + "step": 11000 + }, + { + "epoch": 170.76923076923077, + "grad_norm": 39.21344757080078, + "learning_rate": 8.891e-06, + "loss": 1.0049, + "step": 11100 + }, + { + "epoch": 170.76923076923077, + "eval_loss": 1.1642228364944458, + "eval_runtime": 11.1947, + "eval_samples_per_second": 11.702, + "eval_steps_per_second": 1.519, + "step": 11100 + }, + { + "epoch": 172.30769230769232, + "grad_norm": 29.880149841308594, + "learning_rate": 8.881e-06, + "loss": 0.9843, + "step": 11200 + }, + { + "epoch": 172.30769230769232, + "eval_loss": 1.1574000120162964, + "eval_runtime": 11.2634, + "eval_samples_per_second": 11.631, + "eval_steps_per_second": 1.509, + "step": 11200 + }, + { + "epoch": 173.84615384615384, + "grad_norm": 63.53031539916992, + "learning_rate": 8.871e-06, + "loss": 1.0354, + "step": 11300 + }, + { + "epoch": 173.84615384615384, + "eval_loss": 1.1575734615325928, + "eval_runtime": 11.0265, + "eval_samples_per_second": 11.88, + "eval_steps_per_second": 1.542, + "step": 11300 + }, + { + "epoch": 175.3846153846154, + "grad_norm": 26.937786102294922, + "learning_rate": 8.861e-06, + "loss": 0.9964, + "step": 11400 + }, + { + "epoch": 175.3846153846154, + "eval_loss": 1.1552445888519287, + "eval_runtime": 11.0325, + "eval_samples_per_second": 11.874, + "eval_steps_per_second": 1.541, + "step": 11400 + }, + { + "epoch": 176.92307692307693, + "grad_norm": 87.28536987304688, + "learning_rate": 8.851e-06, + "loss": 0.9932, + "step": 11500 + }, + { + "epoch": 176.92307692307693, + "eval_loss": 1.1411677598953247, + "eval_runtime": 11.1527, + "eval_samples_per_second": 11.746, + "eval_steps_per_second": 1.524, + "step": 11500 + }, + { + "epoch": 178.46153846153845, + "grad_norm": 25.903568267822266, + "learning_rate": 8.841e-06, + "loss": 0.9768, + "step": 11600 + }, + { + "epoch": 178.46153846153845, + "eval_loss": 1.1635726690292358, + "eval_runtime": 11.123, + "eval_samples_per_second": 11.777, + "eval_steps_per_second": 1.528, + "step": 11600 + }, + { + "epoch": 180.0, + "grad_norm": 24.315654754638672, + "learning_rate": 8.831e-06, + "loss": 0.9984, + "step": 11700 + }, + { + "epoch": 180.0, + "eval_loss": 1.1710366010665894, + "eval_runtime": 11.1375, + "eval_samples_per_second": 11.762, + "eval_steps_per_second": 1.526, + "step": 11700 + }, + { + "epoch": 181.53846153846155, + "grad_norm": 60.17182540893555, + "learning_rate": 8.821e-06, + "loss": 0.9703, + "step": 11800 + }, + { + "epoch": 181.53846153846155, + "eval_loss": 1.1556936502456665, + "eval_runtime": 11.0387, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 1.54, + "step": 11800 + }, + { + "epoch": 183.07692307692307, + "grad_norm": 32.373477935791016, + "learning_rate": 8.811000000000001e-06, + "loss": 0.9996, + "step": 11900 + }, + { + "epoch": 183.07692307692307, + "eval_loss": 1.146790623664856, + "eval_runtime": 11.1805, + "eval_samples_per_second": 11.717, + "eval_steps_per_second": 1.521, + "step": 11900 + }, + { + "epoch": 184.6153846153846, + "grad_norm": 42.578575134277344, + "learning_rate": 8.801000000000001e-06, + "loss": 0.9795, + "step": 12000 + }, + { + "epoch": 184.6153846153846, + "eval_loss": 1.144544005393982, + "eval_runtime": 11.0704, + "eval_samples_per_second": 11.833, + "eval_steps_per_second": 1.536, + "step": 12000 + }, + { + "epoch": 186.15384615384616, + "grad_norm": 22.79789161682129, + "learning_rate": 8.791000000000001e-06, + "loss": 0.9905, + "step": 12100 + }, + { + "epoch": 186.15384615384616, + "eval_loss": 1.1581685543060303, + "eval_runtime": 11.272, + "eval_samples_per_second": 11.622, + "eval_steps_per_second": 1.508, + "step": 12100 + }, + { + "epoch": 187.69230769230768, + "grad_norm": 52.061012268066406, + "learning_rate": 8.781200000000002e-06, + "loss": 0.9817, + "step": 12200 + }, + { + "epoch": 187.69230769230768, + "eval_loss": 1.159809947013855, + "eval_runtime": 11.2021, + "eval_samples_per_second": 11.694, + "eval_steps_per_second": 1.518, + "step": 12200 + }, + { + "epoch": 189.23076923076923, + "grad_norm": 35.95882034301758, + "learning_rate": 8.7712e-06, + "loss": 1.0071, + "step": 12300 + }, + { + "epoch": 189.23076923076923, + "eval_loss": 1.1944890022277832, + "eval_runtime": 10.9263, + "eval_samples_per_second": 11.989, + "eval_steps_per_second": 1.556, + "step": 12300 + }, + { + "epoch": 190.76923076923077, + "grad_norm": 213.48587036132812, + "learning_rate": 8.7612e-06, + "loss": 0.9997, + "step": 12400 + }, + { + "epoch": 190.76923076923077, + "eval_loss": 1.191455602645874, + "eval_runtime": 10.917, + "eval_samples_per_second": 12.0, + "eval_steps_per_second": 1.557, + "step": 12400 + }, + { + "epoch": 192.30769230769232, + "grad_norm": 19.97510528564453, + "learning_rate": 8.7512e-06, + "loss": 1.001, + "step": 12500 + }, + { + "epoch": 192.30769230769232, + "eval_loss": 1.167776346206665, + "eval_runtime": 11.033, + "eval_samples_per_second": 11.873, + "eval_steps_per_second": 1.541, + "step": 12500 + }, + { + "epoch": 193.84615384615384, + "grad_norm": 30.815828323364258, + "learning_rate": 8.7412e-06, + "loss": 0.9719, + "step": 12600 + }, + { + "epoch": 193.84615384615384, + "eval_loss": 1.150451898574829, + "eval_runtime": 11.2471, + "eval_samples_per_second": 11.647, + "eval_steps_per_second": 1.512, + "step": 12600 + }, + { + "epoch": 195.3846153846154, + "grad_norm": 40.32701110839844, + "learning_rate": 8.7312e-06, + "loss": 0.9658, + "step": 12700 + }, + { + "epoch": 195.3846153846154, + "eval_loss": 1.1517494916915894, + "eval_runtime": 11.1893, + "eval_samples_per_second": 11.708, + "eval_steps_per_second": 1.519, + "step": 12700 + }, + { + "epoch": 196.92307692307693, + "grad_norm": 42.11077117919922, + "learning_rate": 8.7212e-06, + "loss": 0.9744, + "step": 12800 + }, + { + "epoch": 196.92307692307693, + "eval_loss": 1.1507395505905151, + "eval_runtime": 11.2196, + "eval_samples_per_second": 11.676, + "eval_steps_per_second": 1.515, + "step": 12800 + }, + { + "epoch": 198.46153846153845, + "grad_norm": 20.991779327392578, + "learning_rate": 8.7112e-06, + "loss": 0.9695, + "step": 12900 + }, + { + "epoch": 198.46153846153845, + "eval_loss": 1.1557880640029907, + "eval_runtime": 11.2158, + "eval_samples_per_second": 11.68, + "eval_steps_per_second": 1.516, + "step": 12900 + }, + { + "epoch": 200.0, + "grad_norm": 22.79688835144043, + "learning_rate": 8.7012e-06, + "loss": 0.9652, + "step": 13000 + }, + { + "epoch": 200.0, + "eval_loss": 1.1554670333862305, + "eval_runtime": 11.1246, + "eval_samples_per_second": 11.776, + "eval_steps_per_second": 1.528, + "step": 13000 + }, + { + "epoch": 201.53846153846155, + "grad_norm": 180.34512329101562, + "learning_rate": 8.6912e-06, + "loss": 0.9664, + "step": 13100 + }, + { + "epoch": 201.53846153846155, + "eval_loss": 1.1403967142105103, + "eval_runtime": 10.9895, + "eval_samples_per_second": 11.921, + "eval_steps_per_second": 1.547, + "step": 13100 + }, + { + "epoch": 203.07692307692307, + "grad_norm": 31.583358764648438, + "learning_rate": 8.6812e-06, + "loss": 0.9672, + "step": 13200 + }, + { + "epoch": 203.07692307692307, + "eval_loss": 1.1569470167160034, + "eval_runtime": 11.2695, + "eval_samples_per_second": 11.624, + "eval_steps_per_second": 1.508, + "step": 13200 + }, + { + "epoch": 204.6153846153846, + "grad_norm": 34.05722427368164, + "learning_rate": 8.671200000000001e-06, + "loss": 0.9531, + "step": 13300 + }, + { + "epoch": 204.6153846153846, + "eval_loss": 1.1408321857452393, + "eval_runtime": 11.1074, + "eval_samples_per_second": 11.794, + "eval_steps_per_second": 1.531, + "step": 13300 + }, + { + "epoch": 206.15384615384616, + "grad_norm": 26.748388290405273, + "learning_rate": 8.661200000000001e-06, + "loss": 0.9484, + "step": 13400 + }, + { + "epoch": 206.15384615384616, + "eval_loss": 1.151693344116211, + "eval_runtime": 10.9585, + "eval_samples_per_second": 11.954, + "eval_steps_per_second": 1.551, + "step": 13400 + }, + { + "epoch": 207.69230769230768, + "grad_norm": 19.531770706176758, + "learning_rate": 8.651200000000001e-06, + "loss": 0.971, + "step": 13500 + }, + { + "epoch": 207.69230769230768, + "eval_loss": 1.138724684715271, + "eval_runtime": 11.0296, + "eval_samples_per_second": 11.877, + "eval_steps_per_second": 1.541, + "step": 13500 + }, + { + "epoch": 209.23076923076923, + "grad_norm": 23.87537384033203, + "learning_rate": 8.641200000000001e-06, + "loss": 0.944, + "step": 13600 + }, + { + "epoch": 209.23076923076923, + "eval_loss": 1.1402664184570312, + "eval_runtime": 11.1505, + "eval_samples_per_second": 11.748, + "eval_steps_per_second": 1.525, + "step": 13600 + }, + { + "epoch": 210.76923076923077, + "grad_norm": 25.069852828979492, + "learning_rate": 8.631200000000001e-06, + "loss": 0.9581, + "step": 13700 + }, + { + "epoch": 210.76923076923077, + "eval_loss": 1.1348073482513428, + "eval_runtime": 11.063, + "eval_samples_per_second": 11.841, + "eval_steps_per_second": 1.537, + "step": 13700 + }, + { + "epoch": 212.30769230769232, + "grad_norm": 23.794719696044922, + "learning_rate": 8.621200000000001e-06, + "loss": 0.957, + "step": 13800 + }, + { + "epoch": 212.30769230769232, + "eval_loss": 1.143198013305664, + "eval_runtime": 11.1422, + "eval_samples_per_second": 11.757, + "eval_steps_per_second": 1.526, + "step": 13800 + }, + { + "epoch": 213.84615384615384, + "grad_norm": 26.059829711914062, + "learning_rate": 8.611200000000002e-06, + "loss": 0.9554, + "step": 13900 + }, + { + "epoch": 213.84615384615384, + "eval_loss": 1.1333541870117188, + "eval_runtime": 10.948, + "eval_samples_per_second": 11.966, + "eval_steps_per_second": 1.553, + "step": 13900 + }, + { + "epoch": 215.3846153846154, + "grad_norm": 49.8937873840332, + "learning_rate": 8.6012e-06, + "loss": 0.9607, + "step": 14000 + }, + { + "epoch": 215.3846153846154, + "eval_loss": 1.1584446430206299, + "eval_runtime": 11.2488, + "eval_samples_per_second": 11.646, + "eval_steps_per_second": 1.511, + "step": 14000 + }, + { + "epoch": 216.92307692307693, + "grad_norm": 17.9267520904541, + "learning_rate": 8.5912e-06, + "loss": 0.9444, + "step": 14100 + }, + { + "epoch": 216.92307692307693, + "eval_loss": 1.1573532819747925, + "eval_runtime": 11.143, + "eval_samples_per_second": 11.756, + "eval_steps_per_second": 1.526, + "step": 14100 + }, + { + "epoch": 218.46153846153845, + "grad_norm": 27.38156509399414, + "learning_rate": 8.5812e-06, + "loss": 0.928, + "step": 14200 + }, + { + "epoch": 218.46153846153845, + "eval_loss": 1.1540145874023438, + "eval_runtime": 11.0475, + "eval_samples_per_second": 11.858, + "eval_steps_per_second": 1.539, + "step": 14200 + }, + { + "epoch": 220.0, + "grad_norm": 42.785037994384766, + "learning_rate": 8.5712e-06, + "loss": 0.9548, + "step": 14300 + }, + { + "epoch": 220.0, + "eval_loss": 1.1379021406173706, + "eval_runtime": 10.9412, + "eval_samples_per_second": 11.973, + "eval_steps_per_second": 1.554, + "step": 14300 + }, + { + "epoch": 221.53846153846155, + "grad_norm": 39.50480270385742, + "learning_rate": 8.5612e-06, + "loss": 0.9583, + "step": 14400 + }, + { + "epoch": 221.53846153846155, + "eval_loss": 1.1666078567504883, + "eval_runtime": 11.2066, + "eval_samples_per_second": 11.69, + "eval_steps_per_second": 1.517, + "step": 14400 + }, + { + "epoch": 223.07692307692307, + "grad_norm": 15.560932159423828, + "learning_rate": 8.5512e-06, + "loss": 0.9306, + "step": 14500 + }, + { + "epoch": 223.07692307692307, + "eval_loss": 1.151904582977295, + "eval_runtime": 11.2376, + "eval_samples_per_second": 11.657, + "eval_steps_per_second": 1.513, + "step": 14500 + }, + { + "epoch": 224.6153846153846, + "grad_norm": 36.12020492553711, + "learning_rate": 8.541400000000001e-06, + "loss": 0.9668, + "step": 14600 + }, + { + "epoch": 224.6153846153846, + "eval_loss": 1.139450192451477, + "eval_runtime": 11.1643, + "eval_samples_per_second": 11.734, + "eval_steps_per_second": 1.523, + "step": 14600 + }, + { + "epoch": 226.15384615384616, + "grad_norm": 31.29511070251465, + "learning_rate": 8.531400000000001e-06, + "loss": 0.9646, + "step": 14700 + }, + { + "epoch": 226.15384615384616, + "eval_loss": 1.1311490535736084, + "eval_runtime": 11.0, + "eval_samples_per_second": 11.909, + "eval_steps_per_second": 1.545, + "step": 14700 + }, + { + "epoch": 227.69230769230768, + "grad_norm": 22.08748435974121, + "learning_rate": 8.521400000000001e-06, + "loss": 0.922, + "step": 14800 + }, + { + "epoch": 227.69230769230768, + "eval_loss": 1.1504499912261963, + "eval_runtime": 11.135, + "eval_samples_per_second": 11.765, + "eval_steps_per_second": 1.527, + "step": 14800 + }, + { + "epoch": 229.23076923076923, + "grad_norm": 26.33457374572754, + "learning_rate": 8.511400000000001e-06, + "loss": 0.9306, + "step": 14900 + }, + { + "epoch": 229.23076923076923, + "eval_loss": 1.136217713356018, + "eval_runtime": 11.1465, + "eval_samples_per_second": 11.753, + "eval_steps_per_second": 1.525, + "step": 14900 + }, + { + "epoch": 230.76923076923077, + "grad_norm": 49.193206787109375, + "learning_rate": 8.501400000000001e-06, + "loss": 0.938, + "step": 15000 + }, + { + "epoch": 230.76923076923077, + "eval_loss": 1.1409271955490112, + "eval_runtime": 11.0858, + "eval_samples_per_second": 11.817, + "eval_steps_per_second": 1.533, + "step": 15000 + }, + { + "epoch": 232.30769230769232, + "grad_norm": 22.36850357055664, + "learning_rate": 8.491400000000001e-06, + "loss": 0.9218, + "step": 15100 + }, + { + "epoch": 232.30769230769232, + "eval_loss": 1.131103515625, + "eval_runtime": 11.0691, + "eval_samples_per_second": 11.835, + "eval_steps_per_second": 1.536, + "step": 15100 + }, + { + "epoch": 233.84615384615384, + "grad_norm": 26.34011459350586, + "learning_rate": 8.481400000000002e-06, + "loss": 0.9617, + "step": 15200 + }, + { + "epoch": 233.84615384615384, + "eval_loss": 1.1415542364120483, + "eval_runtime": 10.9887, + "eval_samples_per_second": 11.921, + "eval_steps_per_second": 1.547, + "step": 15200 + }, + { + "epoch": 235.3846153846154, + "grad_norm": 29.583358764648438, + "learning_rate": 8.4714e-06, + "loss": 0.9272, + "step": 15300 + }, + { + "epoch": 235.3846153846154, + "eval_loss": 1.144914150238037, + "eval_runtime": 11.2646, + "eval_samples_per_second": 11.629, + "eval_steps_per_second": 1.509, + "step": 15300 + }, + { + "epoch": 236.92307692307693, + "grad_norm": 31.824247360229492, + "learning_rate": 8.4614e-06, + "loss": 0.9207, + "step": 15400 + }, + { + "epoch": 236.92307692307693, + "eval_loss": 1.1387474536895752, + "eval_runtime": 11.0721, + "eval_samples_per_second": 11.832, + "eval_steps_per_second": 1.535, + "step": 15400 + }, + { + "epoch": 238.46153846153845, + "grad_norm": 41.94277572631836, + "learning_rate": 8.4514e-06, + "loss": 0.9454, + "step": 15500 + }, + { + "epoch": 238.46153846153845, + "eval_loss": 1.1316168308258057, + "eval_runtime": 11.1831, + "eval_samples_per_second": 11.714, + "eval_steps_per_second": 1.52, + "step": 15500 + }, + { + "epoch": 240.0, + "grad_norm": 21.150598526000977, + "learning_rate": 8.4414e-06, + "loss": 0.9249, + "step": 15600 + }, + { + "epoch": 240.0, + "eval_loss": 1.1368097066879272, + "eval_runtime": 11.0887, + "eval_samples_per_second": 11.814, + "eval_steps_per_second": 1.533, + "step": 15600 + }, + { + "epoch": 241.53846153846155, + "grad_norm": 47.432212829589844, + "learning_rate": 8.4314e-06, + "loss": 0.9212, + "step": 15700 + }, + { + "epoch": 241.53846153846155, + "eval_loss": 1.125348448753357, + "eval_runtime": 11.2434, + "eval_samples_per_second": 11.651, + "eval_steps_per_second": 1.512, + "step": 15700 + }, + { + "epoch": 243.07692307692307, + "grad_norm": 28.406036376953125, + "learning_rate": 8.4214e-06, + "loss": 0.9272, + "step": 15800 + }, + { + "epoch": 243.07692307692307, + "eval_loss": 1.1328097581863403, + "eval_runtime": 11.1097, + "eval_samples_per_second": 11.791, + "eval_steps_per_second": 1.53, + "step": 15800 + }, + { + "epoch": 244.6153846153846, + "grad_norm": 53.369564056396484, + "learning_rate": 8.4114e-06, + "loss": 0.9174, + "step": 15900 + }, + { + "epoch": 244.6153846153846, + "eval_loss": 1.1235861778259277, + "eval_runtime": 10.9581, + "eval_samples_per_second": 11.955, + "eval_steps_per_second": 1.551, + "step": 15900 + }, + { + "epoch": 246.15384615384616, + "grad_norm": 31.435935974121094, + "learning_rate": 8.4014e-06, + "loss": 0.9041, + "step": 16000 + }, + { + "epoch": 246.15384615384616, + "eval_loss": 1.1266686916351318, + "eval_runtime": 11.1276, + "eval_samples_per_second": 11.773, + "eval_steps_per_second": 1.528, + "step": 16000 + }, + { + "epoch": 247.69230769230768, + "grad_norm": 32.799991607666016, + "learning_rate": 8.3914e-06, + "loss": 0.9062, + "step": 16100 + }, + { + "epoch": 247.69230769230768, + "eval_loss": 1.1481693983078003, + "eval_runtime": 11.0435, + "eval_samples_per_second": 11.862, + "eval_steps_per_second": 1.539, + "step": 16100 + }, + { + "epoch": 249.23076923076923, + "grad_norm": 36.49935531616211, + "learning_rate": 8.3814e-06, + "loss": 0.9163, + "step": 16200 + }, + { + "epoch": 249.23076923076923, + "eval_loss": 1.139769434928894, + "eval_runtime": 11.1922, + "eval_samples_per_second": 11.705, + "eval_steps_per_second": 1.519, + "step": 16200 + }, + { + "epoch": 250.76923076923077, + "grad_norm": 35.4781379699707, + "learning_rate": 8.371400000000001e-06, + "loss": 0.9219, + "step": 16300 + }, + { + "epoch": 250.76923076923077, + "eval_loss": 1.1498539447784424, + "eval_runtime": 11.1805, + "eval_samples_per_second": 11.717, + "eval_steps_per_second": 1.52, + "step": 16300 + }, + { + "epoch": 252.30769230769232, + "grad_norm": 17.645612716674805, + "learning_rate": 8.361400000000001e-06, + "loss": 0.9278, + "step": 16400 + }, + { + "epoch": 252.30769230769232, + "eval_loss": 1.1338902711868286, + "eval_runtime": 11.1146, + "eval_samples_per_second": 11.786, + "eval_steps_per_second": 1.53, + "step": 16400 + }, + { + "epoch": 253.84615384615384, + "grad_norm": 32.81660079956055, + "learning_rate": 8.351400000000001e-06, + "loss": 0.9108, + "step": 16500 + }, + { + "epoch": 253.84615384615384, + "eval_loss": 1.1279574632644653, + "eval_runtime": 11.1151, + "eval_samples_per_second": 11.786, + "eval_steps_per_second": 1.529, + "step": 16500 + }, + { + "epoch": 255.3846153846154, + "grad_norm": 22.3878116607666, + "learning_rate": 8.341400000000001e-06, + "loss": 0.9011, + "step": 16600 + }, + { + "epoch": 255.3846153846154, + "eval_loss": 1.1570419073104858, + "eval_runtime": 11.0784, + "eval_samples_per_second": 11.825, + "eval_steps_per_second": 1.535, + "step": 16600 + }, + { + "epoch": 256.9230769230769, + "grad_norm": 29.845205307006836, + "learning_rate": 8.331400000000001e-06, + "loss": 0.9314, + "step": 16700 + }, + { + "epoch": 256.9230769230769, + "eval_loss": 1.1365561485290527, + "eval_runtime": 11.2405, + "eval_samples_per_second": 11.654, + "eval_steps_per_second": 1.512, + "step": 16700 + }, + { + "epoch": 258.46153846153845, + "grad_norm": 21.02674102783203, + "learning_rate": 8.321400000000001e-06, + "loss": 0.9021, + "step": 16800 + }, + { + "epoch": 258.46153846153845, + "eval_loss": 1.164974570274353, + "eval_runtime": 11.0643, + "eval_samples_per_second": 11.84, + "eval_steps_per_second": 1.536, + "step": 16800 + }, + { + "epoch": 260.0, + "grad_norm": 22.380117416381836, + "learning_rate": 8.3114e-06, + "loss": 0.912, + "step": 16900 + }, + { + "epoch": 260.0, + "eval_loss": 1.1483317613601685, + "eval_runtime": 11.1852, + "eval_samples_per_second": 11.712, + "eval_steps_per_second": 1.52, + "step": 16900 + }, + { + "epoch": 261.53846153846155, + "grad_norm": 39.20146560668945, + "learning_rate": 8.3014e-06, + "loss": 0.9165, + "step": 17000 + }, + { + "epoch": 261.53846153846155, + "eval_loss": 1.159449577331543, + "eval_runtime": 11.4058, + "eval_samples_per_second": 11.485, + "eval_steps_per_second": 1.49, + "step": 17000 + }, + { + "epoch": 263.0769230769231, + "grad_norm": 46.305389404296875, + "learning_rate": 8.2914e-06, + "loss": 0.916, + "step": 17100 + }, + { + "epoch": 263.0769230769231, + "eval_loss": 1.146033525466919, + "eval_runtime": 11.3638, + "eval_samples_per_second": 11.528, + "eval_steps_per_second": 1.496, + "step": 17100 + }, + { + "epoch": 264.61538461538464, + "grad_norm": 33.07489776611328, + "learning_rate": 8.2814e-06, + "loss": 0.9147, + "step": 17200 + }, + { + "epoch": 264.61538461538464, + "eval_loss": 1.143062710762024, + "eval_runtime": 11.3544, + "eval_samples_per_second": 11.537, + "eval_steps_per_second": 1.497, + "step": 17200 + }, + { + "epoch": 266.15384615384613, + "grad_norm": 35.233131408691406, + "learning_rate": 8.2714e-06, + "loss": 0.9151, + "step": 17300 + }, + { + "epoch": 266.15384615384613, + "eval_loss": 1.126172661781311, + "eval_runtime": 11.1185, + "eval_samples_per_second": 11.782, + "eval_steps_per_second": 1.529, + "step": 17300 + }, + { + "epoch": 267.6923076923077, + "grad_norm": 32.72975540161133, + "learning_rate": 8.2614e-06, + "loss": 0.8881, + "step": 17400 + }, + { + "epoch": 267.6923076923077, + "eval_loss": 1.1455607414245605, + "eval_runtime": 11.0568, + "eval_samples_per_second": 11.848, + "eval_steps_per_second": 1.538, + "step": 17400 + }, + { + "epoch": 269.2307692307692, + "grad_norm": 16.41983985900879, + "learning_rate": 8.2514e-06, + "loss": 0.9027, + "step": 17500 + }, + { + "epoch": 269.2307692307692, + "eval_loss": 1.1283539533615112, + "eval_runtime": 11.3233, + "eval_samples_per_second": 11.569, + "eval_steps_per_second": 1.501, + "step": 17500 + }, + { + "epoch": 270.7692307692308, + "grad_norm": 20.32726287841797, + "learning_rate": 8.2414e-06, + "loss": 0.9391, + "step": 17600 + }, + { + "epoch": 270.7692307692308, + "eval_loss": 1.124210000038147, + "eval_runtime": 11.3345, + "eval_samples_per_second": 11.558, + "eval_steps_per_second": 1.5, + "step": 17600 + }, + { + "epoch": 272.3076923076923, + "grad_norm": 23.14797019958496, + "learning_rate": 8.2314e-06, + "loss": 0.8899, + "step": 17700 + }, + { + "epoch": 272.3076923076923, + "eval_loss": 1.1297597885131836, + "eval_runtime": 11.1272, + "eval_samples_per_second": 11.773, + "eval_steps_per_second": 1.528, + "step": 17700 + }, + { + "epoch": 273.84615384615387, + "grad_norm": 18.778406143188477, + "learning_rate": 8.2214e-06, + "loss": 0.9074, + "step": 17800 + }, + { + "epoch": 273.84615384615387, + "eval_loss": 1.135562777519226, + "eval_runtime": 11.2964, + "eval_samples_per_second": 11.597, + "eval_steps_per_second": 1.505, + "step": 17800 + }, + { + "epoch": 275.38461538461536, + "grad_norm": 27.574323654174805, + "learning_rate": 8.2114e-06, + "loss": 0.8931, + "step": 17900 + }, + { + "epoch": 275.38461538461536, + "eval_loss": 1.1423242092132568, + "eval_runtime": 11.0992, + "eval_samples_per_second": 11.803, + "eval_steps_per_second": 1.532, + "step": 17900 + }, + { + "epoch": 276.9230769230769, + "grad_norm": 26.559467315673828, + "learning_rate": 8.2014e-06, + "loss": 0.8913, + "step": 18000 + }, + { + "epoch": 276.9230769230769, + "eval_loss": 1.1252741813659668, + "eval_runtime": 11.2765, + "eval_samples_per_second": 11.617, + "eval_steps_per_second": 1.508, + "step": 18000 + }, + { + "epoch": 278.46153846153845, + "grad_norm": 24.442596435546875, + "learning_rate": 8.191400000000001e-06, + "loss": 0.8993, + "step": 18100 + }, + { + "epoch": 278.46153846153845, + "eval_loss": 1.1197646856307983, + "eval_runtime": 11.0479, + "eval_samples_per_second": 11.857, + "eval_steps_per_second": 1.539, + "step": 18100 + }, + { + "epoch": 280.0, + "grad_norm": 42.99458694458008, + "learning_rate": 8.181400000000001e-06, + "loss": 0.8925, + "step": 18200 + }, + { + "epoch": 280.0, + "eval_loss": 1.129381775856018, + "eval_runtime": 11.1979, + "eval_samples_per_second": 11.699, + "eval_steps_per_second": 1.518, + "step": 18200 + }, + { + "epoch": 281.53846153846155, + "grad_norm": 38.08549118041992, + "learning_rate": 8.171400000000001e-06, + "loss": 0.8699, + "step": 18300 + }, + { + "epoch": 281.53846153846155, + "eval_loss": 1.1298307180404663, + "eval_runtime": 11.1097, + "eval_samples_per_second": 11.792, + "eval_steps_per_second": 1.53, + "step": 18300 + }, + { + "epoch": 283.0769230769231, + "grad_norm": 49.90501022338867, + "learning_rate": 8.161400000000001e-06, + "loss": 0.9207, + "step": 18400 + }, + { + "epoch": 283.0769230769231, + "eval_loss": 1.1229758262634277, + "eval_runtime": 11.284, + "eval_samples_per_second": 11.609, + "eval_steps_per_second": 1.507, + "step": 18400 + }, + { + "epoch": 284.61538461538464, + "grad_norm": 37.63615036010742, + "learning_rate": 8.1514e-06, + "loss": 0.9061, + "step": 18500 + }, + { + "epoch": 284.61538461538464, + "eval_loss": 1.1395354270935059, + "eval_runtime": 11.2087, + "eval_samples_per_second": 11.687, + "eval_steps_per_second": 1.517, + "step": 18500 + }, + { + "epoch": 286.15384615384613, + "grad_norm": 17.88991355895996, + "learning_rate": 8.1414e-06, + "loss": 0.8664, + "step": 18600 + }, + { + "epoch": 286.15384615384613, + "eval_loss": 1.1339645385742188, + "eval_runtime": 11.317, + "eval_samples_per_second": 11.576, + "eval_steps_per_second": 1.502, + "step": 18600 + }, + { + "epoch": 287.6923076923077, + "grad_norm": 33.13370895385742, + "learning_rate": 8.1314e-06, + "loss": 0.8759, + "step": 18700 + }, + { + "epoch": 287.6923076923077, + "eval_loss": 1.1445599794387817, + "eval_runtime": 11.0472, + "eval_samples_per_second": 11.858, + "eval_steps_per_second": 1.539, + "step": 18700 + }, + { + "epoch": 289.2307692307692, + "grad_norm": 22.776575088500977, + "learning_rate": 8.1214e-06, + "loss": 0.8889, + "step": 18800 + }, + { + "epoch": 289.2307692307692, + "eval_loss": 1.1401453018188477, + "eval_runtime": 11.2523, + "eval_samples_per_second": 11.642, + "eval_steps_per_second": 1.511, + "step": 18800 + }, + { + "epoch": 290.7692307692308, + "grad_norm": 19.893653869628906, + "learning_rate": 8.1114e-06, + "loss": 0.8945, + "step": 18900 + }, + { + "epoch": 290.7692307692308, + "eval_loss": 1.1185678243637085, + "eval_runtime": 10.9814, + "eval_samples_per_second": 11.929, + "eval_steps_per_second": 1.548, + "step": 18900 + }, + { + "epoch": 292.3076923076923, + "grad_norm": 35.09921646118164, + "learning_rate": 8.1015e-06, + "loss": 0.8821, + "step": 19000 + }, + { + "epoch": 292.3076923076923, + "eval_loss": 1.1313185691833496, + "eval_runtime": 11.2698, + "eval_samples_per_second": 11.624, + "eval_steps_per_second": 1.508, + "step": 19000 + }, + { + "epoch": 293.84615384615387, + "grad_norm": 36.43528366088867, + "learning_rate": 8.0915e-06, + "loss": 0.8794, + "step": 19100 + }, + { + "epoch": 293.84615384615387, + "eval_loss": 1.1506413221359253, + "eval_runtime": 10.9821, + "eval_samples_per_second": 11.928, + "eval_steps_per_second": 1.548, + "step": 19100 + }, + { + "epoch": 295.38461538461536, + "grad_norm": 24.73018455505371, + "learning_rate": 8.0815e-06, + "loss": 0.8856, + "step": 19200 + }, + { + "epoch": 295.38461538461536, + "eval_loss": 1.1280685663223267, + "eval_runtime": 11.2539, + "eval_samples_per_second": 11.64, + "eval_steps_per_second": 1.511, + "step": 19200 + }, + { + "epoch": 296.9230769230769, + "grad_norm": 50.01460647583008, + "learning_rate": 8.0715e-06, + "loss": 0.8532, + "step": 19300 + }, + { + "epoch": 296.9230769230769, + "eval_loss": 1.124637484550476, + "eval_runtime": 11.1106, + "eval_samples_per_second": 11.791, + "eval_steps_per_second": 1.53, + "step": 19300 + }, + { + "epoch": 298.46153846153845, + "grad_norm": 38.828033447265625, + "learning_rate": 8.0615e-06, + "loss": 0.8425, + "step": 19400 + }, + { + "epoch": 298.46153846153845, + "eval_loss": 1.1076736450195312, + "eval_runtime": 11.3645, + "eval_samples_per_second": 11.527, + "eval_steps_per_second": 1.496, + "step": 19400 + }, + { + "epoch": 300.0, + "grad_norm": 18.97437858581543, + "learning_rate": 8.0515e-06, + "loss": 0.8934, + "step": 19500 + }, + { + "epoch": 300.0, + "eval_loss": 1.110048532485962, + "eval_runtime": 11.1778, + "eval_samples_per_second": 11.72, + "eval_steps_per_second": 1.521, + "step": 19500 + }, + { + "epoch": 301.53846153846155, + "grad_norm": 49.44120407104492, + "learning_rate": 8.041500000000001e-06, + "loss": 0.8694, + "step": 19600 + }, + { + "epoch": 301.53846153846155, + "eval_loss": 1.1411337852478027, + "eval_runtime": 11.4024, + "eval_samples_per_second": 11.489, + "eval_steps_per_second": 1.491, + "step": 19600 + }, + { + "epoch": 303.0769230769231, + "grad_norm": 32.77436828613281, + "learning_rate": 8.031500000000001e-06, + "loss": 0.8876, + "step": 19700 + }, + { + "epoch": 303.0769230769231, + "eval_loss": 1.1274378299713135, + "eval_runtime": 11.3743, + "eval_samples_per_second": 11.517, + "eval_steps_per_second": 1.495, + "step": 19700 + }, + { + "epoch": 304.61538461538464, + "grad_norm": 33.37275695800781, + "learning_rate": 8.021500000000001e-06, + "loss": 0.839, + "step": 19800 + }, + { + "epoch": 304.61538461538464, + "eval_loss": 1.1159642934799194, + "eval_runtime": 11.2947, + "eval_samples_per_second": 11.598, + "eval_steps_per_second": 1.505, + "step": 19800 + }, + { + "epoch": 306.15384615384613, + "grad_norm": 15.981072425842285, + "learning_rate": 8.011500000000001e-06, + "loss": 0.8796, + "step": 19900 + }, + { + "epoch": 306.15384615384613, + "eval_loss": 1.1134443283081055, + "eval_runtime": 11.0923, + "eval_samples_per_second": 11.81, + "eval_steps_per_second": 1.533, + "step": 19900 + }, + { + "epoch": 307.6923076923077, + "grad_norm": 33.69294738769531, + "learning_rate": 8.001500000000001e-06, + "loss": 0.8757, + "step": 20000 + }, + { + "epoch": 307.6923076923077, + "eval_loss": 1.1174206733703613, + "eval_runtime": 11.1824, + "eval_samples_per_second": 11.715, + "eval_steps_per_second": 1.52, + "step": 20000 + }, + { + "epoch": 309.2307692307692, + "grad_norm": 39.100887298583984, + "learning_rate": 7.991500000000001e-06, + "loss": 0.9037, + "step": 20100 + }, + { + "epoch": 309.2307692307692, + "eval_loss": 1.1444469690322876, + "eval_runtime": 11.2517, + "eval_samples_per_second": 11.643, + "eval_steps_per_second": 1.511, + "step": 20100 + }, + { + "epoch": 310.7692307692308, + "grad_norm": 34.204410552978516, + "learning_rate": 7.981500000000001e-06, + "loss": 0.8714, + "step": 20200 + }, + { + "epoch": 310.7692307692308, + "eval_loss": 1.113229751586914, + "eval_runtime": 11.1634, + "eval_samples_per_second": 11.735, + "eval_steps_per_second": 1.523, + "step": 20200 + }, + { + "epoch": 312.3076923076923, + "grad_norm": 34.093692779541016, + "learning_rate": 7.971500000000002e-06, + "loss": 0.8952, + "step": 20300 + }, + { + "epoch": 312.3076923076923, + "eval_loss": 1.132067084312439, + "eval_runtime": 11.0613, + "eval_samples_per_second": 11.843, + "eval_steps_per_second": 1.537, + "step": 20300 + }, + { + "epoch": 313.84615384615387, + "grad_norm": 33.44735336303711, + "learning_rate": 7.961500000000002e-06, + "loss": 0.8545, + "step": 20400 + }, + { + "epoch": 313.84615384615387, + "eval_loss": 1.102783441543579, + "eval_runtime": 10.9986, + "eval_samples_per_second": 11.911, + "eval_steps_per_second": 1.546, + "step": 20400 + }, + { + "epoch": 315.38461538461536, + "grad_norm": 21.25609588623047, + "learning_rate": 7.9516e-06, + "loss": 0.896, + "step": 20500 + }, + { + "epoch": 315.38461538461536, + "eval_loss": 1.1276897192001343, + "eval_runtime": 11.0387, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 1.54, + "step": 20500 + }, + { + "epoch": 316.9230769230769, + "grad_norm": 42.066017150878906, + "learning_rate": 7.9416e-06, + "loss": 0.8545, + "step": 20600 + }, + { + "epoch": 316.9230769230769, + "eval_loss": 1.1395957469940186, + "eval_runtime": 11.2855, + "eval_samples_per_second": 11.608, + "eval_steps_per_second": 1.506, + "step": 20600 + }, + { + "epoch": 318.46153846153845, + "grad_norm": 22.6724796295166, + "learning_rate": 7.9316e-06, + "loss": 0.8838, + "step": 20700 + }, + { + "epoch": 318.46153846153845, + "eval_loss": 1.1254605054855347, + "eval_runtime": 11.2226, + "eval_samples_per_second": 11.673, + "eval_steps_per_second": 1.515, + "step": 20700 + }, + { + "epoch": 320.0, + "grad_norm": 31.141693115234375, + "learning_rate": 7.9216e-06, + "loss": 0.8704, + "step": 20800 + }, + { + "epoch": 320.0, + "eval_loss": 1.1148459911346436, + "eval_runtime": 11.3548, + "eval_samples_per_second": 11.537, + "eval_steps_per_second": 1.497, + "step": 20800 + }, + { + "epoch": 321.53846153846155, + "grad_norm": 53.709205627441406, + "learning_rate": 7.9116e-06, + "loss": 0.8571, + "step": 20900 + }, + { + "epoch": 321.53846153846155, + "eval_loss": 1.1235041618347168, + "eval_runtime": 11.3363, + "eval_samples_per_second": 11.556, + "eval_steps_per_second": 1.5, + "step": 20900 + }, + { + "epoch": 323.0769230769231, + "grad_norm": 36.34547805786133, + "learning_rate": 7.9016e-06, + "loss": 0.8749, + "step": 21000 + }, + { + "epoch": 323.0769230769231, + "eval_loss": 1.1102306842803955, + "eval_runtime": 11.2154, + "eval_samples_per_second": 11.68, + "eval_steps_per_second": 1.516, + "step": 21000 + }, + { + "epoch": 324.61538461538464, + "grad_norm": 19.974227905273438, + "learning_rate": 7.8916e-06, + "loss": 0.8572, + "step": 21100 + }, + { + "epoch": 324.61538461538464, + "eval_loss": 1.1081122159957886, + "eval_runtime": 11.2322, + "eval_samples_per_second": 11.663, + "eval_steps_per_second": 1.513, + "step": 21100 + }, + { + "epoch": 326.15384615384613, + "grad_norm": 31.686569213867188, + "learning_rate": 7.881600000000001e-06, + "loss": 0.8689, + "step": 21200 + }, + { + "epoch": 326.15384615384613, + "eval_loss": 1.1191128492355347, + "eval_runtime": 11.4469, + "eval_samples_per_second": 11.444, + "eval_steps_per_second": 1.485, + "step": 21200 + }, + { + "epoch": 327.6923076923077, + "grad_norm": 35.97456359863281, + "learning_rate": 7.871600000000001e-06, + "loss": 0.8421, + "step": 21300 + }, + { + "epoch": 327.6923076923077, + "eval_loss": 1.1023869514465332, + "eval_runtime": 11.2157, + "eval_samples_per_second": 11.68, + "eval_steps_per_second": 1.516, + "step": 21300 + }, + { + "epoch": 329.2307692307692, + "grad_norm": 34.61111068725586, + "learning_rate": 7.861600000000001e-06, + "loss": 0.8546, + "step": 21400 + }, + { + "epoch": 329.2307692307692, + "eval_loss": 1.1294784545898438, + "eval_runtime": 11.1536, + "eval_samples_per_second": 11.745, + "eval_steps_per_second": 1.524, + "step": 21400 + }, + { + "epoch": 330.7692307692308, + "grad_norm": 21.682897567749023, + "learning_rate": 7.8516e-06, + "loss": 0.828, + "step": 21500 + }, + { + "epoch": 330.7692307692308, + "eval_loss": 1.101345181465149, + "eval_runtime": 11.0361, + "eval_samples_per_second": 11.87, + "eval_steps_per_second": 1.54, + "step": 21500 + }, + { + "epoch": 332.3076923076923, + "grad_norm": 31.514686584472656, + "learning_rate": 7.8416e-06, + "loss": 0.8762, + "step": 21600 + }, + { + "epoch": 332.3076923076923, + "eval_loss": 1.1125372648239136, + "eval_runtime": 11.2868, + "eval_samples_per_second": 11.607, + "eval_steps_per_second": 1.506, + "step": 21600 + }, + { + "epoch": 333.84615384615387, + "grad_norm": 34.57918167114258, + "learning_rate": 7.8316e-06, + "loss": 0.8368, + "step": 21700 + }, + { + "epoch": 333.84615384615387, + "eval_loss": 1.1226669549942017, + "eval_runtime": 11.4486, + "eval_samples_per_second": 11.442, + "eval_steps_per_second": 1.485, + "step": 21700 + }, + { + "epoch": 335.38461538461536, + "grad_norm": 57.26470947265625, + "learning_rate": 7.8216e-06, + "loss": 0.8675, + "step": 21800 + }, + { + "epoch": 335.38461538461536, + "eval_loss": 1.1287323236465454, + "eval_runtime": 11.197, + "eval_samples_per_second": 11.7, + "eval_steps_per_second": 1.518, + "step": 21800 + }, + { + "epoch": 336.9230769230769, + "grad_norm": 26.583576202392578, + "learning_rate": 7.8116e-06, + "loss": 0.8555, + "step": 21900 + }, + { + "epoch": 336.9230769230769, + "eval_loss": 1.1035945415496826, + "eval_runtime": 11.2619, + "eval_samples_per_second": 11.632, + "eval_steps_per_second": 1.51, + "step": 21900 + }, + { + "epoch": 338.46153846153845, + "grad_norm": 44.196590423583984, + "learning_rate": 7.8016e-06, + "loss": 0.8601, + "step": 22000 + }, + { + "epoch": 338.46153846153845, + "eval_loss": 1.09674870967865, + "eval_runtime": 11.2173, + "eval_samples_per_second": 11.678, + "eval_steps_per_second": 1.516, + "step": 22000 + }, + { + "epoch": 340.0, + "grad_norm": 34.388145446777344, + "learning_rate": 7.7916e-06, + "loss": 0.8347, + "step": 22100 + }, + { + "epoch": 340.0, + "eval_loss": 1.100284218788147, + "eval_runtime": 11.2123, + "eval_samples_per_second": 11.684, + "eval_steps_per_second": 1.516, + "step": 22100 + }, + { + "epoch": 341.53846153846155, + "grad_norm": 33.55098342895508, + "learning_rate": 7.7816e-06, + "loss": 0.8429, + "step": 22200 + }, + { + "epoch": 341.53846153846155, + "eval_loss": 1.1070573329925537, + "eval_runtime": 11.1389, + "eval_samples_per_second": 11.761, + "eval_steps_per_second": 1.526, + "step": 22200 + }, + { + "epoch": 343.0769230769231, + "grad_norm": 21.367860794067383, + "learning_rate": 7.7716e-06, + "loss": 0.827, + "step": 22300 + }, + { + "epoch": 343.0769230769231, + "eval_loss": 1.1184412240982056, + "eval_runtime": 11.2313, + "eval_samples_per_second": 11.664, + "eval_steps_per_second": 1.514, + "step": 22300 + }, + { + "epoch": 344.61538461538464, + "grad_norm": 32.62434387207031, + "learning_rate": 7.7616e-06, + "loss": 0.8566, + "step": 22400 + }, + { + "epoch": 344.61538461538464, + "eval_loss": 1.1258198022842407, + "eval_runtime": 11.1788, + "eval_samples_per_second": 11.719, + "eval_steps_per_second": 1.521, + "step": 22400 + }, + { + "epoch": 346.15384615384613, + "grad_norm": 44.411659240722656, + "learning_rate": 7.7516e-06, + "loss": 0.8666, + "step": 22500 + }, + { + "epoch": 346.15384615384613, + "eval_loss": 1.121845006942749, + "eval_runtime": 11.1251, + "eval_samples_per_second": 11.775, + "eval_steps_per_second": 1.528, + "step": 22500 + }, + { + "epoch": 347.6923076923077, + "grad_norm": 41.10401153564453, + "learning_rate": 7.7416e-06, + "loss": 0.8354, + "step": 22600 + }, + { + "epoch": 347.6923076923077, + "eval_loss": 1.1154077053070068, + "eval_runtime": 11.161, + "eval_samples_per_second": 11.737, + "eval_steps_per_second": 1.523, + "step": 22600 + }, + { + "epoch": 349.2307692307692, + "grad_norm": 31.414745330810547, + "learning_rate": 7.7316e-06, + "loss": 0.8321, + "step": 22700 + }, + { + "epoch": 349.2307692307692, + "eval_loss": 1.1233044862747192, + "eval_runtime": 11.0203, + "eval_samples_per_second": 11.887, + "eval_steps_per_second": 1.543, + "step": 22700 + }, + { + "epoch": 350.7692307692308, + "grad_norm": 37.39424133300781, + "learning_rate": 7.7216e-06, + "loss": 0.8384, + "step": 22800 + }, + { + "epoch": 350.7692307692308, + "eval_loss": 1.1059685945510864, + "eval_runtime": 10.9825, + "eval_samples_per_second": 11.928, + "eval_steps_per_second": 1.548, + "step": 22800 + }, + { + "epoch": 352.3076923076923, + "grad_norm": 24.75067901611328, + "learning_rate": 7.7116e-06, + "loss": 0.8216, + "step": 22900 + }, + { + "epoch": 352.3076923076923, + "eval_loss": 1.1147040128707886, + "eval_runtime": 11.1066, + "eval_samples_per_second": 11.795, + "eval_steps_per_second": 1.531, + "step": 22900 + }, + { + "epoch": 353.84615384615387, + "grad_norm": 21.775754928588867, + "learning_rate": 7.701600000000001e-06, + "loss": 0.8562, + "step": 23000 + }, + { + "epoch": 353.84615384615387, + "eval_loss": 1.1105334758758545, + "eval_runtime": 11.0277, + "eval_samples_per_second": 11.879, + "eval_steps_per_second": 1.542, + "step": 23000 + }, + { + "epoch": 355.38461538461536, + "grad_norm": 33.22850799560547, + "learning_rate": 7.6916e-06, + "loss": 0.813, + "step": 23100 + }, + { + "epoch": 355.38461538461536, + "eval_loss": 1.113241195678711, + "eval_runtime": 10.9954, + "eval_samples_per_second": 11.914, + "eval_steps_per_second": 1.546, + "step": 23100 + }, + { + "epoch": 356.9230769230769, + "grad_norm": 31.402652740478516, + "learning_rate": 7.6816e-06, + "loss": 0.8527, + "step": 23200 + }, + { + "epoch": 356.9230769230769, + "eval_loss": 1.1040674448013306, + "eval_runtime": 11.0415, + "eval_samples_per_second": 11.864, + "eval_steps_per_second": 1.54, + "step": 23200 + }, + { + "epoch": 358.46153846153845, + "grad_norm": 106.14508819580078, + "learning_rate": 7.6716e-06, + "loss": 0.8223, + "step": 23300 + }, + { + "epoch": 358.46153846153845, + "eval_loss": 1.1121721267700195, + "eval_runtime": 11.1579, + "eval_samples_per_second": 11.741, + "eval_steps_per_second": 1.524, + "step": 23300 + }, + { + "epoch": 360.0, + "grad_norm": 32.76395797729492, + "learning_rate": 7.6616e-06, + "loss": 0.8199, + "step": 23400 + }, + { + "epoch": 360.0, + "eval_loss": 1.109994649887085, + "eval_runtime": 11.0068, + "eval_samples_per_second": 11.902, + "eval_steps_per_second": 1.545, + "step": 23400 + }, + { + "epoch": 361.53846153846155, + "grad_norm": 44.64813232421875, + "learning_rate": 7.6516e-06, + "loss": 0.8566, + "step": 23500 + }, + { + "epoch": 361.53846153846155, + "eval_loss": 1.1383651494979858, + "eval_runtime": 11.1866, + "eval_samples_per_second": 11.71, + "eval_steps_per_second": 1.52, + "step": 23500 + }, + { + "epoch": 363.0769230769231, + "grad_norm": 20.096393585205078, + "learning_rate": 7.6416e-06, + "loss": 0.795, + "step": 23600 + }, + { + "epoch": 363.0769230769231, + "eval_loss": 1.1200674772262573, + "eval_runtime": 11.289, + "eval_samples_per_second": 11.604, + "eval_steps_per_second": 1.506, + "step": 23600 + }, + { + "epoch": 364.61538461538464, + "grad_norm": 30.219837188720703, + "learning_rate": 7.6316e-06, + "loss": 0.8413, + "step": 23700 + }, + { + "epoch": 364.61538461538464, + "eval_loss": 1.111639380455017, + "eval_runtime": 11.2547, + "eval_samples_per_second": 11.64, + "eval_steps_per_second": 1.51, + "step": 23700 + }, + { + "epoch": 366.15384615384613, + "grad_norm": 54.91815185546875, + "learning_rate": 7.6216e-06, + "loss": 0.8284, + "step": 23800 + }, + { + "epoch": 366.15384615384613, + "eval_loss": 1.1330946683883667, + "eval_runtime": 11.3054, + "eval_samples_per_second": 11.587, + "eval_steps_per_second": 1.504, + "step": 23800 + }, + { + "epoch": 367.6923076923077, + "grad_norm": 40.97207260131836, + "learning_rate": 7.6116e-06, + "loss": 0.8387, + "step": 23900 + }, + { + "epoch": 367.6923076923077, + "eval_loss": 1.1550222635269165, + "eval_runtime": 11.2372, + "eval_samples_per_second": 11.658, + "eval_steps_per_second": 1.513, + "step": 23900 + }, + { + "epoch": 369.2307692307692, + "grad_norm": 36.580745697021484, + "learning_rate": 7.6017e-06, + "loss": 0.829, + "step": 24000 + }, + { + "epoch": 369.2307692307692, + "eval_loss": 1.1261051893234253, + "eval_runtime": 11.1834, + "eval_samples_per_second": 11.714, + "eval_steps_per_second": 1.52, + "step": 24000 + }, + { + "epoch": 370.7692307692308, + "grad_norm": 46.304649353027344, + "learning_rate": 7.5917000000000005e-06, + "loss": 0.8316, + "step": 24100 + }, + { + "epoch": 370.7692307692308, + "eval_loss": 1.1410084962844849, + "eval_runtime": 11.3511, + "eval_samples_per_second": 11.541, + "eval_steps_per_second": 1.498, + "step": 24100 + }, + { + "epoch": 372.3076923076923, + "grad_norm": 32.472293853759766, + "learning_rate": 7.581700000000001e-06, + "loss": 0.8309, + "step": 24200 + }, + { + "epoch": 372.3076923076923, + "eval_loss": 1.1283127069473267, + "eval_runtime": 11.1949, + "eval_samples_per_second": 11.702, + "eval_steps_per_second": 1.519, + "step": 24200 + }, + { + "epoch": 373.84615384615387, + "grad_norm": 24.16025161743164, + "learning_rate": 7.571700000000001e-06, + "loss": 0.8154, + "step": 24300 + }, + { + "epoch": 373.84615384615387, + "eval_loss": 1.1386806964874268, + "eval_runtime": 11.1089, + "eval_samples_per_second": 11.792, + "eval_steps_per_second": 1.53, + "step": 24300 + }, + { + "epoch": 375.38461538461536, + "grad_norm": 16.86387825012207, + "learning_rate": 7.561700000000001e-06, + "loss": 0.8221, + "step": 24400 + }, + { + "epoch": 375.38461538461536, + "eval_loss": 1.09730863571167, + "eval_runtime": 11.1299, + "eval_samples_per_second": 11.77, + "eval_steps_per_second": 1.527, + "step": 24400 + }, + { + "epoch": 376.9230769230769, + "grad_norm": 34.62860107421875, + "learning_rate": 7.551700000000001e-06, + "loss": 0.8106, + "step": 24500 + }, + { + "epoch": 376.9230769230769, + "eval_loss": 1.1233881711959839, + "eval_runtime": 11.0352, + "eval_samples_per_second": 11.871, + "eval_steps_per_second": 1.541, + "step": 24500 + }, + { + "epoch": 378.46153846153845, + "grad_norm": 16.957393646240234, + "learning_rate": 7.541700000000001e-06, + "loss": 0.8362, + "step": 24600 + }, + { + "epoch": 378.46153846153845, + "eval_loss": 1.1209690570831299, + "eval_runtime": 11.0555, + "eval_samples_per_second": 11.849, + "eval_steps_per_second": 1.538, + "step": 24600 + }, + { + "epoch": 380.0, + "grad_norm": 21.519577026367188, + "learning_rate": 7.531700000000001e-06, + "loss": 0.8139, + "step": 24700 + }, + { + "epoch": 380.0, + "eval_loss": 1.1106966733932495, + "eval_runtime": 11.0492, + "eval_samples_per_second": 11.856, + "eval_steps_per_second": 1.539, + "step": 24700 + }, + { + "epoch": 381.53846153846155, + "grad_norm": 26.86697769165039, + "learning_rate": 7.5217e-06, + "loss": 0.8036, + "step": 24800 + }, + { + "epoch": 381.53846153846155, + "eval_loss": 1.1153205633163452, + "eval_runtime": 11.1284, + "eval_samples_per_second": 11.772, + "eval_steps_per_second": 1.528, + "step": 24800 + }, + { + "epoch": 383.0769230769231, + "grad_norm": 21.1047420501709, + "learning_rate": 7.5117000000000004e-06, + "loss": 0.8115, + "step": 24900 + }, + { + "epoch": 383.0769230769231, + "eval_loss": 1.116546630859375, + "eval_runtime": 11.4261, + "eval_samples_per_second": 11.465, + "eval_steps_per_second": 1.488, + "step": 24900 + }, + { + "epoch": 384.61538461538464, + "grad_norm": 40.87736892700195, + "learning_rate": 7.5017000000000005e-06, + "loss": 0.8146, + "step": 25000 + }, + { + "epoch": 384.61538461538464, + "eval_loss": 1.1162028312683105, + "eval_runtime": 11.3423, + "eval_samples_per_second": 11.55, + "eval_steps_per_second": 1.499, + "step": 25000 + }, + { + "epoch": 386.15384615384613, + "grad_norm": 19.28094482421875, + "learning_rate": 7.491700000000001e-06, + "loss": 0.8114, + "step": 25100 + }, + { + "epoch": 386.15384615384613, + "eval_loss": 1.106558918952942, + "eval_runtime": 11.0618, + "eval_samples_per_second": 11.843, + "eval_steps_per_second": 1.537, + "step": 25100 + }, + { + "epoch": 387.6923076923077, + "grad_norm": 19.689420700073242, + "learning_rate": 7.481700000000001e-06, + "loss": 0.8076, + "step": 25200 + }, + { + "epoch": 387.6923076923077, + "eval_loss": 1.1141376495361328, + "eval_runtime": 11.0491, + "eval_samples_per_second": 11.856, + "eval_steps_per_second": 1.539, + "step": 25200 + }, + { + "epoch": 389.2307692307692, + "grad_norm": 22.76107406616211, + "learning_rate": 7.471700000000001e-06, + "loss": 0.8077, + "step": 25300 + }, + { + "epoch": 389.2307692307692, + "eval_loss": 1.1186425685882568, + "eval_runtime": 11.1798, + "eval_samples_per_second": 11.718, + "eval_steps_per_second": 1.521, + "step": 25300 + }, + { + "epoch": 390.7692307692308, + "grad_norm": 40.23360061645508, + "learning_rate": 7.461700000000001e-06, + "loss": 0.7924, + "step": 25400 + }, + { + "epoch": 390.7692307692308, + "eval_loss": 1.1189498901367188, + "eval_runtime": 11.1824, + "eval_samples_per_second": 11.715, + "eval_steps_per_second": 1.52, + "step": 25400 + }, + { + "epoch": 392.3076923076923, + "grad_norm": 32.873207092285156, + "learning_rate": 7.451700000000001e-06, + "loss": 0.8335, + "step": 25500 + }, + { + "epoch": 392.3076923076923, + "eval_loss": 1.1451479196548462, + "eval_runtime": 11.1543, + "eval_samples_per_second": 11.744, + "eval_steps_per_second": 1.524, + "step": 25500 + }, + { + "epoch": 393.84615384615387, + "grad_norm": 27.307552337646484, + "learning_rate": 7.4417e-06, + "loss": 0.7926, + "step": 25600 + }, + { + "epoch": 393.84615384615387, + "eval_loss": 1.1189228296279907, + "eval_runtime": 11.0875, + "eval_samples_per_second": 11.815, + "eval_steps_per_second": 1.533, + "step": 25600 + }, + { + "epoch": 395.38461538461536, + "grad_norm": 22.984905242919922, + "learning_rate": 7.4317e-06, + "loss": 0.8039, + "step": 25700 + }, + { + "epoch": 395.38461538461536, + "eval_loss": 1.1381311416625977, + "eval_runtime": 11.1226, + "eval_samples_per_second": 11.778, + "eval_steps_per_second": 1.528, + "step": 25700 + }, + { + "epoch": 396.9230769230769, + "grad_norm": 43.75572967529297, + "learning_rate": 7.4217000000000004e-06, + "loss": 0.8426, + "step": 25800 + }, + { + "epoch": 396.9230769230769, + "eval_loss": 1.1005278825759888, + "eval_runtime": 10.9018, + "eval_samples_per_second": 12.016, + "eval_steps_per_second": 1.559, + "step": 25800 + }, + { + "epoch": 398.46153846153845, + "grad_norm": 45.551212310791016, + "learning_rate": 7.4117000000000005e-06, + "loss": 0.7918, + "step": 25900 + }, + { + "epoch": 398.46153846153845, + "eval_loss": 1.1017777919769287, + "eval_runtime": 10.8997, + "eval_samples_per_second": 12.019, + "eval_steps_per_second": 1.56, + "step": 25900 + }, + { + "epoch": 400.0, + "grad_norm": 27.237998962402344, + "learning_rate": 7.401700000000001e-06, + "loss": 0.825, + "step": 26000 + }, + { + "epoch": 400.0, + "eval_loss": 1.1006879806518555, + "eval_runtime": 10.8272, + "eval_samples_per_second": 12.099, + "eval_steps_per_second": 1.57, + "step": 26000 + }, + { + "epoch": 401.53846153846155, + "grad_norm": 20.51366424560547, + "learning_rate": 7.391700000000001e-06, + "loss": 0.8069, + "step": 26100 + }, + { + "epoch": 401.53846153846155, + "eval_loss": 1.121657371520996, + "eval_runtime": 10.956, + "eval_samples_per_second": 11.957, + "eval_steps_per_second": 1.552, + "step": 26100 + }, + { + "epoch": 403.0769230769231, + "grad_norm": 38.978363037109375, + "learning_rate": 7.381700000000001e-06, + "loss": 0.8105, + "step": 26200 + }, + { + "epoch": 403.0769230769231, + "eval_loss": 1.1141672134399414, + "eval_runtime": 10.8817, + "eval_samples_per_second": 12.039, + "eval_steps_per_second": 1.562, + "step": 26200 + }, + { + "epoch": 404.61538461538464, + "grad_norm": 40.20176315307617, + "learning_rate": 7.371700000000001e-06, + "loss": 0.7953, + "step": 26300 + }, + { + "epoch": 404.61538461538464, + "eval_loss": 1.1042087078094482, + "eval_runtime": 10.8826, + "eval_samples_per_second": 12.038, + "eval_steps_per_second": 1.562, + "step": 26300 + }, + { + "epoch": 406.15384615384613, + "grad_norm": 27.35833168029785, + "learning_rate": 7.3617e-06, + "loss": 0.8075, + "step": 26400 + }, + { + "epoch": 406.15384615384613, + "eval_loss": 1.1115680932998657, + "eval_runtime": 10.9465, + "eval_samples_per_second": 11.967, + "eval_steps_per_second": 1.553, + "step": 26400 + }, + { + "epoch": 407.6923076923077, + "grad_norm": 32.054351806640625, + "learning_rate": 7.3517e-06, + "loss": 0.7958, + "step": 26500 + }, + { + "epoch": 407.6923076923077, + "eval_loss": 1.1209728717803955, + "eval_runtime": 10.8226, + "eval_samples_per_second": 12.104, + "eval_steps_per_second": 1.571, + "step": 26500 + }, + { + "epoch": 409.2307692307692, + "grad_norm": 37.53184509277344, + "learning_rate": 7.3417e-06, + "loss": 0.7859, + "step": 26600 + }, + { + "epoch": 409.2307692307692, + "eval_loss": 1.0920478105545044, + "eval_runtime": 10.8488, + "eval_samples_per_second": 12.075, + "eval_steps_per_second": 1.567, + "step": 26600 + }, + { + "epoch": 410.7692307692308, + "grad_norm": 39.949039459228516, + "learning_rate": 7.3317000000000005e-06, + "loss": 0.8, + "step": 26700 + }, + { + "epoch": 410.7692307692308, + "eval_loss": 1.0909216403961182, + "eval_runtime": 10.9975, + "eval_samples_per_second": 11.912, + "eval_steps_per_second": 1.546, + "step": 26700 + }, + { + "epoch": 412.3076923076923, + "grad_norm": 28.89598846435547, + "learning_rate": 7.3217000000000006e-06, + "loss": 0.8168, + "step": 26800 + }, + { + "epoch": 412.3076923076923, + "eval_loss": 1.108217477798462, + "eval_runtime": 10.9623, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 1.551, + "step": 26800 + }, + { + "epoch": 413.84615384615387, + "grad_norm": 33.56279754638672, + "learning_rate": 7.311700000000001e-06, + "loss": 0.7854, + "step": 26900 + }, + { + "epoch": 413.84615384615387, + "eval_loss": 1.1230065822601318, + "eval_runtime": 10.8811, + "eval_samples_per_second": 12.039, + "eval_steps_per_second": 1.562, + "step": 26900 + }, + { + "epoch": 415.38461538461536, + "grad_norm": 26.578338623046875, + "learning_rate": 7.301700000000001e-06, + "loss": 0.7954, + "step": 27000 + }, + { + "epoch": 415.38461538461536, + "eval_loss": 1.1114639043807983, + "eval_runtime": 10.9172, + "eval_samples_per_second": 11.999, + "eval_steps_per_second": 1.557, + "step": 27000 + }, + { + "epoch": 416.9230769230769, + "grad_norm": 31.6333065032959, + "learning_rate": 7.291700000000001e-06, + "loss": 0.8057, + "step": 27100 + }, + { + "epoch": 416.9230769230769, + "eval_loss": 1.1136469841003418, + "eval_runtime": 10.835, + "eval_samples_per_second": 12.09, + "eval_steps_per_second": 1.569, + "step": 27100 + }, + { + "epoch": 418.46153846153845, + "grad_norm": 17.80211067199707, + "learning_rate": 7.2817e-06, + "loss": 0.8074, + "step": 27200 + }, + { + "epoch": 418.46153846153845, + "eval_loss": 1.0823259353637695, + "eval_runtime": 11.1165, + "eval_samples_per_second": 11.784, + "eval_steps_per_second": 1.529, + "step": 27200 + }, + { + "epoch": 420.0, + "grad_norm": 50.79414367675781, + "learning_rate": 7.2717e-06, + "loss": 0.778, + "step": 27300 + }, + { + "epoch": 420.0, + "eval_loss": 1.0840650796890259, + "eval_runtime": 10.9207, + "eval_samples_per_second": 11.996, + "eval_steps_per_second": 1.557, + "step": 27300 + }, + { + "epoch": 421.53846153846155, + "grad_norm": 27.64950942993164, + "learning_rate": 7.2617e-06, + "loss": 0.79, + "step": 27400 + }, + { + "epoch": 421.53846153846155, + "eval_loss": 1.1120820045471191, + "eval_runtime": 10.8896, + "eval_samples_per_second": 12.03, + "eval_steps_per_second": 1.561, + "step": 27400 + }, + { + "epoch": 423.0769230769231, + "grad_norm": 28.115442276000977, + "learning_rate": 7.2517e-06, + "loss": 0.7927, + "step": 27500 + }, + { + "epoch": 423.0769230769231, + "eval_loss": 1.0890986919403076, + "eval_runtime": 10.8766, + "eval_samples_per_second": 12.044, + "eval_steps_per_second": 1.563, + "step": 27500 + }, + { + "epoch": 424.61538461538464, + "grad_norm": 56.913578033447266, + "learning_rate": 7.2417000000000005e-06, + "loss": 0.7843, + "step": 27600 + }, + { + "epoch": 424.61538461538464, + "eval_loss": 1.0650087594985962, + "eval_runtime": 10.8169, + "eval_samples_per_second": 12.111, + "eval_steps_per_second": 1.572, + "step": 27600 + }, + { + "epoch": 426.15384615384613, + "grad_norm": 28.091888427734375, + "learning_rate": 7.231800000000001e-06, + "loss": 0.8113, + "step": 27700 + }, + { + "epoch": 426.15384615384613, + "eval_loss": 1.103638768196106, + "eval_runtime": 10.8897, + "eval_samples_per_second": 12.03, + "eval_steps_per_second": 1.561, + "step": 27700 + }, + { + "epoch": 427.6923076923077, + "grad_norm": 21.665067672729492, + "learning_rate": 7.221800000000001e-06, + "loss": 0.7803, + "step": 27800 + }, + { + "epoch": 427.6923076923077, + "eval_loss": 1.1119697093963623, + "eval_runtime": 10.8141, + "eval_samples_per_second": 12.114, + "eval_steps_per_second": 1.572, + "step": 27800 + }, + { + "epoch": 429.2307692307692, + "grad_norm": 65.23661804199219, + "learning_rate": 7.211800000000001e-06, + "loss": 0.7963, + "step": 27900 + }, + { + "epoch": 429.2307692307692, + "eval_loss": 1.1217514276504517, + "eval_runtime": 10.8793, + "eval_samples_per_second": 12.041, + "eval_steps_per_second": 1.563, + "step": 27900 + }, + { + "epoch": 430.7692307692308, + "grad_norm": 43.18936538696289, + "learning_rate": 7.201800000000001e-06, + "loss": 0.7976, + "step": 28000 + }, + { + "epoch": 430.7692307692308, + "eval_loss": 1.1384552717208862, + "eval_runtime": 10.8325, + "eval_samples_per_second": 12.093, + "eval_steps_per_second": 1.569, + "step": 28000 + }, + { + "epoch": 432.3076923076923, + "grad_norm": 33.29544448852539, + "learning_rate": 7.191800000000001e-06, + "loss": 0.803, + "step": 28100 + }, + { + "epoch": 432.3076923076923, + "eval_loss": 1.1162676811218262, + "eval_runtime": 10.8215, + "eval_samples_per_second": 12.106, + "eval_steps_per_second": 1.571, + "step": 28100 + }, + { + "epoch": 433.84615384615387, + "grad_norm": 22.86684799194336, + "learning_rate": 7.181800000000001e-06, + "loss": 0.8037, + "step": 28200 + }, + { + "epoch": 433.84615384615387, + "eval_loss": 1.1036708354949951, + "eval_runtime": 10.8309, + "eval_samples_per_second": 12.095, + "eval_steps_per_second": 1.57, + "step": 28200 + }, + { + "epoch": 435.38461538461536, + "grad_norm": 21.229990005493164, + "learning_rate": 7.171800000000001e-06, + "loss": 0.7713, + "step": 28300 + }, + { + "epoch": 435.38461538461536, + "eval_loss": 1.0910826921463013, + "eval_runtime": 10.8187, + "eval_samples_per_second": 12.109, + "eval_steps_per_second": 1.571, + "step": 28300 + }, + { + "epoch": 436.9230769230769, + "grad_norm": 33.30841064453125, + "learning_rate": 7.161800000000001e-06, + "loss": 0.7956, + "step": 28400 + }, + { + "epoch": 436.9230769230769, + "eval_loss": 1.0862958431243896, + "eval_runtime": 10.8992, + "eval_samples_per_second": 12.019, + "eval_steps_per_second": 1.56, + "step": 28400 + }, + { + "epoch": 438.46153846153845, + "grad_norm": 24.01555633544922, + "learning_rate": 7.151800000000001e-06, + "loss": 0.7883, + "step": 28500 + }, + { + "epoch": 438.46153846153845, + "eval_loss": 1.1144325733184814, + "eval_runtime": 11.0935, + "eval_samples_per_second": 11.809, + "eval_steps_per_second": 1.532, + "step": 28500 + }, + { + "epoch": 440.0, + "grad_norm": 43.260921478271484, + "learning_rate": 7.141800000000001e-06, + "loss": 0.7885, + "step": 28600 + }, + { + "epoch": 440.0, + "eval_loss": 1.0987013578414917, + "eval_runtime": 10.803, + "eval_samples_per_second": 12.126, + "eval_steps_per_second": 1.574, + "step": 28600 + }, + { + "epoch": 441.53846153846155, + "grad_norm": 24.575450897216797, + "learning_rate": 7.131800000000001e-06, + "loss": 0.8052, + "step": 28700 + }, + { + "epoch": 441.53846153846155, + "eval_loss": 1.1236560344696045, + "eval_runtime": 10.7671, + "eval_samples_per_second": 12.167, + "eval_steps_per_second": 1.579, + "step": 28700 + }, + { + "epoch": 443.0769230769231, + "grad_norm": 58.26254653930664, + "learning_rate": 7.121800000000001e-06, + "loss": 0.7856, + "step": 28800 + }, + { + "epoch": 443.0769230769231, + "eval_loss": 1.1264104843139648, + "eval_runtime": 11.0135, + "eval_samples_per_second": 11.894, + "eval_steps_per_second": 1.544, + "step": 28800 + }, + { + "epoch": 444.61538461538464, + "grad_norm": 41.40846633911133, + "learning_rate": 7.111800000000001e-06, + "loss": 0.7923, + "step": 28900 + }, + { + "epoch": 444.61538461538464, + "eval_loss": 1.1042475700378418, + "eval_runtime": 10.8929, + "eval_samples_per_second": 12.026, + "eval_steps_per_second": 1.561, + "step": 28900 + }, + { + "epoch": 446.15384615384613, + "grad_norm": 42.24570846557617, + "learning_rate": 7.101800000000001e-06, + "loss": 0.8239, + "step": 29000 + }, + { + "epoch": 446.15384615384613, + "eval_loss": 1.0964312553405762, + "eval_runtime": 11.2262, + "eval_samples_per_second": 11.669, + "eval_steps_per_second": 1.514, + "step": 29000 + }, + { + "epoch": 447.6923076923077, + "grad_norm": 29.23143768310547, + "learning_rate": 7.091800000000001e-06, + "loss": 0.7698, + "step": 29100 + }, + { + "epoch": 447.6923076923077, + "eval_loss": 1.0975351333618164, + "eval_runtime": 10.963, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 1.551, + "step": 29100 + }, + { + "epoch": 449.2307692307692, + "grad_norm": 50.939388275146484, + "learning_rate": 7.0818000000000005e-06, + "loss": 0.8042, + "step": 29200 + }, + { + "epoch": 449.2307692307692, + "eval_loss": 1.095157504081726, + "eval_runtime": 10.842, + "eval_samples_per_second": 12.083, + "eval_steps_per_second": 1.568, + "step": 29200 + }, + { + "epoch": 450.7692307692308, + "grad_norm": 14.69772720336914, + "learning_rate": 7.071800000000001e-06, + "loss": 0.7854, + "step": 29300 + }, + { + "epoch": 450.7692307692308, + "eval_loss": 1.0933393239974976, + "eval_runtime": 10.8659, + "eval_samples_per_second": 12.056, + "eval_steps_per_second": 1.565, + "step": 29300 + }, + { + "epoch": 452.3076923076923, + "grad_norm": 25.141313552856445, + "learning_rate": 7.061800000000001e-06, + "loss": 0.7794, + "step": 29400 + }, + { + "epoch": 452.3076923076923, + "eval_loss": 1.1168276071548462, + "eval_runtime": 10.8764, + "eval_samples_per_second": 12.044, + "eval_steps_per_second": 1.563, + "step": 29400 + }, + { + "epoch": 453.84615384615387, + "grad_norm": 19.738088607788086, + "learning_rate": 7.051800000000001e-06, + "loss": 0.7703, + "step": 29500 + }, + { + "epoch": 453.84615384615387, + "eval_loss": 1.1066123247146606, + "eval_runtime": 10.8416, + "eval_samples_per_second": 12.083, + "eval_steps_per_second": 1.568, + "step": 29500 + }, + { + "epoch": 455.38461538461536, + "grad_norm": 37.319374084472656, + "learning_rate": 7.041800000000001e-06, + "loss": 0.7886, + "step": 29600 + }, + { + "epoch": 455.38461538461536, + "eval_loss": 1.0956982374191284, + "eval_runtime": 10.9677, + "eval_samples_per_second": 11.944, + "eval_steps_per_second": 1.55, + "step": 29600 + }, + { + "epoch": 456.9230769230769, + "grad_norm": 38.13822937011719, + "learning_rate": 7.031800000000001e-06, + "loss": 0.7931, + "step": 29700 + }, + { + "epoch": 456.9230769230769, + "eval_loss": 1.1083027124404907, + "eval_runtime": 10.9352, + "eval_samples_per_second": 11.98, + "eval_steps_per_second": 1.555, + "step": 29700 + }, + { + "epoch": 458.46153846153845, + "grad_norm": 35.276790618896484, + "learning_rate": 7.021800000000001e-06, + "loss": 0.7759, + "step": 29800 + }, + { + "epoch": 458.46153846153845, + "eval_loss": 1.0975127220153809, + "eval_runtime": 10.9213, + "eval_samples_per_second": 11.995, + "eval_steps_per_second": 1.557, + "step": 29800 + }, + { + "epoch": 460.0, + "grad_norm": 35.04694366455078, + "learning_rate": 7.011800000000001e-06, + "loss": 0.7901, + "step": 29900 + }, + { + "epoch": 460.0, + "eval_loss": 1.0966426134109497, + "eval_runtime": 10.8724, + "eval_samples_per_second": 12.049, + "eval_steps_per_second": 1.564, + "step": 29900 + }, + { + "epoch": 461.53846153846155, + "grad_norm": 18.101720809936523, + "learning_rate": 7.0018000000000005e-06, + "loss": 0.7949, + "step": 30000 + }, + { + "epoch": 461.53846153846155, + "eval_loss": 1.1292482614517212, + "eval_runtime": 10.811, + "eval_samples_per_second": 12.117, + "eval_steps_per_second": 1.572, + "step": 30000 + }, + { + "epoch": 463.0769230769231, + "grad_norm": 38.57355880737305, + "learning_rate": 6.9918000000000005e-06, + "loss": 0.7582, + "step": 30100 + }, + { + "epoch": 463.0769230769231, + "eval_loss": 1.0853633880615234, + "eval_runtime": 10.9157, + "eval_samples_per_second": 12.001, + "eval_steps_per_second": 1.557, + "step": 30100 + }, + { + "epoch": 464.61538461538464, + "grad_norm": 21.699960708618164, + "learning_rate": 6.981800000000001e-06, + "loss": 0.7696, + "step": 30200 + }, + { + "epoch": 464.61538461538464, + "eval_loss": 1.1221872568130493, + "eval_runtime": 10.964, + "eval_samples_per_second": 11.948, + "eval_steps_per_second": 1.551, + "step": 30200 + }, + { + "epoch": 466.15384615384613, + "grad_norm": 20.038455963134766, + "learning_rate": 6.971800000000001e-06, + "loss": 0.7684, + "step": 30300 + }, + { + "epoch": 466.15384615384613, + "eval_loss": 1.1108542680740356, + "eval_runtime": 10.8469, + "eval_samples_per_second": 12.077, + "eval_steps_per_second": 1.567, + "step": 30300 + }, + { + "epoch": 467.6923076923077, + "grad_norm": 24.24856185913086, + "learning_rate": 6.961800000000001e-06, + "loss": 0.781, + "step": 30400 + }, + { + "epoch": 467.6923076923077, + "eval_loss": 1.1001288890838623, + "eval_runtime": 11.2825, + "eval_samples_per_second": 11.611, + "eval_steps_per_second": 1.507, + "step": 30400 + }, + { + "epoch": 469.2307692307692, + "grad_norm": 116.92959594726562, + "learning_rate": 6.951800000000001e-06, + "loss": 0.7442, + "step": 30500 + }, + { + "epoch": 469.2307692307692, + "eval_loss": 1.1101197004318237, + "eval_runtime": 10.8905, + "eval_samples_per_second": 12.029, + "eval_steps_per_second": 1.561, + "step": 30500 + }, + { + "epoch": 470.7692307692308, + "grad_norm": 23.174938201904297, + "learning_rate": 6.941800000000001e-06, + "loss": 0.7763, + "step": 30600 + }, + { + "epoch": 470.7692307692308, + "eval_loss": 1.1175529956817627, + "eval_runtime": 10.8639, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.565, + "step": 30600 + }, + { + "epoch": 472.3076923076923, + "grad_norm": 35.02244186401367, + "learning_rate": 6.931800000000001e-06, + "loss": 0.7763, + "step": 30700 + }, + { + "epoch": 472.3076923076923, + "eval_loss": 1.1092499494552612, + "eval_runtime": 10.8762, + "eval_samples_per_second": 12.045, + "eval_steps_per_second": 1.563, + "step": 30700 + }, + { + "epoch": 473.84615384615387, + "grad_norm": 45.695640563964844, + "learning_rate": 6.9218e-06, + "loss": 0.7931, + "step": 30800 + }, + { + "epoch": 473.84615384615387, + "eval_loss": 1.1132335662841797, + "eval_runtime": 10.94, + "eval_samples_per_second": 11.974, + "eval_steps_per_second": 1.554, + "step": 30800 + }, + { + "epoch": 475.38461538461536, + "grad_norm": 14.497979164123535, + "learning_rate": 6.9118000000000005e-06, + "loss": 0.7646, + "step": 30900 + }, + { + "epoch": 475.38461538461536, + "eval_loss": 1.1042397022247314, + "eval_runtime": 10.9629, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 1.551, + "step": 30900 + }, + { + "epoch": 476.9230769230769, + "grad_norm": 23.597251892089844, + "learning_rate": 6.9018000000000006e-06, + "loss": 0.7423, + "step": 31000 + }, + { + "epoch": 476.9230769230769, + "eval_loss": 1.122165322303772, + "eval_runtime": 10.8816, + "eval_samples_per_second": 12.039, + "eval_steps_per_second": 1.562, + "step": 31000 + }, + { + "epoch": 478.46153846153845, + "grad_norm": 30.462820053100586, + "learning_rate": 6.891800000000001e-06, + "loss": 0.7731, + "step": 31100 + }, + { + "epoch": 478.46153846153845, + "eval_loss": 1.0937728881835938, + "eval_runtime": 10.8637, + "eval_samples_per_second": 12.059, + "eval_steps_per_second": 1.565, + "step": 31100 + }, + { + "epoch": 480.0, + "grad_norm": 34.22341537475586, + "learning_rate": 6.881800000000001e-06, + "loss": 0.7693, + "step": 31200 + }, + { + "epoch": 480.0, + "eval_loss": 1.1110594272613525, + "eval_runtime": 10.9356, + "eval_samples_per_second": 11.979, + "eval_steps_per_second": 1.555, + "step": 31200 + }, + { + "epoch": 481.53846153846155, + "grad_norm": 22.95448875427246, + "learning_rate": 6.871800000000001e-06, + "loss": 0.7596, + "step": 31300 + }, + { + "epoch": 481.53846153846155, + "eval_loss": 1.1153086423873901, + "eval_runtime": 10.8089, + "eval_samples_per_second": 12.12, + "eval_steps_per_second": 1.573, + "step": 31300 + }, + { + "epoch": 483.0769230769231, + "grad_norm": 41.664546966552734, + "learning_rate": 6.861800000000001e-06, + "loss": 0.7662, + "step": 31400 + }, + { + "epoch": 483.0769230769231, + "eval_loss": 1.094671368598938, + "eval_runtime": 10.8539, + "eval_samples_per_second": 12.069, + "eval_steps_per_second": 1.566, + "step": 31400 + }, + { + "epoch": 484.61538461538464, + "grad_norm": 41.87430953979492, + "learning_rate": 6.851800000000001e-06, + "loss": 0.7487, + "step": 31500 + }, + { + "epoch": 484.61538461538464, + "eval_loss": 1.0832558870315552, + "eval_runtime": 10.8158, + "eval_samples_per_second": 12.112, + "eval_steps_per_second": 1.572, + "step": 31500 + }, + { + "epoch": 486.15384615384613, + "grad_norm": 19.956377029418945, + "learning_rate": 6.8418e-06, + "loss": 0.7853, + "step": 31600 + }, + { + "epoch": 486.15384615384613, + "eval_loss": 1.0961304903030396, + "eval_runtime": 10.8783, + "eval_samples_per_second": 12.042, + "eval_steps_per_second": 1.563, + "step": 31600 + }, + { + "epoch": 487.6923076923077, + "grad_norm": 22.02857780456543, + "learning_rate": 6.8318e-06, + "loss": 0.779, + "step": 31700 + }, + { + "epoch": 487.6923076923077, + "eval_loss": 1.100192904472351, + "eval_runtime": 11.1077, + "eval_samples_per_second": 11.794, + "eval_steps_per_second": 1.53, + "step": 31700 + }, + { + "epoch": 489.2307692307692, + "grad_norm": 51.10612106323242, + "learning_rate": 6.8218000000000005e-06, + "loss": 0.7496, + "step": 31800 + }, + { + "epoch": 489.2307692307692, + "eval_loss": 1.0941290855407715, + "eval_runtime": 10.9049, + "eval_samples_per_second": 12.013, + "eval_steps_per_second": 1.559, + "step": 31800 + }, + { + "epoch": 490.7692307692308, + "grad_norm": 31.252660751342773, + "learning_rate": 6.811800000000001e-06, + "loss": 0.7701, + "step": 31900 + }, + { + "epoch": 490.7692307692308, + "eval_loss": 1.1330056190490723, + "eval_runtime": 11.0327, + "eval_samples_per_second": 11.874, + "eval_steps_per_second": 1.541, + "step": 31900 + }, + { + "epoch": 492.3076923076923, + "grad_norm": 19.217748641967773, + "learning_rate": 6.801800000000001e-06, + "loss": 0.7608, + "step": 32000 + }, + { + "epoch": 492.3076923076923, + "eval_loss": 1.1036320924758911, + "eval_runtime": 10.8717, + "eval_samples_per_second": 12.05, + "eval_steps_per_second": 1.564, + "step": 32000 + }, + { + "epoch": 493.84615384615387, + "grad_norm": 31.334781646728516, + "learning_rate": 6.791800000000001e-06, + "loss": 0.7612, + "step": 32100 + }, + { + "epoch": 493.84615384615387, + "eval_loss": 1.1022484302520752, + "eval_runtime": 10.9308, + "eval_samples_per_second": 11.984, + "eval_steps_per_second": 1.555, + "step": 32100 + }, + { + "epoch": 495.38461538461536, + "grad_norm": 10.683860778808594, + "learning_rate": 6.781800000000001e-06, + "loss": 0.7762, + "step": 32200 + }, + { + "epoch": 495.38461538461536, + "eval_loss": 1.090829610824585, + "eval_runtime": 10.9568, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 1.552, + "step": 32200 + }, + { + "epoch": 496.9230769230769, + "grad_norm": 37.3969612121582, + "learning_rate": 6.771800000000001e-06, + "loss": 0.758, + "step": 32300 + }, + { + "epoch": 496.9230769230769, + "eval_loss": 1.110468864440918, + "eval_runtime": 10.9251, + "eval_samples_per_second": 11.991, + "eval_steps_per_second": 1.556, + "step": 32300 + }, + { + "epoch": 498.46153846153845, + "grad_norm": 28.375669479370117, + "learning_rate": 6.7618e-06, + "loss": 0.7532, + "step": 32400 + }, + { + "epoch": 498.46153846153845, + "eval_loss": 1.0944682359695435, + "eval_runtime": 10.859, + "eval_samples_per_second": 12.064, + "eval_steps_per_second": 1.566, + "step": 32400 + }, + { + "epoch": 500.0, + "grad_norm": 38.84261703491211, + "learning_rate": 6.7518e-06, + "loss": 0.7491, + "step": 32500 + }, + { + "epoch": 500.0, + "eval_loss": 1.0800715684890747, + "eval_runtime": 11.5919, + "eval_samples_per_second": 11.301, + "eval_steps_per_second": 1.467, + "step": 32500 + }, + { + "epoch": 501.53846153846155, + "grad_norm": 23.437068939208984, + "learning_rate": 6.7418e-06, + "loss": 0.7669, + "step": 32600 + }, + { + "epoch": 501.53846153846155, + "eval_loss": 1.0843063592910767, + "eval_runtime": 11.406, + "eval_samples_per_second": 11.485, + "eval_steps_per_second": 1.49, + "step": 32600 + }, + { + "epoch": 503.0769230769231, + "grad_norm": 31.442537307739258, + "learning_rate": 6.7318000000000005e-06, + "loss": 0.7488, + "step": 32700 + }, + { + "epoch": 503.0769230769231, + "eval_loss": 1.0940508842468262, + "eval_runtime": 11.3219, + "eval_samples_per_second": 11.571, + "eval_steps_per_second": 1.502, + "step": 32700 + }, + { + "epoch": 504.61538461538464, + "grad_norm": 59.37199020385742, + "learning_rate": 6.721800000000001e-06, + "loss": 0.7446, + "step": 32800 + }, + { + "epoch": 504.61538461538464, + "eval_loss": 1.0775959491729736, + "eval_runtime": 11.5469, + "eval_samples_per_second": 11.345, + "eval_steps_per_second": 1.472, + "step": 32800 + }, + { + "epoch": 506.15384615384613, + "grad_norm": 35.232391357421875, + "learning_rate": 6.711800000000001e-06, + "loss": 0.7404, + "step": 32900 + }, + { + "epoch": 506.15384615384613, + "eval_loss": 1.0897659063339233, + "eval_runtime": 11.5782, + "eval_samples_per_second": 11.314, + "eval_steps_per_second": 1.468, + "step": 32900 + }, + { + "epoch": 507.6923076923077, + "grad_norm": 47.93821334838867, + "learning_rate": 6.701800000000001e-06, + "loss": 0.742, + "step": 33000 + }, + { + "epoch": 507.6923076923077, + "eval_loss": 1.11281156539917, + "eval_runtime": 11.3512, + "eval_samples_per_second": 11.541, + "eval_steps_per_second": 1.498, + "step": 33000 + }, + { + "epoch": 509.2307692307692, + "grad_norm": 37.49169158935547, + "learning_rate": 6.691800000000001e-06, + "loss": 0.7716, + "step": 33100 + }, + { + "epoch": 509.2307692307692, + "eval_loss": 1.0931943655014038, + "eval_runtime": 11.4512, + "eval_samples_per_second": 11.44, + "eval_steps_per_second": 1.485, + "step": 33100 + }, + { + "epoch": 510.7692307692308, + "grad_norm": 38.801361083984375, + "learning_rate": 6.6818e-06, + "loss": 0.7231, + "step": 33200 + }, + { + "epoch": 510.7692307692308, + "eval_loss": 1.106292486190796, + "eval_runtime": 11.5429, + "eval_samples_per_second": 11.349, + "eval_steps_per_second": 1.473, + "step": 33200 + }, + { + "epoch": 512.3076923076923, + "grad_norm": 31.739389419555664, + "learning_rate": 6.6718e-06, + "loss": 0.7325, + "step": 33300 + }, + { + "epoch": 512.3076923076923, + "eval_loss": 1.1008031368255615, + "eval_runtime": 11.6986, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.453, + "step": 33300 + }, + { + "epoch": 513.8461538461538, + "grad_norm": 41.67818069458008, + "learning_rate": 6.6618e-06, + "loss": 0.7325, + "step": 33400 + }, + { + "epoch": 513.8461538461538, + "eval_loss": 1.0946918725967407, + "eval_runtime": 11.3198, + "eval_samples_per_second": 11.573, + "eval_steps_per_second": 1.502, + "step": 33400 + }, + { + "epoch": 515.3846153846154, + "grad_norm": 28.887916564941406, + "learning_rate": 6.6518e-06, + "loss": 0.78, + "step": 33500 + }, + { + "epoch": 515.3846153846154, + "eval_loss": 1.1194220781326294, + "eval_runtime": 11.4274, + "eval_samples_per_second": 11.464, + "eval_steps_per_second": 1.488, + "step": 33500 + }, + { + "epoch": 516.9230769230769, + "grad_norm": 35.69796371459961, + "learning_rate": 6.6418000000000005e-06, + "loss": 0.737, + "step": 33600 + }, + { + "epoch": 516.9230769230769, + "eval_loss": 1.1269547939300537, + "eval_runtime": 11.5682, + "eval_samples_per_second": 11.324, + "eval_steps_per_second": 1.47, + "step": 33600 + }, + { + "epoch": 518.4615384615385, + "grad_norm": 61.11826705932617, + "learning_rate": 6.6319e-06, + "loss": 0.7306, + "step": 33700 + }, + { + "epoch": 518.4615384615385, + "eval_loss": 1.1097276210784912, + "eval_runtime": 11.4915, + "eval_samples_per_second": 11.4, + "eval_steps_per_second": 1.479, + "step": 33700 + }, + { + "epoch": 520.0, + "grad_norm": 31.179792404174805, + "learning_rate": 6.6219e-06, + "loss": 0.7648, + "step": 33800 + }, + { + "epoch": 520.0, + "eval_loss": 1.1040135622024536, + "eval_runtime": 11.443, + "eval_samples_per_second": 11.448, + "eval_steps_per_second": 1.486, + "step": 33800 + }, + { + "epoch": 521.5384615384615, + "grad_norm": 18.834291458129883, + "learning_rate": 6.611900000000001e-06, + "loss": 0.7672, + "step": 33900 + }, + { + "epoch": 521.5384615384615, + "eval_loss": 1.1044901609420776, + "eval_runtime": 11.7949, + "eval_samples_per_second": 11.106, + "eval_steps_per_second": 1.441, + "step": 33900 + }, + { + "epoch": 523.0769230769231, + "grad_norm": 20.697114944458008, + "learning_rate": 6.601900000000001e-06, + "loss": 0.7249, + "step": 34000 + }, + { + "epoch": 523.0769230769231, + "eval_loss": 1.0985796451568604, + "eval_runtime": 11.6235, + "eval_samples_per_second": 11.27, + "eval_steps_per_second": 1.463, + "step": 34000 + }, + { + "epoch": 524.6153846153846, + "grad_norm": 34.14662170410156, + "learning_rate": 6.591900000000001e-06, + "loss": 0.7502, + "step": 34100 + }, + { + "epoch": 524.6153846153846, + "eval_loss": 1.0988330841064453, + "eval_runtime": 11.5435, + "eval_samples_per_second": 11.348, + "eval_steps_per_second": 1.473, + "step": 34100 + }, + { + "epoch": 526.1538461538462, + "grad_norm": 44.42218780517578, + "learning_rate": 6.581900000000001e-06, + "loss": 0.7317, + "step": 34200 + }, + { + "epoch": 526.1538461538462, + "eval_loss": 1.1278640031814575, + "eval_runtime": 11.2566, + "eval_samples_per_second": 11.638, + "eval_steps_per_second": 1.51, + "step": 34200 + }, + { + "epoch": 527.6923076923077, + "grad_norm": 29.917627334594727, + "learning_rate": 6.571900000000001e-06, + "loss": 0.7362, + "step": 34300 + }, + { + "epoch": 527.6923076923077, + "eval_loss": 1.099636197090149, + "eval_runtime": 11.4869, + "eval_samples_per_second": 11.404, + "eval_steps_per_second": 1.48, + "step": 34300 + }, + { + "epoch": 529.2307692307693, + "grad_norm": 25.54258918762207, + "learning_rate": 6.561900000000001e-06, + "loss": 0.7549, + "step": 34400 + }, + { + "epoch": 529.2307692307693, + "eval_loss": 1.1085729598999023, + "eval_runtime": 11.5126, + "eval_samples_per_second": 11.379, + "eval_steps_per_second": 1.477, + "step": 34400 + }, + { + "epoch": 530.7692307692307, + "grad_norm": 18.7155704498291, + "learning_rate": 6.551900000000001e-06, + "loss": 0.7441, + "step": 34500 + }, + { + "epoch": 530.7692307692307, + "eval_loss": 1.1084907054901123, + "eval_runtime": 10.8375, + "eval_samples_per_second": 12.088, + "eval_steps_per_second": 1.569, + "step": 34500 + }, + { + "epoch": 532.3076923076923, + "grad_norm": 26.670413970947266, + "learning_rate": 6.541900000000001e-06, + "loss": 0.7309, + "step": 34600 + }, + { + "epoch": 532.3076923076923, + "eval_loss": 1.1282768249511719, + "eval_runtime": 10.8726, + "eval_samples_per_second": 12.049, + "eval_steps_per_second": 1.564, + "step": 34600 + }, + { + "epoch": 533.8461538461538, + "grad_norm": 18.00678825378418, + "learning_rate": 6.531900000000001e-06, + "loss": 0.7549, + "step": 34700 + }, + { + "epoch": 533.8461538461538, + "eval_loss": 1.1249347925186157, + "eval_runtime": 10.8691, + "eval_samples_per_second": 12.052, + "eval_steps_per_second": 1.564, + "step": 34700 + }, + { + "epoch": 535.3846153846154, + "grad_norm": 12.743446350097656, + "learning_rate": 6.521900000000001e-06, + "loss": 0.7427, + "step": 34800 + }, + { + "epoch": 535.3846153846154, + "eval_loss": 1.12040114402771, + "eval_runtime": 10.9245, + "eval_samples_per_second": 11.991, + "eval_steps_per_second": 1.556, + "step": 34800 + }, + { + "epoch": 536.9230769230769, + "grad_norm": 43.14711380004883, + "learning_rate": 6.511900000000001e-06, + "loss": 0.7323, + "step": 34900 + }, + { + "epoch": 536.9230769230769, + "eval_loss": 1.10872483253479, + "eval_runtime": 10.8852, + "eval_samples_per_second": 12.035, + "eval_steps_per_second": 1.562, + "step": 34900 + }, + { + "epoch": 538.4615384615385, + "grad_norm": 20.799585342407227, + "learning_rate": 6.501900000000001e-06, + "loss": 0.7484, + "step": 35000 + }, + { + "epoch": 538.4615384615385, + "eval_loss": 1.1270841360092163, + "eval_runtime": 10.9929, + "eval_samples_per_second": 11.917, + "eval_steps_per_second": 1.546, + "step": 35000 + }, + { + "epoch": 540.0, + "grad_norm": 26.492895126342773, + "learning_rate": 6.491900000000001e-06, + "loss": 0.7046, + "step": 35100 + }, + { + "epoch": 540.0, + "eval_loss": 1.1277624368667603, + "eval_runtime": 10.8994, + "eval_samples_per_second": 12.019, + "eval_steps_per_second": 1.56, + "step": 35100 + }, + { + "epoch": 541.5384615384615, + "grad_norm": 14.708949089050293, + "learning_rate": 6.4819000000000006e-06, + "loss": 0.7292, + "step": 35200 + }, + { + "epoch": 541.5384615384615, + "eval_loss": 1.12324059009552, + "eval_runtime": 11.5243, + "eval_samples_per_second": 11.367, + "eval_steps_per_second": 1.475, + "step": 35200 + }, + { + "epoch": 543.0769230769231, + "grad_norm": 28.36172103881836, + "learning_rate": 6.472000000000001e-06, + "loss": 0.743, + "step": 35300 + }, + { + "epoch": 543.0769230769231, + "eval_loss": 1.1157997846603394, + "eval_runtime": 11.4175, + "eval_samples_per_second": 11.474, + "eval_steps_per_second": 1.489, + "step": 35300 + }, + { + "epoch": 544.6153846153846, + "grad_norm": 23.058122634887695, + "learning_rate": 6.462e-06, + "loss": 0.7372, + "step": 35400 + }, + { + "epoch": 544.6153846153846, + "eval_loss": 1.1235535144805908, + "eval_runtime": 11.4868, + "eval_samples_per_second": 11.404, + "eval_steps_per_second": 1.48, + "step": 35400 + }, + { + "epoch": 546.1538461538462, + "grad_norm": 36.28565979003906, + "learning_rate": 6.452e-06, + "loss": 0.7378, + "step": 35500 + }, + { + "epoch": 546.1538461538462, + "eval_loss": 1.1193617582321167, + "eval_runtime": 11.6387, + "eval_samples_per_second": 11.256, + "eval_steps_per_second": 1.461, + "step": 35500 + }, + { + "epoch": 547.6923076923077, + "grad_norm": 22.836715698242188, + "learning_rate": 6.442e-06, + "loss": 0.7345, + "step": 35600 + }, + { + "epoch": 547.6923076923077, + "eval_loss": 1.1083735227584839, + "eval_runtime": 11.4393, + "eval_samples_per_second": 11.452, + "eval_steps_per_second": 1.486, + "step": 35600 + }, + { + "epoch": 549.2307692307693, + "grad_norm": 33.76357650756836, + "learning_rate": 6.432e-06, + "loss": 0.7327, + "step": 35700 + }, + { + "epoch": 549.2307692307693, + "eval_loss": 1.1143224239349365, + "eval_runtime": 11.3719, + "eval_samples_per_second": 11.52, + "eval_steps_per_second": 1.495, + "step": 35700 + }, + { + "epoch": 550.7692307692307, + "grad_norm": 25.397972106933594, + "learning_rate": 6.4220000000000005e-06, + "loss": 0.7329, + "step": 35800 + }, + { + "epoch": 550.7692307692307, + "eval_loss": 1.1176689863204956, + "eval_runtime": 11.6532, + "eval_samples_per_second": 11.242, + "eval_steps_per_second": 1.459, + "step": 35800 + }, + { + "epoch": 552.3076923076923, + "grad_norm": 31.618921279907227, + "learning_rate": 6.412000000000001e-06, + "loss": 0.731, + "step": 35900 + }, + { + "epoch": 552.3076923076923, + "eval_loss": 1.1206835508346558, + "eval_runtime": 11.5798, + "eval_samples_per_second": 11.313, + "eval_steps_per_second": 1.468, + "step": 35900 + }, + { + "epoch": 553.8461538461538, + "grad_norm": 14.43566608428955, + "learning_rate": 6.402000000000001e-06, + "loss": 0.7322, + "step": 36000 + }, + { + "epoch": 553.8461538461538, + "eval_loss": 1.126084566116333, + "eval_runtime": 11.4063, + "eval_samples_per_second": 11.485, + "eval_steps_per_second": 1.49, + "step": 36000 + }, + { + "epoch": 555.3846153846154, + "grad_norm": 31.066484451293945, + "learning_rate": 6.392000000000001e-06, + "loss": 0.7078, + "step": 36100 + }, + { + "epoch": 555.3846153846154, + "eval_loss": 1.122070074081421, + "eval_runtime": 11.5483, + "eval_samples_per_second": 11.344, + "eval_steps_per_second": 1.472, + "step": 36100 + }, + { + "epoch": 556.9230769230769, + "grad_norm": 31.682966232299805, + "learning_rate": 6.382e-06, + "loss": 0.7287, + "step": 36200 + }, + { + "epoch": 556.9230769230769, + "eval_loss": 1.1081511974334717, + "eval_runtime": 11.6203, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.463, + "step": 36200 + }, + { + "epoch": 558.4615384615385, + "grad_norm": 29.7509822845459, + "learning_rate": 6.372e-06, + "loss": 0.7509, + "step": 36300 + }, + { + "epoch": 558.4615384615385, + "eval_loss": 1.109014868736267, + "eval_runtime": 11.4632, + "eval_samples_per_second": 11.428, + "eval_steps_per_second": 1.483, + "step": 36300 + }, + { + "epoch": 560.0, + "grad_norm": 18.40947723388672, + "learning_rate": 6.362e-06, + "loss": 0.7191, + "step": 36400 + }, + { + "epoch": 560.0, + "eval_loss": 1.0958855152130127, + "eval_runtime": 11.4042, + "eval_samples_per_second": 11.487, + "eval_steps_per_second": 1.491, + "step": 36400 + }, + { + "epoch": 561.5384615384615, + "grad_norm": 34.25148391723633, + "learning_rate": 6.352e-06, + "loss": 0.7413, + "step": 36500 + }, + { + "epoch": 561.5384615384615, + "eval_loss": 1.1016945838928223, + "eval_runtime": 11.9112, + "eval_samples_per_second": 10.998, + "eval_steps_per_second": 1.427, + "step": 36500 + }, + { + "epoch": 563.0769230769231, + "grad_norm": 28.210641860961914, + "learning_rate": 6.3420000000000004e-06, + "loss": 0.7105, + "step": 36600 + }, + { + "epoch": 563.0769230769231, + "eval_loss": 1.0802854299545288, + "eval_runtime": 11.5412, + "eval_samples_per_second": 11.351, + "eval_steps_per_second": 1.473, + "step": 36600 + }, + { + "epoch": 564.6153846153846, + "grad_norm": 27.83544921875, + "learning_rate": 6.3320000000000005e-06, + "loss": 0.7563, + "step": 36700 + }, + { + "epoch": 564.6153846153846, + "eval_loss": 1.1238279342651367, + "eval_runtime": 11.5371, + "eval_samples_per_second": 11.355, + "eval_steps_per_second": 1.474, + "step": 36700 + }, + { + "epoch": 566.1538461538462, + "grad_norm": 34.13388442993164, + "learning_rate": 6.322000000000001e-06, + "loss": 0.7063, + "step": 36800 + }, + { + "epoch": 566.1538461538462, + "eval_loss": 1.1076627969741821, + "eval_runtime": 11.0013, + "eval_samples_per_second": 11.908, + "eval_steps_per_second": 1.545, + "step": 36800 + }, + { + "epoch": 567.6923076923077, + "grad_norm": 30.208393096923828, + "learning_rate": 6.312000000000001e-06, + "loss": 0.7383, + "step": 36900 + }, + { + "epoch": 567.6923076923077, + "eval_loss": 1.1094681024551392, + "eval_runtime": 10.8669, + "eval_samples_per_second": 12.055, + "eval_steps_per_second": 1.564, + "step": 36900 + }, + { + "epoch": 569.2307692307693, + "grad_norm": 27.49210548400879, + "learning_rate": 6.302e-06, + "loss": 0.7139, + "step": 37000 + }, + { + "epoch": 569.2307692307693, + "eval_loss": 1.1218080520629883, + "eval_runtime": 11.5696, + "eval_samples_per_second": 11.323, + "eval_steps_per_second": 1.469, + "step": 37000 + }, + { + "epoch": 570.7692307692307, + "grad_norm": 47.69194793701172, + "learning_rate": 6.292e-06, + "loss": 0.7198, + "step": 37100 + }, + { + "epoch": 570.7692307692307, + "eval_loss": 1.1173454523086548, + "eval_runtime": 11.4969, + "eval_samples_per_second": 11.394, + "eval_steps_per_second": 1.479, + "step": 37100 + }, + { + "epoch": 572.3076923076923, + "grad_norm": 23.722719192504883, + "learning_rate": 6.282e-06, + "loss": 0.7182, + "step": 37200 + }, + { + "epoch": 572.3076923076923, + "eval_loss": 1.115249752998352, + "eval_runtime": 11.3349, + "eval_samples_per_second": 11.557, + "eval_steps_per_second": 1.5, + "step": 37200 + }, + { + "epoch": 573.8461538461538, + "grad_norm": 28.419593811035156, + "learning_rate": 6.272e-06, + "loss": 0.7364, + "step": 37300 + }, + { + "epoch": 573.8461538461538, + "eval_loss": 1.1297661066055298, + "eval_runtime": 11.6188, + "eval_samples_per_second": 11.275, + "eval_steps_per_second": 1.463, + "step": 37300 + }, + { + "epoch": 575.3846153846154, + "grad_norm": 22.39786148071289, + "learning_rate": 6.262e-06, + "loss": 0.7332, + "step": 37400 + }, + { + "epoch": 575.3846153846154, + "eval_loss": 1.1166205406188965, + "eval_runtime": 11.4913, + "eval_samples_per_second": 11.4, + "eval_steps_per_second": 1.479, + "step": 37400 + }, + { + "epoch": 576.9230769230769, + "grad_norm": 16.601444244384766, + "learning_rate": 6.2520000000000004e-06, + "loss": 0.6947, + "step": 37500 + }, + { + "epoch": 576.9230769230769, + "eval_loss": 1.1165810823440552, + "eval_runtime": 11.5215, + "eval_samples_per_second": 11.37, + "eval_steps_per_second": 1.476, + "step": 37500 + }, + { + "epoch": 578.4615384615385, + "grad_norm": 34.82759094238281, + "learning_rate": 6.2420000000000005e-06, + "loss": 0.7208, + "step": 37600 + }, + { + "epoch": 578.4615384615385, + "eval_loss": 1.1390671730041504, + "eval_runtime": 11.5816, + "eval_samples_per_second": 11.311, + "eval_steps_per_second": 1.468, + "step": 37600 + }, + { + "epoch": 580.0, + "grad_norm": 58.55775451660156, + "learning_rate": 6.232000000000001e-06, + "loss": 0.72, + "step": 37700 + }, + { + "epoch": 580.0, + "eval_loss": 1.135158896446228, + "eval_runtime": 11.5241, + "eval_samples_per_second": 11.367, + "eval_steps_per_second": 1.475, + "step": 37700 + }, + { + "epoch": 581.5384615384615, + "grad_norm": 28.227439880371094, + "learning_rate": 6.222e-06, + "loss": 0.72, + "step": 37800 + }, + { + "epoch": 581.5384615384615, + "eval_loss": 1.122320532798767, + "eval_runtime": 11.3973, + "eval_samples_per_second": 11.494, + "eval_steps_per_second": 1.492, + "step": 37800 + }, + { + "epoch": 583.0769230769231, + "grad_norm": 15.580057144165039, + "learning_rate": 6.212e-06, + "loss": 0.6972, + "step": 37900 + }, + { + "epoch": 583.0769230769231, + "eval_loss": 1.128136157989502, + "eval_runtime": 11.3017, + "eval_samples_per_second": 11.591, + "eval_steps_per_second": 1.504, + "step": 37900 + }, + { + "epoch": 584.6153846153846, + "grad_norm": 16.714353561401367, + "learning_rate": 6.202e-06, + "loss": 0.7108, + "step": 38000 + }, + { + "epoch": 584.6153846153846, + "eval_loss": 1.1299370527267456, + "eval_runtime": 11.574, + "eval_samples_per_second": 11.318, + "eval_steps_per_second": 1.469, + "step": 38000 + }, + { + "epoch": 586.1538461538462, + "grad_norm": 33.85563659667969, + "learning_rate": 6.192100000000001e-06, + "loss": 0.7222, + "step": 38100 + }, + { + "epoch": 586.1538461538462, + "eval_loss": 1.1264841556549072, + "eval_runtime": 11.4782, + "eval_samples_per_second": 11.413, + "eval_steps_per_second": 1.481, + "step": 38100 + }, + { + "epoch": 587.6923076923077, + "grad_norm": 52.50895690917969, + "learning_rate": 6.1821000000000005e-06, + "loss": 0.7032, + "step": 38200 + }, + { + "epoch": 587.6923076923077, + "eval_loss": 1.1254533529281616, + "eval_runtime": 11.5258, + "eval_samples_per_second": 11.366, + "eval_steps_per_second": 1.475, + "step": 38200 + }, + { + "epoch": 589.2307692307693, + "grad_norm": 34.21299362182617, + "learning_rate": 6.172100000000001e-06, + "loss": 0.7135, + "step": 38300 + }, + { + "epoch": 589.2307692307693, + "eval_loss": 1.1204712390899658, + "eval_runtime": 11.4245, + "eval_samples_per_second": 11.467, + "eval_steps_per_second": 1.488, + "step": 38300 + }, + { + "epoch": 590.7692307692307, + "grad_norm": 32.890167236328125, + "learning_rate": 6.162100000000001e-06, + "loss": 0.7411, + "step": 38400 + }, + { + "epoch": 590.7692307692307, + "eval_loss": 1.137237310409546, + "eval_runtime": 11.5125, + "eval_samples_per_second": 11.379, + "eval_steps_per_second": 1.477, + "step": 38400 + }, + { + "epoch": 592.3076923076923, + "grad_norm": 24.825029373168945, + "learning_rate": 6.152100000000001e-06, + "loss": 0.706, + "step": 38500 + }, + { + "epoch": 592.3076923076923, + "eval_loss": 1.129481315612793, + "eval_runtime": 11.5369, + "eval_samples_per_second": 11.355, + "eval_steps_per_second": 1.474, + "step": 38500 + }, + { + "epoch": 593.8461538461538, + "grad_norm": 14.461865425109863, + "learning_rate": 6.142100000000001e-06, + "loss": 0.709, + "step": 38600 + }, + { + "epoch": 593.8461538461538, + "eval_loss": 1.1302515268325806, + "eval_runtime": 11.4045, + "eval_samples_per_second": 11.487, + "eval_steps_per_second": 1.491, + "step": 38600 + }, + { + "epoch": 595.3846153846154, + "grad_norm": 21.824010848999023, + "learning_rate": 6.132100000000001e-06, + "loss": 0.7236, + "step": 38700 + }, + { + "epoch": 595.3846153846154, + "eval_loss": 1.142196536064148, + "eval_runtime": 10.8855, + "eval_samples_per_second": 12.034, + "eval_steps_per_second": 1.562, + "step": 38700 + }, + { + "epoch": 596.9230769230769, + "grad_norm": 33.73760223388672, + "learning_rate": 6.122100000000001e-06, + "loss": 0.752, + "step": 38800 + }, + { + "epoch": 596.9230769230769, + "eval_loss": 1.124137282371521, + "eval_runtime": 10.8972, + "eval_samples_per_second": 12.021, + "eval_steps_per_second": 1.56, + "step": 38800 + }, + { + "epoch": 598.4615384615385, + "grad_norm": 31.676185607910156, + "learning_rate": 6.112100000000001e-06, + "loss": 0.6774, + "step": 38900 + }, + { + "epoch": 598.4615384615385, + "eval_loss": 1.1296617984771729, + "eval_runtime": 11.2135, + "eval_samples_per_second": 11.682, + "eval_steps_per_second": 1.516, + "step": 38900 + }, + { + "epoch": 600.0, + "grad_norm": 21.168643951416016, + "learning_rate": 6.1021e-06, + "loss": 0.7269, + "step": 39000 + }, + { + "epoch": 600.0, + "eval_loss": 1.1429485082626343, + "eval_runtime": 11.5449, + "eval_samples_per_second": 11.347, + "eval_steps_per_second": 1.473, + "step": 39000 + }, + { + "epoch": 601.5384615384615, + "grad_norm": 21.421810150146484, + "learning_rate": 6.0921000000000005e-06, + "loss": 0.7144, + "step": 39100 + }, + { + "epoch": 601.5384615384615, + "eval_loss": 1.1250011920928955, + "eval_runtime": 11.9713, + "eval_samples_per_second": 10.943, + "eval_steps_per_second": 1.42, + "step": 39100 + }, + { + "epoch": 603.0769230769231, + "grad_norm": 32.57196807861328, + "learning_rate": 6.082100000000001e-06, + "loss": 0.6937, + "step": 39200 + }, + { + "epoch": 603.0769230769231, + "eval_loss": 1.1387052536010742, + "eval_runtime": 11.5513, + "eval_samples_per_second": 11.341, + "eval_steps_per_second": 1.472, + "step": 39200 + }, + { + "epoch": 604.6153846153846, + "grad_norm": 59.047874450683594, + "learning_rate": 6.072100000000001e-06, + "loss": 0.7223, + "step": 39300 + }, + { + "epoch": 604.6153846153846, + "eval_loss": 1.1233646869659424, + "eval_runtime": 11.2014, + "eval_samples_per_second": 11.695, + "eval_steps_per_second": 1.518, + "step": 39300 + }, + { + "epoch": 606.1538461538462, + "grad_norm": 18.40199089050293, + "learning_rate": 6.062100000000001e-06, + "loss": 0.7123, + "step": 39400 + }, + { + "epoch": 606.1538461538462, + "eval_loss": 1.132296085357666, + "eval_runtime": 11.4455, + "eval_samples_per_second": 11.446, + "eval_steps_per_second": 1.485, + "step": 39400 + }, + { + "epoch": 607.6923076923077, + "grad_norm": 28.574697494506836, + "learning_rate": 6.052100000000001e-06, + "loss": 0.7026, + "step": 39500 + }, + { + "epoch": 607.6923076923077, + "eval_loss": 1.1176589727401733, + "eval_runtime": 11.6095, + "eval_samples_per_second": 11.284, + "eval_steps_per_second": 1.464, + "step": 39500 + }, + { + "epoch": 609.2307692307693, + "grad_norm": 59.04723358154297, + "learning_rate": 6.042100000000001e-06, + "loss": 0.7162, + "step": 39600 + }, + { + "epoch": 609.2307692307693, + "eval_loss": 1.135021686553955, + "eval_runtime": 11.5211, + "eval_samples_per_second": 11.37, + "eval_steps_per_second": 1.476, + "step": 39600 + }, + { + "epoch": 610.7692307692307, + "grad_norm": 18.17350959777832, + "learning_rate": 6.032100000000001e-06, + "loss": 0.6926, + "step": 39700 + }, + { + "epoch": 610.7692307692307, + "eval_loss": 1.112329363822937, + "eval_runtime": 11.5498, + "eval_samples_per_second": 11.342, + "eval_steps_per_second": 1.472, + "step": 39700 + }, + { + "epoch": 612.3076923076923, + "grad_norm": 22.2219181060791, + "learning_rate": 6.0221e-06, + "loss": 0.7189, + "step": 39800 + }, + { + "epoch": 612.3076923076923, + "eval_loss": 1.126987099647522, + "eval_runtime": 11.3368, + "eval_samples_per_second": 11.555, + "eval_steps_per_second": 1.5, + "step": 39800 + }, + { + "epoch": 613.8461538461538, + "grad_norm": 49.054046630859375, + "learning_rate": 6.0121000000000004e-06, + "loss": 0.7011, + "step": 39900 + }, + { + "epoch": 613.8461538461538, + "eval_loss": 1.1256166696548462, + "eval_runtime": 11.4961, + "eval_samples_per_second": 11.395, + "eval_steps_per_second": 1.479, + "step": 39900 + }, + { + "epoch": 615.3846153846154, + "grad_norm": 34.73727035522461, + "learning_rate": 6.0021000000000005e-06, + "loss": 0.6827, + "step": 40000 + }, + { + "epoch": 615.3846153846154, + "eval_loss": 1.1318682432174683, + "eval_runtime": 11.645, + "eval_samples_per_second": 11.249, + "eval_steps_per_second": 1.46, + "step": 40000 + }, + { + "epoch": 616.9230769230769, + "grad_norm": 25.531124114990234, + "learning_rate": 5.992100000000001e-06, + "loss": 0.7231, + "step": 40100 + }, + { + "epoch": 616.9230769230769, + "eval_loss": 1.1109651327133179, + "eval_runtime": 11.5709, + "eval_samples_per_second": 11.321, + "eval_steps_per_second": 1.469, + "step": 40100 + }, + { + "epoch": 618.4615384615385, + "grad_norm": 20.263084411621094, + "learning_rate": 5.982100000000001e-06, + "loss": 0.7102, + "step": 40200 + }, + { + "epoch": 618.4615384615385, + "eval_loss": 1.1411464214324951, + "eval_runtime": 11.2059, + "eval_samples_per_second": 11.69, + "eval_steps_per_second": 1.517, + "step": 40200 + }, + { + "epoch": 620.0, + "grad_norm": 18.807043075561523, + "learning_rate": 5.972100000000001e-06, + "loss": 0.7086, + "step": 40300 + }, + { + "epoch": 620.0, + "eval_loss": 1.1142805814743042, + "eval_runtime": 11.4638, + "eval_samples_per_second": 11.427, + "eval_steps_per_second": 1.483, + "step": 40300 + }, + { + "epoch": 621.5384615384615, + "grad_norm": 20.680356979370117, + "learning_rate": 5.962100000000001e-06, + "loss": 0.7092, + "step": 40400 + }, + { + "epoch": 621.5384615384615, + "eval_loss": 1.120303988456726, + "eval_runtime": 10.8454, + "eval_samples_per_second": 12.079, + "eval_steps_per_second": 1.567, + "step": 40400 + }, + { + "epoch": 623.0769230769231, + "grad_norm": 31.14982032775879, + "learning_rate": 5.952100000000001e-06, + "loss": 0.6827, + "step": 40500 + }, + { + "epoch": 623.0769230769231, + "eval_loss": 1.1231149435043335, + "eval_runtime": 10.9253, + "eval_samples_per_second": 11.991, + "eval_steps_per_second": 1.556, + "step": 40500 + }, + { + "epoch": 624.6153846153846, + "grad_norm": 27.589797973632812, + "learning_rate": 5.9421e-06, + "loss": 0.7191, + "step": 40600 + }, + { + "epoch": 624.6153846153846, + "eval_loss": 1.1205034255981445, + "eval_runtime": 10.9176, + "eval_samples_per_second": 11.999, + "eval_steps_per_second": 1.557, + "step": 40600 + }, + { + "epoch": 626.1538461538462, + "grad_norm": 45.19071578979492, + "learning_rate": 5.9321e-06, + "loss": 0.6888, + "step": 40700 + }, + { + "epoch": 626.1538461538462, + "eval_loss": 1.1391079425811768, + "eval_runtime": 11.5943, + "eval_samples_per_second": 11.299, + "eval_steps_per_second": 1.466, + "step": 40700 + }, + { + "epoch": 627.6923076923077, + "grad_norm": 59.510799407958984, + "learning_rate": 5.9221000000000004e-06, + "loss": 0.7031, + "step": 40800 + }, + { + "epoch": 627.6923076923077, + "eval_loss": 1.1189318895339966, + "eval_runtime": 11.5906, + "eval_samples_per_second": 11.302, + "eval_steps_per_second": 1.467, + "step": 40800 + }, + { + "epoch": 629.2307692307693, + "grad_norm": 52.27006530761719, + "learning_rate": 5.9121000000000005e-06, + "loss": 0.6999, + "step": 40900 + }, + { + "epoch": 629.2307692307693, + "eval_loss": 1.1334617137908936, + "eval_runtime": 11.4792, + "eval_samples_per_second": 11.412, + "eval_steps_per_second": 1.481, + "step": 40900 + }, + { + "epoch": 630.7692307692307, + "grad_norm": 44.54253387451172, + "learning_rate": 5.902100000000001e-06, + "loss": 0.6967, + "step": 41000 + }, + { + "epoch": 630.7692307692307, + "eval_loss": 1.1367884874343872, + "eval_runtime": 11.6102, + "eval_samples_per_second": 11.283, + "eval_steps_per_second": 1.464, + "step": 41000 + }, + { + "epoch": 632.3076923076923, + "grad_norm": 24.941390991210938, + "learning_rate": 5.892100000000001e-06, + "loss": 0.6991, + "step": 41100 + }, + { + "epoch": 632.3076923076923, + "eval_loss": 1.1169872283935547, + "eval_runtime": 11.19, + "eval_samples_per_second": 11.707, + "eval_steps_per_second": 1.519, + "step": 41100 + }, + { + "epoch": 633.8461538461538, + "grad_norm": 19.966068267822266, + "learning_rate": 5.882100000000001e-06, + "loss": 0.7061, + "step": 41200 + }, + { + "epoch": 633.8461538461538, + "eval_loss": 1.1281603574752808, + "eval_runtime": 11.5586, + "eval_samples_per_second": 11.334, + "eval_steps_per_second": 1.471, + "step": 41200 + }, + { + "epoch": 635.3846153846154, + "grad_norm": 40.40338134765625, + "learning_rate": 5.872100000000001e-06, + "loss": 0.6997, + "step": 41300 + }, + { + "epoch": 635.3846153846154, + "eval_loss": 1.1226210594177246, + "eval_runtime": 11.5826, + "eval_samples_per_second": 11.31, + "eval_steps_per_second": 1.468, + "step": 41300 + }, + { + "epoch": 636.9230769230769, + "grad_norm": 24.697601318359375, + "learning_rate": 5.8622e-06, + "loss": 0.7009, + "step": 41400 + }, + { + "epoch": 636.9230769230769, + "eval_loss": 1.1038686037063599, + "eval_runtime": 11.5099, + "eval_samples_per_second": 11.382, + "eval_steps_per_second": 1.477, + "step": 41400 + }, + { + "epoch": 638.4615384615385, + "grad_norm": 17.308862686157227, + "learning_rate": 5.8522000000000005e-06, + "loss": 0.6924, + "step": 41500 + }, + { + "epoch": 638.4615384615385, + "eval_loss": 1.1147842407226562, + "eval_runtime": 11.2674, + "eval_samples_per_second": 11.627, + "eval_steps_per_second": 1.509, + "step": 41500 + }, + { + "epoch": 640.0, + "grad_norm": 38.33342742919922, + "learning_rate": 5.8422e-06, + "loss": 0.7187, + "step": 41600 + }, + { + "epoch": 640.0, + "eval_loss": 1.1148661375045776, + "eval_runtime": 11.5429, + "eval_samples_per_second": 11.349, + "eval_steps_per_second": 1.473, + "step": 41600 + }, + { + "epoch": 641.5384615384615, + "grad_norm": 52.12275314331055, + "learning_rate": 5.8322e-06, + "loss": 0.6747, + "step": 41700 + }, + { + "epoch": 641.5384615384615, + "eval_loss": 1.117422342300415, + "eval_runtime": 11.5416, + "eval_samples_per_second": 11.35, + "eval_steps_per_second": 1.473, + "step": 41700 + }, + { + "epoch": 643.0769230769231, + "grad_norm": 44.06056594848633, + "learning_rate": 5.8222e-06, + "loss": 0.7143, + "step": 41800 + }, + { + "epoch": 643.0769230769231, + "eval_loss": 1.1139775514602661, + "eval_runtime": 11.4951, + "eval_samples_per_second": 11.396, + "eval_steps_per_second": 1.479, + "step": 41800 + }, + { + "epoch": 644.6153846153846, + "grad_norm": 31.470233917236328, + "learning_rate": 5.8122e-06, + "loss": 0.6854, + "step": 41900 + }, + { + "epoch": 644.6153846153846, + "eval_loss": 1.1292999982833862, + "eval_runtime": 11.5495, + "eval_samples_per_second": 11.342, + "eval_steps_per_second": 1.472, + "step": 41900 + }, + { + "epoch": 646.1538461538462, + "grad_norm": 28.07218360900879, + "learning_rate": 5.8022e-06, + "loss": 0.6973, + "step": 42000 + }, + { + "epoch": 646.1538461538462, + "eval_loss": 1.1061806678771973, + "eval_runtime": 11.342, + "eval_samples_per_second": 11.55, + "eval_steps_per_second": 1.499, + "step": 42000 + }, + { + "epoch": 647.6923076923077, + "grad_norm": 25.022056579589844, + "learning_rate": 5.7922e-06, + "loss": 0.6769, + "step": 42100 + }, + { + "epoch": 647.6923076923077, + "eval_loss": 1.1117756366729736, + "eval_runtime": 11.6908, + "eval_samples_per_second": 11.205, + "eval_steps_per_second": 1.454, + "step": 42100 + }, + { + "epoch": 649.2307692307693, + "grad_norm": 25.953819274902344, + "learning_rate": 5.7822e-06, + "loss": 0.6966, + "step": 42200 + }, + { + "epoch": 649.2307692307693, + "eval_loss": 1.1325063705444336, + "eval_runtime": 11.589, + "eval_samples_per_second": 11.304, + "eval_steps_per_second": 1.467, + "step": 42200 + }, + { + "epoch": 650.7692307692307, + "grad_norm": 73.43199157714844, + "learning_rate": 5.7722e-06, + "loss": 0.7022, + "step": 42300 + }, + { + "epoch": 650.7692307692307, + "eval_loss": 1.1291977167129517, + "eval_runtime": 11.4842, + "eval_samples_per_second": 11.407, + "eval_steps_per_second": 1.48, + "step": 42300 + }, + { + "epoch": 652.3076923076923, + "grad_norm": 57.62006759643555, + "learning_rate": 5.7622e-06, + "loss": 0.7047, + "step": 42400 + }, + { + "epoch": 652.3076923076923, + "eval_loss": 1.13730788230896, + "eval_runtime": 11.479, + "eval_samples_per_second": 11.412, + "eval_steps_per_second": 1.481, + "step": 42400 + }, + { + "epoch": 653.8461538461538, + "grad_norm": 27.652311325073242, + "learning_rate": 5.7522e-06, + "loss": 0.6791, + "step": 42500 + }, + { + "epoch": 653.8461538461538, + "eval_loss": 1.1305973529815674, + "eval_runtime": 11.1409, + "eval_samples_per_second": 11.759, + "eval_steps_per_second": 1.526, + "step": 42500 + }, + { + "epoch": 655.3846153846154, + "grad_norm": 19.134410858154297, + "learning_rate": 5.7422e-06, + "loss": 0.6775, + "step": 42600 + }, + { + "epoch": 655.3846153846154, + "eval_loss": 1.1186637878417969, + "eval_runtime": 11.5053, + "eval_samples_per_second": 11.386, + "eval_steps_per_second": 1.478, + "step": 42600 + }, + { + "epoch": 656.9230769230769, + "grad_norm": 22.742666244506836, + "learning_rate": 5.7322e-06, + "loss": 0.6879, + "step": 42700 + }, + { + "epoch": 656.9230769230769, + "eval_loss": 1.1315637826919556, + "eval_runtime": 11.5431, + "eval_samples_per_second": 11.349, + "eval_steps_per_second": 1.473, + "step": 42700 + }, + { + "epoch": 658.4615384615385, + "grad_norm": 66.6679916381836, + "learning_rate": 5.7222e-06, + "loss": 0.6815, + "step": 42800 + }, + { + "epoch": 658.4615384615385, + "eval_loss": 1.1492441892623901, + "eval_runtime": 11.3975, + "eval_samples_per_second": 11.494, + "eval_steps_per_second": 1.492, + "step": 42800 + }, + { + "epoch": 660.0, + "grad_norm": 23.880626678466797, + "learning_rate": 5.7122e-06, + "loss": 0.6819, + "step": 42900 + }, + { + "epoch": 660.0, + "eval_loss": 1.1444504261016846, + "eval_runtime": 11.1352, + "eval_samples_per_second": 11.764, + "eval_steps_per_second": 1.527, + "step": 42900 + }, + { + "epoch": 661.5384615384615, + "grad_norm": 23.52161407470703, + "learning_rate": 5.7022e-06, + "loss": 0.6937, + "step": 43000 + }, + { + "epoch": 661.5384615384615, + "eval_loss": 1.1386537551879883, + "eval_runtime": 11.4648, + "eval_samples_per_second": 11.426, + "eval_steps_per_second": 1.483, + "step": 43000 + }, + { + "epoch": 663.0769230769231, + "grad_norm": 20.599346160888672, + "learning_rate": 5.6922e-06, + "loss": 0.6806, + "step": 43100 + }, + { + "epoch": 663.0769230769231, + "eval_loss": 1.174161672592163, + "eval_runtime": 11.4594, + "eval_samples_per_second": 11.432, + "eval_steps_per_second": 1.483, + "step": 43100 + }, + { + "epoch": 664.6153846153846, + "grad_norm": 51.909767150878906, + "learning_rate": 5.6821999999999996e-06, + "loss": 0.6741, + "step": 43200 + }, + { + "epoch": 664.6153846153846, + "eval_loss": 1.1519333124160767, + "eval_runtime": 11.3771, + "eval_samples_per_second": 11.514, + "eval_steps_per_second": 1.494, + "step": 43200 + }, + { + "epoch": 666.1538461538462, + "grad_norm": 25.982128143310547, + "learning_rate": 5.6722e-06, + "loss": 0.7022, + "step": 43300 + }, + { + "epoch": 666.1538461538462, + "eval_loss": 1.1483289003372192, + "eval_runtime": 11.4064, + "eval_samples_per_second": 11.485, + "eval_steps_per_second": 1.49, + "step": 43300 + }, + { + "epoch": 667.6923076923077, + "grad_norm": 43.9743537902832, + "learning_rate": 5.6622e-06, + "loss": 0.6835, + "step": 43400 + }, + { + "epoch": 667.6923076923077, + "eval_loss": 1.143465280532837, + "eval_runtime": 11.3581, + "eval_samples_per_second": 11.534, + "eval_steps_per_second": 1.497, + "step": 43400 + }, + { + "epoch": 669.2307692307693, + "grad_norm": 37.34025573730469, + "learning_rate": 5.6522e-06, + "loss": 0.6687, + "step": 43500 + }, + { + "epoch": 669.2307692307693, + "eval_loss": 1.154449224472046, + "eval_runtime": 11.5977, + "eval_samples_per_second": 11.295, + "eval_steps_per_second": 1.466, + "step": 43500 + }, + { + "epoch": 670.7692307692307, + "grad_norm": 34.556060791015625, + "learning_rate": 5.642200000000001e-06, + "loss": 0.6967, + "step": 43600 + }, + { + "epoch": 670.7692307692307, + "eval_loss": 1.149709701538086, + "eval_runtime": 11.5679, + "eval_samples_per_second": 11.324, + "eval_steps_per_second": 1.47, + "step": 43600 + }, + { + "epoch": 672.3076923076923, + "grad_norm": 38.00693130493164, + "learning_rate": 5.632200000000001e-06, + "loss": 0.6883, + "step": 43700 + }, + { + "epoch": 672.3076923076923, + "eval_loss": 1.1413865089416504, + "eval_runtime": 11.521, + "eval_samples_per_second": 11.371, + "eval_steps_per_second": 1.476, + "step": 43700 + }, + { + "epoch": 673.8461538461538, + "grad_norm": 43.81779098510742, + "learning_rate": 5.622200000000001e-06, + "loss": 0.6775, + "step": 43800 + }, + { + "epoch": 673.8461538461538, + "eval_loss": 1.1503146886825562, + "eval_runtime": 11.3151, + "eval_samples_per_second": 11.577, + "eval_steps_per_second": 1.502, + "step": 43800 + }, + { + "epoch": 675.3846153846154, + "grad_norm": 35.310508728027344, + "learning_rate": 5.6123000000000005e-06, + "loss": 0.6847, + "step": 43900 + }, + { + "epoch": 675.3846153846154, + "eval_loss": 1.1309814453125, + "eval_runtime": 11.4477, + "eval_samples_per_second": 11.443, + "eval_steps_per_second": 1.485, + "step": 43900 + }, + { + "epoch": 676.9230769230769, + "grad_norm": 24.62909507751465, + "learning_rate": 5.6023000000000006e-06, + "loss": 0.6682, + "step": 44000 + }, + { + "epoch": 676.9230769230769, + "eval_loss": 1.1286953687667847, + "eval_runtime": 11.022, + "eval_samples_per_second": 11.885, + "eval_steps_per_second": 1.542, + "step": 44000 + }, + { + "epoch": 678.4615384615385, + "grad_norm": 22.765432357788086, + "learning_rate": 5.592300000000001e-06, + "loss": 0.6584, + "step": 44100 + }, + { + "epoch": 678.4615384615385, + "eval_loss": 1.1532663106918335, + "eval_runtime": 10.8616, + "eval_samples_per_second": 12.061, + "eval_steps_per_second": 1.565, + "step": 44100 + }, + { + "epoch": 680.0, + "grad_norm": 52.021812438964844, + "learning_rate": 5.582300000000001e-06, + "loss": 0.6913, + "step": 44200 + }, + { + "epoch": 680.0, + "eval_loss": 1.1283496618270874, + "eval_runtime": 10.9199, + "eval_samples_per_second": 11.996, + "eval_steps_per_second": 1.557, + "step": 44200 + }, + { + "epoch": 681.5384615384615, + "grad_norm": 32.32806396484375, + "learning_rate": 5.572300000000001e-06, + "loss": 0.6676, + "step": 44300 + }, + { + "epoch": 681.5384615384615, + "eval_loss": 1.1262216567993164, + "eval_runtime": 10.9862, + "eval_samples_per_second": 11.924, + "eval_steps_per_second": 1.547, + "step": 44300 + }, + { + "epoch": 683.0769230769231, + "grad_norm": 22.324115753173828, + "learning_rate": 5.5623e-06, + "loss": 0.694, + "step": 44400 + }, + { + "epoch": 683.0769230769231, + "eval_loss": 1.137344241142273, + "eval_runtime": 11.2107, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 1.516, + "step": 44400 + }, + { + "epoch": 684.6153846153846, + "grad_norm": 38.57556915283203, + "learning_rate": 5.5523e-06, + "loss": 0.6734, + "step": 44500 + }, + { + "epoch": 684.6153846153846, + "eval_loss": 1.1320441961288452, + "eval_runtime": 11.6749, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.456, + "step": 44500 + }, + { + "epoch": 686.1538461538462, + "grad_norm": 15.991618156433105, + "learning_rate": 5.5423e-06, + "loss": 0.6962, + "step": 44600 + }, + { + "epoch": 686.1538461538462, + "eval_loss": 1.1225178241729736, + "eval_runtime": 11.4874, + "eval_samples_per_second": 11.404, + "eval_steps_per_second": 1.48, + "step": 44600 + }, + { + "epoch": 687.6923076923077, + "grad_norm": 19.2874755859375, + "learning_rate": 5.5323e-06, + "loss": 0.6702, + "step": 44700 + }, + { + "epoch": 687.6923076923077, + "eval_loss": 1.147964596748352, + "eval_runtime": 11.5273, + "eval_samples_per_second": 11.364, + "eval_steps_per_second": 1.475, + "step": 44700 + }, + { + "epoch": 689.2307692307693, + "grad_norm": 46.83424377441406, + "learning_rate": 5.5223000000000005e-06, + "loss": 0.6975, + "step": 44800 + }, + { + "epoch": 689.2307692307693, + "eval_loss": 1.1342499256134033, + "eval_runtime": 11.614, + "eval_samples_per_second": 11.279, + "eval_steps_per_second": 1.464, + "step": 44800 + }, + { + "epoch": 690.7692307692307, + "grad_norm": 64.3833999633789, + "learning_rate": 5.5123000000000006e-06, + "loss": 0.6782, + "step": 44900 + }, + { + "epoch": 690.7692307692307, + "eval_loss": 1.1242735385894775, + "eval_runtime": 11.6467, + "eval_samples_per_second": 11.248, + "eval_steps_per_second": 1.46, + "step": 44900 + }, + { + "epoch": 692.3076923076923, + "grad_norm": 19.196340560913086, + "learning_rate": 5.502300000000001e-06, + "loss": 0.6636, + "step": 45000 + }, + { + "epoch": 692.3076923076923, + "eval_loss": 1.1473920345306396, + "eval_runtime": 11.4762, + "eval_samples_per_second": 11.415, + "eval_steps_per_second": 1.481, + "step": 45000 + }, + { + "epoch": 693.8461538461538, + "grad_norm": 20.438758850097656, + "learning_rate": 5.492300000000001e-06, + "loss": 0.696, + "step": 45100 + }, + { + "epoch": 693.8461538461538, + "eval_loss": 1.1358587741851807, + "eval_runtime": 11.5586, + "eval_samples_per_second": 11.334, + "eval_steps_per_second": 1.471, + "step": 45100 + }, + { + "epoch": 695.3846153846154, + "grad_norm": 33.10650634765625, + "learning_rate": 5.4823e-06, + "loss": 0.6684, + "step": 45200 + }, + { + "epoch": 695.3846153846154, + "eval_loss": 1.1217955350875854, + "eval_runtime": 11.7819, + "eval_samples_per_second": 11.119, + "eval_steps_per_second": 1.443, + "step": 45200 + }, + { + "epoch": 696.9230769230769, + "grad_norm": 33.91626739501953, + "learning_rate": 5.4723e-06, + "loss": 0.6735, + "step": 45300 + }, + { + "epoch": 696.9230769230769, + "eval_loss": 1.1355735063552856, + "eval_runtime": 11.1909, + "eval_samples_per_second": 11.706, + "eval_steps_per_second": 1.519, + "step": 45300 + }, + { + "epoch": 698.4615384615385, + "grad_norm": 28.755367279052734, + "learning_rate": 5.4623e-06, + "loss": 0.6963, + "step": 45400 + }, + { + "epoch": 698.4615384615385, + "eval_loss": 1.149074673652649, + "eval_runtime": 11.5587, + "eval_samples_per_second": 11.333, + "eval_steps_per_second": 1.471, + "step": 45400 + }, + { + "epoch": 700.0, + "grad_norm": 19.224637985229492, + "learning_rate": 5.4523e-06, + "loss": 0.6585, + "step": 45500 + }, + { + "epoch": 700.0, + "eval_loss": 1.1453114748001099, + "eval_runtime": 11.4484, + "eval_samples_per_second": 11.443, + "eval_steps_per_second": 1.485, + "step": 45500 + }, + { + "epoch": 701.5384615384615, + "grad_norm": 51.07728576660156, + "learning_rate": 5.4423e-06, + "loss": 0.6804, + "step": 45600 + }, + { + "epoch": 701.5384615384615, + "eval_loss": 1.1507929563522339, + "eval_runtime": 11.5868, + "eval_samples_per_second": 11.306, + "eval_steps_per_second": 1.467, + "step": 45600 + }, + { + "epoch": 703.0769230769231, + "grad_norm": 41.6357421875, + "learning_rate": 5.4323000000000005e-06, + "loss": 0.6532, + "step": 45700 + }, + { + "epoch": 703.0769230769231, + "eval_loss": 1.1559633016586304, + "eval_runtime": 11.6179, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.463, + "step": 45700 + }, + { + "epoch": 704.6153846153846, + "grad_norm": 35.78409957885742, + "learning_rate": 5.422300000000001e-06, + "loss": 0.6813, + "step": 45800 + }, + { + "epoch": 704.6153846153846, + "eval_loss": 1.1355921030044556, + "eval_runtime": 11.6385, + "eval_samples_per_second": 11.256, + "eval_steps_per_second": 1.461, + "step": 45800 + }, + { + "epoch": 706.1538461538462, + "grad_norm": 27.35905647277832, + "learning_rate": 5.412400000000001e-06, + "loss": 0.6771, + "step": 45900 + }, + { + "epoch": 706.1538461538462, + "eval_loss": 1.131105661392212, + "eval_runtime": 11.5313, + "eval_samples_per_second": 11.36, + "eval_steps_per_second": 1.474, + "step": 45900 + }, + { + "epoch": 707.6923076923077, + "grad_norm": 48.791236877441406, + "learning_rate": 5.402400000000001e-06, + "loss": 0.6712, + "step": 46000 + }, + { + "epoch": 707.6923076923077, + "eval_loss": 1.1603729724884033, + "eval_runtime": 11.4998, + "eval_samples_per_second": 11.392, + "eval_steps_per_second": 1.478, + "step": 46000 + }, + { + "epoch": 709.2307692307693, + "grad_norm": 31.62120819091797, + "learning_rate": 5.392400000000001e-06, + "loss": 0.6679, + "step": 46100 + }, + { + "epoch": 709.2307692307693, + "eval_loss": 1.1280769109725952, + "eval_runtime": 11.3233, + "eval_samples_per_second": 11.569, + "eval_steps_per_second": 1.501, + "step": 46100 + }, + { + "epoch": 710.7692307692307, + "grad_norm": 31.49280548095703, + "learning_rate": 5.382400000000001e-06, + "loss": 0.6842, + "step": 46200 + }, + { + "epoch": 710.7692307692307, + "eval_loss": 1.1522812843322754, + "eval_runtime": 11.14, + "eval_samples_per_second": 11.759, + "eval_steps_per_second": 1.526, + "step": 46200 + }, + { + "epoch": 712.3076923076923, + "grad_norm": 50.18932342529297, + "learning_rate": 5.372400000000001e-06, + "loss": 0.6544, + "step": 46300 + }, + { + "epoch": 712.3076923076923, + "eval_loss": 1.151581048965454, + "eval_runtime": 11.3329, + "eval_samples_per_second": 11.559, + "eval_steps_per_second": 1.5, + "step": 46300 + }, + { + "epoch": 713.8461538461538, + "grad_norm": 59.33495330810547, + "learning_rate": 5.3624000000000005e-06, + "loss": 0.6509, + "step": 46400 + }, + { + "epoch": 713.8461538461538, + "eval_loss": 1.153143286705017, + "eval_runtime": 11.5264, + "eval_samples_per_second": 11.365, + "eval_steps_per_second": 1.475, + "step": 46400 + }, + { + "epoch": 715.3846153846154, + "grad_norm": 41.886268615722656, + "learning_rate": 5.352400000000001e-06, + "loss": 0.6631, + "step": 46500 + }, + { + "epoch": 715.3846153846154, + "eval_loss": 1.1551434993743896, + "eval_runtime": 11.5123, + "eval_samples_per_second": 11.379, + "eval_steps_per_second": 1.477, + "step": 46500 + }, + { + "epoch": 716.9230769230769, + "grad_norm": 31.274044036865234, + "learning_rate": 5.342400000000001e-06, + "loss": 0.6717, + "step": 46600 + }, + { + "epoch": 716.9230769230769, + "eval_loss": 1.1296733617782593, + "eval_runtime": 11.5569, + "eval_samples_per_second": 11.335, + "eval_steps_per_second": 1.471, + "step": 46600 + }, + { + "epoch": 718.4615384615385, + "grad_norm": 58.7375602722168, + "learning_rate": 5.332400000000001e-06, + "loss": 0.6596, + "step": 46700 + }, + { + "epoch": 718.4615384615385, + "eval_loss": 1.1503448486328125, + "eval_runtime": 11.6542, + "eval_samples_per_second": 11.241, + "eval_steps_per_second": 1.459, + "step": 46700 + }, + { + "epoch": 720.0, + "grad_norm": 22.866985321044922, + "learning_rate": 5.322400000000001e-06, + "loss": 0.6797, + "step": 46800 + }, + { + "epoch": 720.0, + "eval_loss": 1.178478717803955, + "eval_runtime": 11.4836, + "eval_samples_per_second": 11.408, + "eval_steps_per_second": 1.48, + "step": 46800 + }, + { + "epoch": 721.5384615384615, + "grad_norm": 62.562984466552734, + "learning_rate": 5.312400000000001e-06, + "loss": 0.6593, + "step": 46900 + }, + { + "epoch": 721.5384615384615, + "eval_loss": 1.1387419700622559, + "eval_runtime": 11.5067, + "eval_samples_per_second": 11.385, + "eval_steps_per_second": 1.477, + "step": 46900 + }, + { + "epoch": 723.0769230769231, + "grad_norm": 27.74352264404297, + "learning_rate": 5.302400000000001e-06, + "loss": 0.6789, + "step": 47000 + }, + { + "epoch": 723.0769230769231, + "eval_loss": 1.1372648477554321, + "eval_runtime": 11.4814, + "eval_samples_per_second": 11.41, + "eval_steps_per_second": 1.481, + "step": 47000 + }, + { + "epoch": 724.6153846153846, + "grad_norm": 29.324443817138672, + "learning_rate": 5.292400000000001e-06, + "loss": 0.662, + "step": 47100 + }, + { + "epoch": 724.6153846153846, + "eval_loss": 1.140125036239624, + "eval_runtime": 11.2574, + "eval_samples_per_second": 11.637, + "eval_steps_per_second": 1.51, + "step": 47100 + }, + { + "epoch": 726.1538461538462, + "grad_norm": 30.845813751220703, + "learning_rate": 5.2824000000000004e-06, + "loss": 0.6745, + "step": 47200 + }, + { + "epoch": 726.1538461538462, + "eval_loss": 1.1335151195526123, + "eval_runtime": 11.3546, + "eval_samples_per_second": 11.537, + "eval_steps_per_second": 1.497, + "step": 47200 + }, + { + "epoch": 727.6923076923077, + "grad_norm": 37.0521240234375, + "learning_rate": 5.2724000000000005e-06, + "loss": 0.6612, + "step": 47300 + }, + { + "epoch": 727.6923076923077, + "eval_loss": 1.1420923471450806, + "eval_runtime": 11.619, + "eval_samples_per_second": 11.275, + "eval_steps_per_second": 1.463, + "step": 47300 + }, + { + "epoch": 729.2307692307693, + "grad_norm": 32.282073974609375, + "learning_rate": 5.262400000000001e-06, + "loss": 0.6324, + "step": 47400 + }, + { + "epoch": 729.2307692307693, + "eval_loss": 1.147072434425354, + "eval_runtime": 11.5244, + "eval_samples_per_second": 11.367, + "eval_steps_per_second": 1.475, + "step": 47400 + }, + { + "epoch": 730.7692307692307, + "grad_norm": 34.776607513427734, + "learning_rate": 5.252400000000001e-06, + "loss": 0.6573, + "step": 47500 + }, + { + "epoch": 730.7692307692307, + "eval_loss": 1.1505202054977417, + "eval_runtime": 11.6157, + "eval_samples_per_second": 11.278, + "eval_steps_per_second": 1.464, + "step": 47500 + }, + { + "epoch": 732.3076923076923, + "grad_norm": 18.59247398376465, + "learning_rate": 5.242400000000001e-06, + "loss": 0.6663, + "step": 47600 + }, + { + "epoch": 732.3076923076923, + "eval_loss": 1.16086745262146, + "eval_runtime": 11.612, + "eval_samples_per_second": 11.281, + "eval_steps_per_second": 1.464, + "step": 47600 + }, + { + "epoch": 733.8461538461538, + "grad_norm": 47.67042922973633, + "learning_rate": 5.232400000000001e-06, + "loss": 0.6562, + "step": 47700 + }, + { + "epoch": 733.8461538461538, + "eval_loss": 1.1352193355560303, + "eval_runtime": 11.4985, + "eval_samples_per_second": 11.393, + "eval_steps_per_second": 1.478, + "step": 47700 + }, + { + "epoch": 735.3846153846154, + "grad_norm": 28.442916870117188, + "learning_rate": 5.222400000000001e-06, + "loss": 0.6592, + "step": 47800 + }, + { + "epoch": 735.3846153846154, + "eval_loss": 1.1500188112258911, + "eval_runtime": 11.4697, + "eval_samples_per_second": 11.421, + "eval_steps_per_second": 1.482, + "step": 47800 + }, + { + "epoch": 736.9230769230769, + "grad_norm": 27.579147338867188, + "learning_rate": 5.212400000000001e-06, + "loss": 0.6789, + "step": 47900 + }, + { + "epoch": 736.9230769230769, + "eval_loss": 1.1632080078125, + "eval_runtime": 11.5537, + "eval_samples_per_second": 11.338, + "eval_steps_per_second": 1.471, + "step": 47900 + }, + { + "epoch": 738.4615384615385, + "grad_norm": 28.332275390625, + "learning_rate": 5.2024e-06, + "loss": 0.6687, + "step": 48000 + }, + { + "epoch": 738.4615384615385, + "eval_loss": 1.1715619564056396, + "eval_runtime": 11.3148, + "eval_samples_per_second": 11.578, + "eval_steps_per_second": 1.502, + "step": 48000 + }, + { + "epoch": 740.0, + "grad_norm": 26.800214767456055, + "learning_rate": 5.1924000000000005e-06, + "loss": 0.6697, + "step": 48100 + }, + { + "epoch": 740.0, + "eval_loss": 1.130355954170227, + "eval_runtime": 11.3571, + "eval_samples_per_second": 11.535, + "eval_steps_per_second": 1.497, + "step": 48100 + }, + { + "epoch": 741.5384615384615, + "grad_norm": 44.184932708740234, + "learning_rate": 5.1824000000000006e-06, + "loss": 0.6692, + "step": 48200 + }, + { + "epoch": 741.5384615384615, + "eval_loss": 1.160269021987915, + "eval_runtime": 10.9507, + "eval_samples_per_second": 11.963, + "eval_steps_per_second": 1.552, + "step": 48200 + }, + { + "epoch": 743.0769230769231, + "grad_norm": 23.32909393310547, + "learning_rate": 5.172400000000001e-06, + "loss": 0.6687, + "step": 48300 + }, + { + "epoch": 743.0769230769231, + "eval_loss": 1.1364867687225342, + "eval_runtime": 10.9178, + "eval_samples_per_second": 11.999, + "eval_steps_per_second": 1.557, + "step": 48300 + }, + { + "epoch": 744.6153846153846, + "grad_norm": 30.51333236694336, + "learning_rate": 5.162400000000001e-06, + "loss": 0.6572, + "step": 48400 + }, + { + "epoch": 744.6153846153846, + "eval_loss": 1.165590763092041, + "eval_runtime": 10.9057, + "eval_samples_per_second": 12.012, + "eval_steps_per_second": 1.559, + "step": 48400 + }, + { + "epoch": 746.1538461538462, + "grad_norm": 56.37994384765625, + "learning_rate": 5.152400000000001e-06, + "loss": 0.6391, + "step": 48500 + }, + { + "epoch": 746.1538461538462, + "eval_loss": 1.1541125774383545, + "eval_runtime": 10.8659, + "eval_samples_per_second": 12.056, + "eval_steps_per_second": 1.565, + "step": 48500 + }, + { + "epoch": 747.6923076923077, + "grad_norm": 41.9628791809082, + "learning_rate": 5.142400000000001e-06, + "loss": 0.6553, + "step": 48600 + }, + { + "epoch": 747.6923076923077, + "eval_loss": 1.1572431325912476, + "eval_runtime": 10.9145, + "eval_samples_per_second": 12.002, + "eval_steps_per_second": 1.558, + "step": 48600 + }, + { + "epoch": 749.2307692307693, + "grad_norm": 16.727279663085938, + "learning_rate": 5.132400000000001e-06, + "loss": 0.6595, + "step": 48700 + }, + { + "epoch": 749.2307692307693, + "eval_loss": 1.144070029258728, + "eval_runtime": 10.9278, + "eval_samples_per_second": 11.988, + "eval_steps_per_second": 1.556, + "step": 48700 + }, + { + "epoch": 750.7692307692307, + "grad_norm": 24.255083084106445, + "learning_rate": 5.1224e-06, + "loss": 0.6481, + "step": 48800 + }, + { + "epoch": 750.7692307692307, + "eval_loss": 1.1574848890304565, + "eval_runtime": 10.8752, + "eval_samples_per_second": 12.046, + "eval_steps_per_second": 1.563, + "step": 48800 + }, + { + "epoch": 752.3076923076923, + "grad_norm": 32.8690071105957, + "learning_rate": 5.1124e-06, + "loss": 0.6521, + "step": 48900 + }, + { + "epoch": 752.3076923076923, + "eval_loss": 1.1481850147247314, + "eval_runtime": 10.8774, + "eval_samples_per_second": 12.043, + "eval_steps_per_second": 1.563, + "step": 48900 + }, + { + "epoch": 753.8461538461538, + "grad_norm": 18.540687561035156, + "learning_rate": 5.1024000000000005e-06, + "loss": 0.6398, + "step": 49000 + }, + { + "epoch": 753.8461538461538, + "eval_loss": 1.14930260181427, + "eval_runtime": 10.9492, + "eval_samples_per_second": 11.964, + "eval_steps_per_second": 1.553, + "step": 49000 + }, + { + "epoch": 755.3846153846154, + "grad_norm": 54.92947769165039, + "learning_rate": 5.0924000000000006e-06, + "loss": 0.6682, + "step": 49100 + }, + { + "epoch": 755.3846153846154, + "eval_loss": 1.1369296312332153, + "eval_runtime": 10.9454, + "eval_samples_per_second": 11.968, + "eval_steps_per_second": 1.553, + "step": 49100 + }, + { + "epoch": 756.9230769230769, + "grad_norm": 53.52052307128906, + "learning_rate": 5.082400000000001e-06, + "loss": 0.6585, + "step": 49200 + }, + { + "epoch": 756.9230769230769, + "eval_loss": 1.1473652124404907, + "eval_runtime": 10.8826, + "eval_samples_per_second": 12.038, + "eval_steps_per_second": 1.562, + "step": 49200 + }, + { + "epoch": 758.4615384615385, + "grad_norm": 16.635896682739258, + "learning_rate": 5.072400000000001e-06, + "loss": 0.6548, + "step": 49300 + }, + { + "epoch": 758.4615384615385, + "eval_loss": 1.1442939043045044, + "eval_runtime": 10.8624, + "eval_samples_per_second": 12.06, + "eval_steps_per_second": 1.565, + "step": 49300 + }, + { + "epoch": 760.0, + "grad_norm": 31.926729202270508, + "learning_rate": 5.062400000000001e-06, + "loss": 0.6506, + "step": 49400 + }, + { + "epoch": 760.0, + "eval_loss": 1.1774253845214844, + "eval_runtime": 10.8717, + "eval_samples_per_second": 12.05, + "eval_steps_per_second": 1.564, + "step": 49400 + }, + { + "epoch": 761.5384615384615, + "grad_norm": 27.157562255859375, + "learning_rate": 5.052400000000001e-06, + "loss": 0.6546, + "step": 49500 + }, + { + "epoch": 761.5384615384615, + "eval_loss": 1.1536325216293335, + "eval_runtime": 10.8163, + "eval_samples_per_second": 12.111, + "eval_steps_per_second": 1.572, + "step": 49500 + }, + { + "epoch": 763.0769230769231, + "grad_norm": 45.562442779541016, + "learning_rate": 5.0424e-06, + "loss": 0.6225, + "step": 49600 + }, + { + "epoch": 763.0769230769231, + "eval_loss": 1.1408708095550537, + "eval_runtime": 10.9997, + "eval_samples_per_second": 11.909, + "eval_steps_per_second": 1.545, + "step": 49600 + }, + { + "epoch": 764.6153846153846, + "grad_norm": 36.91178512573242, + "learning_rate": 5.0325000000000005e-06, + "loss": 0.6609, + "step": 49700 + }, + { + "epoch": 764.6153846153846, + "eval_loss": 1.1544111967086792, + "eval_runtime": 10.9638, + "eval_samples_per_second": 11.948, + "eval_steps_per_second": 1.551, + "step": 49700 + }, + { + "epoch": 766.1538461538462, + "grad_norm": 16.614078521728516, + "learning_rate": 5.0225e-06, + "loss": 0.6465, + "step": 49800 + }, + { + "epoch": 766.1538461538462, + "eval_loss": 1.1474765539169312, + "eval_runtime": 10.9423, + "eval_samples_per_second": 11.972, + "eval_steps_per_second": 1.554, + "step": 49800 + }, + { + "epoch": 767.6923076923077, + "grad_norm": 22.53380584716797, + "learning_rate": 5.0125e-06, + "loss": 0.648, + "step": 49900 + }, + { + "epoch": 767.6923076923077, + "eval_loss": 1.1623444557189941, + "eval_runtime": 11.0246, + "eval_samples_per_second": 11.883, + "eval_steps_per_second": 1.542, + "step": 49900 + }, + { + "epoch": 769.2307692307693, + "grad_norm": 61.682342529296875, + "learning_rate": 5.0025e-06, + "loss": 0.6484, + "step": 50000 + }, + { + "epoch": 769.2307692307693, + "eval_loss": 1.155450463294983, + "eval_runtime": 11.0102, + "eval_samples_per_second": 11.898, + "eval_steps_per_second": 1.544, + "step": 50000 + }, + { + "epoch": 770.7692307692307, + "grad_norm": 31.147279739379883, + "learning_rate": 4.992500000000001e-06, + "loss": 0.6304, + "step": 50100 + }, + { + "epoch": 770.7692307692307, + "eval_loss": 1.1733567714691162, + "eval_runtime": 11.0184, + "eval_samples_per_second": 11.889, + "eval_steps_per_second": 1.543, + "step": 50100 + }, + { + "epoch": 772.3076923076923, + "grad_norm": 30.67839241027832, + "learning_rate": 4.982500000000001e-06, + "loss": 0.664, + "step": 50200 + }, + { + "epoch": 772.3076923076923, + "eval_loss": 1.1758753061294556, + "eval_runtime": 10.9018, + "eval_samples_per_second": 12.016, + "eval_steps_per_second": 1.559, + "step": 50200 + }, + { + "epoch": 773.8461538461538, + "grad_norm": 15.452601432800293, + "learning_rate": 4.9725e-06, + "loss": 0.6421, + "step": 50300 + }, + { + "epoch": 773.8461538461538, + "eval_loss": 1.189855933189392, + "eval_runtime": 10.9505, + "eval_samples_per_second": 11.963, + "eval_steps_per_second": 1.552, + "step": 50300 + }, + { + "epoch": 775.3846153846154, + "grad_norm": 35.192935943603516, + "learning_rate": 4.9625e-06, + "loss": 0.6443, + "step": 50400 + }, + { + "epoch": 775.3846153846154, + "eval_loss": 1.1846635341644287, + "eval_runtime": 10.9647, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.55, + "step": 50400 + }, + { + "epoch": 776.9230769230769, + "grad_norm": 28.71339225769043, + "learning_rate": 4.9525000000000004e-06, + "loss": 0.6578, + "step": 50500 + }, + { + "epoch": 776.9230769230769, + "eval_loss": 1.1727924346923828, + "eval_runtime": 10.9587, + "eval_samples_per_second": 11.954, + "eval_steps_per_second": 1.551, + "step": 50500 + }, + { + "epoch": 778.4615384615385, + "grad_norm": 27.644819259643555, + "learning_rate": 4.9425000000000005e-06, + "loss": 0.6664, + "step": 50600 + }, + { + "epoch": 778.4615384615385, + "eval_loss": 1.178717017173767, + "eval_runtime": 10.9922, + "eval_samples_per_second": 11.918, + "eval_steps_per_second": 1.547, + "step": 50600 + }, + { + "epoch": 780.0, + "grad_norm": 35.76622772216797, + "learning_rate": 4.932500000000001e-06, + "loss": 0.6414, + "step": 50700 + }, + { + "epoch": 780.0, + "eval_loss": 1.166308045387268, + "eval_runtime": 10.8857, + "eval_samples_per_second": 12.034, + "eval_steps_per_second": 1.562, + "step": 50700 + }, + { + "epoch": 781.5384615384615, + "grad_norm": 50.48115539550781, + "learning_rate": 4.922500000000001e-06, + "loss": 0.6476, + "step": 50800 + }, + { + "epoch": 781.5384615384615, + "eval_loss": 1.1655805110931396, + "eval_runtime": 10.9241, + "eval_samples_per_second": 11.992, + "eval_steps_per_second": 1.556, + "step": 50800 + }, + { + "epoch": 783.0769230769231, + "grad_norm": 35.98445129394531, + "learning_rate": 4.912500000000001e-06, + "loss": 0.6413, + "step": 50900 + }, + { + "epoch": 783.0769230769231, + "eval_loss": 1.1722829341888428, + "eval_runtime": 10.8395, + "eval_samples_per_second": 12.085, + "eval_steps_per_second": 1.568, + "step": 50900 + }, + { + "epoch": 784.6153846153846, + "grad_norm": 20.791536331176758, + "learning_rate": 4.902500000000001e-06, + "loss": 0.6339, + "step": 51000 + }, + { + "epoch": 784.6153846153846, + "eval_loss": 1.158523678779602, + "eval_runtime": 10.9902, + "eval_samples_per_second": 11.92, + "eval_steps_per_second": 1.547, + "step": 51000 + }, + { + "epoch": 786.1538461538462, + "grad_norm": 26.89096450805664, + "learning_rate": 4.8925e-06, + "loss": 0.6727, + "step": 51100 + }, + { + "epoch": 786.1538461538462, + "eval_loss": 1.1766748428344727, + "eval_runtime": 11.0106, + "eval_samples_per_second": 11.898, + "eval_steps_per_second": 1.544, + "step": 51100 + }, + { + "epoch": 787.6923076923077, + "grad_norm": 62.940006256103516, + "learning_rate": 4.8825e-06, + "loss": 0.6555, + "step": 51200 + }, + { + "epoch": 787.6923076923077, + "eval_loss": 1.168807029724121, + "eval_runtime": 10.9776, + "eval_samples_per_second": 11.933, + "eval_steps_per_second": 1.549, + "step": 51200 + }, + { + "epoch": 789.2307692307693, + "grad_norm": 40.993186950683594, + "learning_rate": 4.872600000000001e-06, + "loss": 0.6299, + "step": 51300 + }, + { + "epoch": 789.2307692307693, + "eval_loss": 1.1668223142623901, + "eval_runtime": 10.9625, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 1.551, + "step": 51300 + }, + { + "epoch": 790.7692307692307, + "grad_norm": 28.166980743408203, + "learning_rate": 4.862600000000001e-06, + "loss": 0.6352, + "step": 51400 + }, + { + "epoch": 790.7692307692307, + "eval_loss": 1.1532963514328003, + "eval_runtime": 10.9508, + "eval_samples_per_second": 11.963, + "eval_steps_per_second": 1.552, + "step": 51400 + }, + { + "epoch": 792.3076923076923, + "grad_norm": 22.570247650146484, + "learning_rate": 4.852600000000001e-06, + "loss": 0.656, + "step": 51500 + }, + { + "epoch": 792.3076923076923, + "eval_loss": 1.1761770248413086, + "eval_runtime": 10.9696, + "eval_samples_per_second": 11.942, + "eval_steps_per_second": 1.55, + "step": 51500 + }, + { + "epoch": 793.8461538461538, + "grad_norm": 29.27780532836914, + "learning_rate": 4.842600000000001e-06, + "loss": 0.6509, + "step": 51600 + }, + { + "epoch": 793.8461538461538, + "eval_loss": 1.179997444152832, + "eval_runtime": 10.8921, + "eval_samples_per_second": 12.027, + "eval_steps_per_second": 1.561, + "step": 51600 + }, + { + "epoch": 795.3846153846154, + "grad_norm": 37.451087951660156, + "learning_rate": 4.832600000000001e-06, + "loss": 0.6574, + "step": 51700 + }, + { + "epoch": 795.3846153846154, + "eval_loss": 1.1756476163864136, + "eval_runtime": 10.8957, + "eval_samples_per_second": 12.023, + "eval_steps_per_second": 1.56, + "step": 51700 + }, + { + "epoch": 796.9230769230769, + "grad_norm": 53.822975158691406, + "learning_rate": 4.8226e-06, + "loss": 0.6408, + "step": 51800 + }, + { + "epoch": 796.9230769230769, + "eval_loss": 1.1540732383728027, + "eval_runtime": 10.9262, + "eval_samples_per_second": 11.99, + "eval_steps_per_second": 1.556, + "step": 51800 + }, + { + "epoch": 798.4615384615385, + "grad_norm": 37.03651428222656, + "learning_rate": 4.8126e-06, + "loss": 0.6705, + "step": 51900 + }, + { + "epoch": 798.4615384615385, + "eval_loss": 1.177356243133545, + "eval_runtime": 10.9577, + "eval_samples_per_second": 11.955, + "eval_steps_per_second": 1.551, + "step": 51900 + }, + { + "epoch": 800.0, + "grad_norm": 20.716794967651367, + "learning_rate": 4.8026e-06, + "loss": 0.6266, + "step": 52000 + }, + { + "epoch": 800.0, + "eval_loss": 1.1817307472229004, + "eval_runtime": 10.9377, + "eval_samples_per_second": 11.977, + "eval_steps_per_second": 1.554, + "step": 52000 + }, + { + "epoch": 801.5384615384615, + "grad_norm": 28.821758270263672, + "learning_rate": 4.7926000000000005e-06, + "loss": 0.6267, + "step": 52100 + }, + { + "epoch": 801.5384615384615, + "eval_loss": 1.1904364824295044, + "eval_runtime": 10.9153, + "eval_samples_per_second": 12.002, + "eval_steps_per_second": 1.557, + "step": 52100 + }, + { + "epoch": 803.0769230769231, + "grad_norm": 34.647403717041016, + "learning_rate": 4.782600000000001e-06, + "loss": 0.6522, + "step": 52200 + }, + { + "epoch": 803.0769230769231, + "eval_loss": 1.1568284034729004, + "eval_runtime": 10.9373, + "eval_samples_per_second": 11.977, + "eval_steps_per_second": 1.554, + "step": 52200 + }, + { + "epoch": 804.6153846153846, + "grad_norm": 31.859989166259766, + "learning_rate": 4.772600000000001e-06, + "loss": 0.6593, + "step": 52300 + }, + { + "epoch": 804.6153846153846, + "eval_loss": 1.160211205482483, + "eval_runtime": 10.9485, + "eval_samples_per_second": 11.965, + "eval_steps_per_second": 1.553, + "step": 52300 + }, + { + "epoch": 806.1538461538462, + "grad_norm": 51.3975830078125, + "learning_rate": 4.762600000000001e-06, + "loss": 0.6377, + "step": 52400 + }, + { + "epoch": 806.1538461538462, + "eval_loss": 1.1537035703659058, + "eval_runtime": 10.9209, + "eval_samples_per_second": 11.995, + "eval_steps_per_second": 1.557, + "step": 52400 + }, + { + "epoch": 807.6923076923077, + "grad_norm": 35.31791305541992, + "learning_rate": 4.752600000000001e-06, + "loss": 0.6335, + "step": 52500 + }, + { + "epoch": 807.6923076923077, + "eval_loss": 1.1430357694625854, + "eval_runtime": 11.032, + "eval_samples_per_second": 11.875, + "eval_steps_per_second": 1.541, + "step": 52500 + }, + { + "epoch": 809.2307692307693, + "grad_norm": 36.545860290527344, + "learning_rate": 4.7426e-06, + "loss": 0.6486, + "step": 52600 + }, + { + "epoch": 809.2307692307693, + "eval_loss": 1.1675678491592407, + "eval_runtime": 10.9679, + "eval_samples_per_second": 11.944, + "eval_steps_per_second": 1.55, + "step": 52600 + }, + { + "epoch": 810.7692307692307, + "grad_norm": 11.130383491516113, + "learning_rate": 4.7326e-06, + "loss": 0.6547, + "step": 52700 + }, + { + "epoch": 810.7692307692307, + "eval_loss": 1.1528987884521484, + "eval_runtime": 10.9553, + "eval_samples_per_second": 11.958, + "eval_steps_per_second": 1.552, + "step": 52700 + }, + { + "epoch": 812.3076923076923, + "grad_norm": 30.692113876342773, + "learning_rate": 4.7226e-06, + "loss": 0.6371, + "step": 52800 + }, + { + "epoch": 812.3076923076923, + "eval_loss": 1.144073247909546, + "eval_runtime": 10.9653, + "eval_samples_per_second": 11.947, + "eval_steps_per_second": 1.55, + "step": 52800 + }, + { + "epoch": 813.8461538461538, + "grad_norm": 20.095840454101562, + "learning_rate": 4.7126e-06, + "loss": 0.6524, + "step": 52900 + }, + { + "epoch": 813.8461538461538, + "eval_loss": 1.1581467390060425, + "eval_runtime": 10.9858, + "eval_samples_per_second": 11.924, + "eval_steps_per_second": 1.547, + "step": 52900 + }, + { + "epoch": 815.3846153846154, + "grad_norm": 58.920494079589844, + "learning_rate": 4.7026000000000005e-06, + "loss": 0.6464, + "step": 53000 + }, + { + "epoch": 815.3846153846154, + "eval_loss": 1.1517822742462158, + "eval_runtime": 10.9972, + "eval_samples_per_second": 11.912, + "eval_steps_per_second": 1.546, + "step": 53000 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1539, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0276470994669568e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}