diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6307 +1,4216 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 15.0, + "epoch": 10.0, "eval_steps": 500, - "global_step": 4395, + "global_step": 2930, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034129692832764505, - "grad_norm": 1.8359375, - "learning_rate": 4.545454545454545e-07, + "grad_norm": 3.671875, + "learning_rate": 6.825938566552902e-07, "loss": 3.0499, "step": 1 }, { "epoch": 0.017064846416382253, - "grad_norm": 2.234375, - "learning_rate": 2.2727272727272728e-06, - "loss": 3.0434, + "grad_norm": 4.96875, + "learning_rate": 3.4129692832764506e-06, + "loss": 3.0421, "step": 5 }, { "epoch": 0.034129692832764506, - "grad_norm": 2.078125, - "learning_rate": 4.5454545454545455e-06, - "loss": 3.0699, + "grad_norm": 4.34375, + "learning_rate": 6.825938566552901e-06, + "loss": 3.0559, "step": 10 }, { "epoch": 0.051194539249146756, - "grad_norm": 1.8515625, - "learning_rate": 6.818181818181818e-06, - "loss": 3.0656, + "grad_norm": 3.09375, + "learning_rate": 1.0238907849829352e-05, + "loss": 2.9957, "step": 15 }, { "epoch": 0.06825938566552901, - "grad_norm": 3.0, - "learning_rate": 9.090909090909091e-06, - "loss": 3.0526, + "grad_norm": 2.71875, + "learning_rate": 1.3651877133105803e-05, + "loss": 2.8653, "step": 20 }, { "epoch": 0.08532423208191127, - "grad_norm": 2.015625, - "learning_rate": 1.1363636363636365e-05, - "loss": 3.0382, + "grad_norm": 3.3125, + "learning_rate": 1.7064846416382256e-05, + "loss": 2.7049, "step": 25 }, { "epoch": 0.10238907849829351, - "grad_norm": 17.125, - "learning_rate": 1.3636363636363637e-05, - "loss": 2.982, + "grad_norm": 18.5, + "learning_rate": 2.0477815699658705e-05, + "loss": 2.5238, "step": 30 }, { "epoch": 0.11945392491467577, - "grad_norm": 2.03125, - "learning_rate": 1.590909090909091e-05, - "loss": 2.9332, + "grad_norm": 1.8828125, + "learning_rate": 2.3890784982935157e-05, + "loss": 2.3984, "step": 35 }, { "epoch": 0.13651877133105803, - "grad_norm": 3.140625, - "learning_rate": 1.8181818181818182e-05, - "loss": 2.8934, + "grad_norm": 1.3671875, + "learning_rate": 2.7303754266211605e-05, + "loss": 2.3001, "step": 40 }, { "epoch": 0.15358361774744028, - "grad_norm": 3.25, - "learning_rate": 2.0454545454545457e-05, - "loss": 2.7804, + "grad_norm": 2.0625, + "learning_rate": 3.071672354948806e-05, + "loss": 2.1645, "step": 45 }, { "epoch": 0.17064846416382254, - "grad_norm": 1.3984375, - "learning_rate": 2.272727272727273e-05, - "loss": 2.7194, + "grad_norm": 1.25, + "learning_rate": 3.412969283276451e-05, + "loss": 2.0453, "step": 50 }, { "epoch": 0.18771331058020477, - "grad_norm": 1.296875, - "learning_rate": 2.5e-05, - "loss": 2.5961, + "grad_norm": 0.8046875, + "learning_rate": 3.754266211604096e-05, + "loss": 1.8952, "step": 55 }, { "epoch": 0.20477815699658702, - "grad_norm": 1.6640625, - "learning_rate": 2.7272727272727273e-05, - "loss": 2.5046, + "grad_norm": 1.609375, + "learning_rate": 4.095563139931741e-05, + "loss": 1.7862, "step": 60 }, { "epoch": 0.22184300341296928, - "grad_norm": 1.0234375, - "learning_rate": 2.954545454545455e-05, - "loss": 2.3975, + "grad_norm": 0.63671875, + "learning_rate": 4.436860068259386e-05, + "loss": 1.6922, "step": 65 }, { "epoch": 0.23890784982935154, - "grad_norm": 1.390625, - "learning_rate": 3.181818181818182e-05, - "loss": 2.3091, + "grad_norm": 0.400390625, + "learning_rate": 4.778156996587031e-05, + "loss": 1.6006, "step": 70 }, { "epoch": 0.25597269624573377, - "grad_norm": 1.1171875, - "learning_rate": 3.409090909090909e-05, - "loss": 2.2036, + "grad_norm": 0.375, + "learning_rate": 5.119453924914676e-05, + "loss": 1.5335, "step": 75 }, { "epoch": 0.27303754266211605, - "grad_norm": 0.84375, - "learning_rate": 3.6363636363636364e-05, - "loss": 2.113, + "grad_norm": 0.43359375, + "learning_rate": 5.460750853242321e-05, + "loss": 1.4832, "step": 80 }, { "epoch": 0.2901023890784983, - "grad_norm": 1.7421875, - "learning_rate": 3.8636363636363636e-05, - "loss": 2.043, + "grad_norm": 0.67578125, + "learning_rate": 5.802047781569966e-05, + "loss": 1.4393, "step": 85 }, { "epoch": 0.30716723549488056, - "grad_norm": 6.375, - "learning_rate": 4.0909090909090915e-05, - "loss": 1.9568, + "grad_norm": 0.337890625, + "learning_rate": 6.143344709897612e-05, + "loss": 1.3951, "step": 90 }, { "epoch": 0.3242320819112628, - "grad_norm": 0.71484375, - "learning_rate": 4.318181818181819e-05, - "loss": 1.8927, + "grad_norm": 0.275390625, + "learning_rate": 6.484641638225257e-05, + "loss": 1.3594, "step": 95 }, { "epoch": 0.3412969283276451, - "grad_norm": 1.5078125, - "learning_rate": 4.545454545454546e-05, - "loss": 1.8394, + "grad_norm": 0.267578125, + "learning_rate": 6.825938566552902e-05, + "loss": 1.3456, "step": 100 }, { "epoch": 0.3583617747440273, - "grad_norm": 25.625, - "learning_rate": 4.772727272727273e-05, - "loss": 1.7808, + "grad_norm": 0.3671875, + "learning_rate": 7.167235494880547e-05, + "loss": 1.3174, "step": 105 }, { "epoch": 0.37542662116040953, - "grad_norm": 0.54296875, - "learning_rate": 5e-05, - "loss": 1.7467, + "grad_norm": 0.328125, + "learning_rate": 7.508532423208191e-05, + "loss": 1.3087, "step": 110 }, { "epoch": 0.3924914675767918, - "grad_norm": 0.8671875, - "learning_rate": 5.2272727272727274e-05, - "loss": 1.6988, + "grad_norm": 0.5703125, + "learning_rate": 7.849829351535837e-05, + "loss": 1.3001, "step": 115 }, { "epoch": 0.40955631399317405, - "grad_norm": 0.609375, - "learning_rate": 5.4545454545454546e-05, - "loss": 1.6442, + "grad_norm": 0.30078125, + "learning_rate": 8.191126279863482e-05, + "loss": 1.2871, "step": 120 }, { "epoch": 0.42662116040955633, - "grad_norm": 0.5078125, - "learning_rate": 5.6818181818181825e-05, - "loss": 1.5875, + "grad_norm": 0.65625, + "learning_rate": 8.532423208191128e-05, + "loss": 1.2567, "step": 125 }, { "epoch": 0.44368600682593856, - "grad_norm": 0.890625, - "learning_rate": 5.90909090909091e-05, - "loss": 1.5646, + "grad_norm": 0.458984375, + "learning_rate": 8.873720136518772e-05, + "loss": 1.2582, "step": 130 }, { "epoch": 0.46075085324232085, - "grad_norm": 0.53125, - "learning_rate": 6.136363636363636e-05, - "loss": 1.5244, + "grad_norm": 0.380859375, + "learning_rate": 9.215017064846417e-05, + "loss": 1.2471, "step": 135 }, { "epoch": 0.4778156996587031, - "grad_norm": 0.39453125, - "learning_rate": 6.363636363636364e-05, - "loss": 1.4945, + "grad_norm": 0.298828125, + "learning_rate": 9.556313993174063e-05, + "loss": 1.2357, "step": 140 }, { "epoch": 0.4948805460750853, - "grad_norm": 0.447265625, - "learning_rate": 6.59090909090909e-05, - "loss": 1.469, + "grad_norm": 0.65625, + "learning_rate": 9.897610921501707e-05, + "loss": 1.2303, "step": 145 }, { "epoch": 0.5119453924914675, - "grad_norm": 0.75390625, - "learning_rate": 6.818181818181818e-05, - "loss": 1.4478, + "grad_norm": 0.423828125, + "learning_rate": 0.00010238907849829352, + "loss": 1.226, "step": 150 }, { "epoch": 0.5290102389078498, - "grad_norm": 0.392578125, - "learning_rate": 7.045454545454546e-05, - "loss": 1.4291, + "grad_norm": 0.5390625, + "learning_rate": 0.00010580204778156998, + "loss": 1.2251, "step": 155 }, { "epoch": 0.5460750853242321, - "grad_norm": 0.314453125, - "learning_rate": 7.272727272727273e-05, - "loss": 1.4056, + "grad_norm": 0.416015625, + "learning_rate": 0.00010921501706484642, + "loss": 1.2135, "step": 160 }, { "epoch": 0.5631399317406144, - "grad_norm": 0.326171875, - "learning_rate": 7.500000000000001e-05, - "loss": 1.3839, + "grad_norm": 0.3828125, + "learning_rate": 0.00011262798634812288, + "loss": 1.2069, "step": 165 }, { "epoch": 0.5802047781569966, - "grad_norm": 0.283203125, - "learning_rate": 7.727272727272727e-05, - "loss": 1.3664, + "grad_norm": 0.6796875, + "learning_rate": 0.00011604095563139932, + "loss": 1.2005, "step": 170 }, { "epoch": 0.5972696245733788, - "grad_norm": 0.39453125, - "learning_rate": 7.954545454545455e-05, - "loss": 1.3557, + "grad_norm": 0.41015625, + "learning_rate": 0.00011945392491467577, + "loss": 1.1944, "step": 175 }, { "epoch": 0.6143344709897611, - "grad_norm": 0.33984375, - "learning_rate": 8.181818181818183e-05, - "loss": 1.3317, + "grad_norm": 0.50390625, + "learning_rate": 0.00012286689419795224, + "loss": 1.1775, "step": 180 }, { "epoch": 0.6313993174061433, - "grad_norm": 0.3125, - "learning_rate": 8.40909090909091e-05, - "loss": 1.3323, + "grad_norm": 0.38671875, + "learning_rate": 0.00012627986348122866, + "loss": 1.1844, "step": 185 }, { "epoch": 0.6484641638225256, - "grad_norm": 0.384765625, - "learning_rate": 8.636363636363637e-05, - "loss": 1.3129, + "grad_norm": 0.46484375, + "learning_rate": 0.00012969283276450513, + "loss": 1.1711, "step": 190 }, { "epoch": 0.6655290102389079, - "grad_norm": 0.435546875, - "learning_rate": 8.863636363636364e-05, - "loss": 1.3194, + "grad_norm": 0.72265625, + "learning_rate": 0.00013310580204778158, + "loss": 1.1824, "step": 195 }, { "epoch": 0.6825938566552902, - "grad_norm": 0.4140625, - "learning_rate": 9.090909090909092e-05, - "loss": 1.2992, + "grad_norm": 1.0859375, + "learning_rate": 0.00013651877133105805, + "loss": 1.169, "step": 200 }, { "epoch": 0.6996587030716723, - "grad_norm": 0.296875, - "learning_rate": 9.318181818181818e-05, - "loss": 1.2934, + "grad_norm": 0.58203125, + "learning_rate": 0.00013993174061433447, + "loss": 1.1691, "step": 205 }, { "epoch": 0.7167235494880546, - "grad_norm": 0.515625, - "learning_rate": 9.545454545454546e-05, - "loss": 1.2759, + "grad_norm": 0.42578125, + "learning_rate": 0.00014334470989761094, + "loss": 1.1573, "step": 210 }, { "epoch": 0.7337883959044369, - "grad_norm": 0.52734375, - "learning_rate": 9.772727272727274e-05, - "loss": 1.2775, + "grad_norm": 0.6328125, + "learning_rate": 0.00014675767918088738, + "loss": 1.1637, "step": 215 }, { "epoch": 0.7508532423208191, - "grad_norm": 0.357421875, - "learning_rate": 0.0001, - "loss": 1.2696, + "grad_norm": 0.68359375, + "learning_rate": 0.00015017064846416383, + "loss": 1.1605, "step": 220 }, { "epoch": 0.7679180887372014, - "grad_norm": 0.29296875, - "learning_rate": 0.00010227272727272727, - "loss": 1.2621, + "grad_norm": 0.4140625, + "learning_rate": 0.00015358361774744027, + "loss": 1.1539, "step": 225 }, { "epoch": 0.7849829351535836, - "grad_norm": 0.3359375, - "learning_rate": 0.00010454545454545455, - "loss": 1.251, + "grad_norm": 0.37109375, + "learning_rate": 0.00015699658703071675, + "loss": 1.1458, "step": 230 }, { "epoch": 0.8020477815699659, - "grad_norm": 0.419921875, - "learning_rate": 0.00010681818181818181, - "loss": 1.2544, + "grad_norm": 0.4140625, + "learning_rate": 0.0001604095563139932, + "loss": 1.1524, "step": 235 }, { "epoch": 0.8191126279863481, - "grad_norm": 0.48046875, - "learning_rate": 0.00010909090909090909, - "loss": 1.2528, + "grad_norm": 0.51171875, + "learning_rate": 0.00016382252559726964, + "loss": 1.1505, "step": 240 }, { "epoch": 0.8361774744027304, - "grad_norm": 0.5234375, - "learning_rate": 0.00011136363636363636, - "loss": 1.2459, + "grad_norm": 0.81640625, + "learning_rate": 0.00016723549488054608, + "loss": 1.1493, "step": 245 }, { "epoch": 0.8532423208191127, - "grad_norm": 0.455078125, - "learning_rate": 0.00011363636363636365, - "loss": 1.2322, + "grad_norm": 0.59765625, + "learning_rate": 0.00017064846416382255, + "loss": 1.1391, "step": 250 }, { "epoch": 0.8703071672354948, - "grad_norm": 0.451171875, - "learning_rate": 0.00011590909090909093, - "loss": 1.2154, + "grad_norm": 0.404296875, + "learning_rate": 0.00017406143344709897, + "loss": 1.1213, "step": 255 }, { "epoch": 0.8873720136518771, - "grad_norm": 0.44140625, - "learning_rate": 0.0001181818181818182, - "loss": 1.2258, + "grad_norm": 0.443359375, + "learning_rate": 0.00017747440273037544, + "loss": 1.1311, "step": 260 }, { "epoch": 0.9044368600682594, - "grad_norm": 0.56640625, - "learning_rate": 0.00012045454545454546, - "loss": 1.213, + "grad_norm": 0.8046875, + "learning_rate": 0.0001808873720136519, + "loss": 1.1224, "step": 265 }, { "epoch": 0.9215017064846417, - "grad_norm": 0.46875, - "learning_rate": 0.00012272727272727272, - "loss": 1.224, + "grad_norm": 0.36328125, + "learning_rate": 0.00018430034129692833, + "loss": 1.1369, "step": 270 }, { "epoch": 0.9385665529010239, - "grad_norm": 0.51171875, - "learning_rate": 0.000125, - "loss": 1.2093, + "grad_norm": 0.52734375, + "learning_rate": 0.00018771331058020478, + "loss": 1.1203, "step": 275 }, { "epoch": 0.9556313993174061, - "grad_norm": 0.90234375, - "learning_rate": 0.00012727272727272728, - "loss": 1.2132, + "grad_norm": 1.171875, + "learning_rate": 0.00019112627986348125, + "loss": 1.1281, "step": 280 }, { "epoch": 0.9726962457337884, - "grad_norm": 0.63671875, - "learning_rate": 0.00012954545454545456, - "loss": 1.2083, + "grad_norm": 0.8671875, + "learning_rate": 0.0001945392491467577, + "loss": 1.1231, "step": 285 }, { "epoch": 0.9897610921501706, - "grad_norm": 0.671875, - "learning_rate": 0.0001318181818181818, - "loss": 1.2085, + "grad_norm": 0.466796875, + "learning_rate": 0.00019795221843003414, + "loss": 1.1249, "step": 290 }, { "epoch": 1.0, - "eval_loss": 2.486323833465576, - "eval_runtime": 0.5451, - "eval_samples_per_second": 18.345, - "eval_steps_per_second": 1.834, + "eval_loss": 2.4640614986419678, + "eval_runtime": 0.5515, + "eval_samples_per_second": 18.133, + "eval_steps_per_second": 1.813, "step": 293 }, { "epoch": 1.006825938566553, - "grad_norm": 0.52734375, - "learning_rate": 0.0001340909090909091, - "loss": 1.1892, + "grad_norm": 0.65625, + "learning_rate": 0.00019999971613668125, + "loss": 1.1028, "step": 295 }, { "epoch": 1.023890784982935, - "grad_norm": 0.4296875, - "learning_rate": 0.00013636363636363637, - "loss": 1.191, + "grad_norm": 0.87890625, + "learning_rate": 0.00019999652269285281, + "loss": 1.0985, "step": 300 }, { "epoch": 1.0409556313993173, - "grad_norm": 0.5390625, - "learning_rate": 0.00013863636363636365, - "loss": 1.18, + "grad_norm": 0.333984375, + "learning_rate": 0.00019998978108973762, + "loss": 1.0885, "step": 305 }, { "epoch": 1.0580204778156996, - "grad_norm": 0.703125, - "learning_rate": 0.00014090909090909093, - "loss": 1.1964, + "grad_norm": 0.34375, + "learning_rate": 0.00019997949156654686, + "loss": 1.1064, "step": 310 }, { "epoch": 1.075085324232082, - "grad_norm": 0.435546875, - "learning_rate": 0.0001431818181818182, - "loss": 1.1877, + "grad_norm": 0.3828125, + "learning_rate": 0.00019996565448838176, + "loss": 1.0991, "step": 315 }, { "epoch": 1.0921501706484642, - "grad_norm": 0.59375, - "learning_rate": 0.00014545454545454546, - "loss": 1.1846, + "grad_norm": 1.03125, + "learning_rate": 0.0001999482703462211, + "loss": 1.0947, "step": 320 }, { "epoch": 1.1092150170648465, - "grad_norm": 0.8125, - "learning_rate": 0.00014772727272727274, - "loss": 1.1833, + "grad_norm": 0.478515625, + "learning_rate": 0.00019992733975690333, + "loss": 1.097, "step": 325 }, { "epoch": 1.1262798634812285, - "grad_norm": 0.9296875, - "learning_rate": 0.00015000000000000001, - "loss": 1.1704, + "grad_norm": 0.451171875, + "learning_rate": 0.00019990286346310493, + "loss": 1.0835, "step": 330 }, { "epoch": 1.1433447098976108, - "grad_norm": 1.5703125, - "learning_rate": 0.00015227272727272727, - "loss": 1.1886, + "grad_norm": 1.40625, + "learning_rate": 0.00019987484233331394, + "loss": 1.1033, "step": 335 }, { "epoch": 1.1604095563139931, - "grad_norm": 0.4609375, - "learning_rate": 0.00015454545454545454, - "loss": 1.1759, + "grad_norm": 1.8515625, + "learning_rate": 0.00019984327736179936, + "loss": 1.1011, "step": 340 }, { "epoch": 1.1774744027303754, - "grad_norm": 0.337890625, - "learning_rate": 0.00015681818181818182, - "loss": 1.1712, + "grad_norm": 1.6328125, + "learning_rate": 0.0001998081696685755, + "loss": 1.0986, "step": 345 }, { "epoch": 1.1945392491467577, - "grad_norm": 0.48046875, - "learning_rate": 0.0001590909090909091, - "loss": 1.1637, + "grad_norm": 0.72265625, + "learning_rate": 0.0001997695204993626, + "loss": 1.0859, "step": 350 }, { "epoch": 1.21160409556314, - "grad_norm": 0.58203125, - "learning_rate": 0.00016136363636363635, - "loss": 1.1657, + "grad_norm": 0.6875, + "learning_rate": 0.00019972733122554246, + "loss": 1.0867, "step": 355 }, { "epoch": 1.2286689419795223, - "grad_norm": 0.54296875, - "learning_rate": 0.00016363636363636366, - "loss": 1.1745, + "grad_norm": 0.62109375, + "learning_rate": 0.00019968160334410975, + "loss": 1.0949, "step": 360 }, { "epoch": 1.2457337883959045, - "grad_norm": 0.421875, - "learning_rate": 0.00016590909090909094, - "loss": 1.1496, + "grad_norm": 0.376953125, + "learning_rate": 0.00019963233847761894, + "loss": 1.0683, "step": 365 }, { "epoch": 1.2627986348122868, - "grad_norm": 0.546875, - "learning_rate": 0.0001681818181818182, - "loss": 1.1653, + "grad_norm": 0.5, + "learning_rate": 0.00019957953837412677, + "loss": 1.0829, "step": 370 }, { "epoch": 1.2798634812286689, - "grad_norm": 0.5078125, - "learning_rate": 0.00017045454545454547, - "loss": 1.1702, + "grad_norm": 0.306640625, + "learning_rate": 0.0001995232049071302, + "loss": 1.0878, "step": 375 }, { "epoch": 1.2969283276450512, - "grad_norm": 0.53125, - "learning_rate": 0.00017272727272727275, - "loss": 1.15, + "grad_norm": 0.458984375, + "learning_rate": 0.00019946334007549978, + "loss": 1.0697, "step": 380 }, { "epoch": 1.3139931740614335, - "grad_norm": 1.2421875, - "learning_rate": 0.000175, - "loss": 1.1615, + "grad_norm": 0.54296875, + "learning_rate": 0.00019939994600340905, + "loss": 1.0765, "step": 385 }, { "epoch": 1.3310580204778157, - "grad_norm": 1.625, - "learning_rate": 0.00017727272727272728, - "loss": 1.1662, + "grad_norm": 0.310546875, + "learning_rate": 0.00019933302494025884, + "loss": 1.0772, "step": 390 }, { "epoch": 1.348122866894198, - "grad_norm": 0.5, - "learning_rate": 0.00017954545454545456, - "loss": 1.1579, + "grad_norm": 0.3515625, + "learning_rate": 0.00019926257926059768, + "loss": 1.0739, "step": 395 }, { "epoch": 1.36518771331058, - "grad_norm": 0.90234375, - "learning_rate": 0.00018181818181818183, - "loss": 1.1628, + "grad_norm": 0.271484375, + "learning_rate": 0.00019918861146403733, + "loss": 1.0816, "step": 400 }, { "epoch": 1.3822525597269624, - "grad_norm": 0.3515625, - "learning_rate": 0.00018409090909090909, - "loss": 1.1521, + "grad_norm": 0.30859375, + "learning_rate": 0.0001991111241751644, + "loss": 1.0711, "step": 405 }, { "epoch": 1.3993174061433447, - "grad_norm": 0.455078125, - "learning_rate": 0.00018636363636363636, - "loss": 1.1422, + "grad_norm": 0.27734375, + "learning_rate": 0.00019903012014344686, + "loss": 1.0616, "step": 410 }, { "epoch": 1.416382252559727, - "grad_norm": 0.52734375, - "learning_rate": 0.00018863636363636364, - "loss": 1.1408, + "grad_norm": 0.30859375, + "learning_rate": 0.00019894560224313678, + "loss": 1.0624, "step": 415 }, { "epoch": 1.4334470989761092, - "grad_norm": 0.53515625, - "learning_rate": 0.00019090909090909092, - "loss": 1.1356, + "grad_norm": 0.341796875, + "learning_rate": 0.00019885757347316813, + "loss": 1.0572, "step": 420 }, { "epoch": 1.4505119453924915, - "grad_norm": 0.46875, - "learning_rate": 0.0001931818181818182, - "loss": 1.1497, + "grad_norm": 0.400390625, + "learning_rate": 0.0001987660369570505, + "loss": 1.0701, "step": 425 }, { "epoch": 1.4675767918088738, - "grad_norm": 0.47265625, - "learning_rate": 0.00019545454545454548, - "loss": 1.1437, + "grad_norm": 0.578125, + "learning_rate": 0.00019867099594275827, + "loss": 1.0669, "step": 430 }, { "epoch": 1.484641638225256, - "grad_norm": 0.42578125, - "learning_rate": 0.00019772727272727273, - "loss": 1.1518, + "grad_norm": 0.412109375, + "learning_rate": 0.00019857245380261525, + "loss": 1.0724, "step": 435 }, { "epoch": 1.5017064846416384, - "grad_norm": 0.38671875, - "learning_rate": 0.0002, - "loss": 1.1518, + "grad_norm": 0.365234375, + "learning_rate": 0.0001984704140331751, + "loss": 1.0728, "step": 440 }, { "epoch": 1.5187713310580204, - "grad_norm": 0.3984375, - "learning_rate": 0.000199999211292062, - "loss": 1.1498, + "grad_norm": 0.296875, + "learning_rate": 0.00019836488025509736, + "loss": 1.0712, "step": 445 }, { "epoch": 1.5358361774744027, - "grad_norm": 0.388671875, - "learning_rate": 0.00019999684518068916, - "loss": 1.1378, + "grad_norm": 0.3125, + "learning_rate": 0.00019825585621301872, + "loss": 1.0569, "step": 450 }, { "epoch": 1.552901023890785, - "grad_norm": 0.87109375, - "learning_rate": 0.00019999290170320485, - "loss": 1.1434, + "grad_norm": 0.486328125, + "learning_rate": 0.00019814334577542038, + "loss": 1.0638, "step": 455 }, { "epoch": 1.5699658703071673, - "grad_norm": 0.6953125, - "learning_rate": 0.00019998738092181421, - "loss": 1.1417, + "grad_norm": 0.30078125, + "learning_rate": 0.0001980273529344907, + "loss": 1.0638, "step": 460 }, { "epoch": 1.5870307167235493, - "grad_norm": 0.79296875, - "learning_rate": 0.00019998028292360286, - "loss": 1.1329, + "grad_norm": 0.310546875, + "learning_rate": 0.00019790788180598358, + "loss": 1.0556, "step": 465 }, { "epoch": 1.6040955631399316, - "grad_norm": 0.5625, - "learning_rate": 0.00019997160782053578, - "loss": 1.1339, + "grad_norm": 0.265625, + "learning_rate": 0.00019778493662907237, + "loss": 1.056, "step": 470 }, { "epoch": 1.621160409556314, - "grad_norm": 0.384765625, - "learning_rate": 0.00019996135574945544, - "loss": 1.1273, + "grad_norm": 0.5390625, + "learning_rate": 0.00019765852176619944, + "loss": 1.0512, "step": 475 }, { "epoch": 1.6382252559726962, - "grad_norm": 0.455078125, - "learning_rate": 0.00019994952687207954, - "loss": 1.1343, + "grad_norm": 0.404296875, + "learning_rate": 0.00019752864170292152, + "loss": 1.0585, "step": 480 }, { "epoch": 1.6552901023890785, - "grad_norm": 0.69921875, - "learning_rate": 0.00019993612137499876, - "loss": 1.1374, + "grad_norm": 0.62890625, + "learning_rate": 0.00019739530104775032, + "loss": 1.0628, "step": 485 }, { "epoch": 1.6723549488054608, - "grad_norm": 0.88671875, - "learning_rate": 0.00019992113946967353, - "loss": 1.1368, + "grad_norm": 0.3125, + "learning_rate": 0.00019725850453198925, + "loss": 1.0612, "step": 490 }, { "epoch": 1.689419795221843, - "grad_norm": 0.490234375, - "learning_rate": 0.00019990458139243077, - "loss": 1.1289, + "grad_norm": 0.5234375, + "learning_rate": 0.00019711825700956536, + "loss": 1.0549, "step": 495 }, { "epoch": 1.7064846416382253, - "grad_norm": 0.53515625, - "learning_rate": 0.00019988644740446022, - "loss": 1.1255, + "grad_norm": 0.42578125, + "learning_rate": 0.0001969745634568572, + "loss": 1.0506, "step": 500 }, { "epoch": 1.7235494880546076, - "grad_norm": 0.5234375, - "learning_rate": 0.00019986673779181033, - "loss": 1.1149, + "grad_norm": 0.404296875, + "learning_rate": 0.00019682742897251818, + "loss": 1.0418, "step": 505 }, { "epoch": 1.74061433447099, - "grad_norm": 0.41015625, - "learning_rate": 0.0001998454528653836, - "loss": 1.1241, + "grad_norm": 0.400390625, + "learning_rate": 0.0001966768587772957, + "loss": 1.0508, "step": 510 }, { "epoch": 1.757679180887372, - "grad_norm": 0.41796875, - "learning_rate": 0.0001998225929609319, - "loss": 1.1252, + "grad_norm": 0.400390625, + "learning_rate": 0.00019652285821384596, + "loss": 1.0519, "step": 515 }, { "epoch": 1.7747440273037542, - "grad_norm": 0.458984375, - "learning_rate": 0.00019979815843905097, - "loss": 1.1292, + "grad_norm": 0.404296875, + "learning_rate": 0.0001963654327465442, + "loss": 1.0554, "step": 520 }, { "epoch": 1.7918088737201365, - "grad_norm": 0.400390625, - "learning_rate": 0.0001997721496851748, - "loss": 1.1147, + "grad_norm": 0.404296875, + "learning_rate": 0.00019620458796129104, + "loss": 1.0421, "step": 525 }, { "epoch": 1.8088737201365188, - "grad_norm": 0.53125, - "learning_rate": 0.00019974456710956964, - "loss": 1.1155, + "grad_norm": 0.375, + "learning_rate": 0.0001960403295653141, + "loss": 1.0421, "step": 530 }, { "epoch": 1.8259385665529009, - "grad_norm": 0.546875, - "learning_rate": 0.00019971541114732741, - "loss": 1.1213, + "grad_norm": 0.322265625, + "learning_rate": 0.00019587266338696565, + "loss": 1.046, "step": 535 }, { "epoch": 1.8430034129692832, - "grad_norm": 0.40234375, - "learning_rate": 0.0001996846822583589, - "loss": 1.1257, + "grad_norm": 0.279296875, + "learning_rate": 0.00019570159537551552, + "loss": 1.0528, "step": 540 }, { "epoch": 1.8600682593856654, - "grad_norm": 0.38671875, - "learning_rate": 0.00019965238092738643, - "loss": 1.1217, + "grad_norm": 0.31640625, + "learning_rate": 0.00019552713160094038, + "loss": 1.0481, "step": 545 }, { "epoch": 1.8771331058020477, - "grad_norm": 0.5390625, - "learning_rate": 0.0001996185076639364, - "loss": 1.122, + "grad_norm": 0.314453125, + "learning_rate": 0.00019534927825370815, + "loss": 1.0477, "step": 550 }, { "epoch": 1.89419795221843, - "grad_norm": 0.390625, - "learning_rate": 0.00019958306300233098, - "loss": 1.1236, + "grad_norm": 0.30078125, + "learning_rate": 0.00019516804164455826, + "loss": 1.0513, "step": 555 }, { "epoch": 1.9112627986348123, - "grad_norm": 0.5390625, - "learning_rate": 0.00019954604750167993, - "loss": 1.122, + "grad_norm": 0.455078125, + "learning_rate": 0.00019498342820427794, + "loss": 1.0505, "step": 560 }, { "epoch": 1.9283276450511946, - "grad_norm": 0.66796875, - "learning_rate": 0.00019950746174587163, - "loss": 1.1271, + "grad_norm": 0.36328125, + "learning_rate": 0.00019479544448347392, + "loss": 1.0538, "step": 565 }, { "epoch": 1.9453924914675769, - "grad_norm": 0.47265625, - "learning_rate": 0.0001994673063435639, - "loss": 1.1064, + "grad_norm": 0.283203125, + "learning_rate": 0.00019460409715233996, + "loss": 1.0332, "step": 570 }, { "epoch": 1.9624573378839592, - "grad_norm": 0.3359375, - "learning_rate": 0.0001994255819281744, - "loss": 1.1186, + "grad_norm": 0.41796875, + "learning_rate": 0.00019440939300042028, + "loss": 1.047, "step": 575 }, { "epoch": 1.9795221843003414, - "grad_norm": 0.63671875, - "learning_rate": 0.0001993822891578708, - "loss": 1.1054, + "grad_norm": 0.365234375, + "learning_rate": 0.00019421133893636854, + "loss": 1.0321, "step": 580 }, { "epoch": 1.9965870307167235, - "grad_norm": 0.68359375, - "learning_rate": 0.00019933742871556, - "loss": 1.1135, + "grad_norm": 0.298828125, + "learning_rate": 0.00019400994198770274, + "loss": 1.0415, "step": 585 }, { "epoch": 2.0, - "eval_loss": 2.4516425132751465, - "eval_runtime": 0.5387, - "eval_samples_per_second": 18.563, - "eval_steps_per_second": 1.856, + "eval_loss": 2.451392650604248, + "eval_runtime": 0.5484, + "eval_samples_per_second": 18.236, + "eval_steps_per_second": 1.824, "step": 586 }, { "epoch": 2.013651877133106, - "grad_norm": 0.66015625, - "learning_rate": 0.00019929100130887782, - "loss": 1.1079, + "grad_norm": 0.287109375, + "learning_rate": 0.00019380520930055602, + "loss": 1.0194, "step": 590 }, { "epoch": 2.030716723549488, - "grad_norm": 0.94140625, - "learning_rate": 0.0001992430076701775, - "loss": 1.088, + "grad_norm": 0.412109375, + "learning_rate": 0.0001935971481394227, + "loss": 0.9985, "step": 595 }, { "epoch": 2.04778156996587, - "grad_norm": 0.400390625, - "learning_rate": 0.00019919344855651833, - "loss": 1.0921, + "grad_norm": 0.28515625, + "learning_rate": 0.00019338576588690104, + "loss": 1.0026, "step": 600 }, { "epoch": 2.0648464163822524, - "grad_norm": 0.59375, - "learning_rate": 0.00019914232474965365, - "loss": 1.0909, + "grad_norm": 0.34765625, + "learning_rate": 0.00019317107004343078, + "loss": 1.0018, "step": 605 }, { "epoch": 2.0819112627986347, - "grad_norm": 0.42578125, - "learning_rate": 0.00019908963705601846, - "loss": 1.0986, + "grad_norm": 0.310546875, + "learning_rate": 0.0001929530682270274, + "loss": 1.0096, "step": 610 }, { "epoch": 2.098976109215017, - "grad_norm": 0.435546875, - "learning_rate": 0.0001990353863067169, - "loss": 1.0925, + "grad_norm": 0.326171875, + "learning_rate": 0.0001927317681730115, + "loss": 1.0047, "step": 615 }, { "epoch": 2.1160409556313993, - "grad_norm": 0.640625, - "learning_rate": 0.00019897957335750878, - "loss": 1.0887, + "grad_norm": 0.4609375, + "learning_rate": 0.00019250717773373462, + "loss": 0.9998, "step": 620 }, { "epoch": 2.1331058020477816, - "grad_norm": 0.5078125, - "learning_rate": 0.00019892219908879653, - "loss": 1.0991, + "grad_norm": 0.333984375, + "learning_rate": 0.00019227930487830035, + "loss": 1.0121, "step": 625 }, { "epoch": 2.150170648464164, - "grad_norm": 0.416015625, - "learning_rate": 0.00019886326440561093, - "loss": 1.0949, + "grad_norm": 0.326171875, + "learning_rate": 0.00019204815769228176, + "loss": 1.0064, "step": 630 }, { "epoch": 2.167235494880546, - "grad_norm": 0.373046875, - "learning_rate": 0.00019880277023759702, - "loss": 1.0841, + "grad_norm": 0.294921875, + "learning_rate": 0.00019181374437743438, + "loss": 0.9968, "step": 635 }, { "epoch": 2.1843003412969284, - "grad_norm": 0.78515625, - "learning_rate": 0.0001987407175389994, - "loss": 1.0947, + "grad_norm": 0.2890625, + "learning_rate": 0.00019157607325140524, + "loss": 1.0046, "step": 640 }, { "epoch": 2.2013651877133107, - "grad_norm": 0.42578125, - "learning_rate": 0.0001986771072886472, - "loss": 1.1026, + "grad_norm": 0.43359375, + "learning_rate": 0.00019133515274743771, + "loss": 1.0161, "step": 645 }, { "epoch": 2.218430034129693, - "grad_norm": 0.392578125, - "learning_rate": 0.00019861194048993863, - "loss": 1.0918, + "grad_norm": 0.3203125, + "learning_rate": 0.00019109099141407233, + "loss": 1.004, "step": 650 }, { "epoch": 2.2354948805460753, "grad_norm": 0.41015625, - "learning_rate": 0.0001985452181708251, - "loss": 1.0903, + "learning_rate": 0.0001908435979148434, + "loss": 1.0071, "step": 655 }, { "epoch": 2.252559726962457, - "grad_norm": 0.7109375, - "learning_rate": 0.00019847694138379506, - "loss": 1.0978, + "grad_norm": 0.318359375, + "learning_rate": 0.00019059298102797146, + "loss": 1.0117, "step": 660 }, { "epoch": 2.26962457337884, - "grad_norm": 0.4609375, - "learning_rate": 0.0001984071112058574, - "loss": 1.0864, + "grad_norm": 0.3671875, + "learning_rate": 0.0001903391496460522, + "loss": 0.9996, "step": 665 }, { "epoch": 2.2866894197952217, - "grad_norm": 0.341796875, - "learning_rate": 0.00019833572873852444, - "loss": 1.0896, + "grad_norm": 0.31640625, + "learning_rate": 0.0001900821127757405, + "loss": 1.0038, "step": 670 }, { "epoch": 2.303754266211604, - "grad_norm": 0.53125, - "learning_rate": 0.00019826279510779454, - "loss": 1.0962, + "grad_norm": 0.5078125, + "learning_rate": 0.0001898218795374311, + "loss": 1.0105, "step": 675 }, { "epoch": 2.3208191126279862, - "grad_norm": 0.54296875, - "learning_rate": 0.00019818831146413434, - "loss": 1.0766, + "grad_norm": 0.451171875, + "learning_rate": 0.0001895584591649349, + "loss": 0.9929, "step": 680 }, { "epoch": 2.3378839590443685, - "grad_norm": 0.337890625, - "learning_rate": 0.0001981122789824607, - "loss": 1.0853, + "grad_norm": 0.333984375, + "learning_rate": 0.00018929186100515136, + "loss": 1.0018, "step": 685 }, { "epoch": 2.354948805460751, - "grad_norm": 0.69140625, - "learning_rate": 0.0001980346988621221, - "loss": 1.0788, + "grad_norm": 0.498046875, + "learning_rate": 0.00018902209451773674, + "loss": 0.9955, "step": 690 }, { "epoch": 2.372013651877133, - "grad_norm": 0.96875, - "learning_rate": 0.00019795557232687956, - "loss": 1.0804, + "grad_norm": 0.4375, + "learning_rate": 0.0001887491692747686, + "loss": 0.9953, "step": 695 }, { "epoch": 2.3890784982935154, - "grad_norm": 0.470703125, - "learning_rate": 0.0001978749006248877, - "loss": 1.0674, + "grad_norm": 0.37890625, + "learning_rate": 0.000188473094960406, + "loss": 0.9833, "step": 700 }, { "epoch": 2.4061433447098977, - "grad_norm": 0.326171875, - "learning_rate": 0.00019779268502867473, - "loss": 1.0931, + "grad_norm": 0.291015625, + "learning_rate": 0.00018819388137054604, + "loss": 1.0089, "step": 705 }, { "epoch": 2.42320819112628, - "grad_norm": 0.458984375, - "learning_rate": 0.0001977089268351225, - "loss": 1.0854, + "grad_norm": 0.287109375, + "learning_rate": 0.00018791153841247614, + "loss": 1.0031, "step": 710 }, { "epoch": 2.4402730375426622, - "grad_norm": 0.43359375, - "learning_rate": 0.00019762362736544607, - "loss": 1.0858, + "grad_norm": 0.279296875, + "learning_rate": 0.00018762607610452254, + "loss": 1.002, "step": 715 }, { "epoch": 2.4573378839590445, - "grad_norm": 0.396484375, - "learning_rate": 0.00019753678796517282, - "loss": 1.0835, + "grad_norm": 0.333984375, + "learning_rate": 0.00018733750457569485, + "loss": 1.0003, "step": 720 }, { "epoch": 2.474402730375427, - "grad_norm": 0.59375, - "learning_rate": 0.00019744841000412123, - "loss": 1.0881, + "grad_norm": 0.361328125, + "learning_rate": 0.00018704583406532662, + "loss": 1.004, "step": 725 }, { "epoch": 2.491467576791809, - "grad_norm": 0.6171875, - "learning_rate": 0.00019735849487637929, - "loss": 1.091, + "grad_norm": 0.31640625, + "learning_rate": 0.00018675107492271208, + "loss": 1.0075, "step": 730 }, { "epoch": 2.508532423208191, - "grad_norm": 0.5625, - "learning_rate": 0.0001972670440002825, - "loss": 1.0877, + "grad_norm": 0.2890625, + "learning_rate": 0.0001864532376067387, + "loss": 1.0035, "step": 735 }, { "epoch": 2.5255972696245736, - "grad_norm": 0.419921875, - "learning_rate": 0.00019717405881839145, - "loss": 1.0777, + "grad_norm": 0.33203125, + "learning_rate": 0.00018615233268551643, + "loss": 0.9968, "step": 740 }, { "epoch": 2.5426621160409555, - "grad_norm": 0.380859375, - "learning_rate": 0.00019707954079746927, - "loss": 1.0934, + "grad_norm": 0.294921875, + "learning_rate": 0.00018584837083600244, + "loss": 1.0124, "step": 745 }, { "epoch": 2.5597269624573378, - "grad_norm": 0.439453125, - "learning_rate": 0.00019698349142845814, - "loss": 1.085, + "grad_norm": 0.302734375, + "learning_rate": 0.00018554136284362237, + "loss": 1.0012, "step": 750 }, { "epoch": 2.57679180887372, - "grad_norm": 0.38671875, - "learning_rate": 0.00019688591222645607, - "loss": 1.0744, + "grad_norm": 0.291015625, + "learning_rate": 0.00018523131960188755, + "loss": 0.9915, "step": 755 }, { "epoch": 2.5938566552901023, - "grad_norm": 0.4375, - "learning_rate": 0.00019678680473069293, - "loss": 1.0818, + "grad_norm": 0.365234375, + "learning_rate": 0.0001849182521120087, + "loss": 0.9996, "step": 760 }, { "epoch": 2.6109215017064846, - "grad_norm": 0.3984375, - "learning_rate": 0.00019668617050450603, - "loss": 1.0824, + "grad_norm": 0.2890625, + "learning_rate": 0.00018460217148250524, + "loss": 0.9975, "step": 765 }, { "epoch": 2.627986348122867, - "grad_norm": 0.4921875, - "learning_rate": 0.00019658401113531565, - "loss": 1.0828, + "grad_norm": 0.33203125, + "learning_rate": 0.0001842830889288114, + "loss": 1.0008, "step": 770 }, { "epoch": 2.645051194539249, - "grad_norm": 1.09375, - "learning_rate": 0.00019648032823459994, - "loss": 1.0884, + "grad_norm": 0.341796875, + "learning_rate": 0.00018396101577287813, + "loss": 1.0041, "step": 775 }, { "epoch": 2.6621160409556315, - "grad_norm": 0.55859375, - "learning_rate": 0.00019637512343786937, - "loss": 1.0835, + "grad_norm": 0.33984375, + "learning_rate": 0.00018363596344277144, + "loss": 0.9995, "step": 780 }, { "epoch": 2.6791808873720138, - "grad_norm": 0.484375, - "learning_rate": 0.00019626839840464119, - "loss": 1.0828, + "grad_norm": 0.30078125, + "learning_rate": 0.0001833079434722668, + "loss": 1.002, "step": 785 }, { "epoch": 2.696245733788396, - "grad_norm": 0.376953125, - "learning_rate": 0.0001961601548184129, - "loss": 1.0881, + "grad_norm": 0.36328125, + "learning_rate": 0.00018297696750044, + "loss": 1.0057, "step": 790 }, { "epoch": 2.7133105802047783, - "grad_norm": 0.35546875, - "learning_rate": 0.00019605039438663614, - "loss": 1.0772, + "grad_norm": 0.30859375, + "learning_rate": 0.00018264304727125407, + "loss": 0.9966, "step": 795 }, { "epoch": 2.73037542662116, - "grad_norm": 0.349609375, - "learning_rate": 0.0001959391188406893, - "loss": 1.0677, + "grad_norm": 0.384765625, + "learning_rate": 0.00018230619463314266, + "loss": 0.9887, "step": 800 }, { "epoch": 2.747440273037543, - "grad_norm": 0.486328125, - "learning_rate": 0.00019582632993585052, - "loss": 1.0815, + "grad_norm": 0.373046875, + "learning_rate": 0.00018196642153858958, + "loss": 0.9993, "step": 805 }, { "epoch": 2.7645051194539247, - "grad_norm": 0.470703125, - "learning_rate": 0.00019571202945126994, - "loss": 1.0763, + "grad_norm": 0.408203125, + "learning_rate": 0.00018162374004370463, + "loss": 0.9953, "step": 810 }, { "epoch": 2.781569965870307, - "grad_norm": 0.396484375, - "learning_rate": 0.0001955962191899415, - "loss": 1.0684, + "grad_norm": 0.283203125, + "learning_rate": 0.0001812781623077959, + "loss": 0.9856, "step": 815 }, { "epoch": 2.7986348122866893, - "grad_norm": 0.373046875, - "learning_rate": 0.00019547890097867468, - "loss": 1.0847, + "grad_norm": 0.294921875, + "learning_rate": 0.00018092970059293835, + "loss": 1.0029, "step": 820 }, { "epoch": 2.8156996587030716, - "grad_norm": 0.474609375, - "learning_rate": 0.00019536007666806556, - "loss": 1.071, + "grad_norm": 0.37109375, + "learning_rate": 0.0001805783672635386, + "loss": 0.991, "step": 825 }, { "epoch": 2.832764505119454, - "grad_norm": 0.380859375, - "learning_rate": 0.00019523974813246767, - "loss": 1.0873, + "grad_norm": 0.298828125, + "learning_rate": 0.00018022417478589627, + "loss": 1.0053, "step": 830 }, { "epoch": 2.849829351535836, - "grad_norm": 0.40234375, - "learning_rate": 0.00019511791726996243, - "loss": 1.0676, + "grad_norm": 0.3359375, + "learning_rate": 0.00017986713572776174, + "loss": 0.9865, "step": 835 }, { "epoch": 2.8668941979522184, - "grad_norm": 0.51953125, - "learning_rate": 0.0001949945860023292, - "loss": 1.0748, + "grad_norm": 0.271484375, + "learning_rate": 0.00017950726275789, + "loss": 0.9948, "step": 840 }, { "epoch": 2.8839590443686007, - "grad_norm": 0.384765625, - "learning_rate": 0.00019486975627501502, - "loss": 1.0716, + "grad_norm": 0.38671875, + "learning_rate": 0.00017914456864559126, + "loss": 0.9916, "step": 845 }, { "epoch": 2.901023890784983, - "grad_norm": 0.38671875, - "learning_rate": 0.0001947434300571038, - "loss": 1.0777, + "grad_norm": 0.345703125, + "learning_rate": 0.0001787790662602779, + "loss": 0.9985, "step": 850 }, { "epoch": 2.9180887372013653, - "grad_norm": 0.365234375, - "learning_rate": 0.00019461560934128533, - "loss": 1.0733, + "grad_norm": 0.34375, + "learning_rate": 0.00017841076857100767, + "loss": 0.994, "step": 855 }, { "epoch": 2.9351535836177476, - "grad_norm": 0.42578125, - "learning_rate": 0.0001944862961438239, - "loss": 1.0582, + "grad_norm": 0.265625, + "learning_rate": 0.0001780396886460237, + "loss": 0.9811, "step": 860 }, { "epoch": 2.9522184300341294, - "grad_norm": 0.462890625, - "learning_rate": 0.00019435549250452645, - "loss": 1.0657, + "grad_norm": 0.458984375, + "learning_rate": 0.00017766583965229065, + "loss": 0.9872, "step": 865 }, { "epoch": 2.969283276450512, - "grad_norm": 1.1171875, - "learning_rate": 0.0001942232004867103, - "loss": 1.0746, + "grad_norm": 0.37109375, + "learning_rate": 0.00017728923485502759, + "loss": 0.9951, "step": 870 }, { "epoch": 2.986348122866894, - "grad_norm": 0.466796875, - "learning_rate": 0.0001940894221771708, - "loss": 1.0715, + "grad_norm": 0.365234375, + "learning_rate": 0.00017690988761723725, + "loss": 0.9915, "step": 875 }, { "epoch": 3.0, - "eval_loss": 2.447284698486328, - "eval_runtime": 0.553, - "eval_samples_per_second": 18.083, - "eval_steps_per_second": 1.808, + "eval_loss": 2.4749691486358643, + "eval_runtime": 0.5425, + "eval_samples_per_second": 18.434, + "eval_steps_per_second": 1.843, "step": 879 }, { "epoch": 3.0034129692832763, - "grad_norm": 0.80859375, - "learning_rate": 0.00019395415968614813, - "loss": 1.0736, + "grad_norm": 0.39453125, + "learning_rate": 0.00017652781139923196, + "loss": 0.9883, "step": 880 }, { "epoch": 3.0204778156996586, - "grad_norm": 0.47265625, - "learning_rate": 0.00019381741514729443, - "loss": 1.0618, + "grad_norm": 0.5078125, + "learning_rate": 0.000176143019758156, + "loss": 0.9611, "step": 885 }, { "epoch": 3.037542662116041, - "grad_norm": 0.390625, - "learning_rate": 0.0001936791907176397, - "loss": 1.0571, + "grad_norm": 0.359375, + "learning_rate": 0.0001757555263475044, + "loss": 0.9542, "step": 890 }, { "epoch": 3.054607508532423, - "grad_norm": 0.46484375, - "learning_rate": 0.00019353948857755803, - "loss": 1.0626, + "grad_norm": 0.326171875, + "learning_rate": 0.00017536534491663873, + "loss": 0.9614, "step": 895 }, { "epoch": 3.0716723549488054, - "grad_norm": 0.357421875, - "learning_rate": 0.00019339831093073318, - "loss": 1.053, + "grad_norm": 0.349609375, + "learning_rate": 0.00017497248931029914, + "loss": 0.9538, "step": 900 }, { "epoch": 3.0887372013651877, - "grad_norm": 0.380859375, - "learning_rate": 0.00019325566000412376, - "loss": 1.06, + "grad_norm": 0.30859375, + "learning_rate": 0.000174576973468113, + "loss": 0.9581, "step": 905 }, { "epoch": 3.10580204778157, - "grad_norm": 0.38671875, - "learning_rate": 0.0001931115380479281, - "loss": 1.0452, + "grad_norm": 0.31640625, + "learning_rate": 0.00017417881142410037, + "loss": 0.9466, "step": 910 }, { "epoch": 3.1228668941979523, - "grad_norm": 0.515625, - "learning_rate": 0.00019296594733554892, - "loss": 1.0642, + "grad_norm": 0.298828125, + "learning_rate": 0.00017377801730617613, + "loss": 0.9632, "step": 915 }, { "epoch": 3.1399317406143346, - "grad_norm": 0.5, - "learning_rate": 0.0001928188901635571, - "loss": 1.0474, + "grad_norm": 0.298828125, + "learning_rate": 0.00017337460533564845, + "loss": 0.948, "step": 920 }, { "epoch": 3.156996587030717, - "grad_norm": 0.380859375, - "learning_rate": 0.00019267036885165588, - "loss": 1.0526, + "grad_norm": 0.310546875, + "learning_rate": 0.00017296858982671442, + "loss": 0.9515, "step": 925 }, { "epoch": 3.174061433447099, - "grad_norm": 0.4296875, - "learning_rate": 0.00019252038574264405, - "loss": 1.061, + "grad_norm": 0.29296875, + "learning_rate": 0.00017255998518595194, + "loss": 0.9625, "step": 930 }, { "epoch": 3.1911262798634814, - "grad_norm": 0.443359375, - "learning_rate": 0.00019236894320237894, - "loss": 1.0519, + "grad_norm": 0.28125, + "learning_rate": 0.00017214880591180873, + "loss": 0.9532, "step": 935 }, { "epoch": 3.2081911262798632, - "grad_norm": 0.458984375, - "learning_rate": 0.00019221604361973919, - "loss": 1.0479, - "step": 940 + "grad_norm": 0.326171875, + "learning_rate": 0.0001717350665940877, + "loss": 0.9499, + "step": 940 }, { "epoch": 3.2252559726962455, - "grad_norm": 0.50390625, - "learning_rate": 0.00019206168940658712, - "loss": 1.049, + "grad_norm": 0.333984375, + "learning_rate": 0.00017131878191342932, + "loss": 0.9505, "step": 945 }, { "epoch": 3.242320819112628, - "grad_norm": 0.462890625, - "learning_rate": 0.00019190588299773062, - "loss": 1.0474, + "grad_norm": 0.376953125, + "learning_rate": 0.00017089996664079084, + "loss": 0.9489, "step": 950 }, { "epoch": 3.25938566552901, - "grad_norm": 0.462890625, - "learning_rate": 0.00019174862685088472, - "loss": 1.06, + "grad_norm": 0.310546875, + "learning_rate": 0.00017047863563692198, + "loss": 0.9623, "step": 955 }, { "epoch": 3.2764505119453924, - "grad_norm": 0.373046875, - "learning_rate": 0.0001915899234466328, - "loss": 1.0464, + "grad_norm": 0.267578125, + "learning_rate": 0.00017005480385183774, + "loss": 0.9474, "step": 960 }, { "epoch": 3.2935153583617747, - "grad_norm": 0.48046875, - "learning_rate": 0.00019142977528838762, - "loss": 1.0531, + "grad_norm": 0.333984375, + "learning_rate": 0.00016962848632428795, + "loss": 0.9558, "step": 965 }, { "epoch": 3.310580204778157, - "grad_norm": 0.380859375, - "learning_rate": 0.0001912681849023516, - "loss": 1.0518, + "grad_norm": 0.34765625, + "learning_rate": 0.00016919969818122345, + "loss": 0.9538, "step": 970 }, { "epoch": 3.3276450511945392, - "grad_norm": 0.447265625, - "learning_rate": 0.00019110515483747716, - "loss": 1.0535, + "grad_norm": 0.5, + "learning_rate": 0.00016876845463725975, + "loss": 0.955, "step": 975 }, { "epoch": 3.3447098976109215, - "grad_norm": 0.625, - "learning_rate": 0.0001909406876654264, - "loss": 1.0559, + "grad_norm": 1.8203125, + "learning_rate": 0.0001683347709941367, + "loss": 0.9615, "step": 980 }, { "epoch": 3.361774744027304, - "grad_norm": 0.51953125, - "learning_rate": 0.00019077478598053063, - "loss": 1.0528, + "grad_norm": 0.44921875, + "learning_rate": 0.0001678986626401759, + "loss": 0.9591, "step": 985 }, { "epoch": 3.378839590443686, - "grad_norm": 0.46875, - "learning_rate": 0.00019060745239974936, - "loss": 1.0431, + "grad_norm": 0.3671875, + "learning_rate": 0.00016746014504973448, + "loss": 0.9479, "step": 990 }, { "epoch": 3.3959044368600684, - "grad_norm": 0.63671875, - "learning_rate": 0.0001904386895626291, - "loss": 1.0456, + "grad_norm": 0.419921875, + "learning_rate": 0.00016701923378265615, + "loss": 0.9511, "step": 995 }, { "epoch": 3.4129692832764507, - "grad_norm": 0.48828125, - "learning_rate": 0.00019026850013126157, - "loss": 1.0579, + "grad_norm": 0.3203125, + "learning_rate": 0.00016657594448371896, + "loss": 0.962, "step": 1000 }, { "epoch": 3.430034129692833, - "grad_norm": 0.625, - "learning_rate": 0.0001900968867902419, - "loss": 1.0592, + "grad_norm": 0.4296875, + "learning_rate": 0.0001661302928820803, + "loss": 0.9612, "step": 1005 }, { "epoch": 3.4470989761092152, - "grad_norm": 0.51171875, - "learning_rate": 0.00018992385224662623, - "loss": 1.0476, + "grad_norm": 0.455078125, + "learning_rate": 0.00016568229479071872, + "loss": 0.9524, "step": 1010 }, { "epoch": 3.464163822525597, - "grad_norm": 0.470703125, - "learning_rate": 0.00018974939922988883, - "loss": 1.0517, + "grad_norm": 0.306640625, + "learning_rate": 0.0001652319661058729, + "loss": 0.9557, "step": 1015 }, { "epoch": 3.4812286689419794, - "grad_norm": 0.423828125, - "learning_rate": 0.00018957353049187936, - "loss": 1.0607, + "grad_norm": 0.26953125, + "learning_rate": 0.00016477932280647747, + "loss": 0.9635, "step": 1020 }, { "epoch": 3.4982935153583616, - "grad_norm": 0.4765625, - "learning_rate": 0.00018939624880677918, - "loss": 1.0502, + "grad_norm": 0.28515625, + "learning_rate": 0.00016432438095359623, + "loss": 0.9549, "step": 1025 }, { "epoch": 3.515358361774744, - "grad_norm": 0.3671875, - "learning_rate": 0.0001892175569710577, - "loss": 1.041, + "grad_norm": 0.28515625, + "learning_rate": 0.00016386715668985211, + "loss": 0.9456, "step": 1030 }, { "epoch": 3.532423208191126, - "grad_norm": 0.52734375, - "learning_rate": 0.00018903745780342839, - "loss": 1.0382, + "grad_norm": 0.423828125, + "learning_rate": 0.00016340766623885438, + "loss": 0.945, "step": 1035 }, { "epoch": 3.5494880546075085, - "grad_norm": 0.3984375, - "learning_rate": 0.00018885595414480405, - "loss": 1.0426, + "grad_norm": 0.330078125, + "learning_rate": 0.00016294592590462316, + "loss": 0.95, "step": 1040 }, { "epoch": 3.5665529010238908, - "grad_norm": 0.400390625, - "learning_rate": 0.0001886730488582522, - "loss": 1.0524, + "grad_norm": 0.3359375, + "learning_rate": 0.0001624819520710107, + "loss": 0.9583, "step": 1045 }, { "epoch": 3.583617747440273, - "grad_norm": 0.58203125, - "learning_rate": 0.00018848874482894993, - "loss": 1.0371, + "grad_norm": 0.36328125, + "learning_rate": 0.00016201576120112007, + "loss": 0.9443, "step": 1050 }, { "epoch": 3.6006825938566553, - "grad_norm": 0.412109375, - "learning_rate": 0.00018830304496413822, - "loss": 1.0571, + "grad_norm": 0.345703125, + "learning_rate": 0.0001615473698367212, + "loss": 0.9635, "step": 1055 }, { "epoch": 3.6177474402730376, - "grad_norm": 0.33984375, - "learning_rate": 0.00018811595219307622, - "loss": 1.0458, + "grad_norm": 0.51953125, + "learning_rate": 0.00016107679459766367, + "loss": 0.9524, "step": 1060 }, { "epoch": 3.63481228668942, - "grad_norm": 0.455078125, - "learning_rate": 0.000187927469466995, - "loss": 1.0474, + "grad_norm": 0.279296875, + "learning_rate": 0.0001606040521812872, + "loss": 0.9552, "step": 1065 }, { "epoch": 3.651877133105802, - "grad_norm": 0.37109375, - "learning_rate": 0.00018773759975905098, - "loss": 1.0438, + "grad_norm": 0.3125, + "learning_rate": 0.00016012915936182892, + "loss": 0.9502, "step": 1070 }, { "epoch": 3.6689419795221845, - "grad_norm": 0.384765625, - "learning_rate": 0.00018754634606427914, - "loss": 1.0577, + "grad_norm": 0.3125, + "learning_rate": 0.00015965213298982855, + "loss": 0.9629, "step": 1075 }, { "epoch": 3.6860068259385663, - "grad_norm": 0.435546875, - "learning_rate": 0.00018735371139954558, - "loss": 1.0522, + "grad_norm": 0.361328125, + "learning_rate": 0.00015917298999153015, + "loss": 0.9591, "step": 1080 }, { "epoch": 3.703071672354949, - "grad_norm": 0.55859375, - "learning_rate": 0.0001871596988035001, - "loss": 1.0622, + "grad_norm": 0.296875, + "learning_rate": 0.00015869174736828168, + "loss": 0.9699, "step": 1085 }, { "epoch": 3.720136518771331, - "grad_norm": 0.53125, - "learning_rate": 0.00018696431133652817, - "loss": 1.0404, + "grad_norm": 0.30078125, + "learning_rate": 0.00015820842219593182, + "loss": 0.9478, "step": 1090 }, { "epoch": 3.737201365187713, - "grad_norm": 0.41796875, - "learning_rate": 0.00018676755208070275, - "loss": 1.0576, + "grad_norm": 0.33984375, + "learning_rate": 0.00015772303162422385, + "loss": 0.9646, "step": 1095 }, { "epoch": 3.7542662116040955, - "grad_norm": 0.396484375, - "learning_rate": 0.00018656942413973555, - "loss": 1.0525, + "grad_norm": 0.404296875, + "learning_rate": 0.00015723559287618728, + "loss": 0.9601, "step": 1100 }, { "epoch": 3.7713310580204777, - "grad_norm": 0.392578125, - "learning_rate": 0.0001863699306389282, - "loss": 1.047, + "grad_norm": 0.421875, + "learning_rate": 0.00015674612324752683, + "loss": 0.9548, "step": 1105 }, { "epoch": 3.78839590443686, - "grad_norm": 0.54296875, - "learning_rate": 0.0001861690747251228, - "loss": 1.0547, + "grad_norm": 0.32421875, + "learning_rate": 0.00015625464010600844, + "loss": 0.9625, "step": 1110 }, { "epoch": 3.8054607508532423, - "grad_norm": 0.455078125, - "learning_rate": 0.00018596685956665245, - "loss": 1.0366, + "grad_norm": 0.287109375, + "learning_rate": 0.00015576116089084327, + "loss": 0.9448, "step": 1115 }, { "epoch": 3.8225255972696246, - "grad_norm": 0.373046875, - "learning_rate": 0.00018576328835329117, - "loss": 1.0444, + "grad_norm": 0.28125, + "learning_rate": 0.00015526570311206884, + "loss": 0.9547, "step": 1120 }, { "epoch": 3.839590443686007, - "grad_norm": 0.498046875, - "learning_rate": 0.00018555836429620358, - "loss": 1.0428, + "grad_norm": 0.275390625, + "learning_rate": 0.00015476828434992762, + "loss": 0.9527, "step": 1125 }, { "epoch": 3.856655290102389, - "grad_norm": 0.4453125, - "learning_rate": 0.00018535209062789433, - "loss": 1.0425, + "grad_norm": 0.326171875, + "learning_rate": 0.00015426892225424337, + "loss": 0.9499, "step": 1130 }, { "epoch": 3.8737201365187715, - "grad_norm": 0.392578125, - "learning_rate": 0.00018514447060215698, - "loss": 1.0503, + "grad_norm": 0.271484375, + "learning_rate": 0.00015376763454379478, + "loss": 0.9593, "step": 1135 }, { "epoch": 3.8907849829351537, - "grad_norm": 0.384765625, - "learning_rate": 0.00018493550749402278, - "loss": 1.0376, + "grad_norm": 0.314453125, + "learning_rate": 0.0001532644390056868, + "loss": 0.9457, "step": 1140 }, { "epoch": 3.9078498293515356, - "grad_norm": 0.3984375, - "learning_rate": 0.00018472520459970898, - "loss": 1.054, + "grad_norm": 0.49609375, + "learning_rate": 0.00015275935349471959, + "loss": 0.9622, "step": 1145 }, { "epoch": 3.9249146757679183, - "grad_norm": 0.44921875, - "learning_rate": 0.0001845135652365668, - "loss": 1.0491, + "grad_norm": 0.3125, + "learning_rate": 0.00015225239593275473, + "loss": 0.9584, "step": 1150 }, { "epoch": 3.9419795221843, - "grad_norm": 0.37890625, - "learning_rate": 0.00018430059274302917, - "loss": 1.0454, + "grad_norm": 0.29296875, + "learning_rate": 0.00015174358430807957, + "loss": 0.9547, "step": 1155 }, { "epoch": 3.9590443686006824, - "grad_norm": 0.365234375, - "learning_rate": 0.00018408629047855804, - "loss": 1.0466, + "grad_norm": 0.28515625, + "learning_rate": 0.00015123293667476887, + "loss": 0.9546, "step": 1160 }, { "epoch": 3.9761092150170647, - "grad_norm": 0.34765625, - "learning_rate": 0.00018387066182359133, - "loss": 1.0356, + "grad_norm": 0.345703125, + "learning_rate": 0.00015072047115204397, + "loss": 0.945, "step": 1165 }, { "epoch": 3.993174061433447, - "grad_norm": 0.357421875, - "learning_rate": 0.00018365371017948964, - "loss": 1.0471, + "grad_norm": 0.3203125, + "learning_rate": 0.00015020620592363034, + "loss": 0.9551, "step": 1170 }, { "epoch": 4.0, - "eval_loss": 2.452413558959961, - "eval_runtime": 0.5427, - "eval_samples_per_second": 18.427, - "eval_steps_per_second": 1.843, + "eval_loss": 2.529212474822998, + "eval_runtime": 0.5437, + "eval_samples_per_second": 18.394, + "eval_steps_per_second": 1.839, "step": 1172 }, { "epoch": 4.010238907849829, - "grad_norm": 0.47265625, - "learning_rate": 0.00018343543896848273, - "loss": 1.0282, + "grad_norm": 0.31640625, + "learning_rate": 0.00014969015923711195, + "loss": 0.925, "step": 1175 }, { "epoch": 4.027303754266212, - "grad_norm": 0.41796875, - "learning_rate": 0.00018321585163361527, - "loss": 1.0262, + "grad_norm": 0.306640625, + "learning_rate": 0.00014917234940328396, + "loss": 0.9111, "step": 1180 }, { "epoch": 4.044368600682594, - "grad_norm": 0.365234375, - "learning_rate": 0.00018299495163869275, - "loss": 1.0263, + "grad_norm": 0.3203125, + "learning_rate": 0.00014865279479550292, + "loss": 0.9124, "step": 1185 }, { "epoch": 4.061433447098976, - "grad_norm": 0.359375, - "learning_rate": 0.0001827727424682268, - "loss": 1.0265, + "grad_norm": 0.3125, + "learning_rate": 0.00014813151384903493, + "loss": 0.912, "step": 1190 }, { "epoch": 4.078498293515358, - "grad_norm": 0.375, - "learning_rate": 0.00018254922762738008, - "loss": 1.0266, + "grad_norm": 0.3125, + "learning_rate": 0.00014760852506040162, + "loss": 0.9113, "step": 1195 }, { "epoch": 4.09556313993174, - "grad_norm": 0.3828125, - "learning_rate": 0.00018232441064191125, - "loss": 1.0326, + "grad_norm": 0.298828125, + "learning_rate": 0.0001470838469867234, + "loss": 0.9168, "step": 1200 }, { "epoch": 4.112627986348123, - "grad_norm": 0.3828125, - "learning_rate": 0.0001820982950581191, - "loss": 1.0278, + "grad_norm": 0.310546875, + "learning_rate": 0.00014655749824506151, + "loss": 0.9152, "step": 1205 }, { "epoch": 4.129692832764505, - "grad_norm": 0.46484375, - "learning_rate": 0.00018187088444278674, - "loss": 1.0206, + "grad_norm": 0.3046875, + "learning_rate": 0.00014602949751175713, + "loss": 0.9098, "step": 1210 }, { "epoch": 4.146757679180888, - "grad_norm": 0.4140625, - "learning_rate": 0.00018164218238312535, - "loss": 1.037, + "grad_norm": 0.333984375, + "learning_rate": 0.00014549986352176882, + "loss": 0.9213, "step": 1215 }, { "epoch": 4.163822525597269, - "grad_norm": 0.3671875, - "learning_rate": 0.00018141219248671745, - "loss": 1.0229, + "grad_norm": 0.341796875, + "learning_rate": 0.00014496861506800758, + "loss": 0.9128, "step": 1220 }, { "epoch": 4.180887372013652, - "grad_norm": 0.376953125, - "learning_rate": 0.00018118091838146029, - "loss": 1.0223, + "grad_norm": 0.29296875, + "learning_rate": 0.0001444357710006703, + "loss": 0.9102, "step": 1225 }, { "epoch": 4.197952218430034, - "grad_norm": 0.373046875, - "learning_rate": 0.00018094836371550824, - "loss": 1.0175, + "grad_norm": 0.330078125, + "learning_rate": 0.0001439013502265707, + "loss": 0.9058, "step": 1230 }, { "epoch": 4.215017064846417, "grad_norm": 0.380859375, - "learning_rate": 0.00018071453215721554, - "loss": 1.0369, + "learning_rate": 0.00014336537170846848, + "loss": 0.9233, "step": 1235 }, { "epoch": 4.2320819112627985, - "grad_norm": 0.41015625, - "learning_rate": 0.00018047942739507836, - "loss": 1.0182, + "grad_norm": 0.404296875, + "learning_rate": 0.00014282785446439653, + "loss": 0.9092, "step": 1240 }, { "epoch": 4.249146757679181, - "grad_norm": 0.421875, - "learning_rate": 0.00018024305313767646, - "loss": 1.0192, + "grad_norm": 0.40234375, + "learning_rate": 0.00014228881756698603, + "loss": 0.9093, "step": 1245 }, { "epoch": 4.266211604095563, - "grad_norm": 0.40625, - "learning_rate": 0.000180005413113615, - "loss": 1.0427, + "grad_norm": 0.306640625, + "learning_rate": 0.00014174828014278985, + "loss": 0.9271, "step": 1250 }, { "epoch": 4.283276450511945, - "grad_norm": 0.42578125, - "learning_rate": 0.00017976651107146533, - "loss": 1.0313, + "grad_norm": 0.3203125, + "learning_rate": 0.00014120626137160375, + "loss": 0.9189, "step": 1255 }, { "epoch": 4.300341296928328, - "grad_norm": 0.359375, - "learning_rate": 0.0001795263507797063, - "loss": 1.0195, + "grad_norm": 0.28125, + "learning_rate": 0.00014066278048578584, + "loss": 0.9078, "step": 1260 }, { "epoch": 4.3174061433447095, - "grad_norm": 0.453125, - "learning_rate": 0.00017928493602666445, - "loss": 1.0222, + "grad_norm": 0.298828125, + "learning_rate": 0.00014011785676957422, + "loss": 0.9115, "step": 1265 }, { "epoch": 4.334470989761092, - "grad_norm": 0.5546875, - "learning_rate": 0.00017904227062045437, - "loss": 1.0183, + "grad_norm": 0.33203125, + "learning_rate": 0.00013957150955840267, + "loss": 0.9099, "step": 1270 }, { "epoch": 4.351535836177474, - "grad_norm": 0.6328125, - "learning_rate": 0.00017879835838891875, - "loss": 1.0321, + "grad_norm": 0.28125, + "learning_rate": 0.0001390237582382147, + "loss": 0.9208, "step": 1275 }, { "epoch": 4.368600682593857, - "grad_norm": 0.7265625, - "learning_rate": 0.00017855320317956784, - "loss": 1.0241, + "grad_norm": 0.3125, + "learning_rate": 0.00013847462224477538, + "loss": 0.9133, "step": 1280 }, { "epoch": 4.385665529010239, - "grad_norm": 0.380859375, - "learning_rate": 0.00017830680885951887, - "loss": 1.019, + "grad_norm": 0.328125, + "learning_rate": 0.00013792412106298198, + "loss": 0.9088, "step": 1285 }, { "epoch": 4.402730375426621, - "grad_norm": 0.7265625, - "learning_rate": 0.00017805917931543492, - "loss": 1.0291, + "grad_norm": 0.328125, + "learning_rate": 0.00013737227422617267, + "loss": 0.9176, "step": 1290 }, { "epoch": 4.419795221843003, - "grad_norm": 0.8671875, - "learning_rate": 0.00017781031845346375, - "loss": 1.0254, + "grad_norm": 0.30078125, + "learning_rate": 0.00013681910131543309, + "loss": 0.9143, "step": 1295 }, { "epoch": 4.436860068259386, - "grad_norm": 0.38671875, - "learning_rate": 0.00017756023019917607, - "loss": 1.0232, + "grad_norm": 0.328125, + "learning_rate": 0.00013626462195890168, + "loss": 0.9148, "step": 1300 }, { "epoch": 4.453924914675768, - "grad_norm": 0.384765625, - "learning_rate": 0.00017730891849750377, - "loss": 1.0267, + "grad_norm": 0.29296875, + "learning_rate": 0.00013570885583107347, + "loss": 0.9165, "step": 1305 }, { "epoch": 4.4709897610921505, - "grad_norm": 0.38671875, - "learning_rate": 0.0001770563873126775, - "loss": 1.0282, + "grad_norm": 0.3203125, + "learning_rate": 0.00013515182265210165, + "loss": 0.9198, "step": 1310 }, { "epoch": 4.488054607508532, - "grad_norm": 0.357421875, - "learning_rate": 0.0001768026406281642, - "loss": 1.0384, + "grad_norm": 0.3046875, + "learning_rate": 0.00013459354218709794, + "loss": 0.9294, "step": 1315 }, { "epoch": 4.505119453924914, - "grad_norm": 0.37109375, - "learning_rate": 0.00017654768244660448, - "loss": 1.0197, + "grad_norm": 0.287109375, + "learning_rate": 0.00013403403424543139, + "loss": 0.9137, "step": 1320 }, { "epoch": 4.522184300341297, - "grad_norm": 0.458984375, - "learning_rate": 0.00017629151678974907, - "loss": 1.023, + "grad_norm": 0.341796875, + "learning_rate": 0.00013347331868002527, + "loss": 0.9172, "step": 1325 }, { "epoch": 4.53924914675768, - "grad_norm": 0.359375, - "learning_rate": 0.00017603414769839577, - "loss": 1.0289, + "grad_norm": 0.294921875, + "learning_rate": 0.0001329114153866529, + "loss": 0.9237, "step": 1330 }, { "epoch": 4.5563139931740615, - "grad_norm": 0.72265625, - "learning_rate": 0.00017577557923232546, - "loss": 1.0222, + "grad_norm": 0.32421875, + "learning_rate": 0.00013234834430323145, + "loss": 0.9144, "step": 1335 }, { "epoch": 4.573378839590443, - "grad_norm": 0.5, - "learning_rate": 0.00017551581547023819, - "loss": 1.0285, + "grad_norm": 0.326171875, + "learning_rate": 0.00013178412540911457, + "loss": 0.9193, "step": 1340 }, { "epoch": 4.590443686006826, - "grad_norm": 0.392578125, - "learning_rate": 0.00017525486050968875, - "loss": 1.0288, + "grad_norm": 0.322265625, + "learning_rate": 0.00013121877872438354, + "loss": 0.9217, "step": 1345 }, { "epoch": 4.607508532423208, - "grad_norm": 0.37890625, - "learning_rate": 0.00017499271846702213, - "loss": 1.0302, + "grad_norm": 0.3359375, + "learning_rate": 0.00013065232430913676, + "loss": 0.9252, "step": 1350 }, { "epoch": 4.624573378839591, - "grad_norm": 0.419921875, - "learning_rate": 0.00017472939347730856, - "loss": 1.0358, + "grad_norm": 0.294921875, + "learning_rate": 0.00013008478226277816, + "loss": 0.9265, "step": 1355 }, { "epoch": 4.6416382252559725, - "grad_norm": 0.451171875, - "learning_rate": 0.0001744648896942782, - "loss": 1.0278, + "grad_norm": 0.298828125, + "learning_rate": 0.00012951617272330377, + "loss": 0.9221, "step": 1360 }, { "epoch": 4.658703071672355, - "grad_norm": 0.38671875, - "learning_rate": 0.00017419921129025576, - "loss": 1.0171, + "grad_norm": 0.345703125, + "learning_rate": 0.00012894651586658736, + "loss": 0.9131, "step": 1365 }, { "epoch": 4.675767918088737, - "grad_norm": 0.376953125, - "learning_rate": 0.0001739323624560945, - "loss": 1.0152, + "grad_norm": 0.337890625, + "learning_rate": 0.00012837583190566446, + "loss": 0.9109, "step": 1370 }, { "epoch": 4.69283276450512, - "grad_norm": 0.384765625, - "learning_rate": 0.00017366434740111037, - "loss": 1.0247, + "grad_norm": 0.388671875, + "learning_rate": 0.00012780414109001518, + "loss": 0.9204, "step": 1375 }, { "epoch": 4.709897610921502, - "grad_norm": 0.431640625, - "learning_rate": 0.00017339517035301532, - "loss": 1.0212, + "grad_norm": 0.306640625, + "learning_rate": 0.00012723146370484568, + "loss": 0.9154, "step": 1380 }, { "epoch": 4.726962457337884, - "grad_norm": 0.3828125, - "learning_rate": 0.00017312483555785086, - "loss": 1.0309, + "grad_norm": 0.3515625, + "learning_rate": 0.00012665782007036835, + "loss": 0.9251, "step": 1385 }, { "epoch": 4.744027303754266, - "grad_norm": 0.353515625, - "learning_rate": 0.000172853347279921, - "loss": 1.0298, + "grad_norm": 0.3984375, + "learning_rate": 0.0001260832305410809, + "loss": 0.926, "step": 1390 }, { "epoch": 4.761092150170649, - "grad_norm": 0.373046875, - "learning_rate": 0.00017258070980172494, - "loss": 1.0215, + "grad_norm": 0.369140625, + "learning_rate": 0.00012550771550504396, + "loss": 0.9137, "step": 1395 }, { "epoch": 4.778156996587031, - "grad_norm": 0.4453125, - "learning_rate": 0.0001723069274238895, - "loss": 1.0249, + "grad_norm": 0.34375, + "learning_rate": 0.00012493129538315788, + "loss": 0.9181, "step": 1400 }, { "epoch": 4.795221843003413, - "grad_norm": 0.4921875, - "learning_rate": 0.0001720320044651014, - "loss": 1.0259, + "grad_norm": 0.349609375, + "learning_rate": 0.00012435399062843796, + "loss": 0.9207, "step": 1405 }, { "epoch": 4.812286689419795, - "grad_norm": 0.380859375, - "learning_rate": 0.00017175594526203905, - "loss": 1.0215, + "grad_norm": 0.287109375, + "learning_rate": 0.00012377582172528877, + "loss": 0.9156, "step": 1410 }, { "epoch": 4.829351535836177, - "grad_norm": 0.42578125, - "learning_rate": 0.00017147875416930416, - "loss": 1.0272, + "grad_norm": 0.31640625, + "learning_rate": 0.00012319680918877732, + "loss": 0.9222, "step": 1415 }, { "epoch": 4.84641638225256, - "grad_norm": 0.34765625, - "learning_rate": 0.00017120043555935298, - "loss": 1.0365, + "grad_norm": 0.29296875, + "learning_rate": 0.00012261697356390506, + "loss": 0.9297, "step": 1420 }, { "epoch": 4.863481228668942, - "grad_norm": 0.36328125, - "learning_rate": 0.00017092099382242748, - "loss": 1.02, + "grad_norm": 0.37109375, + "learning_rate": 0.00012203633542487907, + "loss": 0.9146, "step": 1425 }, { "epoch": 4.8805460750853245, - "grad_norm": 0.455078125, - "learning_rate": 0.00017064043336648599, - "loss": 1.021, + "grad_norm": 0.453125, + "learning_rate": 0.00012145491537438174, + "loss": 0.917, "step": 1430 }, { "epoch": 4.897610921501706, - "grad_norm": 0.400390625, - "learning_rate": 0.0001703587586171337, - "loss": 1.0156, + "grad_norm": 0.33203125, + "learning_rate": 0.00012087273404284002, + "loss": 0.912, "step": 1435 }, { "epoch": 4.914675767918089, - "grad_norm": 0.375, - "learning_rate": 0.00017007597401755276, - "loss": 1.0283, + "grad_norm": 0.3359375, + "learning_rate": 0.0001202898120876932, + "loss": 0.9224, "step": 1440 }, { "epoch": 4.931740614334471, - "grad_norm": 0.443359375, - "learning_rate": 0.00016979208402843237, - "loss": 1.0194, + "grad_norm": 0.28515625, + "learning_rate": 0.00011970617019266, + "loss": 0.9167, "step": 1445 }, { "epoch": 4.948805460750854, - "grad_norm": 0.57421875, - "learning_rate": 0.00016950709312789833, - "loss": 1.0198, + "grad_norm": 0.33203125, + "learning_rate": 0.00011912182906700466, + "loss": 0.9166, "step": 1450 }, { "epoch": 4.965870307167235, - "grad_norm": 0.37890625, - "learning_rate": 0.00016922100581144228, - "loss": 1.028, + "grad_norm": 0.294921875, + "learning_rate": 0.00011853680944480206, + "loss": 0.9243, "step": 1455 }, { "epoch": 4.982935153583618, - "grad_norm": 0.4765625, - "learning_rate": 0.00016893382659185105, - "loss": 1.0157, + "grad_norm": 0.310546875, + "learning_rate": 0.00011795113208420208, + "loss": 0.9128, "step": 1460 }, { "epoch": 5.0, - "grad_norm": 0.416015625, - "learning_rate": 0.00016864555999913518, - "loss": 1.0357, + "grad_norm": 0.373046875, + "learning_rate": 0.00011736481776669306, + "loss": 0.9287, "step": 1465 }, { "epoch": 5.0, - "eval_loss": 2.468480110168457, - "eval_runtime": 0.549, - "eval_samples_per_second": 18.214, - "eval_steps_per_second": 1.821, + "eval_loss": 2.5924570560455322, + "eval_runtime": 0.5421, + "eval_samples_per_second": 18.446, + "eval_steps_per_second": 1.845, "step": 1465 }, { "epoch": 5.017064846416382, - "grad_norm": 0.380859375, - "learning_rate": 0.0001683562105804577, - "loss": 1.0001, + "grad_norm": 0.3203125, + "learning_rate": 0.00011677788729636427, + "loss": 0.8743, "step": 1470 }, { "epoch": 5.034129692832765, - "grad_norm": 0.5078125, - "learning_rate": 0.00016806578290006225, - "loss": 0.9998, + "grad_norm": 0.345703125, + "learning_rate": 0.0001161903614991679, + "loss": 0.8731, "step": 1475 }, { "epoch": 5.051194539249146, - "grad_norm": 0.400390625, - "learning_rate": 0.0001677742815392012, - "loss": 0.9999, + "grad_norm": 0.341796875, + "learning_rate": 0.00011560226122218, + "loss": 0.8735, "step": 1480 }, { "epoch": 5.068259385665529, - "grad_norm": 0.43359375, - "learning_rate": 0.00016748171109606328, - "loss": 1.0085, + "grad_norm": 0.3359375, + "learning_rate": 0.00011501360733286085, + "loss": 0.8808, "step": 1485 }, { "epoch": 5.085324232081911, - "grad_norm": 0.416015625, - "learning_rate": 0.00016718807618570106, - "loss": 1.0018, + "grad_norm": 0.314453125, + "learning_rate": 0.00011442442071831434, + "loss": 0.8776, "step": 1490 }, { "epoch": 5.102389078498294, - "grad_norm": 0.453125, - "learning_rate": 0.00016689338143995833, - "loss": 0.9997, + "grad_norm": 0.333984375, + "learning_rate": 0.00011383472228454699, + "loss": 0.872, "step": 1495 }, { "epoch": 5.1194539249146755, - "grad_norm": 0.4453125, - "learning_rate": 0.00016659763150739677, - "loss": 1.009, + "grad_norm": 0.314453125, + "learning_rate": 0.00011324453295572618, + "loss": 0.8801, "step": 1500 }, { "epoch": 5.136518771331058, - "grad_norm": 0.357421875, - "learning_rate": 0.00016630083105322266, - "loss": 1.0047, + "grad_norm": 0.310546875, + "learning_rate": 0.00011265387367343763, + "loss": 0.8767, "step": 1505 }, { "epoch": 5.15358361774744, - "grad_norm": 0.33984375, - "learning_rate": 0.00016600298475921365, - "loss": 1.004, + "grad_norm": 0.337890625, + "learning_rate": 0.00011206276539594221, + "loss": 0.8764, "step": 1510 }, { "epoch": 5.170648464163823, - "grad_norm": 0.400390625, - "learning_rate": 0.00016570409732364437, - "loss": 1.0022, + "grad_norm": 0.333984375, + "learning_rate": 0.00011147122909743257, + "loss": 0.8768, "step": 1515 }, { "epoch": 5.187713310580205, - "grad_norm": 0.427734375, - "learning_rate": 0.0001654041734612127, - "loss": 1.0113, + "grad_norm": 0.32421875, + "learning_rate": 0.00011087928576728865, + "loss": 0.8848, "step": 1520 }, { "epoch": 5.204778156996587, - "grad_norm": 0.3828125, - "learning_rate": 0.00016510321790296525, - "loss": 1.0171, + "grad_norm": 0.337890625, + "learning_rate": 0.00011028695640933309, + "loss": 0.8905, "step": 1525 }, { "epoch": 5.221843003412969, - "grad_norm": 0.462890625, - "learning_rate": 0.00016480123539622281, - "loss": 1.0146, + "grad_norm": 0.318359375, + "learning_rate": 0.00010969426204108583, + "loss": 0.8872, "step": 1530 }, { "epoch": 5.238907849829351, - "grad_norm": 0.38671875, - "learning_rate": 0.00016449823070450531, - "loss": 1.0005, + "grad_norm": 0.310546875, + "learning_rate": 0.00010910122369301842, + "loss": 0.8749, "step": 1535 }, { "epoch": 5.255972696245734, - "grad_norm": 0.3515625, - "learning_rate": 0.00016419420860745699, - "loss": 1.0093, + "grad_norm": 0.34765625, + "learning_rate": 0.00010850786240780786, + "loss": 0.884, "step": 1540 }, { "epoch": 5.273037542662116, - "grad_norm": 0.439453125, - "learning_rate": 0.00016388917390077054, - "loss": 0.9965, + "grad_norm": 0.41015625, + "learning_rate": 0.00010791419923958976, + "loss": 0.8739, "step": 1545 }, { "epoch": 5.290102389078498, - "grad_norm": 0.466796875, - "learning_rate": 0.00016358313139611195, - "loss": 1.0153, + "grad_norm": 0.330078125, + "learning_rate": 0.00010732025525321145, + "loss": 0.8902, "step": 1550 }, { "epoch": 5.30716723549488, - "grad_norm": 0.376953125, - "learning_rate": 0.0001632760859210442, - "loss": 1.0094, + "grad_norm": 0.322265625, + "learning_rate": 0.00010672605152348449, + "loss": 0.8863, "step": 1555 }, { "epoch": 5.324232081911263, - "grad_norm": 0.5234375, - "learning_rate": 0.00016296804231895142, - "loss": 0.9984, + "grad_norm": 0.341796875, + "learning_rate": 0.00010613160913443682, + "loss": 0.8752, "step": 1560 }, { "epoch": 5.341296928327645, - "grad_norm": 0.37109375, - "learning_rate": 0.00016265900544896225, - "loss": 1.0066, + "grad_norm": 0.337890625, + "learning_rate": 0.00010553694917856478, + "loss": 0.8782, "step": 1565 }, { "epoch": 5.3583617747440275, - "grad_norm": 0.470703125, - "learning_rate": 0.00016234898018587337, - "loss": 1.0027, + "grad_norm": 0.326171875, + "learning_rate": 0.00010494209275608455, + "loss": 0.8804, "step": 1570 }, { "epoch": 5.375426621160409, - "grad_norm": 0.470703125, - "learning_rate": 0.0001620379714200725, - "loss": 1.014, + "grad_norm": 0.32421875, + "learning_rate": 0.00010434706097418338, + "loss": 0.8889, "step": 1575 }, { "epoch": 5.392491467576792, - "grad_norm": 0.39453125, - "learning_rate": 0.00016172598405746124, - "loss": 1.0085, + "grad_norm": 0.373046875, + "learning_rate": 0.00010375187494627098, + "loss": 0.8861, "step": 1580 }, { "epoch": 5.409556313993174, - "grad_norm": 0.51171875, - "learning_rate": 0.00016141302301937786, - "loss": 0.9999, + "grad_norm": 0.3359375, + "learning_rate": 0.00010315655579123, + "loss": 0.878, "step": 1585 }, { "epoch": 5.426621160409557, - "grad_norm": 0.54296875, - "learning_rate": 0.0001610990932425194, - "loss": 1.0199, + "grad_norm": 0.388671875, + "learning_rate": 0.00010256112463266687, + "loss": 0.893, "step": 1590 }, { "epoch": 5.4436860068259385, - "grad_norm": 0.447265625, - "learning_rate": 0.00016078419967886402, - "loss": 1.0137, + "grad_norm": 0.3515625, + "learning_rate": 0.00010196560259816221, + "loss": 0.8913, "step": 1595 }, { "epoch": 5.460750853242321, - "grad_norm": 0.408203125, - "learning_rate": 0.0001604683472955928, - "loss": 1.0057, + "grad_norm": 0.345703125, + "learning_rate": 0.00010137001081852113, + "loss": 0.8848, "step": 1600 }, { "epoch": 5.477815699658703, - "grad_norm": 0.419921875, - "learning_rate": 0.00016015154107501133, - "loss": 1.0099, + "grad_norm": 0.353515625, + "learning_rate": 0.00010077437042702362, + "loss": 0.8867, "step": 1605 }, { "epoch": 5.494880546075085, - "grad_norm": 0.455078125, - "learning_rate": 0.00015983378601447127, - "loss": 1.0066, + "grad_norm": 0.328125, + "learning_rate": 0.00010017870255867445, + "loss": 0.8843, "step": 1610 }, { "epoch": 5.511945392491468, - "grad_norm": 0.412109375, - "learning_rate": 0.0001595150871262914, - "loss": 1.0134, + "grad_norm": 0.34375, + "learning_rate": 9.958302834945332e-05, + "loss": 0.8905, "step": 1615 }, { "epoch": 5.5290102389078495, - "grad_norm": 0.37890625, - "learning_rate": 0.00015919544943767856, - "loss": 1.0108, + "grad_norm": 0.38671875, + "learning_rate": 9.898736893556502e-05, + "loss": 0.8903, "step": 1620 }, { "epoch": 5.546075085324232, - "grad_norm": 0.40625, - "learning_rate": 0.00015887487799064838, - "loss": 1.0229, + "grad_norm": 0.337890625, + "learning_rate": 9.839174545268931e-05, + "loss": 0.897, "step": 1625 }, { "epoch": 5.563139931740614, - "grad_norm": 0.56640625, - "learning_rate": 0.00015855337784194577, - "loss": 1.0126, + "grad_norm": 0.326171875, + "learning_rate": 9.7796179035231e-05, + "loss": 0.8925, "step": 1630 }, { "epoch": 5.580204778156997, - "grad_norm": 0.37890625, - "learning_rate": 0.00015823095406296514, - "loss": 0.9947, + "grad_norm": 0.310546875, + "learning_rate": 9.720069081557009e-05, + "loss": 0.8748, "step": 1635 }, { "epoch": 5.597269624573379, - "grad_norm": 0.373046875, - "learning_rate": 0.00015790761173967036, - "loss": 1.0063, + "grad_norm": 0.31640625, + "learning_rate": 9.660530192331191e-05, + "loss": 0.8829, "step": 1640 }, { "epoch": 5.614334470989761, - "grad_norm": 0.416015625, - "learning_rate": 0.00015758335597251458, - "loss": 1.0132, + "grad_norm": 0.30859375, + "learning_rate": 9.601003348453734e-05, + "loss": 0.8922, "step": 1645 }, { "epoch": 5.631399317406143, - "grad_norm": 0.4375, - "learning_rate": 0.00015725819187635968, - "loss": 1.0173, + "grad_norm": 0.3203125, + "learning_rate": 9.541490662105326e-05, + "loss": 0.8936, "step": 1650 }, { "epoch": 5.648464163822526, - "grad_norm": 0.388671875, - "learning_rate": 0.00015693212458039584, - "loss": 1.0115, + "grad_norm": 0.337890625, + "learning_rate": 9.481994244964297e-05, + "loss": 0.8897, "step": 1655 }, { "epoch": 5.665529010238908, - "grad_norm": 0.42578125, - "learning_rate": 0.00015660515922806027, - "loss": 0.9966, + "grad_norm": 0.30078125, + "learning_rate": 9.422516208131709e-05, + "loss": 0.8762, "step": 1660 }, { "epoch": 5.6825938566552905, - "grad_norm": 0.349609375, - "learning_rate": 0.00015627730097695638, - "loss": 1.0058, + "grad_norm": 0.3046875, + "learning_rate": 9.363058662056443e-05, + "loss": 0.8842, "step": 1665 }, { "epoch": 5.699658703071672, - "grad_norm": 0.427734375, - "learning_rate": 0.0001559485549987723, - "loss": 1.0143, + "grad_norm": 0.341796875, + "learning_rate": 9.303623716460297e-05, + "loss": 0.8906, "step": 1670 }, { "epoch": 5.716723549488055, - "grad_norm": 0.384765625, - "learning_rate": 0.0001556189264791992, - "loss": 1.0124, + "grad_norm": 0.328125, + "learning_rate": 9.244213480263148e-05, + "loss": 0.8911, "step": 1675 }, { "epoch": 5.733788395904437, - "grad_norm": 0.40625, - "learning_rate": 0.0001552884206178498, - "loss": 1.0119, + "grad_norm": 0.333984375, + "learning_rate": 9.184830061508113e-05, + "loss": 0.8893, "step": 1680 }, { "epoch": 5.750853242320819, - "grad_norm": 0.412109375, - "learning_rate": 0.00015495704262817597, - "loss": 1.0061, + "grad_norm": 0.3359375, + "learning_rate": 9.125475567286744e-05, + "loss": 0.8826, "step": 1685 }, { "epoch": 5.7679180887372015, - "grad_norm": 0.3828125, - "learning_rate": 0.0001546247977373867, - "loss": 1.0054, + "grad_norm": 0.341796875, + "learning_rate": 9.066152103664283e-05, + "loss": 0.8845, "step": 1690 }, { "epoch": 5.784982935153583, - "grad_norm": 0.37109375, - "learning_rate": 0.00015429169118636566, - "loss": 1.0021, + "grad_norm": 0.345703125, + "learning_rate": 9.006861775604904e-05, + "loss": 0.8808, "step": 1695 }, { "epoch": 5.802047781569966, - "grad_norm": 0.392578125, - "learning_rate": 0.00015395772822958845, - "loss": 1.0037, + "grad_norm": 0.328125, + "learning_rate": 8.947606686897045e-05, + "loss": 0.8829, "step": 1700 }, { "epoch": 5.819112627986348, - "grad_norm": 0.408203125, - "learning_rate": 0.00015362291413503984, - "loss": 1.0054, + "grad_norm": 0.31640625, + "learning_rate": 8.88838894007875e-05, + "loss": 0.8835, "step": 1705 }, { "epoch": 5.836177474402731, - "grad_norm": 0.345703125, - "learning_rate": 0.00015328725418413045, - "loss": 1.0132, + "grad_norm": 0.326171875, + "learning_rate": 8.829210636363067e-05, + "loss": 0.8894, "step": 1710 }, { "epoch": 5.853242320819112, - "grad_norm": 0.341796875, - "learning_rate": 0.00015295075367161367, - "loss": 1.0041, + "grad_norm": 0.318359375, + "learning_rate": 8.770073875563493e-05, + "loss": 0.8822, "step": 1715 }, { "epoch": 5.870307167235495, - "grad_norm": 0.34375, - "learning_rate": 0.00015261341790550196, - "loss": 1.001, + "grad_norm": 0.3203125, + "learning_rate": 8.710980756019467e-05, + "loss": 0.8811, "step": 1720 }, { "epoch": 5.887372013651877, - "grad_norm": 0.373046875, - "learning_rate": 0.0001522752522069833, - "loss": 1.0102, + "grad_norm": 0.32421875, + "learning_rate": 8.651933374521907e-05, + "loss": 0.8906, "step": 1725 }, { "epoch": 5.90443686006826, - "grad_norm": 0.38671875, - "learning_rate": 0.00015193626191033712, - "loss": 0.996, + "grad_norm": 0.3125, + "learning_rate": 8.592933826238818e-05, + "loss": 0.8773, "step": 1730 }, { "epoch": 5.921501706484642, - "grad_norm": 0.37109375, - "learning_rate": 0.0001515964523628501, - "loss": 1.0052, + "grad_norm": 0.390625, + "learning_rate": 8.533984204640941e-05, + "loss": 0.8843, "step": 1735 }, { "epoch": 5.938566552901024, - "grad_norm": 0.3984375, - "learning_rate": 0.00015125582892473204, - "loss": 1.0118, + "grad_norm": 0.314453125, + "learning_rate": 8.4750866014275e-05, + "loss": 0.8907, "step": 1740 }, { "epoch": 5.955631399317406, - "grad_norm": 0.392578125, - "learning_rate": 0.00015091439696903115, - "loss": 0.998, + "grad_norm": 0.353515625, + "learning_rate": 8.416243106451934e-05, + "loss": 0.8795, "step": 1745 }, { "epoch": 5.972696245733788, - "grad_norm": 0.388671875, - "learning_rate": 0.00015057216188154928, - "loss": 0.9925, + "grad_norm": 0.30859375, + "learning_rate": 8.357455807647778e-05, + "loss": 0.8767, "step": 1750 }, { "epoch": 5.989761092150171, - "grad_norm": 0.5, - "learning_rate": 0.00015022912906075702, - "loss": 0.993, + "grad_norm": 0.34375, + "learning_rate": 8.29872679095457e-05, + "loss": 0.8733, "step": 1755 }, { "epoch": 6.0, - "eval_loss": 2.4702811241149902, - "eval_runtime": 0.5473, - "eval_samples_per_second": 18.272, - "eval_steps_per_second": 1.827, + "eval_loss": 2.6554951667785645, + "eval_runtime": 0.5458, + "eval_samples_per_second": 18.321, + "eval_steps_per_second": 1.832, "step": 1758 }, { "epoch": 6.006825938566553, - "grad_norm": 0.48046875, - "learning_rate": 0.00014988530391770856, - "loss": 0.9939, + "grad_norm": 0.3203125, + "learning_rate": 8.240058140243834e-05, + "loss": 0.8646, "step": 1760 }, { "epoch": 6.023890784982935, - "grad_norm": 0.396484375, - "learning_rate": 0.00014954069187595633, - "loss": 0.9904, + "grad_norm": 0.34765625, + "learning_rate": 8.181451937245131e-05, + "loss": 0.8498, "step": 1765 }, { "epoch": 6.040955631399317, - "grad_norm": 0.396484375, - "learning_rate": 0.00014919529837146528, - "loss": 0.982, + "grad_norm": 0.349609375, + "learning_rate": 8.122910261472214e-05, + "loss": 0.8455, "step": 1770 }, { "epoch": 6.0580204778157, - "grad_norm": 0.421875, - "learning_rate": 0.0001488491288525275, - "loss": 0.9741, + "grad_norm": 0.365234375, + "learning_rate": 8.064435190149218e-05, + "loss": 0.8363, "step": 1775 }, { "epoch": 6.075085324232082, - "grad_norm": 0.439453125, - "learning_rate": 0.0001485021887796759, - "loss": 0.995, + "grad_norm": 0.361328125, + "learning_rate": 8.006028798136962e-05, + "loss": 0.855, "step": 1780 }, { "epoch": 6.092150170648464, - "grad_norm": 0.39453125, - "learning_rate": 0.00014815448362559826, - "loss": 0.9931, + "grad_norm": 0.322265625, + "learning_rate": 7.947693157859337e-05, + "loss": 0.8556, "step": 1785 }, { "epoch": 6.109215017064846, - "grad_norm": 0.373046875, - "learning_rate": 0.00014780601887505088, - "loss": 1.0001, + "grad_norm": 0.3515625, + "learning_rate": 7.889430339229754e-05, + "loss": 0.8606, "step": 1790 }, { "epoch": 6.126279863481229, - "grad_norm": 0.384765625, - "learning_rate": 0.00014745680002477203, - "loss": 0.9913, + "grad_norm": 0.33203125, + "learning_rate": 7.831242409577716e-05, + "loss": 0.8535, "step": 1795 }, { "epoch": 6.143344709897611, - "grad_norm": 0.36328125, - "learning_rate": 0.00014710683258339536, - "loss": 0.9883, + "grad_norm": 0.34765625, + "learning_rate": 7.773131433575444e-05, + "loss": 0.851, "step": 1800 }, { "epoch": 6.160409556313994, - "grad_norm": 0.427734375, - "learning_rate": 0.0001467561220713628, - "loss": 0.9835, + "grad_norm": 0.369140625, + "learning_rate": 7.715099473164632e-05, + "loss": 0.8468, "step": 1805 }, { "epoch": 6.177474402730375, - "grad_norm": 0.421875, - "learning_rate": 0.0001464046740208377, - "loss": 0.9894, + "grad_norm": 0.361328125, + "learning_rate": 7.657148587483271e-05, + "loss": 0.8518, "step": 1810 }, { "epoch": 6.194539249146757, - "grad_norm": 0.357421875, - "learning_rate": 0.00014605249397561736, - "loss": 0.9833, + "grad_norm": 0.330078125, + "learning_rate": 7.599280832792596e-05, + "loss": 0.8467, "step": 1815 }, { "epoch": 6.21160409556314, - "grad_norm": 0.4140625, - "learning_rate": 0.00014569958749104575, - "loss": 0.9942, + "grad_norm": 0.322265625, + "learning_rate": 7.541498262404125e-05, + "loss": 0.8549, "step": 1820 }, { "epoch": 6.228668941979522, - "grad_norm": 0.58203125, - "learning_rate": 0.00014534596013392575, - "loss": 0.9937, + "grad_norm": 0.32421875, + "learning_rate": 7.483802926606787e-05, + "loss": 0.8534, "step": 1825 }, { "epoch": 6.2457337883959045, - "grad_norm": 0.6328125, - "learning_rate": 0.00014499161748243147, - "loss": 0.9852, + "grad_norm": 0.326171875, + "learning_rate": 7.426196872594182e-05, + "loss": 0.8491, "step": 1830 }, { "epoch": 6.262798634812286, - "grad_norm": 0.6640625, - "learning_rate": 0.0001446365651260201, - "loss": 0.9886, + "grad_norm": 0.330078125, + "learning_rate": 7.368682144391944e-05, + "loss": 0.8503, "step": 1835 }, { "epoch": 6.279863481228669, - "grad_norm": 0.4375, - "learning_rate": 0.00014428080866534396, - "loss": 0.9893, + "grad_norm": 0.361328125, + "learning_rate": 7.311260782785207e-05, + "loss": 0.8528, "step": 1840 }, { "epoch": 6.296928327645051, - "grad_norm": 0.376953125, - "learning_rate": 0.00014392435371216185, - "loss": 0.9951, + "grad_norm": 0.328125, + "learning_rate": 7.253934825246193e-05, + "loss": 0.8592, "step": 1845 }, { "epoch": 6.313993174061434, - "grad_norm": 0.34765625, - "learning_rate": 0.0001435672058892509, - "loss": 0.9877, + "grad_norm": 0.39453125, + "learning_rate": 7.196706305861925e-05, + "loss": 0.8528, "step": 1850 }, { "epoch": 6.3310580204778155, - "grad_norm": 0.390625, - "learning_rate": 0.00014320937083031748, - "loss": 0.9922, + "grad_norm": 0.328125, + "learning_rate": 7.139577255262034e-05, + "loss": 0.8528, "step": 1855 }, { "epoch": 6.348122866894198, - "grad_norm": 0.38671875, - "learning_rate": 0.0001428508541799086, - "loss": 0.9939, + "grad_norm": 0.34375, + "learning_rate": 7.082549700546726e-05, + "loss": 0.8561, "step": 1860 }, { "epoch": 6.36518771331058, - "grad_norm": 0.3828125, - "learning_rate": 0.0001424916615933229, - "loss": 0.994, + "grad_norm": 0.330078125, + "learning_rate": 7.025625665214844e-05, + "loss": 0.8562, "step": 1865 }, { "epoch": 6.382252559726963, - "grad_norm": 0.404296875, - "learning_rate": 0.00014213179873652127, - "loss": 0.993, + "grad_norm": 0.3203125, + "learning_rate": 6.968807169092059e-05, + "loss": 0.8561, "step": 1870 }, { "epoch": 6.399317406143345, - "grad_norm": 0.408203125, - "learning_rate": 0.00014177127128603745, - "loss": 0.9982, + "grad_norm": 0.33203125, + "learning_rate": 6.912096228259236e-05, + "loss": 0.8598, "step": 1875 }, { "epoch": 6.4163822525597265, - "grad_norm": 0.40625, - "learning_rate": 0.0001414100849288888, - "loss": 0.9926, + "grad_norm": 0.337890625, + "learning_rate": 6.855494854980857e-05, + "loss": 0.8573, "step": 1880 }, { "epoch": 6.433447098976109, - "grad_norm": 0.416015625, - "learning_rate": 0.00014104824536248614, - "loss": 0.995, + "grad_norm": 0.359375, + "learning_rate": 6.799005057633644e-05, + "loss": 0.8576, "step": 1885 }, { "epoch": 6.450511945392491, - "grad_norm": 0.40625, - "learning_rate": 0.00014068575829454436, - "loss": 0.9894, + "grad_norm": 0.32421875, + "learning_rate": 6.742628840635284e-05, + "loss": 0.855, "step": 1890 }, { "epoch": 6.467576791808874, - "grad_norm": 0.359375, - "learning_rate": 0.00014032262944299194, - "loss": 0.997, + "grad_norm": 0.33203125, + "learning_rate": 6.68636820437331e-05, + "loss": 0.8628, "step": 1895 }, { "epoch": 6.484641638225256, - "grad_norm": 0.392578125, - "learning_rate": 0.00013995886453588104, - "loss": 0.9861, + "grad_norm": 0.39453125, + "learning_rate": 6.630225145134144e-05, + "loss": 0.8489, "step": 1900 }, { "epoch": 6.501706484641638, "grad_norm": 0.34765625, - "learning_rate": 0.00013959446931129704, - "loss": 0.9896, + "learning_rate": 6.574201655032216e-05, + "loss": 0.8534, "step": 1905 }, { "epoch": 6.51877133105802, - "grad_norm": 0.380859375, - "learning_rate": 0.0001392294495172681, - "loss": 0.9969, + "grad_norm": 0.361328125, + "learning_rate": 6.518299721939323e-05, + "loss": 0.8582, "step": 1910 }, { "epoch": 6.535836177474403, - "grad_norm": 0.392578125, - "learning_rate": 0.0001388638109116744, - "loss": 0.9902, + "grad_norm": 0.34375, + "learning_rate": 6.462521329414066e-05, + "loss": 0.8561, "step": 1915 }, { "epoch": 6.552901023890785, - "grad_norm": 0.375, - "learning_rate": 0.00013849755926215735, - "loss": 0.9995, + "grad_norm": 0.326171875, + "learning_rate": 6.406868456631483e-05, + "loss": 0.8618, "step": 1920 }, { "epoch": 6.5699658703071675, - "grad_norm": 0.384765625, - "learning_rate": 0.00013813070034602863, - "loss": 0.9935, + "grad_norm": 0.36328125, + "learning_rate": 6.351343078312819e-05, + "loss": 0.8575, "step": 1925 }, { "epoch": 6.587030716723549, - "grad_norm": 0.466796875, - "learning_rate": 0.00013776323995017898, - "loss": 0.9799, + "grad_norm": 0.376953125, + "learning_rate": 6.295947164655447e-05, + "loss": 0.8504, "step": 1930 }, { "epoch": 6.604095563139932, - "grad_norm": 0.345703125, - "learning_rate": 0.00013739518387098705, - "loss": 0.9959, + "grad_norm": 0.3359375, + "learning_rate": 6.240682681262971e-05, + "loss": 0.8619, "step": 1935 }, { "epoch": 6.621160409556314, - "grad_norm": 0.388671875, - "learning_rate": 0.0001370265379142279, - "loss": 0.9897, + "grad_norm": 0.322265625, + "learning_rate": 6.185551589075482e-05, + "loss": 0.8536, "step": 1940 }, { "epoch": 6.638225255972696, - "grad_norm": 0.443359375, - "learning_rate": 0.0001366573078949813, - "loss": 0.9829, + "grad_norm": 0.33984375, + "learning_rate": 6.130555844299973e-05, + "loss": 0.8511, "step": 1945 }, { "epoch": 6.6552901023890785, - "grad_norm": 0.44921875, - "learning_rate": 0.00013628749963754026, - "loss": 0.9963, + "grad_norm": 0.322265625, + "learning_rate": 6.075697398340913e-05, + "loss": 0.859, "step": 1950 }, { "epoch": 6.672354948805461, - "grad_norm": 0.52734375, - "learning_rate": 0.0001359171189753189, - "loss": 0.999, + "grad_norm": 0.330078125, + "learning_rate": 6.0209781977310486e-05, + "loss": 0.8617, "step": 1955 }, { "epoch": 6.689419795221843, - "grad_norm": 0.6484375, - "learning_rate": 0.00013554617175076062, - "loss": 0.9806, + "grad_norm": 0.314453125, + "learning_rate": 5.9664001840622886e-05, + "loss": 0.8478, "step": 1960 }, { "epoch": 6.706484641638225, - "grad_norm": 0.388671875, - "learning_rate": 0.0001351746638152458, - "loss": 0.9903, + "grad_norm": 0.314453125, + "learning_rate": 5.91196529391683e-05, + "loss": 0.8548, "step": 1965 }, { "epoch": 6.723549488054608, - "grad_norm": 0.4765625, - "learning_rate": 0.00013480260102899966, - "loss": 1.0009, + "grad_norm": 0.33984375, + "learning_rate": 5.857675458798453e-05, + "loss": 0.8623, "step": 1970 }, { "epoch": 6.7406143344709895, - "grad_norm": 0.44140625, - "learning_rate": 0.0001344299892609996, - "loss": 0.9879, + "grad_norm": 0.333984375, + "learning_rate": 5.8035326050639615e-05, + "loss": 0.853, "step": 1975 }, { "epoch": 6.757679180887372, - "grad_norm": 0.392578125, - "learning_rate": 0.00013405683438888282, - "loss": 0.9966, + "grad_norm": 0.3515625, + "learning_rate": 5.749538653854861e-05, + "loss": 0.8594, "step": 1980 }, { "epoch": 6.774744027303754, - "grad_norm": 0.4140625, - "learning_rate": 0.00013368314229885347, - "loss": 0.988, + "grad_norm": 0.32421875, + "learning_rate": 5.695695521029163e-05, + "loss": 0.8528, "step": 1985 }, { "epoch": 6.791808873720137, - "grad_norm": 0.36328125, - "learning_rate": 0.00013330891888559002, - "loss": 0.9835, + "grad_norm": 0.328125, + "learning_rate": 5.642005117093419e-05, + "loss": 0.8485, "step": 1990 }, { "epoch": 6.808873720136519, - "grad_norm": 0.421875, - "learning_rate": 0.00013293417005215188, - "loss": 0.9922, + "grad_norm": 0.333984375, + "learning_rate": 5.5884693471349256e-05, + "loss": 0.8578, "step": 1995 }, { "epoch": 6.825938566552901, - "grad_norm": 0.40234375, - "learning_rate": 0.0001325589017098867, - "loss": 0.9893, + "grad_norm": 0.330078125, + "learning_rate": 5.535090110754131e-05, + "loss": 0.8549, "step": 2000 }, { "epoch": 6.843003412969283, - "grad_norm": 0.3828125, - "learning_rate": 0.00013218311977833687, - "loss": 0.9965, + "grad_norm": 0.34375, + "learning_rate": 5.481869301997236e-05, + "loss": 0.8625, "step": 2005 }, { "epoch": 6.860068259385666, - "grad_norm": 0.365234375, - "learning_rate": 0.0001318068301851463, - "loss": 0.9843, + "grad_norm": 0.3125, + "learning_rate": 5.428808809288975e-05, + "loss": 0.8529, "step": 2010 }, { "epoch": 6.877133105802048, - "grad_norm": 0.390625, - "learning_rate": 0.00013143003886596669, - "loss": 0.9845, + "grad_norm": 0.3359375, + "learning_rate": 5.37591051536561e-05, + "loss": 0.8505, "step": 2015 }, { "epoch": 6.8941979522184305, - "grad_norm": 0.3515625, - "learning_rate": 0.0001310527517643642, - "loss": 0.9909, + "grad_norm": 0.345703125, + "learning_rate": 5.32317629720814e-05, + "loss": 0.8585, "step": 2020 }, { "epoch": 6.911262798634812, - "grad_norm": 0.359375, - "learning_rate": 0.00013067497483172538, - "loss": 0.9885, + "grad_norm": 0.341796875, + "learning_rate": 5.270608025975686e-05, + "loss": 0.8563, "step": 2025 }, { "epoch": 6.928327645051194, - "grad_norm": 0.375, - "learning_rate": 0.00013029671402716366, - "loss": 0.9879, + "grad_norm": 0.326171875, + "learning_rate": 5.218207566939116e-05, + "loss": 0.8534, "step": 2030 }, { "epoch": 6.945392491467577, - "grad_norm": 0.380859375, - "learning_rate": 0.00012991797531742492, - "loss": 0.9891, + "grad_norm": 0.330078125, + "learning_rate": 5.1659767794148316e-05, + "loss": 0.853, "step": 2035 }, { "epoch": 6.962457337883959, - "grad_norm": 0.34375, - "learning_rate": 0.00012953876467679373, - "loss": 0.9972, + "grad_norm": 0.33984375, + "learning_rate": 5.1139175166988187e-05, + "loss": 0.8622, "step": 2040 }, { "epoch": 6.979522184300341, - "grad_norm": 0.369140625, - "learning_rate": 0.00012915908808699893, - "loss": 0.9962, + "grad_norm": 0.333984375, + "learning_rate": 5.062031626000873e-05, + "loss": 0.8602, "step": 2045 }, { "epoch": 6.996587030716723, - "grad_norm": 0.44921875, - "learning_rate": 0.00012877895153711935, - "loss": 0.9941, + "grad_norm": 0.33984375, + "learning_rate": 5.0103209483790636e-05, + "loss": 0.8577, "step": 2050 }, { "epoch": 7.0, - "eval_loss": 2.49063777923584, - "eval_runtime": 0.554, - "eval_samples_per_second": 18.051, - "eval_steps_per_second": 1.805, + "eval_loss": 2.731566905975342, + "eval_runtime": 0.5528, + "eval_samples_per_second": 18.088, + "eval_steps_per_second": 1.809, "step": 2051 }, { "epoch": 7.013651877133106, - "grad_norm": 0.52734375, - "learning_rate": 0.00012839836102348926, - "loss": 0.9759, + "grad_norm": 0.322265625, + "learning_rate": 4.9587873186744025e-05, + "loss": 0.8366, "step": 2055 }, { "epoch": 7.030716723549488, - "grad_norm": 0.365234375, - "learning_rate": 0.00012801732254960388, - "loss": 0.9703, + "grad_norm": 0.326171875, + "learning_rate": 4.9074325654457446e-05, + "loss": 0.8237, "step": 2060 }, { "epoch": 7.047781569965871, - "grad_norm": 0.375, - "learning_rate": 0.00012763584212602453, - "loss": 0.9643, + "grad_norm": 0.333984375, + "learning_rate": 4.856258510904899e-05, + "loss": 0.8231, "step": 2065 }, { "epoch": 7.064846416382252, - "grad_norm": 0.41796875, - "learning_rate": 0.00012725392577028402, - "loss": 0.9646, + "grad_norm": 0.3359375, + "learning_rate": 4.805266970851975e-05, + "loss": 0.8253, "step": 2070 }, { "epoch": 7.081911262798635, - "grad_norm": 0.400390625, - "learning_rate": 0.0001268715795067916, - "loss": 0.9732, + "grad_norm": 0.333984375, + "learning_rate": 4.7544597546109514e-05, + "loss": 0.8313, "step": 2075 }, { "epoch": 7.098976109215017, - "grad_norm": 0.380859375, - "learning_rate": 0.00012648880936673787, - "loss": 0.9786, + "grad_norm": 0.337890625, + "learning_rate": 4.7038386649654764e-05, + "loss": 0.8322, "step": 2080 }, { "epoch": 7.1160409556314, - "grad_norm": 0.423828125, - "learning_rate": 0.00012610562138799978, - "loss": 0.9733, + "grad_norm": 0.33984375, + "learning_rate": 4.6534054980949113e-05, + "loss": 0.8317, "step": 2085 }, { "epoch": 7.1331058020477816, - "grad_norm": 0.357421875, - "learning_rate": 0.00012572202161504543, - "loss": 0.9808, + "grad_norm": 0.328125, + "learning_rate": 4.603162043510566e-05, + "loss": 0.8356, "step": 2090 }, { "epoch": 7.150170648464163, - "grad_norm": 0.4609375, - "learning_rate": 0.00012533801609883842, - "loss": 0.9762, + "grad_norm": 0.33984375, + "learning_rate": 4.553110083992237e-05, + "loss": 0.8289, "step": 2095 }, { "epoch": 7.167235494880546, - "grad_norm": 0.38671875, - "learning_rate": 0.00012495361089674285, - "loss": 0.9809, + "grad_norm": 0.359375, + "learning_rate": 4.50325139552493e-05, + "loss": 0.8382, "step": 2100 }, { "epoch": 7.184300341296928, - "grad_norm": 0.3984375, - "learning_rate": 0.00012456881207242732, - "loss": 0.9821, + "grad_norm": 0.35546875, + "learning_rate": 4.4535877472358466e-05, + "loss": 0.8363, "step": 2105 }, { "epoch": 7.201365187713311, - "grad_norm": 0.400390625, - "learning_rate": 0.00012418362569576965, - "loss": 0.9873, + "grad_norm": 0.32421875, + "learning_rate": 4.404120901331618e-05, + "loss": 0.8388, "step": 2110 }, { "epoch": 7.2184300341296925, - "grad_norm": 0.55078125, - "learning_rate": 0.00012379805784276082, - "loss": 0.9727, + "grad_norm": 0.34765625, + "learning_rate": 4.354852613035763e-05, + "loss": 0.8291, "step": 2115 }, { "epoch": 7.235494880546075, - "grad_norm": 0.515625, - "learning_rate": 0.0001234121145954094, - "loss": 0.9827, + "grad_norm": 0.328125, + "learning_rate": 4.305784630526416e-05, + "loss": 0.8361, "step": 2120 }, { "epoch": 7.252559726962457, - "grad_norm": 0.3828125, - "learning_rate": 0.00012302580204164541, - "loss": 0.9846, + "grad_norm": 0.3359375, + "learning_rate": 4.2569186948743e-05, + "loss": 0.8416, "step": 2125 }, { "epoch": 7.26962457337884, - "grad_norm": 0.42578125, - "learning_rate": 0.0001226391262752245, - "loss": 0.9736, + "grad_norm": 0.345703125, + "learning_rate": 4.2082565399809404e-05, + "loss": 0.8281, "step": 2130 }, { "epoch": 7.286689419795222, - "grad_norm": 0.5078125, - "learning_rate": 0.00012225209339563145, - "loss": 0.9743, + "grad_norm": 0.326171875, + "learning_rate": 4.159799892517148e-05, + "loss": 0.8281, "step": 2135 }, { "epoch": 7.303754266211604, - "grad_norm": 0.419921875, - "learning_rate": 0.00012186470950798445, - "loss": 0.9787, + "grad_norm": 0.349609375, + "learning_rate": 4.111550471861747e-05, + "loss": 0.8352, "step": 2140 }, { "epoch": 7.320819112627986, - "grad_norm": 0.490234375, - "learning_rate": 0.00012147698072293842, - "loss": 0.9788, + "grad_norm": 0.359375, + "learning_rate": 4.06350999004057e-05, + "loss": 0.833, "step": 2145 }, { "epoch": 7.337883959044369, - "grad_norm": 0.380859375, - "learning_rate": 0.00012108891315658879, - "loss": 0.967, + "grad_norm": 0.353515625, + "learning_rate": 4.0156801516657095e-05, + "loss": 0.825, "step": 2150 }, { "epoch": 7.354948805460751, - "grad_norm": 0.396484375, - "learning_rate": 0.00012070051293037492, - "loss": 0.9792, + "grad_norm": 0.3359375, + "learning_rate": 3.968062653875031e-05, + "loss": 0.8386, "step": 2155 }, { "epoch": 7.372013651877133, - "grad_norm": 0.43359375, - "learning_rate": 0.00012031178617098371, - "loss": 0.9905, + "grad_norm": 0.3359375, + "learning_rate": 3.920659186271953e-05, + "loss": 0.8454, "step": 2160 }, { "epoch": 7.389078498293515, - "grad_norm": 0.400390625, - "learning_rate": 0.00011992273901025269, - "loss": 0.9873, + "grad_norm": 0.349609375, + "learning_rate": 3.873471430865515e-05, + "loss": 0.8431, "step": 2165 }, { "epoch": 7.406143344709897, - "grad_norm": 0.4453125, - "learning_rate": 0.0001195333775850736, - "loss": 0.9872, + "grad_norm": 0.345703125, + "learning_rate": 3.8265010620106533e-05, + "loss": 0.8392, "step": 2170 }, { "epoch": 7.42320819112628, - "grad_norm": 0.44140625, - "learning_rate": 0.00011914370803729533, - "loss": 0.98, + "grad_norm": 0.3359375, + "learning_rate": 3.779749746348831e-05, + "loss": 0.8362, "step": 2175 }, { "epoch": 7.440273037542662, - "grad_norm": 0.361328125, - "learning_rate": 0.00011875373651362727, - "loss": 0.9827, + "grad_norm": 0.35546875, + "learning_rate": 3.7332191427488784e-05, + "loss": 0.8348, "step": 2180 }, { "epoch": 7.4573378839590445, - "grad_norm": 0.474609375, - "learning_rate": 0.00011836346916554205, - "loss": 0.9738, + "grad_norm": 0.35546875, + "learning_rate": 3.6869109022481386e-05, + "loss": 0.831, "step": 2185 }, { "epoch": 7.474402730375426, - "grad_norm": 0.421875, - "learning_rate": 0.00011797291214917881, - "loss": 0.9762, + "grad_norm": 0.357421875, + "learning_rate": 3.640826667993891e-05, + "loss": 0.8314, "step": 2190 }, { "epoch": 7.491467576791809, - "grad_norm": 0.41796875, - "learning_rate": 0.00011758207162524598, - "loss": 0.9675, + "grad_norm": 0.33203125, + "learning_rate": 3.59496807518503e-05, + "loss": 0.8258, "step": 2195 }, { "epoch": 7.508532423208191, - "grad_norm": 0.384765625, - "learning_rate": 0.00011719095375892396, - "loss": 0.9923, + "grad_norm": 0.3359375, + "learning_rate": 3.549336751014057e-05, + "loss": 0.8482, "step": 2200 }, { "epoch": 7.525597269624574, - "grad_norm": 0.482421875, - "learning_rate": 0.00011679956471976814, - "loss": 0.9818, + "grad_norm": 0.32421875, + "learning_rate": 3.503934314609343e-05, + "loss": 0.8387, "step": 2205 }, { "epoch": 7.5426621160409555, - "grad_norm": 0.3671875, - "learning_rate": 0.0001164079106816113, - "loss": 0.9783, + "grad_norm": 0.3515625, + "learning_rate": 3.458762376977669e-05, + "loss": 0.8344, "step": 2210 }, { "epoch": 7.559726962457338, - "grad_norm": 0.376953125, - "learning_rate": 0.00011601599782246646, - "loss": 0.9735, + "grad_norm": 0.353515625, + "learning_rate": 3.41382254094707e-05, + "loss": 0.8315, "step": 2215 }, { "epoch": 7.57679180887372, - "grad_norm": 0.443359375, - "learning_rate": 0.00011562383232442926, - "loss": 0.9751, + "grad_norm": 0.345703125, + "learning_rate": 3.369116401109963e-05, + "loss": 0.8331, "step": 2220 }, { "epoch": 7.593856655290102, - "grad_norm": 0.3671875, - "learning_rate": 0.0001152314203735805, - "loss": 0.9734, + "grad_norm": 0.34375, + "learning_rate": 3.3246455437665594e-05, + "loss": 0.8322, "step": 2225 }, { "epoch": 7.610921501706485, - "grad_norm": 0.439453125, - "learning_rate": 0.00011483876815988867, - "loss": 0.9706, + "grad_norm": 0.326171875, + "learning_rate": 3.280411546868583e-05, + "loss": 0.8281, "step": 2230 }, { "epoch": 7.627986348122867, - "grad_norm": 0.44140625, - "learning_rate": 0.00011444588187711205, - "loss": 0.9727, + "grad_norm": 0.34375, + "learning_rate": 3.2364159799632786e-05, + "loss": 0.8281, "step": 2235 }, { "epoch": 7.645051194539249, - "grad_norm": 0.41796875, - "learning_rate": 0.00011405276772270126, - "loss": 0.9774, + "grad_norm": 0.333984375, + "learning_rate": 3.192660404137729e-05, + "loss": 0.832, "step": 2240 }, { "epoch": 7.662116040955631, - "grad_norm": 0.353515625, - "learning_rate": 0.0001136594318977014, - "loss": 0.9815, + "grad_norm": 0.337890625, + "learning_rate": 3.14914637196345e-05, + "loss": 0.8361, "step": 2245 }, { "epoch": 7.679180887372014, - "grad_norm": 0.412109375, - "learning_rate": 0.0001132658806066542, - "loss": 0.9835, + "grad_norm": 0.328125, + "learning_rate": 3.105875427441297e-05, + "loss": 0.837, "step": 2250 }, { "epoch": 7.696245733788396, - "grad_norm": 0.384765625, - "learning_rate": 0.00011287212005750024, - "loss": 0.9773, + "grad_norm": 0.33984375, + "learning_rate": 3.0628491059467014e-05, + "loss": 0.8351, "step": 2255 }, { "epoch": 7.713310580204778, - "grad_norm": 0.42578125, - "learning_rate": 0.00011247815646148087, - "loss": 0.9835, + "grad_norm": 0.328125, + "learning_rate": 3.020068934175171e-05, + "loss": 0.838, "step": 2260 }, { "epoch": 7.73037542662116, - "grad_norm": 0.56640625, - "learning_rate": 0.00011208399603304047, - "loss": 0.9832, + "grad_norm": 0.333984375, + "learning_rate": 2.977536430088125e-05, + "loss": 0.8355, "step": 2265 }, { "epoch": 7.747440273037543, - "grad_norm": 0.38671875, - "learning_rate": 0.00011168964498972818, - "loss": 0.9701, + "grad_norm": 0.326171875, + "learning_rate": 2.9352531028590424e-05, + "loss": 0.8261, "step": 2270 }, { "epoch": 7.764505119453925, - "grad_norm": 0.3671875, - "learning_rate": 0.00011129510955209996, - "loss": 0.9832, + "grad_norm": 0.3359375, + "learning_rate": 2.8932204528198926e-05, + "loss": 0.8367, "step": 2275 }, { "epoch": 7.7815699658703075, - "grad_norm": 0.546875, - "learning_rate": 0.00011090039594362045, - "loss": 0.9861, + "grad_norm": 0.333984375, + "learning_rate": 2.8514399714079132e-05, + "loss": 0.8405, "step": 2280 }, { "epoch": 7.798634812286689, - "grad_norm": 0.5078125, - "learning_rate": 0.00011050551039056479, - "loss": 0.9881, + "grad_norm": 0.328125, + "learning_rate": 2.8099131411126867e-05, + "loss": 0.8408, "step": 2285 }, { "epoch": 7.815699658703072, - "grad_norm": 0.375, - "learning_rate": 0.00011011045912192035, - "loss": 0.9872, + "grad_norm": 0.326171875, + "learning_rate": 2.7686414354235356e-05, + "loss": 0.8397, "step": 2290 }, { "epoch": 7.832764505119454, - "grad_norm": 0.373046875, - "learning_rate": 0.0001097152483692886, - "loss": 0.9819, + "grad_norm": 0.34375, + "learning_rate": 2.7276263187772423e-05, + "loss": 0.8385, "step": 2295 }, { "epoch": 7.849829351535837, - "grad_norm": 0.375, - "learning_rate": 0.00010931988436678666, - "loss": 0.9756, + "grad_norm": 0.333984375, + "learning_rate": 2.6868692465060828e-05, + "loss": 0.8309, "step": 2300 }, { "epoch": 7.8668941979522184, - "grad_norm": 0.40234375, - "learning_rate": 0.00010892437335094912, - "loss": 0.9662, + "grad_norm": 0.341796875, + "learning_rate": 2.6463716647861904e-05, + "loss": 0.8229, "step": 2305 }, { "epoch": 7.8839590443686, - "grad_norm": 0.427734375, - "learning_rate": 0.00010852872156062946, - "loss": 0.9669, + "grad_norm": 0.34765625, + "learning_rate": 2.6061350105862382e-05, + "loss": 0.8226, "step": 2310 }, { "epoch": 7.901023890784983, - "grad_norm": 0.388671875, - "learning_rate": 0.00010813293523690191, - "loss": 0.9755, + "grad_norm": 0.33203125, + "learning_rate": 2.5661607116164532e-05, + "loss": 0.8334, "step": 2315 }, { "epoch": 7.918088737201365, - "grad_norm": 0.423828125, - "learning_rate": 0.00010773702062296273, - "loss": 0.9916, + "grad_norm": 0.34765625, + "learning_rate": 2.5264501862779667e-05, + "loss": 0.8444, "step": 2320 }, { "epoch": 7.935153583617748, - "grad_norm": 0.396484375, - "learning_rate": 0.00010734098396403192, - "loss": 0.9869, + "grad_norm": 0.328125, + "learning_rate": 2.4870048436124595e-05, + "loss": 0.8403, "step": 2325 }, { "epoch": 7.952218430034129, - "grad_norm": 0.447265625, - "learning_rate": 0.00010694483150725458, - "loss": 0.978, + "grad_norm": 0.32421875, + "learning_rate": 2.4478260832521938e-05, + "loss": 0.8302, "step": 2330 }, { "epoch": 7.969283276450512, - "grad_norm": 0.451171875, - "learning_rate": 0.00010654856950160253, - "loss": 0.9711, + "grad_norm": 0.33203125, + "learning_rate": 2.4089152953703332e-05, + "loss": 0.8265, "step": 2335 }, { "epoch": 7.986348122866894, - "grad_norm": 0.392578125, - "learning_rate": 0.00010615220419777548, - "loss": 0.9844, + "grad_norm": 0.3359375, + "learning_rate": 2.37027386063162e-05, + "loss": 0.8364, "step": 2340 }, { "epoch": 8.0, - "eval_loss": 2.489572525024414, - "eval_runtime": 0.5472, - "eval_samples_per_second": 18.276, - "eval_steps_per_second": 1.828, + "eval_loss": 2.7742018699645996, + "eval_runtime": 0.5517, + "eval_samples_per_second": 18.125, + "eval_steps_per_second": 1.813, "step": 2344 }, { "epoch": 8.003412969283277, - "grad_norm": 0.44140625, - "learning_rate": 0.00010575574184810269, - "loss": 0.9713, + "grad_norm": 0.33203125, + "learning_rate": 2.331903150143391e-05, + "loss": 0.83, "step": 2345 }, { "epoch": 8.020477815699659, - "grad_norm": 0.3984375, - "learning_rate": 0.0001053591887064442, - "loss": 0.9647, + "grad_norm": 0.330078125, + "learning_rate": 2.293804525406915e-05, + "loss": 0.8208, "step": 2350 }, { "epoch": 8.03754266211604, - "grad_norm": 0.453125, - "learning_rate": 0.00010496255102809223, - "loss": 0.9709, + "grad_norm": 0.345703125, + "learning_rate": 2.255979338269093e-05, + "loss": 0.8288, "step": 2355 }, { "epoch": 8.054607508532424, - "grad_norm": 0.431640625, - "learning_rate": 0.00010456583506967248, - "loss": 0.9701, + "grad_norm": 0.3515625, + "learning_rate": 2.2184289308744844e-05, + "loss": 0.8251, "step": 2360 }, { "epoch": 8.071672354948806, - "grad_norm": 0.44921875, - "learning_rate": 0.00010416904708904548, - "loss": 0.9662, + "grad_norm": 0.33984375, + "learning_rate": 2.1811546356176872e-05, + "loss": 0.8202, "step": 2365 }, { "epoch": 8.088737201365188, - "grad_norm": 0.46875, - "learning_rate": 0.00010377219334520783, - "loss": 0.9616, + "grad_norm": 0.341796875, + "learning_rate": 2.144157775096063e-05, + "loss": 0.8191, "step": 2370 }, { "epoch": 8.10580204778157, - "grad_norm": 0.4140625, - "learning_rate": 0.00010337528009819344, - "loss": 0.9609, + "grad_norm": 0.33984375, + "learning_rate": 2.1074396620628e-05, + "loss": 0.8161, "step": 2375 }, { "epoch": 8.122866894197951, - "grad_norm": 0.42578125, - "learning_rate": 0.00010297831360897492, - "loss": 0.9714, + "grad_norm": 0.330078125, + "learning_rate": 2.0710015993803422e-05, + "loss": 0.8259, "step": 2380 }, { "epoch": 8.139931740614335, - "grad_norm": 0.40234375, - "learning_rate": 0.00010258130013936474, - "loss": 0.9718, + "grad_norm": 0.34375, + "learning_rate": 2.0348448799741537e-05, + "loss": 0.8271, "step": 2385 }, { "epoch": 8.156996587030717, - "grad_norm": 0.4296875, - "learning_rate": 0.00010218424595191631, - "loss": 0.963, + "grad_norm": 0.33984375, + "learning_rate": 1.9989707867868425e-05, + "loss": 0.8222, "step": 2390 }, { "epoch": 8.174061433447099, - "grad_norm": 0.361328125, - "learning_rate": 0.00010178715730982549, - "loss": 0.9612, + "grad_norm": 0.33984375, + "learning_rate": 1.9633805927326387e-05, + "loss": 0.8176, "step": 2395 }, { "epoch": 8.19112627986348, - "grad_norm": 0.451171875, - "learning_rate": 0.00010139004047683151, - "loss": 0.9757, + "grad_norm": 0.341796875, + "learning_rate": 1.9280755606522384e-05, + "loss": 0.8303, "step": 2400 }, { "epoch": 8.208191126279864, - "grad_norm": 0.62890625, - "learning_rate": 0.00010099290171711841, - "loss": 0.961, + "grad_norm": 0.33984375, + "learning_rate": 1.893056943267969e-05, + "loss": 0.8179, "step": 2405 }, { "epoch": 8.225255972696246, - "grad_norm": 0.419921875, - "learning_rate": 0.00010059574729521595, - "loss": 0.962, + "grad_norm": 0.341796875, + "learning_rate": 1.8583259831393663e-05, + "loss": 0.8219, "step": 2410 }, { "epoch": 8.242320819112628, - "grad_norm": 0.51171875, - "learning_rate": 0.0001001985834759011, - "loss": 0.9761, + "grad_norm": 0.337890625, + "learning_rate": 1.8238839126190686e-05, + "loss": 0.829, "step": 2415 }, { "epoch": 8.25938566552901, - "grad_norm": 0.390625, - "learning_rate": 9.980141652409895e-05, - "loss": 0.9718, + "grad_norm": 0.34765625, + "learning_rate": 1.7897319538090962e-05, + "loss": 0.8233, "step": 2420 }, { "epoch": 8.276450511945393, - "grad_norm": 0.41796875, - "learning_rate": 9.940425270478407e-05, - "loss": 0.9672, + "grad_norm": 0.33203125, + "learning_rate": 1.755871318517488e-05, + "loss": 0.8224, "step": 2425 }, { "epoch": 8.293515358361775, - "grad_norm": 0.431640625, - "learning_rate": 9.900709828288164e-05, - "loss": 0.9658, + "grad_norm": 0.328125, + "learning_rate": 1.722303208215297e-05, + "loss": 0.8239, "step": 2430 }, { "epoch": 8.310580204778157, - "grad_norm": 0.4140625, - "learning_rate": 9.860995952316851e-05, - "loss": 0.9776, + "grad_norm": 0.33203125, + "learning_rate": 1.6890288139939625e-05, + "loss": 0.8324, "step": 2435 }, { "epoch": 8.327645051194539, - "grad_norm": 0.37890625, - "learning_rate": 9.821284269017455e-05, - "loss": 0.9664, + "grad_norm": 0.33984375, + "learning_rate": 1.6560493165230516e-05, + "loss": 0.8216, "step": 2440 }, { "epoch": 8.344709897610922, - "grad_norm": 0.380859375, - "learning_rate": 9.781575404808371e-05, - "loss": 0.9672, + "grad_norm": 0.337890625, + "learning_rate": 1.623365886008357e-05, + "loss": 0.8249, "step": 2445 }, { "epoch": 8.361774744027304, - "grad_norm": 0.3828125, - "learning_rate": 9.741869986063526e-05, - "loss": 0.9778, + "grad_norm": 0.3359375, + "learning_rate": 1.5909796821503785e-05, + "loss": 0.8327, "step": 2450 }, { "epoch": 8.378839590443686, - "grad_norm": 0.361328125, - "learning_rate": 9.702168639102509e-05, - "loss": 0.9659, + "grad_norm": 0.3359375, + "learning_rate": 1.5588918541031783e-05, + "loss": 0.8202, "step": 2455 }, { "epoch": 8.395904436860068, - "grad_norm": 0.392578125, - "learning_rate": 9.662471990180657e-05, - "loss": 0.9623, + "grad_norm": 0.337890625, + "learning_rate": 1.5271035404335954e-05, + "loss": 0.8213, "step": 2460 }, { "epoch": 8.41296928327645, - "grad_norm": 0.365234375, - "learning_rate": 9.622780665479222e-05, - "loss": 0.9657, + "grad_norm": 0.33203125, + "learning_rate": 1.4956158690808585e-05, + "loss": 0.8217, "step": 2465 }, { "epoch": 8.430034129692833, - "grad_norm": 0.40234375, - "learning_rate": 9.583095291095453e-05, - "loss": 0.9679, + "grad_norm": 0.359375, + "learning_rate": 1.464429957316552e-05, + "loss": 0.8235, "step": 2470 }, { "epoch": 8.447098976109215, - "grad_norm": 0.443359375, - "learning_rate": 9.543416493032757e-05, - "loss": 0.9686, + "grad_norm": 0.337890625, + "learning_rate": 1.433546911704977e-05, + "loss": 0.8257, "step": 2475 }, { "epoch": 8.464163822525597, - "grad_norm": 0.404296875, - "learning_rate": 9.503744897190778e-05, - "loss": 0.9679, + "grad_norm": 0.3359375, + "learning_rate": 1.402967828063897e-05, + "loss": 0.8228, "step": 2480 }, { "epoch": 8.481228668941979, - "grad_norm": 0.396484375, - "learning_rate": 9.464081129355586e-05, - "loss": 0.9588, + "grad_norm": 0.33203125, + "learning_rate": 1.37269379142563e-05, + "loss": 0.8155, "step": 2485 }, { "epoch": 8.498293515358363, - "grad_norm": 0.431640625, - "learning_rate": 9.424425815189733e-05, - "loss": 0.9775, + "grad_norm": 0.337890625, + "learning_rate": 1.3427258759985739e-05, + "loss": 0.8329, "step": 2490 }, { "epoch": 8.515358361774744, - "grad_norm": 0.384765625, - "learning_rate": 9.384779580222453e-05, - "loss": 0.9668, + "grad_norm": 0.337890625, + "learning_rate": 1.3130651451290798e-05, + "loss": 0.8224, "step": 2495 }, { "epoch": 8.532423208191126, - "grad_norm": 0.447265625, - "learning_rate": 9.345143049839749e-05, - "loss": 0.9677, + "grad_norm": 0.353515625, + "learning_rate": 1.2837126512637198e-05, + "loss": 0.8219, "step": 2500 }, { "epoch": 8.549488054607508, - "grad_norm": 0.48828125, - "learning_rate": 9.305516849274541e-05, - "loss": 0.9603, + "grad_norm": 0.330078125, + "learning_rate": 1.2546694359119493e-05, + "loss": 0.8151, "step": 2505 }, { "epoch": 8.56655290102389, - "grad_norm": 0.427734375, - "learning_rate": 9.265901603596811e-05, - "loss": 0.9688, + "grad_norm": 0.33984375, + "learning_rate": 1.2259365296091464e-05, + "loss": 0.8237, "step": 2510 }, { "epoch": 8.583617747440274, - "grad_norm": 0.498046875, - "learning_rate": 9.226297937703728e-05, - "loss": 0.9645, + "grad_norm": 0.34765625, + "learning_rate": 1.1975149518800454e-05, + "loss": 0.8207, "step": 2515 }, { "epoch": 8.600682593856655, - "grad_norm": 0.431640625, - "learning_rate": 9.186706476309812e-05, - "loss": 0.967, + "grad_norm": 0.341796875, + "learning_rate": 1.1694057112025636e-05, + "loss": 0.8221, "step": 2520 }, { "epoch": 8.617747440273037, - "grad_norm": 0.423828125, - "learning_rate": 9.147127843937055e-05, - "loss": 0.9711, + "grad_norm": 0.33203125, + "learning_rate": 1.141609804972017e-05, + "loss": 0.828, "step": 2525 }, { "epoch": 8.634812286689419, - "grad_norm": 0.455078125, - "learning_rate": 9.107562664905093e-05, - "loss": 0.971, + "grad_norm": 0.345703125, + "learning_rate": 1.1141282194657287e-05, + "loss": 0.8232, "step": 2530 }, { "epoch": 8.651877133105803, - "grad_norm": 0.484375, - "learning_rate": 9.068011563321336e-05, - "loss": 0.9722, + "grad_norm": 0.353515625, + "learning_rate": 1.086961929808038e-05, + "loss": 0.8281, "step": 2535 }, { "epoch": 8.668941979522184, - "grad_norm": 0.435546875, - "learning_rate": 9.028475163071141e-05, - "loss": 0.9747, + "grad_norm": 0.34375, + "learning_rate": 1.0601118999356907e-05, + "loss": 0.8252, "step": 2540 }, { "epoch": 8.686006825938566, - "grad_norm": 0.4140625, - "learning_rate": 8.988954087807968e-05, - "loss": 0.9638, + "grad_norm": 0.333984375, + "learning_rate": 1.0335790825636449e-05, + "loss": 0.8225, "step": 2545 }, { "epoch": 8.703071672354948, - "grad_norm": 0.400390625, - "learning_rate": 8.949448960943524e-05, - "loss": 0.9625, + "grad_norm": 0.341796875, + "learning_rate": 1.00736441915126e-05, + "loss": 0.8199, "step": 2550 }, { "epoch": 8.720136518771332, - "grad_norm": 0.49609375, - "learning_rate": 8.909960405637958e-05, - "loss": 0.9568, + "grad_norm": 0.345703125, + "learning_rate": 9.814688398688998e-06, + "loss": 0.8146, "step": 2555 }, { "epoch": 8.737201365187714, - "grad_norm": 0.435546875, - "learning_rate": 8.870489044790006e-05, - "loss": 0.9766, + "grad_norm": 0.34375, + "learning_rate": 9.558932635649131e-06, + "loss": 0.8303, "step": 2560 }, { "epoch": 8.754266211604095, - "grad_norm": 0.41015625, - "learning_rate": 8.831035501027186e-05, - "loss": 0.967, + "grad_norm": 0.328125, + "learning_rate": 9.306385977330411e-06, + "loss": 0.8224, "step": 2565 }, { "epoch": 8.771331058020477, - "grad_norm": 0.376953125, - "learning_rate": 8.791600396695954e-05, - "loss": 0.9686, + "grad_norm": 0.333984375, + "learning_rate": 9.057057384802181e-06, + "loss": 0.8228, "step": 2570 }, { "epoch": 8.788395904436861, - "grad_norm": 0.373046875, - "learning_rate": 8.752184353851916e-05, - "loss": 0.9684, + "grad_norm": 0.3359375, + "learning_rate": 8.810955704947666e-06, + "loss": 0.8231, "step": 2575 }, { "epoch": 8.805460750853243, - "grad_norm": 0.435546875, - "learning_rate": 8.712787994249979e-05, - "loss": 0.977, + "grad_norm": 0.330078125, + "learning_rate": 8.568089670150115e-06, + "loss": 0.8278, "step": 2580 }, { "epoch": 8.822525597269625, - "grad_norm": 0.419921875, - "learning_rate": 8.673411939334581e-05, - "loss": 0.9712, + "grad_norm": 0.341796875, + "learning_rate": 8.328467897982995e-06, + "loss": 0.8248, "step": 2585 }, { "epoch": 8.839590443686006, - "grad_norm": 0.478515625, - "learning_rate": 8.634056810229862e-05, - "loss": 0.9692, + "grad_norm": 0.333984375, + "learning_rate": 8.092098890904098e-06, + "loss": 0.8195, "step": 2590 }, { "epoch": 8.856655290102388, - "grad_norm": 0.404296875, - "learning_rate": 8.594723227729875e-05, - "loss": 0.9639, + "grad_norm": 0.333984375, + "learning_rate": 7.858991035953944e-06, + "loss": 0.8203, "step": 2595 }, { "epoch": 8.873720136518772, - "grad_norm": 0.447265625, - "learning_rate": 8.555411812288798e-05, - "loss": 0.974, + "grad_norm": 0.328125, + "learning_rate": 7.629152604458156e-06, + "loss": 0.8257, "step": 2600 }, { "epoch": 8.890784982935154, - "grad_norm": 0.392578125, - "learning_rate": 8.516123184011135e-05, - "loss": 0.9589, + "grad_norm": 0.34375, + "learning_rate": 7.402591751733989e-06, + "loss": 0.8128, "step": 2605 }, { "epoch": 8.907849829351536, - "grad_norm": 0.43359375, - "learning_rate": 8.47685796264195e-05, - "loss": 0.968, + "grad_norm": 0.3359375, + "learning_rate": 7.179316516800894e-06, + "loss": 0.8251, "step": 2610 }, { "epoch": 8.924914675767917, - "grad_norm": 0.396484375, - "learning_rate": 8.437616767557077e-05, - "loss": 0.9693, + "grad_norm": 0.341796875, + "learning_rate": 6.959334822095354e-06, + "loss": 0.824, "step": 2615 }, { "epoch": 8.941979522184301, - "grad_norm": 0.5390625, - "learning_rate": 8.398400217753357e-05, - "loss": 0.9727, + "grad_norm": 0.3515625, + "learning_rate": 6.7426544731897245e-06, + "loss": 0.8287, "step": 2620 }, { "epoch": 8.959044368600683, - "grad_norm": 0.419921875, - "learning_rate": 8.359208931838871e-05, - "loss": 0.9708, + "grad_norm": 0.3359375, + "learning_rate": 6.529283158515276e-06, + "loss": 0.8264, "step": 2625 }, { "epoch": 8.976109215017065, - "grad_norm": 0.427734375, - "learning_rate": 8.320043528023188e-05, - "loss": 0.9607, + "grad_norm": 0.337890625, + "learning_rate": 6.319228449089376e-06, + "loss": 0.8179, "step": 2630 }, { "epoch": 8.993174061433447, - "grad_norm": 0.455078125, - "learning_rate": 8.280904624107606e-05, - "loss": 0.9779, + "grad_norm": 0.34375, + "learning_rate": 6.11249779824693e-06, + "loss": 0.8311, "step": 2635 }, { "epoch": 9.0, - "eval_loss": 2.502519130706787, - "eval_runtime": 0.5483, - "eval_samples_per_second": 18.238, - "eval_steps_per_second": 1.824, + "eval_loss": 2.7970776557922363, + "eval_runtime": 0.547, + "eval_samples_per_second": 18.282, + "eval_steps_per_second": 1.828, "step": 2637 }, { "epoch": 9.01023890784983, - "grad_norm": 0.3828125, - "learning_rate": 8.241792837475405e-05, - "loss": 0.9673, + "grad_norm": 0.33984375, + "learning_rate": 5.909098541375746e-06, + "loss": 0.827, "step": 2640 }, { "epoch": 9.027303754266212, - "grad_norm": 0.42578125, - "learning_rate": 8.202708785082121e-05, - "loss": 0.9481, + "grad_norm": 0.333984375, + "learning_rate": 5.7090378956564216e-06, + "loss": 0.8173, "step": 2645 }, { "epoch": 9.044368600682594, - "grad_norm": 0.39453125, - "learning_rate": 8.163653083445799e-05, - "loss": 0.9694, + "grad_norm": 0.341796875, + "learning_rate": 5.512322959806193e-06, + "loss": 0.8315, "step": 2650 }, { "epoch": 9.061433447098976, - "grad_norm": 0.392578125, - "learning_rate": 8.124626348637279e-05, - "loss": 0.9651, + "grad_norm": 0.345703125, + "learning_rate": 5.3189607138270255e-06, + "loss": 0.8278, "step": 2655 }, { "epoch": 9.078498293515358, - "grad_norm": 0.376953125, - "learning_rate": 8.085629196270469e-05, - "loss": 0.9561, + "grad_norm": 0.328125, + "learning_rate": 5.128958018758012e-06, + "loss": 0.821, "step": 2660 }, { "epoch": 9.095563139931741, - "grad_norm": 0.408203125, - "learning_rate": 8.046662241492645e-05, - "loss": 0.9617, + "grad_norm": 0.337890625, + "learning_rate": 4.942321616431833e-06, + "loss": 0.8261, "step": 2665 }, { "epoch": 9.112627986348123, - "grad_norm": 0.408203125, - "learning_rate": 8.007726098974734e-05, - "loss": 0.9636, + "grad_norm": 0.341796875, + "learning_rate": 4.7590581292356276e-06, + "loss": 0.8267, "step": 2670 }, { "epoch": 9.129692832764505, - "grad_norm": 0.390625, - "learning_rate": 7.96882138290163e-05, - "loss": 0.9661, + "grad_norm": 0.3359375, + "learning_rate": 4.579174059875946e-06, + "loss": 0.8265, "step": 2675 }, { "epoch": 9.146757679180887, - "grad_norm": 0.396484375, - "learning_rate": 7.929948706962508e-05, - "loss": 0.9577, + "grad_norm": 0.33203125, + "learning_rate": 4.402675791148059e-06, + "loss": 0.8217, "step": 2680 }, { "epoch": 9.16382252559727, - "grad_norm": 0.41796875, - "learning_rate": 7.891108684341121e-05, - "loss": 0.961, + "grad_norm": 0.330078125, + "learning_rate": 4.229569585709425e-06, + "loss": 0.8245, "step": 2685 }, { "epoch": 9.180887372013652, - "grad_norm": 0.37109375, - "learning_rate": 7.852301927706159e-05, - "loss": 0.9602, + "grad_norm": 0.3359375, + "learning_rate": 4.0598615858575605e-06, + "loss": 0.8211, "step": 2690 }, { "epoch": 9.197952218430034, - "grad_norm": 0.396484375, - "learning_rate": 7.813529049201556e-05, - "loss": 0.9544, + "grad_norm": 0.330078125, + "learning_rate": 3.89355781331201e-06, + "loss": 0.8162, "step": 2695 }, { "epoch": 9.215017064846416, - "grad_norm": 0.470703125, - "learning_rate": 7.774790660436858e-05, - "loss": 0.9569, + "grad_norm": 0.33203125, + "learning_rate": 3.730664169000708e-06, + "loss": 0.8154, "step": 2700 }, { "epoch": 9.2320819112628, - "grad_norm": 0.375, - "learning_rate": 7.736087372477554e-05, - "loss": 0.9636, + "grad_norm": 0.330078125, + "learning_rate": 3.571186432850626e-06, + "loss": 0.8245, "step": 2705 }, { "epoch": 9.249146757679181, - "grad_norm": 0.37109375, - "learning_rate": 7.69741979583546e-05, - "loss": 0.9574, + "grad_norm": 0.333984375, + "learning_rate": 3.415130263582611e-06, + "loss": 0.8198, "step": 2710 }, { "epoch": 9.266211604095563, - "grad_norm": 0.390625, - "learning_rate": 7.658788540459062e-05, - "loss": 0.9536, + "grad_norm": 0.330078125, + "learning_rate": 3.2625011985107257e-06, + "loss": 0.8178, "step": 2715 }, { "epoch": 9.283276450511945, - "grad_norm": 0.388671875, - "learning_rate": 7.620194215723919e-05, - "loss": 0.9598, + "grad_norm": 0.337890625, + "learning_rate": 3.1133046533455947e-06, + "loss": 0.825, "step": 2720 }, { "epoch": 9.300341296928327, - "grad_norm": 0.3828125, - "learning_rate": 7.581637430423037e-05, - "loss": 0.9657, + "grad_norm": 0.3359375, + "learning_rate": 2.967545922002379e-06, + "loss": 0.8249, "step": 2725 }, { "epoch": 9.31740614334471, - "grad_norm": 0.435546875, - "learning_rate": 7.543118792757266e-05, - "loss": 0.9639, + "grad_norm": 0.337890625, + "learning_rate": 2.8252301764128962e-06, + "loss": 0.8228, "step": 2730 }, { "epoch": 9.334470989761092, - "grad_norm": 0.408203125, - "learning_rate": 7.504638910325717e-05, - "loss": 0.9625, + "grad_norm": 0.3359375, + "learning_rate": 2.686362466342085e-06, + "loss": 0.822, "step": 2735 }, { "epoch": 9.351535836177474, - "grad_norm": 0.37109375, - "learning_rate": 7.466198390116158e-05, - "loss": 0.9585, + "grad_norm": 0.3359375, + "learning_rate": 2.550947719208829e-06, + "loss": 0.8224, "step": 2740 }, { "epoch": 9.368600682593856, - "grad_norm": 0.447265625, - "learning_rate": 7.427797838495463e-05, - "loss": 0.9634, + "grad_norm": 0.34375, + "learning_rate": 2.4189907399111534e-06, + "loss": 0.8224, "step": 2745 }, { "epoch": 9.38566552901024, - "grad_norm": 0.41796875, - "learning_rate": 7.389437861200024e-05, - "loss": 0.9624, + "grad_norm": 0.33984375, + "learning_rate": 2.2904962106556793e-06, + "loss": 0.82, "step": 2750 }, { "epoch": 9.402730375426621, - "grad_norm": 0.408203125, - "learning_rate": 7.35111906332622e-05, - "loss": 0.9555, + "grad_norm": 0.337890625, + "learning_rate": 2.1654686907915167e-06, + "loss": 0.8183, "step": 2755 }, { "epoch": 9.419795221843003, - "grad_norm": 0.435546875, - "learning_rate": 7.312842049320844e-05, - "loss": 0.9575, + "grad_norm": 0.33984375, + "learning_rate": 2.0439126166485025e-06, + "loss": 0.8189, "step": 2760 }, { "epoch": 9.436860068259385, - "grad_norm": 0.42578125, - "learning_rate": 7.2746074229716e-05, - "loss": 0.9598, + "grad_norm": 0.333984375, + "learning_rate": 1.925832301379726e-06, + "loss": 0.8215, "step": 2765 }, { "epoch": 9.453924914675769, - "grad_norm": 0.423828125, - "learning_rate": 7.236415787397548e-05, - "loss": 0.9594, + "grad_norm": 0.341796875, + "learning_rate": 1.8112319348085771e-06, + "loss": 0.8235, "step": 2770 }, { "epoch": 9.47098976109215, - "grad_norm": 0.408203125, - "learning_rate": 7.198267745039612e-05, - "loss": 0.9571, + "grad_norm": 0.341796875, + "learning_rate": 1.700115583279993e-06, + "loss": 0.8157, "step": 2775 }, { "epoch": 9.488054607508532, - "grad_norm": 0.41015625, - "learning_rate": 7.160163897651075e-05, - "loss": 0.9582, + "grad_norm": 0.337890625, + "learning_rate": 1.592487189516212e-06, + "loss": 0.8192, "step": 2780 }, { "epoch": 9.505119453924914, - "grad_norm": 0.453125, - "learning_rate": 7.122104846288064e-05, - "loss": 0.9583, + "grad_norm": 0.3359375, + "learning_rate": 1.4883505724768932e-06, + "loss": 0.8168, "step": 2785 }, { "epoch": 9.522184300341298, - "grad_norm": 0.474609375, - "learning_rate": 7.08409119130011e-05, - "loss": 0.9713, + "grad_norm": 0.341796875, + "learning_rate": 1.3877094272235712e-06, + "loss": 0.8296, "step": 2790 }, { "epoch": 9.53924914675768, - "grad_norm": 0.388671875, - "learning_rate": 7.04612353232063e-05, - "loss": 0.9538, + "grad_norm": 0.3359375, + "learning_rate": 1.2905673247885718e-06, + "loss": 0.8166, "step": 2795 }, { "epoch": 9.556313993174061, - "grad_norm": 0.41796875, - "learning_rate": 7.008202468257514e-05, - "loss": 0.9572, + "grad_norm": 0.3359375, + "learning_rate": 1.196927712048257e-06, + "loss": 0.817, "step": 2800 }, { "epoch": 9.573378839590443, - "grad_norm": 0.41015625, - "learning_rate": 6.970328597283637e-05, - "loss": 0.9483, + "grad_norm": 0.33984375, + "learning_rate": 1.1067939116008009e-06, + "loss": 0.813, "step": 2805 }, { "epoch": 9.590443686006825, - "grad_norm": 0.40234375, - "learning_rate": 6.932502516827461e-05, - "loss": 0.9521, + "grad_norm": 0.33203125, + "learning_rate": 1.020169121648218e-06, + "loss": 0.8114, "step": 2810 }, { "epoch": 9.607508532423209, - "grad_norm": 0.38671875, - "learning_rate": 6.894724823563583e-05, - "loss": 0.9534, + "grad_norm": 0.32421875, + "learning_rate": 9.370564158829087e-07, + "loss": 0.8146, "step": 2815 }, { "epoch": 9.62457337883959, - "grad_norm": 0.41015625, - "learning_rate": 6.85699611340333e-05, - "loss": 0.9611, + "grad_norm": 0.333984375, + "learning_rate": 8.574587433786363e-07, + "loss": 0.8216, "step": 2820 }, { "epoch": 9.641638225255972, - "grad_norm": 0.369140625, - "learning_rate": 6.819316981485372e-05, - "loss": 0.9499, + "grad_norm": 0.34375, + "learning_rate": 7.813789284857986e-07, + "loss": 0.8157, "step": 2825 }, { "epoch": 9.658703071672354, - "grad_norm": 0.361328125, - "learning_rate": 6.781688022166311e-05, - "loss": 0.9689, + "grad_norm": 0.333984375, + "learning_rate": 7.088196707312977e-07, + "loss": 0.8283, "step": 2830 }, { "epoch": 9.675767918088738, - "grad_norm": 0.40234375, - "learning_rate": 6.744109829011332e-05, - "loss": 0.9492, + "grad_norm": 0.330078125, + "learning_rate": 6.39783544722694e-07, + "loss": 0.8092, "step": 2835 }, { "epoch": 9.69283276450512, - "grad_norm": 0.384765625, - "learning_rate": 6.706582994784814e-05, - "loss": 0.9626, + "grad_norm": 0.330078125, + "learning_rate": 5.742730000568908e-07, + "loss": 0.8242, "step": 2840 }, { "epoch": 9.709897610921502, - "grad_norm": 0.408203125, - "learning_rate": 6.669108111441003e-05, - "loss": 0.9641, + "grad_norm": 0.341796875, + "learning_rate": 5.12290361233192e-07, + "loss": 0.8239, "step": 2845 }, { "epoch": 9.726962457337883, - "grad_norm": 0.40234375, - "learning_rate": 6.631685770114654e-05, - "loss": 0.9578, + "grad_norm": 0.33984375, + "learning_rate": 4.538378275708133e-07, + "loss": 0.8145, "step": 2850 }, { "epoch": 9.744027303754265, - "grad_norm": 0.37890625, - "learning_rate": 6.594316561111724e-05, - "loss": 0.9648, + "grad_norm": 0.3359375, + "learning_rate": 3.989174731308998e-07, + "loss": 0.8249, "step": 2855 }, { "epoch": 9.761092150170649, - "grad_norm": 0.390625, - "learning_rate": 6.557001073900044e-05, - "loss": 0.957, + "grad_norm": 0.3359375, + "learning_rate": 3.4753124664286265e-07, + "loss": 0.817, "step": 2860 }, { "epoch": 9.77815699658703, - "grad_norm": 0.375, - "learning_rate": 6.519739897100034e-05, - "loss": 0.9513, + "grad_norm": 0.33203125, + "learning_rate": 2.9968097143526775e-07, + "loss": 0.8115, "step": 2865 }, { "epoch": 9.795221843003413, - "grad_norm": 0.453125, - "learning_rate": 6.482533618475422e-05, - "loss": 0.9591, + "grad_norm": 0.35546875, + "learning_rate": 2.5536834537114307e-07, + "loss": 0.8192, "step": 2870 }, { "epoch": 9.812286689419794, - "grad_norm": 0.369140625, - "learning_rate": 6.445382824923938e-05, - "loss": 0.9625, + "grad_norm": 0.328125, + "learning_rate": 2.145949407877157e-07, + "loss": 0.8181, "step": 2875 }, { "epoch": 9.829351535836178, - "grad_norm": 0.37109375, - "learning_rate": 6.408288102468113e-05, - "loss": 0.9606, + "grad_norm": 0.3359375, + "learning_rate": 1.7736220444064533e-07, + "loss": 0.8203, "step": 2880 }, { "epoch": 9.84641638225256, - "grad_norm": 0.37890625, - "learning_rate": 6.371250036245976e-05, - "loss": 0.9662, + "grad_norm": 0.333984375, + "learning_rate": 1.436714574526543e-07, + "loss": 0.826, "step": 2885 }, { "epoch": 9.863481228668942, - "grad_norm": 0.373046875, - "learning_rate": 6.334269210501875e-05, - "loss": 0.9635, + "grad_norm": 0.337890625, + "learning_rate": 1.1352389526668727e-07, + "loss": 0.8241, "step": 2890 }, { "epoch": 9.880546075085324, - "grad_norm": 0.365234375, - "learning_rate": 6.297346208577213e-05, - "loss": 0.9649, + "grad_norm": 0.3359375, + "learning_rate": 8.692058760345622e-08, + "loss": 0.8268, "step": 2895 }, { "epoch": 9.897610921501707, - "grad_norm": 0.390625, - "learning_rate": 6.260481612901299e-05, - "loss": 0.9516, + "grad_norm": 0.330078125, + "learning_rate": 6.386247842353754e-08, + "loss": 0.8106, "step": 2900 }, { "epoch": 9.914675767918089, - "grad_norm": 0.3828125, - "learning_rate": 6.223676004982105e-05, - "loss": 0.9601, + "grad_norm": 0.330078125, + "learning_rate": 4.435038589380991e-08, + "loss": 0.8232, "step": 2905 }, { "epoch": 9.93174061433447, - "grad_norm": 0.5625, - "learning_rate": 6.18692996539714e-05, - "loss": 0.9611, + "grad_norm": 0.3359375, + "learning_rate": 2.8385002358466418e-08, + "loss": 0.8187, "step": 2910 }, { "epoch": 9.948805460750853, - "grad_norm": 0.39453125, - "learning_rate": 6.150244073784266e-05, - "loss": 0.9742, + "grad_norm": 0.337890625, + "learning_rate": 1.5966894314456415e-08, + "loss": 0.8284, "step": 2915 }, { "epoch": 9.965870307167236, - "grad_norm": 0.4296875, - "learning_rate": 6.113618908832561e-05, - "loss": 0.9666, + "grad_norm": 0.333984375, + "learning_rate": 7.096502391346071e-09, + "loss": 0.8275, "step": 2920 }, { "epoch": 9.982935153583618, - "grad_norm": 0.447265625, - "learning_rate": 6.0770550482731924e-05, - "loss": 0.9684, + "grad_norm": 0.33984375, + "learning_rate": 1.7741413357197368e-09, + "loss": 0.8271, "step": 2925 }, { "epoch": 10.0, - "grad_norm": 0.41015625, - "learning_rate": 6.0405530688702986e-05, - "loss": 0.9639, + "grad_norm": 0.333984375, + "learning_rate": 0.0, + "loss": 0.8243, "step": 2930 }, { "epoch": 10.0, - "eval_loss": 2.512617588043213, - "eval_runtime": 0.5446, - "eval_samples_per_second": 18.362, - "eval_steps_per_second": 1.836, + "eval_loss": 2.7977683544158936, + "eval_runtime": 0.5422, + "eval_samples_per_second": 18.444, + "eval_steps_per_second": 1.844, "step": 2930 }, { - "epoch": 10.017064846416382, - "grad_norm": 0.427734375, - "learning_rate": 6.0041135464119024e-05, - "loss": 0.9618, - "step": 2935 - }, - { - "epoch": 10.034129692832764, - "grad_norm": 0.384765625, - "learning_rate": 5.9677370557008104e-05, - "loss": 0.9433, - "step": 2940 - }, - { - "epoch": 10.051194539249147, - "grad_norm": 0.478515625, - "learning_rate": 5.9314241705455674e-05, - "loss": 0.9543, - "step": 2945 - }, - { - "epoch": 10.06825938566553, - "grad_norm": 0.408203125, - "learning_rate": 5.895175463751385e-05, - "loss": 0.9579, - "step": 2950 - }, - { - "epoch": 10.085324232081911, - "grad_norm": 0.380859375, - "learning_rate": 5.858991507111122e-05, - "loss": 0.9506, - "step": 2955 - }, - { - "epoch": 10.102389078498293, - "grad_norm": 0.3828125, - "learning_rate": 5.8228728713962543e-05, - "loss": 0.9582, - "step": 2960 - }, - { - "epoch": 10.119453924914676, - "grad_norm": 0.38671875, - "learning_rate": 5.786820126347876e-05, - "loss": 0.9576, - "step": 2965 - }, - { - "epoch": 10.136518771331058, - "grad_norm": 0.4140625, - "learning_rate": 5.750833840667711e-05, - "loss": 0.9506, - "step": 2970 - }, - { - "epoch": 10.15358361774744, - "grad_norm": 0.390625, - "learning_rate": 5.7149145820091385e-05, - "loss": 0.952, - "step": 2975 - }, - { - "epoch": 10.170648464163822, - "grad_norm": 0.38671875, - "learning_rate": 5.6790629169682564e-05, - "loss": 0.9532, - "step": 2980 - }, - { - "epoch": 10.187713310580206, - "grad_norm": 0.396484375, - "learning_rate": 5.6432794110749134e-05, - "loss": 0.9459, - "step": 2985 - }, - { - "epoch": 10.204778156996587, - "grad_norm": 0.490234375, - "learning_rate": 5.607564628783817e-05, - "loss": 0.9513, - "step": 2990 - }, - { - "epoch": 10.22184300341297, - "grad_norm": 0.41796875, - "learning_rate": 5.571919133465605e-05, - "loss": 0.9499, - "step": 2995 - }, - { - "epoch": 10.238907849829351, - "grad_norm": 0.392578125, - "learning_rate": 5.5363434873979903e-05, - "loss": 0.9481, - "step": 3000 - }, - { - "epoch": 10.255972696245733, - "grad_norm": 0.380859375, - "learning_rate": 5.500838251756857e-05, - "loss": 0.9501, - "step": 3005 - }, - { - "epoch": 10.273037542662117, - "grad_norm": 0.3671875, - "learning_rate": 5.465403986607426e-05, - "loss": 0.9498, - "step": 3010 - }, - { - "epoch": 10.290102389078498, - "grad_norm": 0.396484375, - "learning_rate": 5.430041250895428e-05, - "loss": 0.947, - "step": 3015 - }, - { - "epoch": 10.30716723549488, - "grad_norm": 0.42578125, - "learning_rate": 5.3947506024382665e-05, - "loss": 0.9581, - "step": 3020 - }, - { - "epoch": 10.324232081911262, - "grad_norm": 0.408203125, - "learning_rate": 5.359532597916233e-05, - "loss": 0.9549, - "step": 3025 - }, - { - "epoch": 10.341296928327646, - "grad_norm": 0.40625, - "learning_rate": 5.324387792863719e-05, - "loss": 0.968, - "step": 3030 - }, - { - "epoch": 10.358361774744028, - "grad_norm": 0.404296875, - "learning_rate": 5.289316741660466e-05, - "loss": 0.9499, - "step": 3035 - }, - { - "epoch": 10.37542662116041, - "grad_norm": 0.3828125, - "learning_rate": 5.254319997522796e-05, - "loss": 0.9639, - "step": 3040 - }, - { - "epoch": 10.392491467576791, - "grad_norm": 0.404296875, - "learning_rate": 5.21939811249492e-05, - "loss": 0.9555, - "step": 3045 - }, - { - "epoch": 10.409556313993175, - "grad_norm": 0.38671875, - "learning_rate": 5.1845516374401784e-05, - "loss": 0.9533, - "step": 3050 - }, - { - "epoch": 10.426621160409557, - "grad_norm": 0.421875, - "learning_rate": 5.14978112203241e-05, - "loss": 0.9632, - "step": 3055 - }, - { - "epoch": 10.443686006825939, - "grad_norm": 0.380859375, - "learning_rate": 5.11508711474725e-05, - "loss": 0.9596, - "step": 3060 - }, - { - "epoch": 10.46075085324232, - "grad_norm": 0.4140625, - "learning_rate": 5.080470162853472e-05, - "loss": 0.963, - "step": 3065 - }, - { - "epoch": 10.477815699658702, - "grad_norm": 0.412109375, - "learning_rate": 5.0459308124043715e-05, - "loss": 0.9602, - "step": 3070 - }, - { - "epoch": 10.494880546075086, - "grad_norm": 0.4375, - "learning_rate": 5.0114696082291425e-05, - "loss": 0.9429, - "step": 3075 - }, - { - "epoch": 10.511945392491468, - "grad_norm": 0.3828125, - "learning_rate": 4.9770870939242986e-05, - "loss": 0.9569, - "step": 3080 - }, - { - "epoch": 10.52901023890785, - "grad_norm": 0.396484375, - "learning_rate": 4.942783811845074e-05, - "loss": 0.945, - "step": 3085 - }, - { - "epoch": 10.546075085324231, - "grad_norm": 0.38671875, - "learning_rate": 4.908560303096887e-05, - "loss": 0.955, - "step": 3090 - }, - { - "epoch": 10.563139931740615, - "grad_norm": 0.404296875, - "learning_rate": 4.874417107526795e-05, - "loss": 0.9583, - "step": 3095 - }, - { - "epoch": 10.580204778156997, - "grad_norm": 0.38671875, - "learning_rate": 4.840354763714991e-05, - "loss": 0.9499, - "step": 3100 - }, - { - "epoch": 10.597269624573379, - "grad_norm": 0.41015625, - "learning_rate": 4.8063738089662926e-05, - "loss": 0.9528, - "step": 3105 - }, - { - "epoch": 10.61433447098976, - "grad_norm": 0.373046875, - "learning_rate": 4.772474779301669e-05, - "loss": 0.9581, - "step": 3110 - }, - { - "epoch": 10.631399317406144, - "grad_norm": 0.4140625, - "learning_rate": 4.738658209449805e-05, - "loss": 0.9456, - "step": 3115 - }, - { - "epoch": 10.648464163822526, - "grad_norm": 0.384765625, - "learning_rate": 4.704924632838636e-05, - "loss": 0.9507, - "step": 3120 - }, - { - "epoch": 10.665529010238908, - "grad_norm": 0.384765625, - "learning_rate": 4.671274581586958e-05, - "loss": 0.9586, - "step": 3125 - }, - { - "epoch": 10.68259385665529, - "grad_norm": 0.375, - "learning_rate": 4.637708586496018e-05, - "loss": 0.9487, - "step": 3130 - }, - { - "epoch": 10.699658703071673, - "grad_norm": 0.38671875, - "learning_rate": 4.604227177041156e-05, - "loss": 0.9511, - "step": 3135 - }, - { - "epoch": 10.716723549488055, - "grad_norm": 0.404296875, - "learning_rate": 4.570830881363439e-05, - "loss": 0.9529, - "step": 3140 - }, - { - "epoch": 10.733788395904437, - "grad_norm": 0.5078125, - "learning_rate": 4.537520226261333e-05, - "loss": 0.962, - "step": 3145 - }, - { - "epoch": 10.750853242320819, - "grad_norm": 0.396484375, - "learning_rate": 4.5042957371824057e-05, - "loss": 0.9551, - "step": 3150 - }, - { - "epoch": 10.7679180887372, - "grad_norm": 0.42578125, - "learning_rate": 4.471157938215017e-05, - "loss": 0.9537, - "step": 3155 - }, - { - "epoch": 10.784982935153584, - "grad_norm": 0.3984375, - "learning_rate": 4.438107352080076e-05, - "loss": 0.9573, - "step": 3160 - }, - { - "epoch": 10.802047781569966, - "grad_norm": 0.384765625, - "learning_rate": 4.405144500122772e-05, - "loss": 0.9615, - "step": 3165 - }, - { - "epoch": 10.819112627986348, - "grad_norm": 0.365234375, - "learning_rate": 4.372269902304363e-05, - "loss": 0.9592, - "step": 3170 - }, - { - "epoch": 10.83617747440273, - "grad_norm": 0.38671875, - "learning_rate": 4.339484077193974e-05, - "loss": 0.9518, - "step": 3175 - }, - { - "epoch": 10.853242320819113, - "grad_norm": 0.423828125, - "learning_rate": 4.3067875419604184e-05, - "loss": 0.953, - "step": 3180 - }, - { - "epoch": 10.870307167235495, - "grad_norm": 0.376953125, - "learning_rate": 4.2741808123640335e-05, - "loss": 0.9578, - "step": 3185 - }, - { - "epoch": 10.887372013651877, - "grad_norm": 0.36328125, - "learning_rate": 4.241664402748544e-05, - "loss": 0.9548, - "step": 3190 - }, - { - "epoch": 10.904436860068259, - "grad_norm": 0.361328125, - "learning_rate": 4.209238826032965e-05, - "loss": 0.955, - "step": 3195 - }, - { - "epoch": 10.921501706484642, - "grad_norm": 0.380859375, - "learning_rate": 4.1769045937034876e-05, - "loss": 0.9591, - "step": 3200 - }, - { - "epoch": 10.938566552901024, - "grad_norm": 0.43359375, - "learning_rate": 4.144662215805426e-05, - "loss": 0.9544, - "step": 3205 - }, - { - "epoch": 10.955631399317406, - "grad_norm": 0.58984375, - "learning_rate": 4.1125122009351634e-05, - "loss": 0.9539, - "step": 3210 - }, - { - "epoch": 10.972696245733788, - "grad_norm": 0.416015625, - "learning_rate": 4.080455056232147e-05, - "loss": 0.9497, - "step": 3215 - }, - { - "epoch": 10.98976109215017, - "grad_norm": 0.421875, - "learning_rate": 4.048491287370863e-05, - "loss": 0.952, - "step": 3220 - }, - { - "epoch": 11.0, - "eval_loss": 2.519228935241699, - "eval_runtime": 0.5351, - "eval_samples_per_second": 18.688, - "eval_steps_per_second": 1.869, - "step": 3223 - }, - { - "epoch": 11.006825938566553, - "grad_norm": 0.404296875, - "learning_rate": 4.016621398552877e-05, - "loss": 0.954, - "step": 3225 - }, - { - "epoch": 11.023890784982935, - "grad_norm": 0.390625, - "learning_rate": 3.9848458924988684e-05, - "loss": 0.9494, - "step": 3230 - }, - { - "epoch": 11.040955631399317, - "grad_norm": 0.404296875, - "learning_rate": 3.953165270440721e-05, - "loss": 0.9434, - "step": 3235 - }, - { - "epoch": 11.058020477815699, - "grad_norm": 0.38671875, - "learning_rate": 3.921580032113602e-05, - "loss": 0.9542, - "step": 3240 - }, - { - "epoch": 11.075085324232083, - "grad_norm": 0.388671875, - "learning_rate": 3.8900906757480614e-05, - "loss": 0.9519, - "step": 3245 - }, - { - "epoch": 11.092150170648464, - "grad_norm": 0.388671875, - "learning_rate": 3.858697698062217e-05, - "loss": 0.9597, - "step": 3250 - }, - { - "epoch": 11.109215017064846, - "grad_norm": 0.373046875, - "learning_rate": 3.8274015942538745e-05, - "loss": 0.9437, - "step": 3255 - }, - { - "epoch": 11.126279863481228, - "grad_norm": 0.37890625, - "learning_rate": 3.7962028579927555e-05, - "loss": 0.9545, - "step": 3260 - }, - { - "epoch": 11.143344709897612, - "grad_norm": 0.392578125, - "learning_rate": 3.7651019814126654e-05, - "loss": 0.9524, - "step": 3265 - }, - { - "epoch": 11.160409556313994, - "grad_norm": 0.37890625, - "learning_rate": 3.734099455103779e-05, - "loss": 0.9591, - "step": 3270 - }, - { - "epoch": 11.177474402730375, - "grad_norm": 0.38671875, - "learning_rate": 3.7031957681048604e-05, - "loss": 0.9503, - "step": 3275 - }, - { - "epoch": 11.194539249146757, - "grad_norm": 0.384765625, - "learning_rate": 3.6723914078955825e-05, - "loss": 0.9456, - "step": 3280 - }, - { - "epoch": 11.211604095563139, - "grad_norm": 0.380859375, - "learning_rate": 3.64168686038881e-05, - "loss": 0.9426, - "step": 3285 - }, - { - "epoch": 11.228668941979523, - "grad_norm": 0.390625, - "learning_rate": 3.6110826099229453e-05, - "loss": 0.9496, - "step": 3290 - }, - { - "epoch": 11.245733788395905, - "grad_norm": 0.37109375, - "learning_rate": 3.580579139254303e-05, - "loss": 0.9515, - "step": 3295 - }, - { - "epoch": 11.262798634812286, - "grad_norm": 0.3828125, - "learning_rate": 3.550176929549468e-05, - "loss": 0.9535, - "step": 3300 - }, - { - "epoch": 11.279863481228668, - "grad_norm": 0.3671875, - "learning_rate": 3.5198764603777235e-05, - "loss": 0.9575, - "step": 3305 - }, - { - "epoch": 11.296928327645052, - "grad_norm": 0.376953125, - "learning_rate": 3.489678209703475e-05, - "loss": 0.9468, - "step": 3310 - }, - { - "epoch": 11.313993174061434, - "grad_norm": 0.396484375, - "learning_rate": 3.459582653878731e-05, - "loss": 0.9536, - "step": 3315 - }, - { - "epoch": 11.331058020477816, - "grad_norm": 0.39453125, - "learning_rate": 3.429590267635565e-05, - "loss": 0.9575, - "step": 3320 - }, - { - "epoch": 11.348122866894197, - "grad_norm": 0.38671875, - "learning_rate": 3.399701524078635e-05, - "loss": 0.9533, - "step": 3325 - }, - { - "epoch": 11.365187713310581, - "grad_norm": 0.380859375, - "learning_rate": 3.369916894677733e-05, - "loss": 0.9414, - "step": 3330 - }, - { - "epoch": 11.382252559726963, - "grad_norm": 0.421875, - "learning_rate": 3.340236849260324e-05, - "loss": 0.9494, - "step": 3335 - }, - { - "epoch": 11.399317406143345, - "grad_norm": 0.419921875, - "learning_rate": 3.31066185600417e-05, - "loss": 0.9457, - "step": 3340 - }, - { - "epoch": 11.416382252559726, - "grad_norm": 0.384765625, - "learning_rate": 3.281192381429894e-05, - "loss": 0.9403, - "step": 3345 - }, - { - "epoch": 11.43344709897611, - "grad_norm": 0.375, - "learning_rate": 3.251828890393677e-05, - "loss": 0.9489, - "step": 3350 - }, - { - "epoch": 11.450511945392492, - "grad_norm": 0.412109375, - "learning_rate": 3.222571846079881e-05, - "loss": 0.9525, - "step": 3355 - }, - { - "epoch": 11.467576791808874, - "grad_norm": 0.37109375, - "learning_rate": 3.193421709993779e-05, - "loss": 0.9574, - "step": 3360 - }, - { - "epoch": 11.484641638225256, - "grad_norm": 0.390625, - "learning_rate": 3.1643789419542324e-05, - "loss": 0.9453, - "step": 3365 - }, - { - "epoch": 11.501706484641637, - "grad_norm": 0.3828125, - "learning_rate": 3.135444000086485e-05, - "loss": 0.9462, - "step": 3370 - }, - { - "epoch": 11.518771331058021, - "grad_norm": 0.384765625, - "learning_rate": 3.1066173408148955e-05, - "loss": 0.9551, - "step": 3375 - }, - { - "epoch": 11.535836177474403, - "grad_norm": 0.404296875, - "learning_rate": 3.077899418855772e-05, - "loss": 0.9504, - "step": 3380 - }, - { - "epoch": 11.552901023890785, - "grad_norm": 0.400390625, - "learning_rate": 3.04929068721017e-05, - "loss": 0.9496, - "step": 3385 - }, - { - "epoch": 11.569965870307167, - "grad_norm": 0.380859375, - "learning_rate": 3.0207915971567624e-05, - "loss": 0.9426, - "step": 3390 - }, - { - "epoch": 11.58703071672355, - "grad_norm": 0.384765625, - "learning_rate": 2.992402598244727e-05, - "loss": 0.9458, - "step": 3395 - }, - { - "epoch": 11.604095563139932, - "grad_norm": 0.384765625, - "learning_rate": 2.9641241382866348e-05, - "loss": 0.9525, - "step": 3400 - }, - { - "epoch": 11.621160409556314, - "grad_norm": 0.400390625, - "learning_rate": 2.9359566633514037e-05, - "loss": 0.9449, - "step": 3405 - }, - { - "epoch": 11.638225255972696, - "grad_norm": 0.380859375, - "learning_rate": 2.907900617757252e-05, - "loss": 0.9526, - "step": 3410 - }, - { - "epoch": 11.655290102389078, - "grad_norm": 0.373046875, - "learning_rate": 2.879956444064703e-05, - "loss": 0.9598, - "step": 3415 - }, - { - "epoch": 11.672354948805461, - "grad_norm": 0.388671875, - "learning_rate": 2.8521245830695864e-05, - "loss": 0.9484, - "step": 3420 - }, - { - "epoch": 11.689419795221843, - "grad_norm": 0.3828125, - "learning_rate": 2.8244054737960935e-05, - "loss": 0.9431, - "step": 3425 - }, - { - "epoch": 11.706484641638225, - "grad_norm": 0.365234375, - "learning_rate": 2.7967995534898596e-05, - "loss": 0.9554, - "step": 3430 - }, - { - "epoch": 11.723549488054607, - "grad_norm": 0.390625, - "learning_rate": 2.7693072576110514e-05, - "loss": 0.9519, - "step": 3435 - }, - { - "epoch": 11.74061433447099, - "grad_norm": 0.365234375, - "learning_rate": 2.7419290198275095e-05, - "loss": 0.9509, - "step": 3440 - }, - { - "epoch": 11.757679180887372, - "grad_norm": 0.40234375, - "learning_rate": 2.7146652720079003e-05, - "loss": 0.9578, - "step": 3445 - }, - { - "epoch": 11.774744027303754, - "grad_norm": 0.376953125, - "learning_rate": 2.6875164442149147e-05, - "loss": 0.9449, - "step": 3450 - }, - { - "epoch": 11.791808873720136, - "grad_norm": 0.40625, - "learning_rate": 2.6604829646984686e-05, - "loss": 0.9505, - "step": 3455 - }, - { - "epoch": 11.80887372013652, - "grad_norm": 0.3984375, - "learning_rate": 2.6335652598889683e-05, - "loss": 0.9433, - "step": 3460 - }, - { - "epoch": 11.825938566552901, - "grad_norm": 0.380859375, - "learning_rate": 2.60676375439055e-05, - "loss": 0.9464, - "step": 3465 - }, - { - "epoch": 11.843003412969283, - "grad_norm": 0.384765625, - "learning_rate": 2.5800788709744227e-05, - "loss": 0.955, - "step": 3470 - }, - { - "epoch": 11.860068259385665, - "grad_norm": 0.380859375, - "learning_rate": 2.5535110305721776e-05, - "loss": 0.9458, - "step": 3475 - }, - { - "epoch": 11.877133105802049, - "grad_norm": 0.3828125, - "learning_rate": 2.5270606522691443e-05, - "loss": 0.9544, - "step": 3480 - }, - { - "epoch": 11.89419795221843, - "grad_norm": 0.408203125, - "learning_rate": 2.500728153297788e-05, - "loss": 0.9534, - "step": 3485 - }, - { - "epoch": 11.911262798634812, - "grad_norm": 0.373046875, - "learning_rate": 2.4745139490311254e-05, - "loss": 0.9521, - "step": 3490 - }, - { - "epoch": 11.928327645051194, - "grad_norm": 0.392578125, - "learning_rate": 2.4484184529761834e-05, - "loss": 0.948, - "step": 3495 - }, - { - "epoch": 11.945392491467576, - "grad_norm": 0.39453125, - "learning_rate": 2.4224420767674562e-05, - "loss": 0.9543, - "step": 3500 - }, - { - "epoch": 11.96245733788396, - "grad_norm": 0.375, - "learning_rate": 2.3965852301604254e-05, - "loss": 0.959, - "step": 3505 - }, - { - "epoch": 11.979522184300341, - "grad_norm": 0.375, - "learning_rate": 2.370848321025093e-05, - "loss": 0.9599, - "step": 3510 - }, - { - "epoch": 11.996587030716723, - "grad_norm": 0.37109375, - "learning_rate": 2.345231755339554e-05, - "loss": 0.9505, - "step": 3515 - }, - { - "epoch": 12.0, - "eval_loss": 2.520477771759033, - "eval_runtime": 0.5502, - "eval_samples_per_second": 18.175, - "eval_steps_per_second": 1.818, - "step": 3516 - }, - { - "epoch": 12.013651877133105, - "grad_norm": 0.43359375, - "learning_rate": 2.3197359371835802e-05, - "loss": 0.9615, - "step": 3520 - }, - { - "epoch": 12.030716723549489, - "grad_norm": 0.376953125, - "learning_rate": 2.2943612687322525e-05, - "loss": 0.9485, - "step": 3525 - }, - { - "epoch": 12.04778156996587, - "grad_norm": 0.384765625, - "learning_rate": 2.2691081502496246e-05, - "loss": 0.9475, - "step": 3530 - }, - { - "epoch": 12.064846416382252, - "grad_norm": 0.388671875, - "learning_rate": 2.243976980082394e-05, - "loss": 0.9393, - "step": 3535 - }, - { - "epoch": 12.081911262798634, - "grad_norm": 0.39453125, - "learning_rate": 2.218968154653629e-05, - "loss": 0.9466, - "step": 3540 - }, - { - "epoch": 12.098976109215018, - "grad_norm": 0.376953125, - "learning_rate": 2.194082068456509e-05, - "loss": 0.9537, - "step": 3545 - }, - { - "epoch": 12.1160409556314, - "grad_norm": 0.36328125, - "learning_rate": 2.169319114048114e-05, - "loss": 0.961, - "step": 3550 - }, - { - "epoch": 12.133105802047782, - "grad_norm": 0.38671875, - "learning_rate": 2.1446796820432167e-05, - "loss": 0.9493, - "step": 3555 - }, - { - "epoch": 12.150170648464163, - "grad_norm": 0.384765625, - "learning_rate": 2.1201641611081246e-05, - "loss": 0.948, - "step": 3560 - }, - { - "epoch": 12.167235494880545, - "grad_norm": 0.373046875, - "learning_rate": 2.0957729379545655e-05, - "loss": 0.9584, - "step": 3565 - }, - { - "epoch": 12.184300341296929, - "grad_norm": 0.380859375, - "learning_rate": 2.0715063973335568e-05, - "loss": 0.9503, - "step": 3570 - }, - { - "epoch": 12.20136518771331, - "grad_norm": 0.388671875, - "learning_rate": 2.04736492202937e-05, - "loss": 0.9498, - "step": 3575 - }, - { - "epoch": 12.218430034129693, - "grad_norm": 0.392578125, - "learning_rate": 2.0233488928534673e-05, - "loss": 0.9553, - "step": 3580 - }, - { - "epoch": 12.235494880546074, - "grad_norm": 0.396484375, - "learning_rate": 1.9994586886385046e-05, - "loss": 0.9438, - "step": 3585 - }, - { - "epoch": 12.252559726962458, - "grad_norm": 0.369140625, - "learning_rate": 1.9756946862323535e-05, - "loss": 0.9489, - "step": 3590 - }, - { - "epoch": 12.26962457337884, - "grad_norm": 0.369140625, - "learning_rate": 1.9520572604921672e-05, - "loss": 0.9477, - "step": 3595 - }, - { - "epoch": 12.286689419795222, - "grad_norm": 0.375, - "learning_rate": 1.9285467842784467e-05, - "loss": 0.9457, - "step": 3600 - }, - { - "epoch": 12.303754266211604, - "grad_norm": 0.380859375, - "learning_rate": 1.9051636284491757e-05, - "loss": 0.9541, - "step": 3605 - }, - { - "epoch": 12.320819112627987, - "grad_norm": 0.365234375, - "learning_rate": 1.8819081618539723e-05, - "loss": 0.9393, - "step": 3610 - }, - { - "epoch": 12.337883959044369, - "grad_norm": 0.375, - "learning_rate": 1.858780751328255e-05, - "loss": 0.949, - "step": 3615 - }, - { - "epoch": 12.35494880546075, - "grad_norm": 0.384765625, - "learning_rate": 1.8357817616874694e-05, - "loss": 0.9537, - "step": 3620 - }, - { - "epoch": 12.372013651877133, - "grad_norm": 0.3671875, - "learning_rate": 1.8129115557213262e-05, - "loss": 0.9505, - "step": 3625 - }, - { - "epoch": 12.389078498293514, - "grad_norm": 0.3671875, - "learning_rate": 1.7901704941880914e-05, - "loss": 0.9447, - "step": 3630 - }, - { - "epoch": 12.406143344709898, - "grad_norm": 0.3671875, - "learning_rate": 1.7675589358088763e-05, - "loss": 0.9526, - "step": 3635 - }, - { - "epoch": 12.42320819112628, - "grad_norm": 0.376953125, - "learning_rate": 1.745077237261994e-05, - "loss": 0.9592, - "step": 3640 - }, - { - "epoch": 12.440273037542662, - "grad_norm": 0.40234375, - "learning_rate": 1.7227257531773223e-05, - "loss": 0.9515, - "step": 3645 - }, - { - "epoch": 12.457337883959044, - "grad_norm": 0.408203125, - "learning_rate": 1.7005048361307262e-05, - "loss": 0.9504, - "step": 3650 - }, - { - "epoch": 12.474402730375427, - "grad_norm": 0.388671875, - "learning_rate": 1.6784148366384754e-05, - "loss": 0.9462, - "step": 3655 - }, - { - "epoch": 12.491467576791809, - "grad_norm": 0.384765625, - "learning_rate": 1.656456103151728e-05, - "loss": 0.9456, - "step": 3660 - }, - { - "epoch": 12.508532423208191, - "grad_norm": 0.375, - "learning_rate": 1.6346289820510363e-05, - "loss": 0.9475, - "step": 3665 - }, - { - "epoch": 12.525597269624573, - "grad_norm": 0.384765625, - "learning_rate": 1.612933817640868e-05, - "loss": 0.9478, - "step": 3670 - }, - { - "epoch": 12.542662116040956, - "grad_norm": 0.3671875, - "learning_rate": 1.5913709521441988e-05, - "loss": 0.9415, - "step": 3675 - }, - { - "epoch": 12.559726962457338, - "grad_norm": 0.375, - "learning_rate": 1.5699407256970833e-05, - "loss": 0.9452, - "step": 3680 - }, - { - "epoch": 12.57679180887372, - "grad_norm": 0.375, - "learning_rate": 1.5486434763433222e-05, - "loss": 0.9479, - "step": 3685 - }, - { - "epoch": 12.593856655290102, - "grad_norm": 0.38671875, - "learning_rate": 1.527479540029104e-05, - "loss": 0.9495, - "step": 3690 - }, - { - "epoch": 12.610921501706486, - "grad_norm": 0.3828125, - "learning_rate": 1.5064492505977234e-05, - "loss": 0.936, - "step": 3695 - }, - { - "epoch": 12.627986348122867, - "grad_norm": 0.392578125, - "learning_rate": 1.4855529397843038e-05, - "loss": 0.9476, - "step": 3700 - }, - { - "epoch": 12.64505119453925, - "grad_norm": 0.380859375, - "learning_rate": 1.4647909372105672e-05, - "loss": 0.9525, - "step": 3705 - }, - { - "epoch": 12.662116040955631, - "grad_norm": 0.41796875, - "learning_rate": 1.4441635703796408e-05, - "loss": 0.9477, - "step": 3710 - }, - { - "epoch": 12.679180887372013, - "grad_norm": 0.3984375, - "learning_rate": 1.4236711646708844e-05, - "loss": 0.9505, - "step": 3715 - }, - { - "epoch": 12.696245733788396, - "grad_norm": 0.384765625, - "learning_rate": 1.4033140433347569e-05, - "loss": 0.9464, - "step": 3720 - }, - { - "epoch": 12.713310580204778, - "grad_norm": 0.384765625, - "learning_rate": 1.3830925274877216e-05, - "loss": 0.9392, - "step": 3725 - }, - { - "epoch": 12.73037542662116, - "grad_norm": 0.37890625, - "learning_rate": 1.363006936107183e-05, - "loss": 0.9495, - "step": 3730 - }, - { - "epoch": 12.747440273037542, - "grad_norm": 0.3828125, - "learning_rate": 1.343057586026446e-05, - "loss": 0.9423, - "step": 3735 - }, - { - "epoch": 12.764505119453926, - "grad_norm": 0.416015625, - "learning_rate": 1.3232447919297274e-05, - "loss": 0.9448, - "step": 3740 - }, - { - "epoch": 12.781569965870307, - "grad_norm": 0.404296875, - "learning_rate": 1.3035688663471834e-05, - "loss": 0.9544, - "step": 3745 - }, - { - "epoch": 12.79863481228669, - "grad_norm": 0.37109375, - "learning_rate": 1.2840301196499893e-05, - "loss": 0.9548, - "step": 3750 - }, - { - "epoch": 12.815699658703071, - "grad_norm": 0.376953125, - "learning_rate": 1.2646288600454448e-05, - "loss": 0.9492, - "step": 3755 - }, - { - "epoch": 12.832764505119453, - "grad_norm": 0.373046875, - "learning_rate": 1.2453653935720867e-05, - "loss": 0.9506, - "step": 3760 - }, - { - "epoch": 12.849829351535837, - "grad_norm": 0.388671875, - "learning_rate": 1.2262400240949023e-05, - "loss": 0.9543, - "step": 3765 - }, - { - "epoch": 12.866894197952218, - "grad_norm": 0.369140625, - "learning_rate": 1.2072530533005012e-05, - "loss": 0.9418, - "step": 3770 - }, - { - "epoch": 12.8839590443686, - "grad_norm": 0.369140625, - "learning_rate": 1.1884047806923815e-05, - "loss": 0.9475, - "step": 3775 - }, - { - "epoch": 12.901023890784982, - "grad_norm": 0.39453125, - "learning_rate": 1.169695503586179e-05, - "loss": 0.9428, - "step": 3780 - }, - { - "epoch": 12.918088737201366, - "grad_norm": 0.38671875, - "learning_rate": 1.1511255171050084e-05, - "loss": 0.9529, - "step": 3785 - }, - { - "epoch": 12.935153583617748, - "grad_norm": 0.376953125, - "learning_rate": 1.1326951141747788e-05, - "loss": 0.9455, - "step": 3790 - }, - { - "epoch": 12.95221843003413, - "grad_norm": 0.376953125, - "learning_rate": 1.1144045855195973e-05, - "loss": 0.9537, - "step": 3795 - }, - { - "epoch": 12.969283276450511, - "grad_norm": 0.396484375, - "learning_rate": 1.0962542196571634e-05, - "loss": 0.9426, - "step": 3800 - }, - { - "epoch": 12.986348122866895, - "grad_norm": 0.373046875, - "learning_rate": 1.078244302894229e-05, - "loss": 0.9442, - "step": 3805 - }, - { - "epoch": 13.0, - "eval_loss": 2.522336959838867, - "eval_runtime": 0.5484, - "eval_samples_per_second": 18.236, - "eval_steps_per_second": 1.824, - "step": 3809 - }, - { - "epoch": 13.003412969283277, - "grad_norm": 0.376953125, - "learning_rate": 1.0603751193220846e-05, - "loss": 0.956, - "step": 3810 - }, - { - "epoch": 13.020477815699659, - "grad_norm": 0.392578125, - "learning_rate": 1.0426469508120662e-05, - "loss": 0.9449, - "step": 3815 - }, - { - "epoch": 13.03754266211604, - "grad_norm": 0.390625, - "learning_rate": 1.0250600770111185e-05, - "loss": 0.9479, - "step": 3820 - }, - { - "epoch": 13.054607508532424, - "grad_norm": 0.392578125, - "learning_rate": 1.0076147753373789e-05, - "loss": 0.953, - "step": 3825 - }, - { - "epoch": 13.071672354948806, - "grad_norm": 0.388671875, - "learning_rate": 9.903113209758096e-06, - "loss": 0.9436, - "step": 3830 - }, - { - "epoch": 13.088737201365188, - "grad_norm": 0.380859375, - "learning_rate": 9.731499868738447e-06, - "loss": 0.9454, - "step": 3835 - }, - { - "epoch": 13.10580204778157, - "grad_norm": 0.3828125, - "learning_rate": 9.561310437370907e-06, - "loss": 0.9556, - "step": 3840 - }, - { - "epoch": 13.122866894197951, - "grad_norm": 0.373046875, - "learning_rate": 9.392547600250634e-06, - "loss": 0.949, - "step": 3845 - }, - { - "epoch": 13.139931740614335, - "grad_norm": 0.380859375, - "learning_rate": 9.225214019469385e-06, - "loss": 0.9382, - "step": 3850 - }, - { - "epoch": 13.156996587030717, - "grad_norm": 0.40234375, - "learning_rate": 9.059312334573633e-06, - "loss": 0.943, - "step": 3855 - }, - { - "epoch": 13.174061433447099, - "grad_norm": 0.3828125, - "learning_rate": 8.89484516252287e-06, - "loss": 0.9534, - "step": 3860 - }, - { - "epoch": 13.19112627986348, - "grad_norm": 0.369140625, - "learning_rate": 8.731815097648433e-06, - "loss": 0.9526, - "step": 3865 - }, - { - "epoch": 13.208191126279864, - "grad_norm": 0.392578125, - "learning_rate": 8.570224711612385e-06, - "loss": 0.9419, - "step": 3870 - }, - { - "epoch": 13.225255972696246, - "grad_norm": 0.373046875, - "learning_rate": 8.410076553367208e-06, - "loss": 0.9511, - "step": 3875 - }, - { - "epoch": 13.242320819112628, - "grad_norm": 0.380859375, - "learning_rate": 8.251373149115293e-06, - "loss": 0.9489, - "step": 3880 - }, - { - "epoch": 13.25938566552901, - "grad_norm": 0.36328125, - "learning_rate": 8.094117002269363e-06, - "loss": 0.9428, - "step": 3885 - }, - { - "epoch": 13.276450511945393, - "grad_norm": 0.443359375, - "learning_rate": 7.938310593412879e-06, - "loss": 0.9485, - "step": 3890 - }, - { - "epoch": 13.293515358361775, - "grad_norm": 0.3671875, - "learning_rate": 7.783956380260837e-06, - "loss": 0.955, - "step": 3895 - }, - { - "epoch": 13.310580204778157, - "grad_norm": 0.384765625, - "learning_rate": 7.631056797621106e-06, - "loss": 0.9566, - "step": 3900 - }, - { - "epoch": 13.327645051194539, - "grad_norm": 0.369140625, - "learning_rate": 7.479614257355971e-06, - "loss": 0.9495, - "step": 3905 - }, - { - "epoch": 13.344709897610922, - "grad_norm": 0.376953125, - "learning_rate": 7.329631148344118e-06, - "loss": 0.9535, - "step": 3910 - }, - { - "epoch": 13.361774744027304, - "grad_norm": 0.375, - "learning_rate": 7.181109836442912e-06, - "loss": 0.9473, - "step": 3915 - }, - { - "epoch": 13.378839590443686, - "grad_norm": 0.37890625, - "learning_rate": 7.034052664451118e-06, - "loss": 0.946, - "step": 3920 - }, - { - "epoch": 13.395904436860068, - "grad_norm": 0.380859375, - "learning_rate": 6.88846195207189e-06, - "loss": 0.9526, - "step": 3925 - }, - { - "epoch": 13.41296928327645, - "grad_norm": 0.365234375, - "learning_rate": 6.7443399958762584e-06, - "loss": 0.9416, - "step": 3930 - }, - { - "epoch": 13.430034129692833, - "grad_norm": 0.365234375, - "learning_rate": 6.6016890692668364e-06, - "loss": 0.9529, - "step": 3935 - }, - { - "epoch": 13.447098976109215, - "grad_norm": 0.376953125, - "learning_rate": 6.460511422441984e-06, - "loss": 0.9427, - "step": 3940 - }, - { - "epoch": 13.464163822525597, - "grad_norm": 0.37890625, - "learning_rate": 6.320809282360319e-06, - "loss": 0.9516, - "step": 3945 - }, - { - "epoch": 13.481228668941979, - "grad_norm": 0.380859375, - "learning_rate": 6.1825848527055865e-06, - "loss": 0.9448, - "step": 3950 - }, - { - "epoch": 13.498293515358363, - "grad_norm": 0.384765625, - "learning_rate": 6.04584031385188e-06, - "loss": 0.9542, - "step": 3955 - }, - { - "epoch": 13.515358361774744, - "grad_norm": 0.376953125, - "learning_rate": 5.910577822829233e-06, - "loss": 0.9525, - "step": 3960 - }, - { - "epoch": 13.532423208191126, - "grad_norm": 0.3671875, - "learning_rate": 5.77679951328971e-06, - "loss": 0.9502, - "step": 3965 - }, - { - "epoch": 13.549488054607508, - "grad_norm": 0.373046875, - "learning_rate": 5.644507495473572e-06, - "loss": 0.9464, - "step": 3970 - }, - { - "epoch": 13.56655290102389, - "grad_norm": 0.37890625, - "learning_rate": 5.5137038561761115e-06, - "loss": 0.9531, - "step": 3975 - }, - { - "epoch": 13.583617747440274, - "grad_norm": 0.375, - "learning_rate": 5.3843906587146886e-06, - "loss": 0.9498, - "step": 3980 - }, - { - "epoch": 13.600682593856655, - "grad_norm": 0.37890625, - "learning_rate": 5.256569942896217e-06, - "loss": 0.945, - "step": 3985 - }, - { - "epoch": 13.617747440273037, - "grad_norm": 0.365234375, - "learning_rate": 5.130243724984995e-06, - "loss": 0.9468, - "step": 3990 - }, - { - "epoch": 13.634812286689419, - "grad_norm": 0.369140625, - "learning_rate": 5.005413997670816e-06, - "loss": 0.9517, - "step": 3995 - }, - { - "epoch": 13.651877133105803, - "grad_norm": 0.365234375, - "learning_rate": 4.8820827300376075e-06, - "loss": 0.9502, - "step": 4000 - }, - { - "epoch": 13.668941979522184, - "grad_norm": 0.369140625, - "learning_rate": 4.760251867532362e-06, - "loss": 0.9462, - "step": 4005 - }, - { - "epoch": 13.686006825938566, - "grad_norm": 0.384765625, - "learning_rate": 4.639923331934471e-06, - "loss": 0.9476, - "step": 4010 - }, - { - "epoch": 13.703071672354948, - "grad_norm": 0.369140625, - "learning_rate": 4.521099021325336e-06, - "loss": 0.9556, - "step": 4015 - }, - { - "epoch": 13.720136518771332, - "grad_norm": 0.390625, - "learning_rate": 4.403780810058511e-06, - "loss": 0.9438, - "step": 4020 - }, - { - "epoch": 13.737201365187714, - "grad_norm": 0.470703125, - "learning_rate": 4.287970548730069e-06, - "loss": 0.9495, - "step": 4025 - }, - { - "epoch": 13.754266211604095, - "grad_norm": 0.36328125, - "learning_rate": 4.173670064149482e-06, - "loss": 0.934, - "step": 4030 - }, - { - "epoch": 13.771331058020477, - "grad_norm": 0.384765625, - "learning_rate": 4.060881159310725e-06, - "loss": 0.9502, - "step": 4035 - }, - { - "epoch": 13.788395904436861, - "grad_norm": 0.388671875, - "learning_rate": 3.949605613363882e-06, - "loss": 0.939, - "step": 4040 - }, - { - "epoch": 13.805460750853243, - "grad_norm": 0.37890625, - "learning_rate": 3.839845181587098e-06, - "loss": 0.9559, - "step": 4045 - }, - { - "epoch": 13.822525597269625, - "grad_norm": 0.376953125, - "learning_rate": 3.7316015953588467e-06, - "loss": 0.9547, - "step": 4050 - }, - { - "epoch": 13.839590443686006, - "grad_norm": 0.384765625, - "learning_rate": 3.6248765621306414e-06, - "loss": 0.9463, - "step": 4055 - }, - { - "epoch": 13.856655290102388, - "grad_norm": 0.376953125, - "learning_rate": 3.519671765400079e-06, - "loss": 0.9454, - "step": 4060 - }, - { - "epoch": 13.873720136518772, - "grad_norm": 0.373046875, - "learning_rate": 3.4159888646843495e-06, - "loss": 0.9485, - "step": 4065 - }, - { - "epoch": 13.890784982935154, - "grad_norm": 0.375, - "learning_rate": 3.313829495493992e-06, - "loss": 0.9455, - "step": 4070 - }, - { - "epoch": 13.907849829351536, - "grad_norm": 0.37890625, - "learning_rate": 3.2131952693070898e-06, - "loss": 0.9409, - "step": 4075 - }, - { - "epoch": 13.924914675767917, - "grad_norm": 0.396484375, - "learning_rate": 3.1140877735439387e-06, - "loss": 0.9468, - "step": 4080 - }, - { - "epoch": 13.941979522184301, - "grad_norm": 0.375, - "learning_rate": 3.0165085715418763e-06, - "loss": 0.9434, - "step": 4085 - }, - { - "epoch": 13.959044368600683, - "grad_norm": 0.3671875, - "learning_rate": 2.9204592025307566e-06, - "loss": 0.9455, - "step": 4090 - }, - { - "epoch": 13.976109215017065, - "grad_norm": 0.369140625, - "learning_rate": 2.8259411816085492e-06, - "loss": 0.9437, - "step": 4095 - }, - { - "epoch": 13.993174061433447, - "grad_norm": 0.478515625, - "learning_rate": 2.732955999717546e-06, - "loss": 0.9469, - "step": 4100 - }, - { - "epoch": 14.0, - "eval_loss": 2.5227127075195312, - "eval_runtime": 0.542, - "eval_samples_per_second": 18.45, - "eval_steps_per_second": 1.845, - "step": 4102 - }, - { - "epoch": 14.01023890784983, - "grad_norm": 0.376953125, - "learning_rate": 2.6415051236207355e-06, - "loss": 0.9508, - "step": 4105 - }, - { - "epoch": 14.027303754266212, - "grad_norm": 0.375, - "learning_rate": 2.551589995878789e-06, - "loss": 0.9459, - "step": 4110 - }, - { - "epoch": 14.044368600682594, - "grad_norm": 0.380859375, - "learning_rate": 2.4632120348272003e-06, - "loss": 0.9465, - "step": 4115 - }, - { - "epoch": 14.061433447098976, - "grad_norm": 0.37890625, - "learning_rate": 2.376372634553936e-06, - "loss": 0.9475, - "step": 4120 - }, - { - "epoch": 14.078498293515358, - "grad_norm": 0.376953125, - "learning_rate": 2.291073164877511e-06, - "loss": 0.9435, - "step": 4125 - }, - { - "epoch": 14.095563139931741, - "grad_norm": 0.37890625, - "learning_rate": 2.207314971325292e-06, - "loss": 0.9546, - "step": 4130 - }, - { - "epoch": 14.112627986348123, - "grad_norm": 0.400390625, - "learning_rate": 2.125099375112316e-06, - "loss": 0.9496, - "step": 4135 - }, - { - "epoch": 14.129692832764505, - "grad_norm": 0.3671875, - "learning_rate": 2.0444276731204415e-06, - "loss": 0.9592, - "step": 4140 - }, - { - "epoch": 14.146757679180887, - "grad_norm": 0.37890625, - "learning_rate": 1.9653011378779283e-06, - "loss": 0.9446, - "step": 4145 - }, - { - "epoch": 14.16382252559727, - "grad_norm": 0.5625, - "learning_rate": 1.88772101753929e-06, - "loss": 0.9374, - "step": 4150 - }, - { - "epoch": 14.180887372013652, - "grad_norm": 0.37890625, - "learning_rate": 1.8116885358656744e-06, - "loss": 0.9543, - "step": 4155 - }, - { - "epoch": 14.197952218430034, - "grad_norm": 0.37109375, - "learning_rate": 1.7372048922054906e-06, - "loss": 0.9488, - "step": 4160 - }, - { - "epoch": 14.215017064846416, - "grad_norm": 0.373046875, - "learning_rate": 1.6642712614755695e-06, - "loss": 0.9466, - "step": 4165 - }, - { - "epoch": 14.2320819112628, - "grad_norm": 0.396484375, - "learning_rate": 1.5928887941426107e-06, - "loss": 0.9482, - "step": 4170 - }, - { - "epoch": 14.249146757679181, - "grad_norm": 0.373046875, - "learning_rate": 1.523058616204942e-06, - "loss": 0.9449, - "step": 4175 - }, - { - "epoch": 14.266211604095563, - "grad_norm": 0.3984375, - "learning_rate": 1.4547818291749115e-06, - "loss": 0.9562, - "step": 4180 - }, - { - "epoch": 14.283276450511945, - "grad_norm": 0.388671875, - "learning_rate": 1.3880595100613792e-06, - "loss": 0.9445, - "step": 4185 - }, - { - "epoch": 14.300341296928327, - "grad_norm": 0.376953125, - "learning_rate": 1.3228927113528189e-06, - "loss": 0.9457, - "step": 4190 - }, - { - "epoch": 14.31740614334471, - "grad_norm": 0.388671875, - "learning_rate": 1.2592824610006215e-06, - "loss": 0.9488, - "step": 4195 - }, - { - "epoch": 14.334470989761092, - "grad_norm": 0.38671875, - "learning_rate": 1.1972297624030072e-06, - "loss": 0.9437, - "step": 4200 - }, - { - "epoch": 14.351535836177474, - "grad_norm": 0.3671875, - "learning_rate": 1.1367355943890823e-06, - "loss": 0.9459, - "step": 4205 - }, - { - "epoch": 14.368600682593856, - "grad_norm": 0.396484375, - "learning_rate": 1.0778009112034748e-06, - "loss": 0.9477, - "step": 4210 - }, - { - "epoch": 14.38566552901024, - "grad_norm": 0.375, - "learning_rate": 1.0204266424912123e-06, - "loss": 0.95, - "step": 4215 - }, - { - "epoch": 14.402730375426621, - "grad_norm": 0.40625, - "learning_rate": 9.64613693283123e-07, - "loss": 0.9477, - "step": 4220 - }, - { - "epoch": 14.419795221843003, - "grad_norm": 0.375, - "learning_rate": 9.103629439815354e-07, - "loss": 0.9461, - "step": 4225 - }, - { - "epoch": 14.436860068259385, - "grad_norm": 0.3828125, - "learning_rate": 8.57675250346368e-07, - "loss": 0.9585, - "step": 4230 - }, - { - "epoch": 14.453924914675769, - "grad_norm": 0.443359375, - "learning_rate": 8.065514434816845e-07, - "loss": 0.9434, - "step": 4235 - }, - { - "epoch": 14.47098976109215, - "grad_norm": 0.396484375, - "learning_rate": 7.569923298225146e-07, - "loss": 0.941, - "step": 4240 - }, - { - "epoch": 14.488054607508532, - "grad_norm": 0.375, - "learning_rate": 7.08998691122198e-07, - "loss": 0.9527, - "step": 4245 - }, - { - "epoch": 14.505119453924914, - "grad_norm": 0.380859375, - "learning_rate": 6.625712844400056e-07, - "loss": 0.9484, - "step": 4250 - }, - { - "epoch": 14.522184300341298, - "grad_norm": 0.390625, - "learning_rate": 6.177108421292266e-07, - "loss": 0.9453, - "step": 4255 - }, - { - "epoch": 14.53924914675768, - "grad_norm": 0.404296875, - "learning_rate": 5.744180718255776e-07, - "loss": 0.9464, - "step": 4260 - }, - { - "epoch": 14.556313993174061, - "grad_norm": 0.375, - "learning_rate": 5.326936564361118e-07, - "loss": 0.943, - "step": 4265 - }, - { - "epoch": 14.573378839590443, - "grad_norm": 0.369140625, - "learning_rate": 4.92538254128383e-07, - "loss": 0.9422, - "step": 4270 - }, - { - "epoch": 14.590443686006825, - "grad_norm": 0.390625, - "learning_rate": 4.5395249832007604e-07, - "loss": 0.9591, - "step": 4275 - }, - { - "epoch": 14.607508532423209, - "grad_norm": 0.396484375, - "learning_rate": 4.1693699766902626e-07, - "loss": 0.9475, - "step": 4280 - }, - { - "epoch": 14.62457337883959, - "grad_norm": 0.369140625, - "learning_rate": 3.814923360636158e-07, - "loss": 0.9391, - "step": 4285 - }, - { - "epoch": 14.641638225255972, - "grad_norm": 0.36328125, - "learning_rate": 3.4761907261356976e-07, - "loss": 0.9574, - "step": 4290 - }, - { - "epoch": 14.658703071672354, - "grad_norm": 0.388671875, - "learning_rate": 3.1531774164111903e-07, - "loss": 0.9495, - "step": 4295 - }, - { - "epoch": 14.675767918088738, - "grad_norm": 0.373046875, - "learning_rate": 2.8458885267260705e-07, - "loss": 0.9537, - "step": 4300 - }, - { - "epoch": 14.69283276450512, - "grad_norm": 0.38671875, - "learning_rate": 2.554328904303738e-07, - "loss": 0.9435, - "step": 4305 - }, - { - "epoch": 14.709897610921502, - "grad_norm": 0.39453125, - "learning_rate": 2.2785031482521758e-07, - "loss": 0.9474, - "step": 4310 - }, - { - "epoch": 14.726962457337883, - "grad_norm": 0.376953125, - "learning_rate": 2.0184156094905648e-07, - "loss": 0.947, - "step": 4315 - }, - { - "epoch": 14.744027303754265, - "grad_norm": 0.3671875, - "learning_rate": 1.7740703906810042e-07, - "loss": 0.9431, - "step": 4320 - }, - { - "epoch": 14.761092150170649, - "grad_norm": 0.384765625, - "learning_rate": 1.545471346164007e-07, - "loss": 0.9431, - "step": 4325 - }, - { - "epoch": 14.77815699658703, - "grad_norm": 0.37890625, - "learning_rate": 1.3326220818968838e-07, - "loss": 0.9455, - "step": 4330 - }, - { - "epoch": 14.795221843003413, - "grad_norm": 0.375, - "learning_rate": 1.1355259553978981e-07, - "loss": 0.9512, - "step": 4335 - }, - { - "epoch": 14.812286689419794, - "grad_norm": 0.390625, - "learning_rate": 9.541860756925314e-08, - "loss": 0.9439, - "step": 4340 - }, - { - "epoch": 14.829351535836178, - "grad_norm": 0.37109375, - "learning_rate": 7.886053032649665e-08, - "loss": 0.9548, - "step": 4345 - }, - { - "epoch": 14.84641638225256, - "grad_norm": 0.4921875, - "learning_rate": 6.387862500125685e-08, - "loss": 0.9437, - "step": 4350 - }, - { - "epoch": 14.863481228668942, - "grad_norm": 0.380859375, - "learning_rate": 5.047312792046954e-08, - "loss": 0.9512, - "step": 4355 - }, - { - "epoch": 14.880546075085324, - "grad_norm": 0.39453125, - "learning_rate": 3.8644250544594975e-08, - "loss": 0.9478, - "step": 4360 - }, - { - "epoch": 14.897610921501707, - "grad_norm": 0.380859375, - "learning_rate": 2.839217946422057e-08, - "loss": 0.9362, - "step": 4365 - }, - { - "epoch": 14.914675767918089, - "grad_norm": 0.380859375, - "learning_rate": 1.971707639712994e-08, - "loss": 0.9507, - "step": 4370 - }, - { - "epoch": 14.93174061433447, - "grad_norm": 0.37109375, - "learning_rate": 1.2619078185793776e-08, - "loss": 0.948, - "step": 4375 - }, - { - "epoch": 14.948805460750853, - "grad_norm": 0.400390625, - "learning_rate": 7.098296795138293e-09, - "loss": 0.9524, - "step": 4380 - }, - { - "epoch": 14.965870307167236, - "grad_norm": 0.36328125, - "learning_rate": 3.154819310868806e-09, - "loss": 0.9497, - "step": 4385 - }, - { - "epoch": 14.982935153583618, - "grad_norm": 0.400390625, - "learning_rate": 7.887079380153317e-10, - "loss": 0.9536, - "step": 4390 - }, - { - "epoch": 15.0, - "grad_norm": 0.373046875, - "learning_rate": 0.0, - "loss": 0.9444, - "step": 4395 - }, - { - "epoch": 15.0, - "eval_loss": 2.523277521133423, - "eval_runtime": 0.5592, - "eval_samples_per_second": 17.883, - "eval_steps_per_second": 1.788, - "step": 4395 - }, - { - "epoch": 15.0, - "step": 4395, - "total_flos": 2.581505823377195e+18, - "train_loss": 1.0488379673203783, - "train_runtime": 23446.7186, - "train_samples_per_second": 8.983, - "train_steps_per_second": 0.187 + "epoch": 10.0, + "step": 2930, + "total_flos": 1.7464232891960525e+18, + "train_loss": 0.9647074054125633, + "train_runtime": 17674.2713, + "train_samples_per_second": 7.945, + "train_steps_per_second": 0.166 } ], "logging_steps": 5, - "max_steps": 4395, + "max_steps": 2930, "num_input_tokens_seen": 0, - "num_train_epochs": 15, + "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { @@ -6315,7 +4224,7 @@ "attributes": {} } }, - "total_flos": 2.581505823377195e+18, + "total_flos": 1.7464232891960525e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null