{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9986146386515817, "eval_steps": 500, "global_step": 1082, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0073885938582313555, "grad_norm": 30.625, "learning_rate": 8.000000000000001e-07, "loss": 3.171, "step": 4 }, { "epoch": 0.014777187716462711, "grad_norm": 30.625, "learning_rate": 1.6000000000000001e-06, "loss": 3.1417, "step": 8 }, { "epoch": 0.022165781574694066, "grad_norm": 26.375, "learning_rate": 2.4000000000000003e-06, "loss": 2.9572, "step": 12 }, { "epoch": 0.029554375432925422, "grad_norm": 19.75, "learning_rate": 3.2000000000000003e-06, "loss": 2.4271, "step": 16 }, { "epoch": 0.03694296929115678, "grad_norm": 16.25, "learning_rate": 4.000000000000001e-06, "loss": 1.9425, "step": 20 }, { "epoch": 0.04433156314938813, "grad_norm": 10.8125, "learning_rate": 4.800000000000001e-06, "loss": 1.2272, "step": 24 }, { "epoch": 0.051720157007619484, "grad_norm": 6.21875, "learning_rate": 5.600000000000001e-06, "loss": 0.7596, "step": 28 }, { "epoch": 0.059108750865850844, "grad_norm": 4.6875, "learning_rate": 6.4000000000000006e-06, "loss": 0.5928, "step": 32 }, { "epoch": 0.0664973447240822, "grad_norm": 3.234375, "learning_rate": 7.2000000000000005e-06, "loss": 0.5097, "step": 36 }, { "epoch": 0.07388593858231356, "grad_norm": 2.765625, "learning_rate": 8.000000000000001e-06, "loss": 0.4897, "step": 40 }, { "epoch": 0.0812745324405449, "grad_norm": 2.609375, "learning_rate": 8.8e-06, "loss": 0.4433, "step": 44 }, { "epoch": 0.08866312629877626, "grad_norm": 2.515625, "learning_rate": 9.600000000000001e-06, "loss": 0.4597, "step": 48 }, { "epoch": 0.09605172015700762, "grad_norm": 2.109375, "learning_rate": 1.04e-05, "loss": 0.4418, "step": 52 }, { "epoch": 0.10344031401523897, "grad_norm": 2.109375, "learning_rate": 1.1200000000000001e-05, "loss": 0.416, "step": 56 }, { "epoch": 0.11082890787347033, "grad_norm": 2.921875, "learning_rate": 1.2e-05, "loss": 0.4305, "step": 60 }, { "epoch": 0.11821750173170169, "grad_norm": 2.21875, "learning_rate": 1.2800000000000001e-05, "loss": 0.3997, "step": 64 }, { "epoch": 0.12560609558993305, "grad_norm": 2.15625, "learning_rate": 1.3600000000000002e-05, "loss": 0.3958, "step": 68 }, { "epoch": 0.1329946894481644, "grad_norm": 2.484375, "learning_rate": 1.4400000000000001e-05, "loss": 0.4051, "step": 72 }, { "epoch": 0.14038328330639574, "grad_norm": 2.28125, "learning_rate": 1.5200000000000002e-05, "loss": 0.4058, "step": 76 }, { "epoch": 0.1477718771646271, "grad_norm": 2.046875, "learning_rate": 1.6000000000000003e-05, "loss": 0.4084, "step": 80 }, { "epoch": 0.15516047102285846, "grad_norm": 2.203125, "learning_rate": 1.6800000000000002e-05, "loss": 0.402, "step": 84 }, { "epoch": 0.1625490648810898, "grad_norm": 1.9921875, "learning_rate": 1.76e-05, "loss": 0.3955, "step": 88 }, { "epoch": 0.16993765873932118, "grad_norm": 1.96875, "learning_rate": 1.8400000000000003e-05, "loss": 0.395, "step": 92 }, { "epoch": 0.17732625259755253, "grad_norm": 2.0625, "learning_rate": 1.9200000000000003e-05, "loss": 0.4021, "step": 96 }, { "epoch": 0.18471484645578387, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.3918, "step": 100 }, { "epoch": 0.19210344031401524, "grad_norm": 1.75, "learning_rate": 1.9999659601637828e-05, "loss": 0.3943, "step": 104 }, { "epoch": 0.1994920341722466, "grad_norm": 1.8046875, "learning_rate": 1.9998638429725526e-05, "loss": 0.3955, "step": 108 }, { "epoch": 0.20688062803047794, "grad_norm": 1.9296875, "learning_rate": 1.9996936553784137e-05, "loss": 0.4017, "step": 112 }, { "epoch": 0.2142692218887093, "grad_norm": 2.03125, "learning_rate": 1.999455408967682e-05, "loss": 0.4023, "step": 116 }, { "epoch": 0.22165781574694066, "grad_norm": 2.015625, "learning_rate": 1.999149119960095e-05, "loss": 0.4046, "step": 120 }, { "epoch": 0.229046409605172, "grad_norm": 1.734375, "learning_rate": 1.9987748092077082e-05, "loss": 0.3783, "step": 124 }, { "epoch": 0.23643500346340338, "grad_norm": 1.671875, "learning_rate": 1.998332502193475e-05, "loss": 0.3965, "step": 128 }, { "epoch": 0.24382359732163472, "grad_norm": 2.1875, "learning_rate": 1.9978222290295116e-05, "loss": 0.4031, "step": 132 }, { "epoch": 0.2512121911798661, "grad_norm": 1.7265625, "learning_rate": 1.9972440244550485e-05, "loss": 0.3685, "step": 136 }, { "epoch": 0.2586007850380974, "grad_norm": 1.78125, "learning_rate": 1.9965979278340638e-05, "loss": 0.4004, "step": 140 }, { "epoch": 0.2659893788963288, "grad_norm": 1.8203125, "learning_rate": 1.995883983152603e-05, "loss": 0.3881, "step": 144 }, { "epoch": 0.27337797275456016, "grad_norm": 1.6171875, "learning_rate": 1.995102239015787e-05, "loss": 0.4091, "step": 148 }, { "epoch": 0.2807665666127915, "grad_norm": 1.84375, "learning_rate": 1.9942527486445e-05, "loss": 0.3785, "step": 152 }, { "epoch": 0.28815516047102285, "grad_norm": 1.875, "learning_rate": 1.9933355698717683e-05, "loss": 0.3999, "step": 156 }, { "epoch": 0.2955437543292542, "grad_norm": 1.75, "learning_rate": 1.9923507651388224e-05, "loss": 0.3682, "step": 160 }, { "epoch": 0.30293234818748555, "grad_norm": 1.7734375, "learning_rate": 1.991298401490846e-05, "loss": 0.3975, "step": 164 }, { "epoch": 0.3103209420457169, "grad_norm": 1.5625, "learning_rate": 1.9901785505724117e-05, "loss": 0.382, "step": 168 }, { "epoch": 0.3177095359039483, "grad_norm": 1.6640625, "learning_rate": 1.988991288622603e-05, "loss": 0.364, "step": 172 }, { "epoch": 0.3250981297621796, "grad_norm": 1.8515625, "learning_rate": 1.9877366964698242e-05, "loss": 0.375, "step": 176 }, { "epoch": 0.332486723620411, "grad_norm": 1.8359375, "learning_rate": 1.9864148595262988e-05, "loss": 0.3637, "step": 180 }, { "epoch": 0.33987531747864236, "grad_norm": 1.7265625, "learning_rate": 1.985025867782252e-05, "loss": 0.3655, "step": 184 }, { "epoch": 0.3472639113368737, "grad_norm": 1.484375, "learning_rate": 1.9835698157997877e-05, "loss": 0.3498, "step": 188 }, { "epoch": 0.35465250519510505, "grad_norm": 1.7109375, "learning_rate": 1.9820468027064478e-05, "loss": 0.3802, "step": 192 }, { "epoch": 0.3620410990533364, "grad_norm": 1.5390625, "learning_rate": 1.9804569321884638e-05, "loss": 0.3692, "step": 196 }, { "epoch": 0.36942969291156774, "grad_norm": 1.625, "learning_rate": 1.978800312483701e-05, "loss": 0.3667, "step": 200 }, { "epoch": 0.3768182867697991, "grad_norm": 1.7109375, "learning_rate": 1.9770770563742854e-05, "loss": 0.3677, "step": 204 }, { "epoch": 0.3842068806280305, "grad_norm": 2.0, "learning_rate": 1.975287281178929e-05, "loss": 0.3802, "step": 208 }, { "epoch": 0.3915954744862618, "grad_norm": 1.703125, "learning_rate": 1.9734311087449408e-05, "loss": 0.3543, "step": 212 }, { "epoch": 0.3989840683444932, "grad_norm": 1.6171875, "learning_rate": 1.9715086654399317e-05, "loss": 0.3738, "step": 216 }, { "epoch": 0.40637266220272455, "grad_norm": 1.4453125, "learning_rate": 1.9695200821432126e-05, "loss": 0.3568, "step": 220 }, { "epoch": 0.4137612560609559, "grad_norm": 1.6796875, "learning_rate": 1.9674654942368824e-05, "loss": 0.3534, "step": 224 }, { "epoch": 0.42114984991918725, "grad_norm": 1.890625, "learning_rate": 1.965345041596613e-05, "loss": 0.3793, "step": 228 }, { "epoch": 0.4285384437774186, "grad_norm": 2.203125, "learning_rate": 1.963158868582126e-05, "loss": 0.3739, "step": 232 }, { "epoch": 0.43592703763564994, "grad_norm": 1.625, "learning_rate": 1.9609071240273637e-05, "loss": 0.3701, "step": 236 }, { "epoch": 0.4433156314938813, "grad_norm": 1.578125, "learning_rate": 1.958589961230358e-05, "loss": 0.3648, "step": 240 }, { "epoch": 0.4507042253521127, "grad_norm": 1.59375, "learning_rate": 1.9562075379427926e-05, "loss": 0.3775, "step": 244 }, { "epoch": 0.458092819210344, "grad_norm": 1.9609375, "learning_rate": 1.953760016359265e-05, "loss": 0.3806, "step": 248 }, { "epoch": 0.4654814130685754, "grad_norm": 1.6171875, "learning_rate": 1.951247563106243e-05, "loss": 0.366, "step": 252 }, { "epoch": 0.47287000692680675, "grad_norm": 1.5703125, "learning_rate": 1.9486703492307207e-05, "loss": 0.3682, "step": 256 }, { "epoch": 0.48025860078503807, "grad_norm": 1.578125, "learning_rate": 1.9460285501885744e-05, "loss": 0.3661, "step": 260 }, { "epoch": 0.48764719464326944, "grad_norm": 1.609375, "learning_rate": 1.943322345832618e-05, "loss": 0.3713, "step": 264 }, { "epoch": 0.4950357885015008, "grad_norm": 1.5078125, "learning_rate": 1.9405519204003577e-05, "loss": 0.3459, "step": 268 }, { "epoch": 0.5024243823597322, "grad_norm": 1.5703125, "learning_rate": 1.9377174625014485e-05, "loss": 0.3871, "step": 272 }, { "epoch": 0.5098129762179635, "grad_norm": 1.5625, "learning_rate": 1.9348191651048565e-05, "loss": 0.3369, "step": 276 }, { "epoch": 0.5172015700761948, "grad_norm": 1.6015625, "learning_rate": 1.9318572255257192e-05, "loss": 0.3698, "step": 280 }, { "epoch": 0.5245901639344263, "grad_norm": 1.8125, "learning_rate": 1.9288318454119122e-05, "loss": 0.3649, "step": 284 }, { "epoch": 0.5319787577926576, "grad_norm": 1.4609375, "learning_rate": 1.9257432307303232e-05, "loss": 0.3692, "step": 288 }, { "epoch": 0.5393673516508889, "grad_norm": 1.6171875, "learning_rate": 1.9225915917528277e-05, "loss": 0.3789, "step": 292 }, { "epoch": 0.5467559455091203, "grad_norm": 1.59375, "learning_rate": 1.919377143041975e-05, "loss": 0.3755, "step": 296 }, { "epoch": 0.5541445393673516, "grad_norm": 1.46875, "learning_rate": 1.916100103436381e-05, "loss": 0.3482, "step": 300 }, { "epoch": 0.561533133225583, "grad_norm": 1.84375, "learning_rate": 1.9127606960358274e-05, "loss": 0.3644, "step": 304 }, { "epoch": 0.5689217270838144, "grad_norm": 1.5078125, "learning_rate": 1.9093591481860772e-05, "loss": 0.3708, "step": 308 }, { "epoch": 0.5763103209420457, "grad_norm": 1.46875, "learning_rate": 1.9058956914633933e-05, "loss": 0.3814, "step": 312 }, { "epoch": 0.583698914800277, "grad_norm": 1.34375, "learning_rate": 1.9023705616587747e-05, "loss": 0.343, "step": 316 }, { "epoch": 0.5910875086585085, "grad_norm": 1.515625, "learning_rate": 1.8987839987619042e-05, "loss": 0.3793, "step": 320 }, { "epoch": 0.5984761025167398, "grad_norm": 1.703125, "learning_rate": 1.895136246944809e-05, "loss": 0.3553, "step": 324 }, { "epoch": 0.6058646963749711, "grad_norm": 1.59375, "learning_rate": 1.8914275545452377e-05, "loss": 0.3714, "step": 328 }, { "epoch": 0.6132532902332025, "grad_norm": 1.5859375, "learning_rate": 1.887658174049754e-05, "loss": 0.3691, "step": 332 }, { "epoch": 0.6206418840914338, "grad_norm": 1.484375, "learning_rate": 1.8838283620765476e-05, "loss": 0.341, "step": 336 }, { "epoch": 0.6280304779496652, "grad_norm": 1.546875, "learning_rate": 1.879938379357962e-05, "loss": 0.3559, "step": 340 }, { "epoch": 0.6354190718078966, "grad_norm": 1.859375, "learning_rate": 1.8759884907227485e-05, "loss": 0.344, "step": 344 }, { "epoch": 0.6428076656661279, "grad_norm": 1.5, "learning_rate": 1.87197896507803e-05, "loss": 0.3518, "step": 348 }, { "epoch": 0.6501962595243592, "grad_norm": 1.5, "learning_rate": 1.867910075390999e-05, "loss": 0.3399, "step": 352 }, { "epoch": 0.6575848533825907, "grad_norm": 1.5859375, "learning_rate": 1.8637820986703335e-05, "loss": 0.3468, "step": 356 }, { "epoch": 0.664973447240822, "grad_norm": 1.5625, "learning_rate": 1.8595953159473357e-05, "loss": 0.3479, "step": 360 }, { "epoch": 0.6723620410990533, "grad_norm": 1.6328125, "learning_rate": 1.8553500122568023e-05, "loss": 0.3497, "step": 364 }, { "epoch": 0.6797506349572847, "grad_norm": 1.546875, "learning_rate": 1.8510464766176178e-05, "loss": 0.3544, "step": 368 }, { "epoch": 0.687139228815516, "grad_norm": 1.578125, "learning_rate": 1.8466850020130786e-05, "loss": 0.344, "step": 372 }, { "epoch": 0.6945278226737474, "grad_norm": 1.65625, "learning_rate": 1.8422658853709475e-05, "loss": 0.3508, "step": 376 }, { "epoch": 0.7019164165319788, "grad_norm": 1.5625, "learning_rate": 1.8377894275432376e-05, "loss": 0.3452, "step": 380 }, { "epoch": 0.7093050103902101, "grad_norm": 1.5390625, "learning_rate": 1.833255933285732e-05, "loss": 0.3604, "step": 384 }, { "epoch": 0.7166936042484414, "grad_norm": 1.4765625, "learning_rate": 1.8286657112372342e-05, "loss": 0.3719, "step": 388 }, { "epoch": 0.7240821981066728, "grad_norm": 1.4765625, "learning_rate": 1.8240190738985578e-05, "loss": 0.356, "step": 392 }, { "epoch": 0.7314707919649042, "grad_norm": 1.546875, "learning_rate": 1.819316337611251e-05, "loss": 0.3592, "step": 396 }, { "epoch": 0.7388593858231355, "grad_norm": 1.6328125, "learning_rate": 1.8145578225360594e-05, "loss": 0.3557, "step": 400 }, { "epoch": 0.7462479796813669, "grad_norm": 1.609375, "learning_rate": 1.809743852631131e-05, "loss": 0.3532, "step": 404 }, { "epoch": 0.7536365735395982, "grad_norm": 1.4921875, "learning_rate": 1.8048747556299595e-05, "loss": 0.3723, "step": 408 }, { "epoch": 0.7610251673978295, "grad_norm": 1.6796875, "learning_rate": 1.7999508630190742e-05, "loss": 0.3604, "step": 412 }, { "epoch": 0.768413761256061, "grad_norm": 1.515625, "learning_rate": 1.7949725100154707e-05, "loss": 0.36, "step": 416 }, { "epoch": 0.7758023551142923, "grad_norm": 1.453125, "learning_rate": 1.789940035543792e-05, "loss": 0.3497, "step": 420 }, { "epoch": 0.7831909489725236, "grad_norm": 1.6796875, "learning_rate": 1.7848537822132498e-05, "loss": 0.3487, "step": 424 }, { "epoch": 0.790579542830755, "grad_norm": 1.5390625, "learning_rate": 1.7797140962943062e-05, "loss": 0.3491, "step": 428 }, { "epoch": 0.7979681366889864, "grad_norm": 1.6015625, "learning_rate": 1.7745213276950947e-05, "loss": 0.3627, "step": 432 }, { "epoch": 0.8053567305472177, "grad_norm": 1.75, "learning_rate": 1.7692758299376002e-05, "loss": 0.3534, "step": 436 }, { "epoch": 0.8127453244054491, "grad_norm": 1.4140625, "learning_rate": 1.7639779601335916e-05, "loss": 0.3565, "step": 440 }, { "epoch": 0.8201339182636804, "grad_norm": 1.59375, "learning_rate": 1.7586280789603104e-05, "loss": 0.3627, "step": 444 }, { "epoch": 0.8275225121219117, "grad_norm": 1.53125, "learning_rate": 1.753226550635914e-05, "loss": 0.3352, "step": 448 }, { "epoch": 0.8349111059801432, "grad_norm": 1.484375, "learning_rate": 1.7477737428946814e-05, "loss": 0.3499, "step": 452 }, { "epoch": 0.8422996998383745, "grad_norm": 1.6875, "learning_rate": 1.742270026961978e-05, "loss": 0.343, "step": 456 }, { "epoch": 0.8496882936966058, "grad_norm": 1.5703125, "learning_rate": 1.736715777528981e-05, "loss": 0.345, "step": 460 }, { "epoch": 0.8570768875548372, "grad_norm": 1.421875, "learning_rate": 1.7311113727271728e-05, "loss": 0.3329, "step": 464 }, { "epoch": 0.8644654814130686, "grad_norm": 1.609375, "learning_rate": 1.725457194102596e-05, "loss": 0.3299, "step": 468 }, { "epoch": 0.8718540752712999, "grad_norm": 1.375, "learning_rate": 1.71975362658988e-05, "loss": 0.3381, "step": 472 }, { "epoch": 0.8792426691295313, "grad_norm": 1.53125, "learning_rate": 1.7140010584860322e-05, "loss": 0.3554, "step": 476 }, { "epoch": 0.8866312629877626, "grad_norm": 1.7890625, "learning_rate": 1.708199881424005e-05, "loss": 0.3442, "step": 480 }, { "epoch": 0.8940198568459939, "grad_norm": 1.4921875, "learning_rate": 1.7023504903460325e-05, "loss": 0.3501, "step": 484 }, { "epoch": 0.9014084507042254, "grad_norm": 1.5, "learning_rate": 1.696453283476743e-05, "loss": 0.3751, "step": 488 }, { "epoch": 0.9087970445624567, "grad_norm": 1.5234375, "learning_rate": 1.6905086622960488e-05, "loss": 0.3613, "step": 492 }, { "epoch": 0.916185638420688, "grad_norm": 1.453125, "learning_rate": 1.6845170315118124e-05, "loss": 0.3655, "step": 496 }, { "epoch": 0.9235742322789194, "grad_norm": 1.375, "learning_rate": 1.678478799032295e-05, "loss": 0.338, "step": 500 }, { "epoch": 0.9309628261371508, "grad_norm": 1.6015625, "learning_rate": 1.6723943759383857e-05, "loss": 0.3414, "step": 504 }, { "epoch": 0.9383514199953821, "grad_norm": 1.7109375, "learning_rate": 1.666264176455616e-05, "loss": 0.3453, "step": 508 }, { "epoch": 0.9457400138536135, "grad_norm": 1.8671875, "learning_rate": 1.660088617925959e-05, "loss": 0.3275, "step": 512 }, { "epoch": 0.9531286077118448, "grad_norm": 1.65625, "learning_rate": 1.6538681207794153e-05, "loss": 0.3584, "step": 516 }, { "epoch": 0.9605172015700761, "grad_norm": 1.3984375, "learning_rate": 1.6476031085053937e-05, "loss": 0.3594, "step": 520 }, { "epoch": 0.9679057954283076, "grad_norm": 1.609375, "learning_rate": 1.6412940076238782e-05, "loss": 0.3527, "step": 524 }, { "epoch": 0.9752943892865389, "grad_norm": 1.5703125, "learning_rate": 1.6349412476563893e-05, "loss": 0.3654, "step": 528 }, { "epoch": 0.9826829831447702, "grad_norm": 1.4765625, "learning_rate": 1.628545261096745e-05, "loss": 0.3611, "step": 532 }, { "epoch": 0.9900715770030016, "grad_norm": 1.5078125, "learning_rate": 1.622106483381615e-05, "loss": 0.332, "step": 536 }, { "epoch": 0.997460170861233, "grad_norm": 1.5234375, "learning_rate": 1.615625352860877e-05, "loss": 0.329, "step": 540 }, { "epoch": 1.0048487647194644, "grad_norm": 1.1328125, "learning_rate": 1.6091023107677747e-05, "loss": 0.2864, "step": 544 }, { "epoch": 1.0122373585776956, "grad_norm": 1.3828125, "learning_rate": 1.602537801188876e-05, "loss": 0.2337, "step": 548 }, { "epoch": 1.019625952435927, "grad_norm": 1.5078125, "learning_rate": 1.5959322710338426e-05, "loss": 0.2459, "step": 552 }, { "epoch": 1.0270145462941584, "grad_norm": 1.53125, "learning_rate": 1.589286170005005e-05, "loss": 0.2397, "step": 556 }, { "epoch": 1.0344031401523897, "grad_norm": 1.6015625, "learning_rate": 1.5825999505667426e-05, "loss": 0.255, "step": 560 }, { "epoch": 1.041791734010621, "grad_norm": 1.3125, "learning_rate": 1.5758740679146858e-05, "loss": 0.2221, "step": 564 }, { "epoch": 1.0491803278688525, "grad_norm": 1.3203125, "learning_rate": 1.569108979944722e-05, "loss": 0.218, "step": 568 }, { "epoch": 1.0565689217270837, "grad_norm": 1.5078125, "learning_rate": 1.562305147221824e-05, "loss": 0.2313, "step": 572 }, { "epoch": 1.0639575155853152, "grad_norm": 1.59375, "learning_rate": 1.5554630329486954e-05, "loss": 0.2245, "step": 576 }, { "epoch": 1.0713461094435466, "grad_norm": 1.609375, "learning_rate": 1.548583102934234e-05, "loss": 0.2396, "step": 580 }, { "epoch": 1.0787347033017778, "grad_norm": 1.4453125, "learning_rate": 1.541665825561822e-05, "loss": 0.2248, "step": 584 }, { "epoch": 1.0861232971600092, "grad_norm": 1.421875, "learning_rate": 1.5347116717574367e-05, "loss": 0.2263, "step": 588 }, { "epoch": 1.0935118910182406, "grad_norm": 1.5625, "learning_rate": 1.5277211149575915e-05, "loss": 0.2355, "step": 592 }, { "epoch": 1.1009004848764719, "grad_norm": 1.5390625, "learning_rate": 1.5206946310771034e-05, "loss": 0.2197, "step": 596 }, { "epoch": 1.1082890787347033, "grad_norm": 1.5625, "learning_rate": 1.5136326984766934e-05, "loss": 0.2385, "step": 600 }, { "epoch": 1.1156776725929347, "grad_norm": 1.4921875, "learning_rate": 1.5065357979304191e-05, "loss": 0.2325, "step": 604 }, { "epoch": 1.123066266451166, "grad_norm": 1.421875, "learning_rate": 1.4994044125929458e-05, "loss": 0.233, "step": 608 }, { "epoch": 1.1304548603093973, "grad_norm": 1.4609375, "learning_rate": 1.492239027966651e-05, "loss": 0.2392, "step": 612 }, { "epoch": 1.1378434541676288, "grad_norm": 1.515625, "learning_rate": 1.4850401318685728e-05, "loss": 0.2245, "step": 616 }, { "epoch": 1.14523204802586, "grad_norm": 1.59375, "learning_rate": 1.4778082143971992e-05, "loss": 0.2314, "step": 620 }, { "epoch": 1.1526206418840914, "grad_norm": 1.515625, "learning_rate": 1.4705437678991034e-05, "loss": 0.2159, "step": 624 }, { "epoch": 1.1600092357423228, "grad_norm": 1.3828125, "learning_rate": 1.463247286935423e-05, "loss": 0.2289, "step": 628 }, { "epoch": 1.167397829600554, "grad_norm": 1.3828125, "learning_rate": 1.455919268248192e-05, "loss": 0.2445, "step": 632 }, { "epoch": 1.1747864234587855, "grad_norm": 1.3984375, "learning_rate": 1.4485602107265221e-05, "loss": 0.2343, "step": 636 }, { "epoch": 1.182175017317017, "grad_norm": 1.5234375, "learning_rate": 1.4411706153726391e-05, "loss": 0.2249, "step": 640 }, { "epoch": 1.1895636111752481, "grad_norm": 1.515625, "learning_rate": 1.4337509852677735e-05, "loss": 0.2241, "step": 644 }, { "epoch": 1.1969522050334795, "grad_norm": 1.515625, "learning_rate": 1.4263018255379132e-05, "loss": 0.2236, "step": 648 }, { "epoch": 1.204340798891711, "grad_norm": 1.4453125, "learning_rate": 1.4188236433194117e-05, "loss": 0.2215, "step": 652 }, { "epoch": 1.2117293927499422, "grad_norm": 1.53125, "learning_rate": 1.4113169477244659e-05, "loss": 0.2194, "step": 656 }, { "epoch": 1.2191179866081736, "grad_norm": 1.5546875, "learning_rate": 1.4037822498064517e-05, "loss": 0.2309, "step": 660 }, { "epoch": 1.226506580466405, "grad_norm": 1.4296875, "learning_rate": 1.3962200625251363e-05, "loss": 0.2195, "step": 664 }, { "epoch": 1.2338951743246362, "grad_norm": 1.5859375, "learning_rate": 1.3886309007117522e-05, "loss": 0.2402, "step": 668 }, { "epoch": 1.2412837681828677, "grad_norm": 1.53125, "learning_rate": 1.3810152810339496e-05, "loss": 0.2419, "step": 672 }, { "epoch": 1.248672362041099, "grad_norm": 1.4296875, "learning_rate": 1.3733737219606222e-05, "loss": 0.2321, "step": 676 }, { "epoch": 1.2560609558993305, "grad_norm": 1.3046875, "learning_rate": 1.3657067437266078e-05, "loss": 0.2086, "step": 680 }, { "epoch": 1.2634495497575617, "grad_norm": 1.5, "learning_rate": 1.3580148682972739e-05, "loss": 0.2302, "step": 684 }, { "epoch": 1.2708381436157932, "grad_norm": 1.453125, "learning_rate": 1.3502986193329793e-05, "loss": 0.2251, "step": 688 }, { "epoch": 1.2782267374740246, "grad_norm": 1.3515625, "learning_rate": 1.3425585221534266e-05, "loss": 0.2315, "step": 692 }, { "epoch": 1.2856153313322558, "grad_norm": 1.546875, "learning_rate": 1.3347951037018961e-05, "loss": 0.2234, "step": 696 }, { "epoch": 1.2930039251904872, "grad_norm": 1.4140625, "learning_rate": 1.3270088925093726e-05, "loss": 0.2236, "step": 700 }, { "epoch": 1.3003925190487187, "grad_norm": 1.65625, "learning_rate": 1.3192004186585643e-05, "loss": 0.2263, "step": 704 }, { "epoch": 1.3077811129069499, "grad_norm": 1.46875, "learning_rate": 1.311370213747813e-05, "loss": 0.2368, "step": 708 }, { "epoch": 1.3151697067651813, "grad_norm": 1.4609375, "learning_rate": 1.3035188108549035e-05, "loss": 0.2393, "step": 712 }, { "epoch": 1.3225583006234127, "grad_norm": 2.015625, "learning_rate": 1.2956467445007736e-05, "loss": 0.2355, "step": 716 }, { "epoch": 1.329946894481644, "grad_norm": 1.5078125, "learning_rate": 1.2877545506131219e-05, "loss": 0.2286, "step": 720 }, { "epoch": 1.3373354883398754, "grad_norm": 1.4921875, "learning_rate": 1.2798427664899232e-05, "loss": 0.2264, "step": 724 }, { "epoch": 1.3447240821981068, "grad_norm": 1.5703125, "learning_rate": 1.271911930762848e-05, "loss": 0.2223, "step": 728 }, { "epoch": 1.352112676056338, "grad_norm": 1.640625, "learning_rate": 1.2639625833605959e-05, "loss": 0.2247, "step": 732 }, { "epoch": 1.3595012699145694, "grad_norm": 1.59375, "learning_rate": 1.255995265472134e-05, "loss": 0.2314, "step": 736 }, { "epoch": 1.3668898637728009, "grad_norm": 1.5546875, "learning_rate": 1.2480105195098537e-05, "loss": 0.2381, "step": 740 }, { "epoch": 1.374278457631032, "grad_norm": 1.546875, "learning_rate": 1.2400088890726451e-05, "loss": 0.2292, "step": 744 }, { "epoch": 1.3816670514892635, "grad_norm": 1.515625, "learning_rate": 1.231990918908887e-05, "loss": 0.2222, "step": 748 }, { "epoch": 1.389055645347495, "grad_norm": 1.8125, "learning_rate": 1.2239571548793618e-05, "loss": 0.2149, "step": 752 }, { "epoch": 1.3964442392057261, "grad_norm": 1.515625, "learning_rate": 1.2159081439200933e-05, "loss": 0.2465, "step": 756 }, { "epoch": 1.4038328330639576, "grad_norm": 1.546875, "learning_rate": 1.2078444340051107e-05, "loss": 0.2329, "step": 760 }, { "epoch": 1.411221426922189, "grad_norm": 1.6328125, "learning_rate": 1.199766574109144e-05, "loss": 0.2284, "step": 764 }, { "epoch": 1.4186100207804202, "grad_norm": 1.4765625, "learning_rate": 1.1916751141702485e-05, "loss": 0.2453, "step": 768 }, { "epoch": 1.4259986146386516, "grad_norm": 1.640625, "learning_rate": 1.1835706050523663e-05, "loss": 0.2267, "step": 772 }, { "epoch": 1.433387208496883, "grad_norm": 1.59375, "learning_rate": 1.1754535985078236e-05, "loss": 0.2178, "step": 776 }, { "epoch": 1.4407758023551143, "grad_norm": 1.4765625, "learning_rate": 1.1673246471397672e-05, "loss": 0.2171, "step": 780 }, { "epoch": 1.4481643962133457, "grad_norm": 1.453125, "learning_rate": 1.1591843043645432e-05, "loss": 0.2279, "step": 784 }, { "epoch": 1.4555529900715771, "grad_norm": 1.6015625, "learning_rate": 1.1510331243740214e-05, "loss": 0.2298, "step": 788 }, { "epoch": 1.4629415839298083, "grad_norm": 1.53125, "learning_rate": 1.1428716620978654e-05, "loss": 0.2319, "step": 792 }, { "epoch": 1.4703301777880398, "grad_norm": 1.4609375, "learning_rate": 1.134700473165754e-05, "loss": 0.2392, "step": 796 }, { "epoch": 1.4777187716462712, "grad_norm": 1.5234375, "learning_rate": 1.1265201138695526e-05, "loss": 0.2313, "step": 800 }, { "epoch": 1.4851073655045024, "grad_norm": 1.421875, "learning_rate": 1.1183311411254425e-05, "loss": 0.2021, "step": 804 }, { "epoch": 1.4924959593627338, "grad_norm": 1.3984375, "learning_rate": 1.1101341124360056e-05, "loss": 0.2178, "step": 808 }, { "epoch": 1.4998845532209653, "grad_norm": 1.609375, "learning_rate": 1.1019295858522708e-05, "loss": 0.2268, "step": 812 }, { "epoch": 1.5072731470791965, "grad_norm": 1.6015625, "learning_rate": 1.0937181199357192e-05, "loss": 0.2244, "step": 816 }, { "epoch": 1.514661740937428, "grad_norm": 1.484375, "learning_rate": 1.0855002737202617e-05, "loss": 0.2311, "step": 820 }, { "epoch": 1.5220503347956593, "grad_norm": 1.59375, "learning_rate": 1.0772766066741763e-05, "loss": 0.2314, "step": 824 }, { "epoch": 1.5294389286538905, "grad_norm": 1.5234375, "learning_rate": 1.0690476786620214e-05, "loss": 0.2357, "step": 828 }, { "epoch": 1.536827522512122, "grad_norm": 1.4921875, "learning_rate": 1.060814049906521e-05, "loss": 0.2126, "step": 832 }, { "epoch": 1.5442161163703534, "grad_norm": 1.4453125, "learning_rate": 1.0525762809504234e-05, "loss": 0.235, "step": 836 }, { "epoch": 1.5516047102285846, "grad_norm": 1.453125, "learning_rate": 1.0443349326183409e-05, "loss": 0.2274, "step": 840 }, { "epoch": 1.558993304086816, "grad_norm": 1.640625, "learning_rate": 1.0360905659785682e-05, "loss": 0.2404, "step": 844 }, { "epoch": 1.5663818979450475, "grad_norm": 1.5625, "learning_rate": 1.0278437423048862e-05, "loss": 0.2124, "step": 848 }, { "epoch": 1.5737704918032787, "grad_norm": 1.578125, "learning_rate": 1.0195950230383482e-05, "loss": 0.2218, "step": 852 }, { "epoch": 1.58115908566151, "grad_norm": 1.671875, "learning_rate": 1.011344969749061e-05, "loss": 0.2298, "step": 856 }, { "epoch": 1.5885476795197415, "grad_norm": 1.359375, "learning_rate": 1.0030941440979489e-05, "loss": 0.2238, "step": 860 }, { "epoch": 1.5959362733779727, "grad_norm": 1.390625, "learning_rate": 9.948431077985203e-06, "loss": 0.2225, "step": 864 }, { "epoch": 1.6033248672362042, "grad_norm": 1.515625, "learning_rate": 9.865924225786241e-06, "loss": 0.227, "step": 868 }, { "epoch": 1.6107134610944356, "grad_norm": 1.453125, "learning_rate": 9.783426501422063e-06, "loss": 0.2291, "step": 872 }, { "epoch": 1.6181020549526668, "grad_norm": 1.6171875, "learning_rate": 9.700943521310727e-06, "loss": 0.2179, "step": 876 }, { "epoch": 1.6254906488108982, "grad_norm": 1.390625, "learning_rate": 9.618480900866502e-06, "loss": 0.2161, "step": 880 }, { "epoch": 1.6328792426691296, "grad_norm": 1.4765625, "learning_rate": 9.536044254117572e-06, "loss": 0.2139, "step": 884 }, { "epoch": 1.6402678365273609, "grad_norm": 1.4609375, "learning_rate": 9.453639193323844e-06, "loss": 0.226, "step": 888 }, { "epoch": 1.6476564303855923, "grad_norm": 1.390625, "learning_rate": 9.371271328594867e-06, "loss": 0.2072, "step": 892 }, { "epoch": 1.6550450242438237, "grad_norm": 1.484375, "learning_rate": 9.28894626750789e-06, "loss": 0.2195, "step": 896 }, { "epoch": 1.662433618102055, "grad_norm": 1.609375, "learning_rate": 9.2066696147261e-06, "loss": 0.2293, "step": 900 }, { "epoch": 1.6698222119602864, "grad_norm": 1.5859375, "learning_rate": 9.124446971617078e-06, "loss": 0.2313, "step": 904 }, { "epoch": 1.6772108058185178, "grad_norm": 1.4765625, "learning_rate": 9.042283935871425e-06, "loss": 0.2237, "step": 908 }, { "epoch": 1.684599399676749, "grad_norm": 1.671875, "learning_rate": 8.9601861011217e-06, "loss": 0.2418, "step": 912 }, { "epoch": 1.6919879935349804, "grad_norm": 1.4375, "learning_rate": 8.878159056561605e-06, "loss": 0.2304, "step": 916 }, { "epoch": 1.6993765873932118, "grad_norm": 1.4921875, "learning_rate": 8.796208386565465e-06, "loss": 0.2202, "step": 920 }, { "epoch": 1.706765181251443, "grad_norm": 1.3984375, "learning_rate": 8.714339670308045e-06, "loss": 0.2104, "step": 924 }, { "epoch": 1.7141537751096745, "grad_norm": 1.5234375, "learning_rate": 8.632558481384729e-06, "loss": 0.2206, "step": 928 }, { "epoch": 1.721542368967906, "grad_norm": 1.4765625, "learning_rate": 8.550870387432075e-06, "loss": 0.2187, "step": 932 }, { "epoch": 1.7289309628261371, "grad_norm": 1.4453125, "learning_rate": 8.469280949748764e-06, "loss": 0.2283, "step": 936 }, { "epoch": 1.7363195566843685, "grad_norm": 1.5234375, "learning_rate": 8.38779572291698e-06, "loss": 0.2195, "step": 940 }, { "epoch": 1.7437081505426, "grad_norm": 1.5078125, "learning_rate": 8.306420254424274e-06, "loss": 0.2237, "step": 944 }, { "epoch": 1.7510967444008312, "grad_norm": 1.40625, "learning_rate": 8.225160084285893e-06, "loss": 0.2109, "step": 948 }, { "epoch": 1.7584853382590626, "grad_norm": 1.75, "learning_rate": 8.144020744667596e-06, "loss": 0.2244, "step": 952 }, { "epoch": 1.765873932117294, "grad_norm": 1.640625, "learning_rate": 8.063007759509044e-06, "loss": 0.2318, "step": 956 }, { "epoch": 1.7732625259755253, "grad_norm": 1.4765625, "learning_rate": 7.982126644147733e-06, "loss": 0.2318, "step": 960 }, { "epoch": 1.7806511198337567, "grad_norm": 1.515625, "learning_rate": 7.901382904943505e-06, "loss": 0.2226, "step": 964 }, { "epoch": 1.788039713691988, "grad_norm": 1.4453125, "learning_rate": 7.82078203890367e-06, "loss": 0.2323, "step": 968 }, { "epoch": 1.7954283075502193, "grad_norm": 1.609375, "learning_rate": 7.740329533308793e-06, "loss": 0.2209, "step": 972 }, { "epoch": 1.8028169014084507, "grad_norm": 1.609375, "learning_rate": 7.660030865339096e-06, "loss": 0.2228, "step": 976 }, { "epoch": 1.8102054952666822, "grad_norm": 1.5859375, "learning_rate": 7.57989150170159e-06, "loss": 0.2378, "step": 980 }, { "epoch": 1.8175940891249134, "grad_norm": 1.5703125, "learning_rate": 7.499916898257906e-06, "loss": 0.2257, "step": 984 }, { "epoch": 1.8249826829831448, "grad_norm": 1.578125, "learning_rate": 7.420112499652851e-06, "loss": 0.2138, "step": 988 }, { "epoch": 1.8323712768413762, "grad_norm": 1.46875, "learning_rate": 7.3404837389437314e-06, "loss": 0.2219, "step": 992 }, { "epoch": 1.8397598706996074, "grad_norm": 1.625, "learning_rate": 7.261036037230499e-06, "loss": 0.2291, "step": 996 }, { "epoch": 1.8471484645578389, "grad_norm": 1.6015625, "learning_rate": 7.181774803286661e-06, "loss": 0.2437, "step": 1000 }, { "epoch": 1.8545370584160703, "grad_norm": 1.5, "learning_rate": 7.102705433191066e-06, "loss": 0.2206, "step": 1004 }, { "epoch": 1.8619256522743015, "grad_norm": 1.5, "learning_rate": 7.02383330996052e-06, "loss": 0.228, "step": 1008 }, { "epoch": 1.869314246132533, "grad_norm": 1.4921875, "learning_rate": 6.945163803183345e-06, "loss": 0.2234, "step": 1012 }, { "epoch": 1.8767028399907644, "grad_norm": 1.65625, "learning_rate": 6.866702268653787e-06, "loss": 0.2359, "step": 1016 }, { "epoch": 1.8840914338489956, "grad_norm": 1.625, "learning_rate": 6.788454048007418e-06, "loss": 0.2411, "step": 1020 }, { "epoch": 1.891480027707227, "grad_norm": 1.4921875, "learning_rate": 6.710424468357471e-06, "loss": 0.2315, "step": 1024 }, { "epoch": 1.8988686215654584, "grad_norm": 1.4375, "learning_rate": 6.632618841932164e-06, "loss": 0.2092, "step": 1028 }, { "epoch": 1.9062572154236896, "grad_norm": 1.5859375, "learning_rate": 6.555042465713059e-06, "loss": 0.2192, "step": 1032 }, { "epoch": 1.913645809281921, "grad_norm": 1.6015625, "learning_rate": 6.4777006210744375e-06, "loss": 0.2158, "step": 1036 }, { "epoch": 1.9210344031401525, "grad_norm": 1.5078125, "learning_rate": 6.400598573423753e-06, "loss": 0.2213, "step": 1040 }, { "epoch": 1.9284229969983837, "grad_norm": 1.5390625, "learning_rate": 6.323741571843145e-06, "loss": 0.2106, "step": 1044 }, { "epoch": 1.9358115908566151, "grad_norm": 1.609375, "learning_rate": 6.24713484873211e-06, "loss": 0.2225, "step": 1048 }, { "epoch": 1.9432001847148466, "grad_norm": 1.5625, "learning_rate": 6.170783619451264e-06, "loss": 0.2151, "step": 1052 }, { "epoch": 1.9505887785730778, "grad_norm": 1.4921875, "learning_rate": 6.094693081967289e-06, "loss": 0.2305, "step": 1056 }, { "epoch": 1.9579773724313092, "grad_norm": 1.609375, "learning_rate": 6.018868416499046e-06, "loss": 0.2196, "step": 1060 }, { "epoch": 1.9653659662895406, "grad_norm": 1.625, "learning_rate": 5.943314785164924e-06, "loss": 0.2395, "step": 1064 }, { "epoch": 1.9727545601477718, "grad_norm": 1.546875, "learning_rate": 5.8680373316314e-06, "loss": 0.2275, "step": 1068 }, { "epoch": 1.9801431540060033, "grad_norm": 1.609375, "learning_rate": 5.793041180762845e-06, "loss": 0.2246, "step": 1072 }, { "epoch": 1.9875317478642347, "grad_norm": 1.453125, "learning_rate": 5.7183314382726555e-06, "loss": 0.2268, "step": 1076 }, { "epoch": 1.994920341722466, "grad_norm": 1.65625, "learning_rate": 5.643913190375614e-06, "loss": 0.2132, "step": 1080 } ], "logging_steps": 4, "max_steps": 1623, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3964435361415823e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }