|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9986146386515817, |
|
"eval_steps": 500, |
|
"global_step": 1082, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0073885938582313555, |
|
"grad_norm": 30.625, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 3.171, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.014777187716462711, |
|
"grad_norm": 30.625, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 3.1417, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.022165781574694066, |
|
"grad_norm": 26.375, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 2.9572, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.029554375432925422, |
|
"grad_norm": 19.75, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 2.4271, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03694296929115678, |
|
"grad_norm": 16.25, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.9425, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04433156314938813, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.2272, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.051720157007619484, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.7596, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.059108750865850844, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.5928, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0664973447240822, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.5097, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07388593858231356, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.4897, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0812745324405449, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.4433, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08866312629877626, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.4597, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09605172015700762, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.4418, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.10344031401523897, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.416, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11082890787347033, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.4305, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11821750173170169, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.3997, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12560609558993305, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.3958, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1329946894481644, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.4051, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.14038328330639574, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.4058, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1477718771646271, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.4084, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15516047102285846, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.402, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1625490648810898, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.3955, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16993765873932118, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.395, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.17732625259755253, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.4021, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.18471484645578387, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3918, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19210344031401524, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.9999659601637828e-05, |
|
"loss": 0.3943, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1994920341722466, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.9998638429725526e-05, |
|
"loss": 0.3955, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.20688062803047794, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.9996936553784137e-05, |
|
"loss": 0.4017, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2142692218887093, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.999455408967682e-05, |
|
"loss": 0.4023, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.22165781574694066, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.999149119960095e-05, |
|
"loss": 0.4046, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.229046409605172, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.9987748092077082e-05, |
|
"loss": 0.3783, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.23643500346340338, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.998332502193475e-05, |
|
"loss": 0.3965, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.24382359732163472, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.9978222290295116e-05, |
|
"loss": 0.4031, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2512121911798661, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.9972440244550485e-05, |
|
"loss": 0.3685, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2586007850380974, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.9965979278340638e-05, |
|
"loss": 0.4004, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2659893788963288, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.995883983152603e-05, |
|
"loss": 0.3881, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.27337797275456016, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.995102239015787e-05, |
|
"loss": 0.4091, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.2807665666127915, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.9942527486445e-05, |
|
"loss": 0.3785, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.28815516047102285, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.9933355698717683e-05, |
|
"loss": 0.3999, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2955437543292542, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.9923507651388224e-05, |
|
"loss": 0.3682, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.30293234818748555, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.991298401490846e-05, |
|
"loss": 0.3975, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3103209420457169, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.9901785505724117e-05, |
|
"loss": 0.382, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3177095359039483, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.988991288622603e-05, |
|
"loss": 0.364, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3250981297621796, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.9877366964698242e-05, |
|
"loss": 0.375, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.332486723620411, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.9864148595262988e-05, |
|
"loss": 0.3637, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.33987531747864236, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.985025867782252e-05, |
|
"loss": 0.3655, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.3472639113368737, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.9835698157997877e-05, |
|
"loss": 0.3498, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.35465250519510505, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.9820468027064478e-05, |
|
"loss": 0.3802, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3620410990533364, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.9804569321884638e-05, |
|
"loss": 0.3692, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.36942969291156774, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.978800312483701e-05, |
|
"loss": 0.3667, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3768182867697991, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.9770770563742854e-05, |
|
"loss": 0.3677, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.3842068806280305, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.975287281178929e-05, |
|
"loss": 0.3802, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3915954744862618, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.9734311087449408e-05, |
|
"loss": 0.3543, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.3989840683444932, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.9715086654399317e-05, |
|
"loss": 0.3738, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.40637266220272455, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.9695200821432126e-05, |
|
"loss": 0.3568, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4137612560609559, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.9674654942368824e-05, |
|
"loss": 0.3534, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.42114984991918725, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.965345041596613e-05, |
|
"loss": 0.3793, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4285384437774186, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.963158868582126e-05, |
|
"loss": 0.3739, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.43592703763564994, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.9609071240273637e-05, |
|
"loss": 0.3701, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4433156314938813, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.958589961230358e-05, |
|
"loss": 0.3648, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4507042253521127, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.9562075379427926e-05, |
|
"loss": 0.3775, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.458092819210344, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.953760016359265e-05, |
|
"loss": 0.3806, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.4654814130685754, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.951247563106243e-05, |
|
"loss": 0.366, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.47287000692680675, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.9486703492307207e-05, |
|
"loss": 0.3682, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.48025860078503807, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.9460285501885744e-05, |
|
"loss": 0.3661, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48764719464326944, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.943322345832618e-05, |
|
"loss": 0.3713, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4950357885015008, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.9405519204003577e-05, |
|
"loss": 0.3459, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5024243823597322, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.9377174625014485e-05, |
|
"loss": 0.3871, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5098129762179635, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.9348191651048565e-05, |
|
"loss": 0.3369, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5172015700761948, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.9318572255257192e-05, |
|
"loss": 0.3698, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5245901639344263, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.9288318454119122e-05, |
|
"loss": 0.3649, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5319787577926576, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.9257432307303232e-05, |
|
"loss": 0.3692, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5393673516508889, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.9225915917528277e-05, |
|
"loss": 0.3789, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5467559455091203, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.919377143041975e-05, |
|
"loss": 0.3755, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5541445393673516, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.916100103436381e-05, |
|
"loss": 0.3482, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.561533133225583, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.9127606960358274e-05, |
|
"loss": 0.3644, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5689217270838144, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.9093591481860772e-05, |
|
"loss": 0.3708, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5763103209420457, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.9058956914633933e-05, |
|
"loss": 0.3814, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.583698914800277, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.9023705616587747e-05, |
|
"loss": 0.343, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5910875086585085, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.8987839987619042e-05, |
|
"loss": 0.3793, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5984761025167398, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.895136246944809e-05, |
|
"loss": 0.3553, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6058646963749711, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.8914275545452377e-05, |
|
"loss": 0.3714, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6132532902332025, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.887658174049754e-05, |
|
"loss": 0.3691, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6206418840914338, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.8838283620765476e-05, |
|
"loss": 0.341, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6280304779496652, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.879938379357962e-05, |
|
"loss": 0.3559, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6354190718078966, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.8759884907227485e-05, |
|
"loss": 0.344, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6428076656661279, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.87197896507803e-05, |
|
"loss": 0.3518, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6501962595243592, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.867910075390999e-05, |
|
"loss": 0.3399, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6575848533825907, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.8637820986703335e-05, |
|
"loss": 0.3468, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.664973447240822, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.8595953159473357e-05, |
|
"loss": 0.3479, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6723620410990533, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.8553500122568023e-05, |
|
"loss": 0.3497, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6797506349572847, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.8510464766176178e-05, |
|
"loss": 0.3544, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.687139228815516, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.8466850020130786e-05, |
|
"loss": 0.344, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.6945278226737474, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.8422658853709475e-05, |
|
"loss": 0.3508, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7019164165319788, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.8377894275432376e-05, |
|
"loss": 0.3452, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7093050103902101, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.833255933285732e-05, |
|
"loss": 0.3604, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7166936042484414, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.8286657112372342e-05, |
|
"loss": 0.3719, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7240821981066728, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.8240190738985578e-05, |
|
"loss": 0.356, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7314707919649042, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.819316337611251e-05, |
|
"loss": 0.3592, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7388593858231355, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.8145578225360594e-05, |
|
"loss": 0.3557, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7462479796813669, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.809743852631131e-05, |
|
"loss": 0.3532, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7536365735395982, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.8048747556299595e-05, |
|
"loss": 0.3723, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.7610251673978295, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.7999508630190742e-05, |
|
"loss": 0.3604, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.768413761256061, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.7949725100154707e-05, |
|
"loss": 0.36, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.7758023551142923, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.789940035543792e-05, |
|
"loss": 0.3497, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7831909489725236, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.7848537822132498e-05, |
|
"loss": 0.3487, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.790579542830755, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.7797140962943062e-05, |
|
"loss": 0.3491, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.7979681366889864, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.7745213276950947e-05, |
|
"loss": 0.3627, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.8053567305472177, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.7692758299376002e-05, |
|
"loss": 0.3534, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8127453244054491, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.7639779601335916e-05, |
|
"loss": 0.3565, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8201339182636804, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.7586280789603104e-05, |
|
"loss": 0.3627, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.8275225121219117, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.753226550635914e-05, |
|
"loss": 0.3352, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8349111059801432, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.7477737428946814e-05, |
|
"loss": 0.3499, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.8422996998383745, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.742270026961978e-05, |
|
"loss": 0.343, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.8496882936966058, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.736715777528981e-05, |
|
"loss": 0.345, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8570768875548372, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.7311113727271728e-05, |
|
"loss": 0.3329, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.8644654814130686, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.725457194102596e-05, |
|
"loss": 0.3299, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.8718540752712999, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.71975362658988e-05, |
|
"loss": 0.3381, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.8792426691295313, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.7140010584860322e-05, |
|
"loss": 0.3554, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.8866312629877626, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.708199881424005e-05, |
|
"loss": 0.3442, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8940198568459939, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.7023504903460325e-05, |
|
"loss": 0.3501, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.9014084507042254, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.696453283476743e-05, |
|
"loss": 0.3751, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.9087970445624567, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.6905086622960488e-05, |
|
"loss": 0.3613, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.916185638420688, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.6845170315118124e-05, |
|
"loss": 0.3655, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.9235742322789194, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.678478799032295e-05, |
|
"loss": 0.338, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9309628261371508, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.6723943759383857e-05, |
|
"loss": 0.3414, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.9383514199953821, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.666264176455616e-05, |
|
"loss": 0.3453, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9457400138536135, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.660088617925959e-05, |
|
"loss": 0.3275, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.9531286077118448, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.6538681207794153e-05, |
|
"loss": 0.3584, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.9605172015700761, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.6476031085053937e-05, |
|
"loss": 0.3594, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9679057954283076, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.6412940076238782e-05, |
|
"loss": 0.3527, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.9752943892865389, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.6349412476563893e-05, |
|
"loss": 0.3654, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.9826829831447702, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.628545261096745e-05, |
|
"loss": 0.3611, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.9900715770030016, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.622106483381615e-05, |
|
"loss": 0.332, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.997460170861233, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.615625352860877e-05, |
|
"loss": 0.329, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0048487647194644, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.6091023107677747e-05, |
|
"loss": 0.2864, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.0122373585776956, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.602537801188876e-05, |
|
"loss": 0.2337, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.019625952435927, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.5959322710338426e-05, |
|
"loss": 0.2459, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.0270145462941584, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.589286170005005e-05, |
|
"loss": 0.2397, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.0344031401523897, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.5825999505667426e-05, |
|
"loss": 0.255, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.041791734010621, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.5758740679146858e-05, |
|
"loss": 0.2221, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.0491803278688525, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.569108979944722e-05, |
|
"loss": 0.218, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.0565689217270837, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.562305147221824e-05, |
|
"loss": 0.2313, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.0639575155853152, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.5554630329486954e-05, |
|
"loss": 0.2245, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.0713461094435466, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.548583102934234e-05, |
|
"loss": 0.2396, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0787347033017778, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.541665825561822e-05, |
|
"loss": 0.2248, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.0861232971600092, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.5347116717574367e-05, |
|
"loss": 0.2263, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.0935118910182406, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.5277211149575915e-05, |
|
"loss": 0.2355, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.1009004848764719, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.5206946310771034e-05, |
|
"loss": 0.2197, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.1082890787347033, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.5136326984766934e-05, |
|
"loss": 0.2385, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1156776725929347, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.5065357979304191e-05, |
|
"loss": 0.2325, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.123066266451166, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.4994044125929458e-05, |
|
"loss": 0.233, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.1304548603093973, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.492239027966651e-05, |
|
"loss": 0.2392, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.1378434541676288, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.4850401318685728e-05, |
|
"loss": 0.2245, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.14523204802586, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.4778082143971992e-05, |
|
"loss": 0.2314, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1526206418840914, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.4705437678991034e-05, |
|
"loss": 0.2159, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.1600092357423228, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.463247286935423e-05, |
|
"loss": 0.2289, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.167397829600554, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.455919268248192e-05, |
|
"loss": 0.2445, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.1747864234587855, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.4485602107265221e-05, |
|
"loss": 0.2343, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.182175017317017, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.4411706153726391e-05, |
|
"loss": 0.2249, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.1895636111752481, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.4337509852677735e-05, |
|
"loss": 0.2241, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.1969522050334795, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.4263018255379132e-05, |
|
"loss": 0.2236, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.204340798891711, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.4188236433194117e-05, |
|
"loss": 0.2215, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.2117293927499422, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.4113169477244659e-05, |
|
"loss": 0.2194, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.2191179866081736, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.4037822498064517e-05, |
|
"loss": 0.2309, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.226506580466405, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.3962200625251363e-05, |
|
"loss": 0.2195, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.2338951743246362, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.3886309007117522e-05, |
|
"loss": 0.2402, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.2412837681828677, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.3810152810339496e-05, |
|
"loss": 0.2419, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.248672362041099, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.3733737219606222e-05, |
|
"loss": 0.2321, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.2560609558993305, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.3657067437266078e-05, |
|
"loss": 0.2086, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2634495497575617, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.3580148682972739e-05, |
|
"loss": 0.2302, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.2708381436157932, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.3502986193329793e-05, |
|
"loss": 0.2251, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.2782267374740246, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.3425585221534266e-05, |
|
"loss": 0.2315, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.2856153313322558, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.3347951037018961e-05, |
|
"loss": 0.2234, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.2930039251904872, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.3270088925093726e-05, |
|
"loss": 0.2236, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3003925190487187, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.3192004186585643e-05, |
|
"loss": 0.2263, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.3077811129069499, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.311370213747813e-05, |
|
"loss": 0.2368, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.3151697067651813, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.3035188108549035e-05, |
|
"loss": 0.2393, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.3225583006234127, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.2956467445007736e-05, |
|
"loss": 0.2355, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.329946894481644, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.2877545506131219e-05, |
|
"loss": 0.2286, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3373354883398754, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.2798427664899232e-05, |
|
"loss": 0.2264, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.3447240821981068, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.271911930762848e-05, |
|
"loss": 0.2223, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.352112676056338, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.2639625833605959e-05, |
|
"loss": 0.2247, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.3595012699145694, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.255995265472134e-05, |
|
"loss": 0.2314, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.3668898637728009, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.2480105195098537e-05, |
|
"loss": 0.2381, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.374278457631032, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.2400088890726451e-05, |
|
"loss": 0.2292, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.3816670514892635, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.231990918908887e-05, |
|
"loss": 0.2222, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.389055645347495, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.2239571548793618e-05, |
|
"loss": 0.2149, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.3964442392057261, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.2159081439200933e-05, |
|
"loss": 0.2465, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.4038328330639576, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.2078444340051107e-05, |
|
"loss": 0.2329, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.411221426922189, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.199766574109144e-05, |
|
"loss": 0.2284, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.4186100207804202, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.1916751141702485e-05, |
|
"loss": 0.2453, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.4259986146386516, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.1835706050523663e-05, |
|
"loss": 0.2267, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.433387208496883, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.1754535985078236e-05, |
|
"loss": 0.2178, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.4407758023551143, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.1673246471397672e-05, |
|
"loss": 0.2171, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4481643962133457, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.1591843043645432e-05, |
|
"loss": 0.2279, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.4555529900715771, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.1510331243740214e-05, |
|
"loss": 0.2298, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.4629415839298083, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.1428716620978654e-05, |
|
"loss": 0.2319, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.4703301777880398, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.134700473165754e-05, |
|
"loss": 0.2392, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.4777187716462712, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.1265201138695526e-05, |
|
"loss": 0.2313, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4851073655045024, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.1183311411254425e-05, |
|
"loss": 0.2021, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.4924959593627338, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.1101341124360056e-05, |
|
"loss": 0.2178, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.4998845532209653, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.1019295858522708e-05, |
|
"loss": 0.2268, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.5072731470791965, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.0937181199357192e-05, |
|
"loss": 0.2244, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.514661740937428, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.0855002737202617e-05, |
|
"loss": 0.2311, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.5220503347956593, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.0772766066741763e-05, |
|
"loss": 0.2314, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.5294389286538905, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.0690476786620214e-05, |
|
"loss": 0.2357, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.536827522512122, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.060814049906521e-05, |
|
"loss": 0.2126, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.5442161163703534, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.0525762809504234e-05, |
|
"loss": 0.235, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.5516047102285846, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.0443349326183409e-05, |
|
"loss": 0.2274, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.558993304086816, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.0360905659785682e-05, |
|
"loss": 0.2404, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.5663818979450475, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.0278437423048862e-05, |
|
"loss": 0.2124, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.5737704918032787, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.0195950230383482e-05, |
|
"loss": 0.2218, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.58115908566151, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.011344969749061e-05, |
|
"loss": 0.2298, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.5885476795197415, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.0030941440979489e-05, |
|
"loss": 0.2238, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.5959362733779727, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.948431077985203e-06, |
|
"loss": 0.2225, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.6033248672362042, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 9.865924225786241e-06, |
|
"loss": 0.227, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.6107134610944356, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 9.783426501422063e-06, |
|
"loss": 0.2291, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.6181020549526668, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 9.700943521310727e-06, |
|
"loss": 0.2179, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.6254906488108982, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.618480900866502e-06, |
|
"loss": 0.2161, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.6328792426691296, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.536044254117572e-06, |
|
"loss": 0.2139, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.6402678365273609, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.453639193323844e-06, |
|
"loss": 0.226, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.6476564303855923, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.371271328594867e-06, |
|
"loss": 0.2072, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.6550450242438237, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.28894626750789e-06, |
|
"loss": 0.2195, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.662433618102055, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 9.2066696147261e-06, |
|
"loss": 0.2293, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6698222119602864, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 9.124446971617078e-06, |
|
"loss": 0.2313, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.6772108058185178, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.042283935871425e-06, |
|
"loss": 0.2237, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.684599399676749, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.9601861011217e-06, |
|
"loss": 0.2418, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.6919879935349804, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 8.878159056561605e-06, |
|
"loss": 0.2304, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.6993765873932118, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 8.796208386565465e-06, |
|
"loss": 0.2202, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.706765181251443, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 8.714339670308045e-06, |
|
"loss": 0.2104, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.7141537751096745, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 8.632558481384729e-06, |
|
"loss": 0.2206, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.721542368967906, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 8.550870387432075e-06, |
|
"loss": 0.2187, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.7289309628261371, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 8.469280949748764e-06, |
|
"loss": 0.2283, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.7363195566843685, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 8.38779572291698e-06, |
|
"loss": 0.2195, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.7437081505426, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 8.306420254424274e-06, |
|
"loss": 0.2237, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.7510967444008312, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 8.225160084285893e-06, |
|
"loss": 0.2109, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.7584853382590626, |
|
"grad_norm": 1.75, |
|
"learning_rate": 8.144020744667596e-06, |
|
"loss": 0.2244, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.765873932117294, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 8.063007759509044e-06, |
|
"loss": 0.2318, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.7732625259755253, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 7.982126644147733e-06, |
|
"loss": 0.2318, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7806511198337567, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 7.901382904943505e-06, |
|
"loss": 0.2226, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.788039713691988, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 7.82078203890367e-06, |
|
"loss": 0.2323, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.7954283075502193, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 7.740329533308793e-06, |
|
"loss": 0.2209, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.8028169014084507, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 7.660030865339096e-06, |
|
"loss": 0.2228, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.8102054952666822, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 7.57989150170159e-06, |
|
"loss": 0.2378, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8175940891249134, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 7.499916898257906e-06, |
|
"loss": 0.2257, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.8249826829831448, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 7.420112499652851e-06, |
|
"loss": 0.2138, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.8323712768413762, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 7.3404837389437314e-06, |
|
"loss": 0.2219, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.8397598706996074, |
|
"grad_norm": 1.625, |
|
"learning_rate": 7.261036037230499e-06, |
|
"loss": 0.2291, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.8471484645578389, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 7.181774803286661e-06, |
|
"loss": 0.2437, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8545370584160703, |
|
"grad_norm": 1.5, |
|
"learning_rate": 7.102705433191066e-06, |
|
"loss": 0.2206, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 1.8619256522743015, |
|
"grad_norm": 1.5, |
|
"learning_rate": 7.02383330996052e-06, |
|
"loss": 0.228, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 1.869314246132533, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 6.945163803183345e-06, |
|
"loss": 0.2234, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 1.8767028399907644, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 6.866702268653787e-06, |
|
"loss": 0.2359, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 1.8840914338489956, |
|
"grad_norm": 1.625, |
|
"learning_rate": 6.788454048007418e-06, |
|
"loss": 0.2411, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.891480027707227, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 6.710424468357471e-06, |
|
"loss": 0.2315, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 1.8988686215654584, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 6.632618841932164e-06, |
|
"loss": 0.2092, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 1.9062572154236896, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 6.555042465713059e-06, |
|
"loss": 0.2192, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 1.913645809281921, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 6.4777006210744375e-06, |
|
"loss": 0.2158, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 1.9210344031401525, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 6.400598573423753e-06, |
|
"loss": 0.2213, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9284229969983837, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 6.323741571843145e-06, |
|
"loss": 0.2106, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 1.9358115908566151, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 6.24713484873211e-06, |
|
"loss": 0.2225, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 1.9432001847148466, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 6.170783619451264e-06, |
|
"loss": 0.2151, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 1.9505887785730778, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 6.094693081967289e-06, |
|
"loss": 0.2305, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 1.9579773724313092, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 6.018868416499046e-06, |
|
"loss": 0.2196, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9653659662895406, |
|
"grad_norm": 1.625, |
|
"learning_rate": 5.943314785164924e-06, |
|
"loss": 0.2395, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 1.9727545601477718, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 5.8680373316314e-06, |
|
"loss": 0.2275, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 1.9801431540060033, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 5.793041180762845e-06, |
|
"loss": 0.2246, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 1.9875317478642347, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 5.7183314382726555e-06, |
|
"loss": 0.2268, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 1.994920341722466, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 5.643913190375614e-06, |
|
"loss": 0.2132, |
|
"step": 1080 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 1623, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3964435361415823e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|