|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9993073193257908, |
|
"eval_steps": 500, |
|
"global_step": 541, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0073885938582313555, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.2922, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.014777187716462711, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.3078, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.022165781574694066, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.2979, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.029554375432925422, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.2788, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03694296929115678, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.3001, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04433156314938813, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.304, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.051720157007619484, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.2816, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.059108750865850844, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.277, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0664973447240822, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.2677, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07388593858231356, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.2739, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0812745324405449, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.2485, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08866312629877626, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.2675, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09605172015700762, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.9999181232057437e-05, |
|
"loss": 0.2547, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.10344031401523897, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.9992631892952108e-05, |
|
"loss": 0.2516, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11082890787347033, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.9979537504476945e-05, |
|
"loss": 0.2558, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11821750173170169, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.995990664329323e-05, |
|
"loss": 0.2382, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12560609558993305, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.993375216737042e-05, |
|
"loss": 0.2376, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1329946894481644, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.9901091207564326e-05, |
|
"loss": 0.2359, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.14038328330639574, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.986194515639662e-05, |
|
"loss": 0.2217, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1477718771646271, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.981633965404302e-05, |
|
"loss": 0.2247, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15516047102285846, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.9764304571539266e-05, |
|
"loss": 0.2082, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1625490648810898, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.9705873991215973e-05, |
|
"loss": 0.2053, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16993765873932118, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.9641086184375148e-05, |
|
"loss": 0.1989, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.17732625259755253, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.956998358622293e-05, |
|
"loss": 0.2069, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.18471484645578387, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.9492612768075094e-05, |
|
"loss": 0.2001, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19210344031401524, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.940902440685339e-05, |
|
"loss": 0.1971, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1994920341722466, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.9319273251892805e-05, |
|
"loss": 0.2056, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.20688062803047794, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.922341808908144e-05, |
|
"loss": 0.2129, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2142692218887093, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.912152170235646e-05, |
|
"loss": 0.2068, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.22165781574694066, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.9013650832581424e-05, |
|
"loss": 0.203, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.229046409605172, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.8899876133831835e-05, |
|
"loss": 0.1934, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.23643500346340338, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.8780272127117606e-05, |
|
"loss": 0.1941, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.24382359732163472, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.865491715157273e-05, |
|
"loss": 0.2016, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2512121911798661, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.852389331314411e-05, |
|
"loss": 0.1838, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2586007850380974, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.838728643081321e-05, |
|
"loss": 0.2002, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2659893788963288, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.8245185980385673e-05, |
|
"loss": 0.1916, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.27337797275456016, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.809768503588578e-05, |
|
"loss": 0.2103, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.2807665666127915, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.7944880208594156e-05, |
|
"loss": 0.1959, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.28815516047102285, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.7786871583768536e-05, |
|
"loss": 0.1975, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2955437543292542, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.7623762655089208e-05, |
|
"loss": 0.1788, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.30293234818748555, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.745566025687193e-05, |
|
"loss": 0.1997, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3103209420457169, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.728267449409278e-05, |
|
"loss": 0.1971, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3177095359039483, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.7104918670270763e-05, |
|
"loss": 0.1835, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3250981297621796, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.692250921325544e-05, |
|
"loss": 0.1883, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.332486723620411, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.6735565598968114e-05, |
|
"loss": 0.1825, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.33987531747864236, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.6544210273146608e-05, |
|
"loss": 0.1931, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.3472639113368737, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.6348568571144816e-05, |
|
"loss": 0.183, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.35465250519510505, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.6148768635839623e-05, |
|
"loss": 0.1966, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3620410990533364, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.5944941333698912e-05, |
|
"loss": 0.188, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.36942969291156774, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.5737220169065656e-05, |
|
"loss": 0.1858, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3768182867697991, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.552574119671423e-05, |
|
"loss": 0.1842, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.3842068806280305, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.5310642932736253e-05, |
|
"loss": 0.1899, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3915954744862618, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.5092066263814245e-05, |
|
"loss": 0.1785, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.3989840683444932, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.487015435494263e-05, |
|
"loss": 0.1949, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.40637266220272455, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.464505255565643e-05, |
|
"loss": 0.1787, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4137612560609559, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.4416908304829142e-05, |
|
"loss": 0.173, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.42114984991918725, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.4185871034102117e-05, |
|
"loss": 0.1945, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4285384437774186, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.3952092070008669e-05, |
|
"loss": 0.1949, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.43592703763564994, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.3715724534857127e-05, |
|
"loss": 0.1824, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4433156314938813, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.347692324643759e-05, |
|
"loss": 0.1844, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4507042253521127, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.323584461661823e-05, |
|
"loss": 0.193, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.458092819210344, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.2992646548897442e-05, |
|
"loss": 0.195, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.4654814130685754, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.2747488334979064e-05, |
|
"loss": 0.1844, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.47287000692680675, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.2500530550438232e-05, |
|
"loss": 0.1919, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.48025860078503807, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.2251934949546446e-05, |
|
"loss": 0.1966, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48764719464326944, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.200186435932449e-05, |
|
"loss": 0.1924, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4950357885015008, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.1750482572892781e-05, |
|
"loss": 0.1744, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5024243823597322, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.1497954242188913e-05, |
|
"loss": 0.2026, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5098129762179635, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.1244444770122707e-05, |
|
"loss": 0.1669, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5172015700761948, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.0990120202239324e-05, |
|
"loss": 0.187, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5245901639344263, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.073514711796155e-05, |
|
"loss": 0.1832, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5319787577926576, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.0479692521482316e-05, |
|
"loss": 0.1834, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5393673516508889, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.0223923732379049e-05, |
|
"loss": 0.1929, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5467559455091203, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.96800827602143e-06, |
|
"loss": 0.1922, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5541445393673516, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.712113773844361e-06, |
|
"loss": 0.1787, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.561533133225583, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.456407833558019e-06, |
|
"loss": 0.1795, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5689217270838144, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.201057939366896e-06, |
|
"loss": 0.1902, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5763103209420457, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 8.94623134226972e-06, |
|
"loss": 0.2048, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.583698914800277, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 8.692094950512145e-06, |
|
"loss": 0.1799, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5910875086585085, |
|
"grad_norm": 1.75, |
|
"learning_rate": 8.438815220263942e-06, |
|
"loss": 0.1958, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5984761025167398, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 8.186558046592247e-06, |
|
"loss": 0.1917, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6058646963749711, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 7.935488654802395e-06, |
|
"loss": 0.2007, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6132532902332025, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.685771492217387e-06, |
|
"loss": 0.1977, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6206418840914338, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 7.437570120466943e-06, |
|
"loss": 0.173, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6280304779496652, |
|
"grad_norm": 1.625, |
|
"learning_rate": 7.1910471083566725e-06, |
|
"loss": 0.1826, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6354190718078966, |
|
"grad_norm": 1.875, |
|
"learning_rate": 6.946363925387546e-06, |
|
"loss": 0.1842, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6428076656661279, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 6.7036808359953585e-06, |
|
"loss": 0.1834, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6501962595243592, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 6.463156794579543e-06, |
|
"loss": 0.189, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6575848533825907, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 6.224949341390017e-06, |
|
"loss": 0.1825, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.664973447240822, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 5.989214499340267e-06, |
|
"loss": 0.1802, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6723620410990533, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 5.756106671814301e-06, |
|
"loss": 0.1877, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6797506349572847, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 5.52577854153435e-06, |
|
"loss": 0.1963, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.687139228815516, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 5.298380970555584e-06, |
|
"loss": 0.1854, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.6945278226737474, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.074062901453352e-06, |
|
"loss": 0.1864, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7019164165319788, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.852971259767642e-06, |
|
"loss": 0.1905, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7093050103902101, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.635250857768696e-06, |
|
"loss": 0.1988, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7166936042484414, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.4210442996067724e-06, |
|
"loss": 0.1995, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7240821981066728, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.210491887908201e-06, |
|
"loss": 0.1927, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7314707919649042, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.0037315318789e-06, |
|
"loss": 0.1926, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7388593858231355, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.800898656975599e-06, |
|
"loss": 0.1961, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7462479796813669, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 3.602126116203819e-06, |
|
"loss": 0.1981, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7536365735395982, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.407544103100824e-06, |
|
"loss": 0.2058, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.7610251673978295, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.217280066460472e-06, |
|
"loss": 0.2083, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.768413761256061, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.0314586268558486e-06, |
|
"loss": 0.2052, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.7758023551142923, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.8502014950143376e-06, |
|
"loss": 0.1999, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7831909489725236, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 2.6736273920986166e-06, |
|
"loss": 0.1983, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.790579542830755, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.5018519719457725e-06, |
|
"loss": 0.2031, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.7979681366889864, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 2.334987745315478e-06, |
|
"loss": 0.2108, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.8053567305472177, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.1731440061968536e-06, |
|
"loss": 0.2072, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8127453244054491, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.016426760222259e-06, |
|
"loss": 0.2121, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8201339182636804, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.8649386552349136e-06, |
|
"loss": 0.2142, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.8275225121219117, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.718778914055873e-06, |
|
"loss": 0.1974, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8349111059801432, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.5780432694942815e-06, |
|
"loss": 0.2049, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.8422996998383745, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.4428239016435953e-06, |
|
"loss": 0.1979, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.8496882936966058, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.3132093775047616e-06, |
|
"loss": 0.2086, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8570768875548372, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.1892845929759412e-06, |
|
"loss": 0.2019, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.8644654814130686, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.07113071724675e-06, |
|
"loss": 0.2005, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.8718540752712999, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 9.588251396334524e-07, |
|
"loss": 0.206, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.8792426691295313, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 8.524414188899266e-07, |
|
"loss": 0.2248, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.8866312629877626, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 7.520492350275876e-07, |
|
"loss": 0.207, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8940198568459939, |
|
"grad_norm": 1.875, |
|
"learning_rate": 6.577143436758659e-07, |
|
"loss": 0.2118, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.9014084507042254, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 5.694985330130698e-07, |
|
"loss": 0.2358, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.9087970445624567, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.874595832959061e-07, |
|
"loss": 0.2224, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.916185638420688, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.1165122901414055e-07, |
|
"loss": 0.2375, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.9235742322789194, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.4212312369516496e-07, |
|
"loss": 0.2169, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9309628261371508, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 2.789208073815608e-07, |
|
"loss": 0.2191, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.9383514199953821, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.220856768029367e-07, |
|
"loss": 0.2225, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9457400138536135, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.7165495826158896e-07, |
|
"loss": 0.2143, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.9531286077118448, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.276616832497346e-07, |
|
"loss": 0.2324, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.9605172015700761, |
|
"grad_norm": 2.0, |
|
"learning_rate": 9.013466681429994e-08, |
|
"loss": 0.2411, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9679057954283076, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.9098488683417834e-08, |
|
"loss": 0.234, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.9752943892865389, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.457347716701587e-08, |
|
"loss": 0.2513, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.9826829831447702, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.6575695842027116e-08, |
|
"loss": 0.2358, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.9900715770030016, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 5.116933030946403e-09, |
|
"loss": 0.2242, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.997460170861233, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 2.0469408062440131e-10, |
|
"loss": 0.225, |
|
"step": 540 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 541, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1973917830354567e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|