{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993073193257908, "eval_steps": 500, "global_step": 541, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0073885938582313555, "grad_norm": 1.8984375, "learning_rate": 1.6000000000000001e-06, "loss": 0.2922, "step": 4 }, { "epoch": 0.014777187716462711, "grad_norm": 2.078125, "learning_rate": 3.2000000000000003e-06, "loss": 0.3078, "step": 8 }, { "epoch": 0.022165781574694066, "grad_norm": 2.390625, "learning_rate": 4.800000000000001e-06, "loss": 0.2979, "step": 12 }, { "epoch": 0.029554375432925422, "grad_norm": 2.078125, "learning_rate": 6.4000000000000006e-06, "loss": 0.2788, "step": 16 }, { "epoch": 0.03694296929115678, "grad_norm": 1.9921875, "learning_rate": 8.000000000000001e-06, "loss": 0.3001, "step": 20 }, { "epoch": 0.04433156314938813, "grad_norm": 1.828125, "learning_rate": 9.600000000000001e-06, "loss": 0.304, "step": 24 }, { "epoch": 0.051720157007619484, "grad_norm": 1.84375, "learning_rate": 1.1200000000000001e-05, "loss": 0.2816, "step": 28 }, { "epoch": 0.059108750865850844, "grad_norm": 1.9375, "learning_rate": 1.2800000000000001e-05, "loss": 0.277, "step": 32 }, { "epoch": 0.0664973447240822, "grad_norm": 1.859375, "learning_rate": 1.4400000000000001e-05, "loss": 0.2677, "step": 36 }, { "epoch": 0.07388593858231356, "grad_norm": 1.96875, "learning_rate": 1.6000000000000003e-05, "loss": 0.2739, "step": 40 }, { "epoch": 0.0812745324405449, "grad_norm": 2.03125, "learning_rate": 1.76e-05, "loss": 0.2485, "step": 44 }, { "epoch": 0.08866312629877626, "grad_norm": 1.953125, "learning_rate": 1.9200000000000003e-05, "loss": 0.2675, "step": 48 }, { "epoch": 0.09605172015700762, "grad_norm": 1.6640625, "learning_rate": 1.9999181232057437e-05, "loss": 0.2547, "step": 52 }, { "epoch": 0.10344031401523897, "grad_norm": 1.78125, "learning_rate": 1.9992631892952108e-05, "loss": 0.2516, "step": 56 }, { "epoch": 0.11082890787347033, "grad_norm": 1.78125, "learning_rate": 1.9979537504476945e-05, "loss": 0.2558, "step": 60 }, { "epoch": 0.11821750173170169, "grad_norm": 1.609375, "learning_rate": 1.995990664329323e-05, "loss": 0.2382, "step": 64 }, { "epoch": 0.12560609558993305, "grad_norm": 1.8515625, "learning_rate": 1.993375216737042e-05, "loss": 0.2376, "step": 68 }, { "epoch": 0.1329946894481644, "grad_norm": 1.8671875, "learning_rate": 1.9901091207564326e-05, "loss": 0.2359, "step": 72 }, { "epoch": 0.14038328330639574, "grad_norm": 1.640625, "learning_rate": 1.986194515639662e-05, "loss": 0.2217, "step": 76 }, { "epoch": 0.1477718771646271, "grad_norm": 1.71875, "learning_rate": 1.981633965404302e-05, "loss": 0.2247, "step": 80 }, { "epoch": 0.15516047102285846, "grad_norm": 1.984375, "learning_rate": 1.9764304571539266e-05, "loss": 0.2082, "step": 84 }, { "epoch": 0.1625490648810898, "grad_norm": 1.875, "learning_rate": 1.9705873991215973e-05, "loss": 0.2053, "step": 88 }, { "epoch": 0.16993765873932118, "grad_norm": 1.640625, "learning_rate": 1.9641086184375148e-05, "loss": 0.1989, "step": 92 }, { "epoch": 0.17732625259755253, "grad_norm": 1.578125, "learning_rate": 1.956998358622293e-05, "loss": 0.2069, "step": 96 }, { "epoch": 0.18471484645578387, "grad_norm": 1.7265625, "learning_rate": 1.9492612768075094e-05, "loss": 0.2001, "step": 100 }, { "epoch": 0.19210344031401524, "grad_norm": 1.6796875, "learning_rate": 1.940902440685339e-05, "loss": 0.1971, "step": 104 }, { "epoch": 0.1994920341722466, "grad_norm": 1.5859375, "learning_rate": 1.9319273251892805e-05, "loss": 0.2056, "step": 108 }, { "epoch": 0.20688062803047794, "grad_norm": 1.6328125, "learning_rate": 1.922341808908144e-05, "loss": 0.2129, "step": 112 }, { "epoch": 0.2142692218887093, "grad_norm": 1.7734375, "learning_rate": 1.912152170235646e-05, "loss": 0.2068, "step": 116 }, { "epoch": 0.22165781574694066, "grad_norm": 1.8203125, "learning_rate": 1.9013650832581424e-05, "loss": 0.203, "step": 120 }, { "epoch": 0.229046409605172, "grad_norm": 1.59375, "learning_rate": 1.8899876133831835e-05, "loss": 0.1934, "step": 124 }, { "epoch": 0.23643500346340338, "grad_norm": 1.9140625, "learning_rate": 1.8780272127117606e-05, "loss": 0.1941, "step": 128 }, { "epoch": 0.24382359732163472, "grad_norm": 1.6796875, "learning_rate": 1.865491715157273e-05, "loss": 0.2016, "step": 132 }, { "epoch": 0.2512121911798661, "grad_norm": 1.578125, "learning_rate": 1.852389331314411e-05, "loss": 0.1838, "step": 136 }, { "epoch": 0.2586007850380974, "grad_norm": 1.65625, "learning_rate": 1.838728643081321e-05, "loss": 0.2002, "step": 140 }, { "epoch": 0.2659893788963288, "grad_norm": 1.765625, "learning_rate": 1.8245185980385673e-05, "loss": 0.1916, "step": 144 }, { "epoch": 0.27337797275456016, "grad_norm": 1.6484375, "learning_rate": 1.809768503588578e-05, "loss": 0.2103, "step": 148 }, { "epoch": 0.2807665666127915, "grad_norm": 1.6796875, "learning_rate": 1.7944880208594156e-05, "loss": 0.1959, "step": 152 }, { "epoch": 0.28815516047102285, "grad_norm": 1.796875, "learning_rate": 1.7786871583768536e-05, "loss": 0.1975, "step": 156 }, { "epoch": 0.2955437543292542, "grad_norm": 1.6796875, "learning_rate": 1.7623762655089208e-05, "loss": 0.1788, "step": 160 }, { "epoch": 0.30293234818748555, "grad_norm": 1.8203125, "learning_rate": 1.745566025687193e-05, "loss": 0.1997, "step": 164 }, { "epoch": 0.3103209420457169, "grad_norm": 1.6640625, "learning_rate": 1.728267449409278e-05, "loss": 0.1971, "step": 168 }, { "epoch": 0.3177095359039483, "grad_norm": 1.5859375, "learning_rate": 1.7104918670270763e-05, "loss": 0.1835, "step": 172 }, { "epoch": 0.3250981297621796, "grad_norm": 1.7734375, "learning_rate": 1.692250921325544e-05, "loss": 0.1883, "step": 176 }, { "epoch": 0.332486723620411, "grad_norm": 1.6875, "learning_rate": 1.6735565598968114e-05, "loss": 0.1825, "step": 180 }, { "epoch": 0.33987531747864236, "grad_norm": 1.6953125, "learning_rate": 1.6544210273146608e-05, "loss": 0.1931, "step": 184 }, { "epoch": 0.3472639113368737, "grad_norm": 1.4765625, "learning_rate": 1.6348568571144816e-05, "loss": 0.183, "step": 188 }, { "epoch": 0.35465250519510505, "grad_norm": 1.6328125, "learning_rate": 1.6148768635839623e-05, "loss": 0.1966, "step": 192 }, { "epoch": 0.3620410990533364, "grad_norm": 1.4453125, "learning_rate": 1.5944941333698912e-05, "loss": 0.188, "step": 196 }, { "epoch": 0.36942969291156774, "grad_norm": 1.4921875, "learning_rate": 1.5737220169065656e-05, "loss": 0.1858, "step": 200 }, { "epoch": 0.3768182867697991, "grad_norm": 1.953125, "learning_rate": 1.552574119671423e-05, "loss": 0.1842, "step": 204 }, { "epoch": 0.3842068806280305, "grad_norm": 1.6640625, "learning_rate": 1.5310642932736253e-05, "loss": 0.1899, "step": 208 }, { "epoch": 0.3915954744862618, "grad_norm": 1.7109375, "learning_rate": 1.5092066263814245e-05, "loss": 0.1785, "step": 212 }, { "epoch": 0.3989840683444932, "grad_norm": 1.875, "learning_rate": 1.487015435494263e-05, "loss": 0.1949, "step": 216 }, { "epoch": 0.40637266220272455, "grad_norm": 1.5859375, "learning_rate": 1.464505255565643e-05, "loss": 0.1787, "step": 220 }, { "epoch": 0.4137612560609559, "grad_norm": 1.8125, "learning_rate": 1.4416908304829142e-05, "loss": 0.173, "step": 224 }, { "epoch": 0.42114984991918725, "grad_norm": 1.9453125, "learning_rate": 1.4185871034102117e-05, "loss": 0.1945, "step": 228 }, { "epoch": 0.4285384437774186, "grad_norm": 2.015625, "learning_rate": 1.3952092070008669e-05, "loss": 0.1949, "step": 232 }, { "epoch": 0.43592703763564994, "grad_norm": 1.5078125, "learning_rate": 1.3715724534857127e-05, "loss": 0.1824, "step": 236 }, { "epoch": 0.4433156314938813, "grad_norm": 1.625, "learning_rate": 1.347692324643759e-05, "loss": 0.1844, "step": 240 }, { "epoch": 0.4507042253521127, "grad_norm": 1.796875, "learning_rate": 1.323584461661823e-05, "loss": 0.193, "step": 244 }, { "epoch": 0.458092819210344, "grad_norm": 2.34375, "learning_rate": 1.2992646548897442e-05, "loss": 0.195, "step": 248 }, { "epoch": 0.4654814130685754, "grad_norm": 1.703125, "learning_rate": 1.2747488334979064e-05, "loss": 0.1844, "step": 252 }, { "epoch": 0.47287000692680675, "grad_norm": 1.8125, "learning_rate": 1.2500530550438232e-05, "loss": 0.1919, "step": 256 }, { "epoch": 0.48025860078503807, "grad_norm": 1.75, "learning_rate": 1.2251934949546446e-05, "loss": 0.1966, "step": 260 }, { "epoch": 0.48764719464326944, "grad_norm": 1.640625, "learning_rate": 1.200186435932449e-05, "loss": 0.1924, "step": 264 }, { "epoch": 0.4950357885015008, "grad_norm": 1.390625, "learning_rate": 1.1750482572892781e-05, "loss": 0.1744, "step": 268 }, { "epoch": 0.5024243823597322, "grad_norm": 1.6953125, "learning_rate": 1.1497954242188913e-05, "loss": 0.2026, "step": 272 }, { "epoch": 0.5098129762179635, "grad_norm": 1.4921875, "learning_rate": 1.1244444770122707e-05, "loss": 0.1669, "step": 276 }, { "epoch": 0.5172015700761948, "grad_norm": 1.640625, "learning_rate": 1.0990120202239324e-05, "loss": 0.187, "step": 280 }, { "epoch": 0.5245901639344263, "grad_norm": 1.8515625, "learning_rate": 1.073514711796155e-05, "loss": 0.1832, "step": 284 }, { "epoch": 0.5319787577926576, "grad_norm": 1.6640625, "learning_rate": 1.0479692521482316e-05, "loss": 0.1834, "step": 288 }, { "epoch": 0.5393673516508889, "grad_norm": 1.828125, "learning_rate": 1.0223923732379049e-05, "loss": 0.1929, "step": 292 }, { "epoch": 0.5467559455091203, "grad_norm": 1.71875, "learning_rate": 9.96800827602143e-06, "loss": 0.1922, "step": 296 }, { "epoch": 0.5541445393673516, "grad_norm": 1.484375, "learning_rate": 9.712113773844361e-06, "loss": 0.1787, "step": 300 }, { "epoch": 0.561533133225583, "grad_norm": 1.796875, "learning_rate": 9.456407833558019e-06, "loss": 0.1795, "step": 304 }, { "epoch": 0.5689217270838144, "grad_norm": 1.7265625, "learning_rate": 9.201057939366896e-06, "loss": 0.1902, "step": 308 }, { "epoch": 0.5763103209420457, "grad_norm": 1.9765625, "learning_rate": 8.94623134226972e-06, "loss": 0.2048, "step": 312 }, { "epoch": 0.583698914800277, "grad_norm": 1.6796875, "learning_rate": 8.692094950512145e-06, "loss": 0.1799, "step": 316 }, { "epoch": 0.5910875086585085, "grad_norm": 1.75, "learning_rate": 8.438815220263942e-06, "loss": 0.1958, "step": 320 }, { "epoch": 0.5984761025167398, "grad_norm": 1.640625, "learning_rate": 8.186558046592247e-06, "loss": 0.1917, "step": 324 }, { "epoch": 0.6058646963749711, "grad_norm": 1.7421875, "learning_rate": 7.935488654802395e-06, "loss": 0.2007, "step": 328 }, { "epoch": 0.6132532902332025, "grad_norm": 1.703125, "learning_rate": 7.685771492217387e-06, "loss": 0.1977, "step": 332 }, { "epoch": 0.6206418840914338, "grad_norm": 1.484375, "learning_rate": 7.437570120466943e-06, "loss": 0.173, "step": 336 }, { "epoch": 0.6280304779496652, "grad_norm": 1.625, "learning_rate": 7.1910471083566725e-06, "loss": 0.1826, "step": 340 }, { "epoch": 0.6354190718078966, "grad_norm": 1.875, "learning_rate": 6.946363925387546e-06, "loss": 0.1842, "step": 344 }, { "epoch": 0.6428076656661279, "grad_norm": 1.546875, "learning_rate": 6.7036808359953585e-06, "loss": 0.1834, "step": 348 }, { "epoch": 0.6501962595243592, "grad_norm": 1.6875, "learning_rate": 6.463156794579543e-06, "loss": 0.189, "step": 352 }, { "epoch": 0.6575848533825907, "grad_norm": 1.71875, "learning_rate": 6.224949341390017e-06, "loss": 0.1825, "step": 356 }, { "epoch": 0.664973447240822, "grad_norm": 1.8203125, "learning_rate": 5.989214499340267e-06, "loss": 0.1802, "step": 360 }, { "epoch": 0.6723620410990533, "grad_norm": 1.765625, "learning_rate": 5.756106671814301e-06, "loss": 0.1877, "step": 364 }, { "epoch": 0.6797506349572847, "grad_norm": 1.7421875, "learning_rate": 5.52577854153435e-06, "loss": 0.1963, "step": 368 }, { "epoch": 0.687139228815516, "grad_norm": 1.703125, "learning_rate": 5.298380970555584e-06, "loss": 0.1854, "step": 372 }, { "epoch": 0.6945278226737474, "grad_norm": 1.6796875, "learning_rate": 5.074062901453352e-06, "loss": 0.1864, "step": 376 }, { "epoch": 0.7019164165319788, "grad_norm": 1.7421875, "learning_rate": 4.852971259767642e-06, "loss": 0.1905, "step": 380 }, { "epoch": 0.7093050103902101, "grad_norm": 1.84375, "learning_rate": 4.635250857768696e-06, "loss": 0.1988, "step": 384 }, { "epoch": 0.7166936042484414, "grad_norm": 1.7265625, "learning_rate": 4.4210442996067724e-06, "loss": 0.1995, "step": 388 }, { "epoch": 0.7240821981066728, "grad_norm": 1.703125, "learning_rate": 4.210491887908201e-06, "loss": 0.1927, "step": 392 }, { "epoch": 0.7314707919649042, "grad_norm": 1.78125, "learning_rate": 4.0037315318789e-06, "loss": 0.1926, "step": 396 }, { "epoch": 0.7388593858231355, "grad_norm": 1.890625, "learning_rate": 3.800898656975599e-06, "loss": 0.1961, "step": 400 }, { "epoch": 0.7462479796813669, "grad_norm": 1.9921875, "learning_rate": 3.602126116203819e-06, "loss": 0.1981, "step": 404 }, { "epoch": 0.7536365735395982, "grad_norm": 1.6484375, "learning_rate": 3.407544103100824e-06, "loss": 0.2058, "step": 408 }, { "epoch": 0.7610251673978295, "grad_norm": 1.953125, "learning_rate": 3.217280066460472e-06, "loss": 0.2083, "step": 412 }, { "epoch": 0.768413761256061, "grad_norm": 1.84375, "learning_rate": 3.0314586268558486e-06, "loss": 0.2052, "step": 416 }, { "epoch": 0.7758023551142923, "grad_norm": 1.796875, "learning_rate": 2.8502014950143376e-06, "loss": 0.1999, "step": 420 }, { "epoch": 0.7831909489725236, "grad_norm": 1.890625, "learning_rate": 2.6736273920986166e-06, "loss": 0.1983, "step": 424 }, { "epoch": 0.790579542830755, "grad_norm": 1.78125, "learning_rate": 2.5018519719457725e-06, "loss": 0.2031, "step": 428 }, { "epoch": 0.7979681366889864, "grad_norm": 1.8828125, "learning_rate": 2.334987745315478e-06, "loss": 0.2108, "step": 432 }, { "epoch": 0.8053567305472177, "grad_norm": 1.8671875, "learning_rate": 2.1731440061968536e-06, "loss": 0.2072, "step": 436 }, { "epoch": 0.8127453244054491, "grad_norm": 2.28125, "learning_rate": 2.016426760222259e-06, "loss": 0.2121, "step": 440 }, { "epoch": 0.8201339182636804, "grad_norm": 1.890625, "learning_rate": 1.8649386552349136e-06, "loss": 0.2142, "step": 444 }, { "epoch": 0.8275225121219117, "grad_norm": 1.90625, "learning_rate": 1.718778914055873e-06, "loss": 0.1974, "step": 448 }, { "epoch": 0.8349111059801432, "grad_norm": 1.7734375, "learning_rate": 1.5780432694942815e-06, "loss": 0.2049, "step": 452 }, { "epoch": 0.8422996998383745, "grad_norm": 2.015625, "learning_rate": 1.4428239016435953e-06, "loss": 0.1979, "step": 456 }, { "epoch": 0.8496882936966058, "grad_norm": 1.953125, "learning_rate": 1.3132093775047616e-06, "loss": 0.2086, "step": 460 }, { "epoch": 0.8570768875548372, "grad_norm": 1.78125, "learning_rate": 1.1892845929759412e-06, "loss": 0.2019, "step": 464 }, { "epoch": 0.8644654814130686, "grad_norm": 1.8046875, "learning_rate": 1.07113071724675e-06, "loss": 0.2005, "step": 468 }, { "epoch": 0.8718540752712999, "grad_norm": 1.6171875, "learning_rate": 9.588251396334524e-07, "loss": 0.206, "step": 472 }, { "epoch": 0.8792426691295313, "grad_norm": 1.953125, "learning_rate": 8.524414188899266e-07, "loss": 0.2248, "step": 476 }, { "epoch": 0.8866312629877626, "grad_norm": 1.8984375, "learning_rate": 7.520492350275876e-07, "loss": 0.207, "step": 480 }, { "epoch": 0.8940198568459939, "grad_norm": 1.875, "learning_rate": 6.577143436758659e-07, "loss": 0.2118, "step": 484 }, { "epoch": 0.9014084507042254, "grad_norm": 1.8203125, "learning_rate": 5.694985330130698e-07, "loss": 0.2358, "step": 488 }, { "epoch": 0.9087970445624567, "grad_norm": 2.109375, "learning_rate": 4.874595832959061e-07, "loss": 0.2224, "step": 492 }, { "epoch": 0.916185638420688, "grad_norm": 1.9609375, "learning_rate": 4.1165122901414055e-07, "loss": 0.2375, "step": 496 }, { "epoch": 0.9235742322789194, "grad_norm": 1.859375, "learning_rate": 3.4212312369516496e-07, "loss": 0.2169, "step": 500 }, { "epoch": 0.9309628261371508, "grad_norm": 1.953125, "learning_rate": 2.789208073815608e-07, "loss": 0.2191, "step": 504 }, { "epoch": 0.9383514199953821, "grad_norm": 2.125, "learning_rate": 2.220856768029367e-07, "loss": 0.2225, "step": 508 }, { "epoch": 0.9457400138536135, "grad_norm": 2.078125, "learning_rate": 1.7165495826158896e-07, "loss": 0.2143, "step": 512 }, { "epoch": 0.9531286077118448, "grad_norm": 2.09375, "learning_rate": 1.276616832497346e-07, "loss": 0.2324, "step": 516 }, { "epoch": 0.9605172015700761, "grad_norm": 2.0, "learning_rate": 9.013466681429994e-08, "loss": 0.2411, "step": 520 }, { "epoch": 0.9679057954283076, "grad_norm": 2.03125, "learning_rate": 5.9098488683417834e-08, "loss": 0.234, "step": 524 }, { "epoch": 0.9752943892865389, "grad_norm": 2.234375, "learning_rate": 3.457347716701587e-08, "loss": 0.2513, "step": 528 }, { "epoch": 0.9826829831447702, "grad_norm": 2.0, "learning_rate": 1.6575695842027116e-08, "loss": 0.2358, "step": 532 }, { "epoch": 0.9900715770030016, "grad_norm": 2.109375, "learning_rate": 5.116933030946403e-09, "loss": 0.2242, "step": 536 }, { "epoch": 0.997460170861233, "grad_norm": 1.96875, "learning_rate": 2.0469408062440131e-10, "loss": 0.225, "step": 540 } ], "logging_steps": 4, "max_steps": 541, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1973917830354567e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }