{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996810207336523, "eval_steps": 500, "global_step": 1567, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006379585326953748, "grad_norm": 683.0049438476562, "learning_rate": 1.910828025477707e-06, "loss": 45.2944, "step": 1 }, { "epoch": 0.003189792663476874, "grad_norm": 519.6649780273438, "learning_rate": 9.554140127388534e-06, "loss": 45.6093, "step": 5 }, { "epoch": 0.006379585326953748, "grad_norm": 189.5433349609375, "learning_rate": 1.9108280254777068e-05, "loss": 32.9532, "step": 10 }, { "epoch": 0.009569377990430622, "grad_norm": 63.46455001831055, "learning_rate": 2.8662420382165606e-05, "loss": 23.1174, "step": 15 }, { "epoch": 0.012759170653907496, "grad_norm": 18.510099411010742, "learning_rate": 3.8216560509554137e-05, "loss": 20.6226, "step": 20 }, { "epoch": 0.01594896331738437, "grad_norm": 10.55661678314209, "learning_rate": 4.777070063694267e-05, "loss": 19.5559, "step": 25 }, { "epoch": 0.019138755980861243, "grad_norm": 14.438572883605957, "learning_rate": 5.732484076433121e-05, "loss": 18.0152, "step": 30 }, { "epoch": 0.022328548644338118, "grad_norm": 36.82223129272461, "learning_rate": 6.687898089171974e-05, "loss": 15.7287, "step": 35 }, { "epoch": 0.025518341307814992, "grad_norm": 75.36656951904297, "learning_rate": 7.643312101910827e-05, "loss": 10.9057, "step": 40 }, { "epoch": 0.028708133971291867, "grad_norm": 9.559320449829102, "learning_rate": 8.59872611464968e-05, "loss": 2.8213, "step": 45 }, { "epoch": 0.03189792663476874, "grad_norm": 4.798595428466797, "learning_rate": 9.554140127388533e-05, "loss": 2.1012, "step": 50 }, { "epoch": 0.03508771929824561, "grad_norm": 3.041489601135254, "learning_rate": 0.00010509554140127387, "loss": 1.751, "step": 55 }, { "epoch": 0.03827751196172249, "grad_norm": 10.074992179870605, "learning_rate": 0.00011464968152866242, "loss": 1.629, "step": 60 }, { "epoch": 0.04146730462519936, "grad_norm": 4.232330322265625, "learning_rate": 0.00012420382165605095, "loss": 1.5111, "step": 65 }, { "epoch": 0.044657097288676235, "grad_norm": 3.0990259647369385, "learning_rate": 0.00013375796178343948, "loss": 1.3541, "step": 70 }, { "epoch": 0.04784688995215311, "grad_norm": 3.949800968170166, "learning_rate": 0.00014331210191082802, "loss": 1.314, "step": 75 }, { "epoch": 0.051036682615629984, "grad_norm": 8.635440826416016, "learning_rate": 0.00015286624203821655, "loss": 1.28, "step": 80 }, { "epoch": 0.05422647527910686, "grad_norm": 2.358060359954834, "learning_rate": 0.00016242038216560508, "loss": 1.231, "step": 85 }, { "epoch": 0.05741626794258373, "grad_norm": 7.330495357513428, "learning_rate": 0.0001719745222929936, "loss": 1.1927, "step": 90 }, { "epoch": 0.06060606060606061, "grad_norm": 3.866813898086548, "learning_rate": 0.00018152866242038214, "loss": 1.1379, "step": 95 }, { "epoch": 0.06379585326953748, "grad_norm": 2.174020528793335, "learning_rate": 0.00019108280254777067, "loss": 1.1335, "step": 100 }, { "epoch": 0.06698564593301436, "grad_norm": 1.3060088157653809, "learning_rate": 0.0002006369426751592, "loss": 1.1068, "step": 105 }, { "epoch": 0.07017543859649122, "grad_norm": 4.537840843200684, "learning_rate": 0.00021019108280254773, "loss": 1.0773, "step": 110 }, { "epoch": 0.0733652312599681, "grad_norm": 51.75442886352539, "learning_rate": 0.00021974522292993626, "loss": 1.1051, "step": 115 }, { "epoch": 0.07655502392344497, "grad_norm": 1.6401653289794922, "learning_rate": 0.00022929936305732485, "loss": 1.0971, "step": 120 }, { "epoch": 0.07974481658692185, "grad_norm": 1.8242731094360352, "learning_rate": 0.00023885350318471338, "loss": 1.0969, "step": 125 }, { "epoch": 0.08293460925039872, "grad_norm": 3.504967212677002, "learning_rate": 0.0002484076433121019, "loss": 1.1196, "step": 130 }, { "epoch": 0.0861244019138756, "grad_norm": 11.742030143737793, "learning_rate": 0.00025796178343949044, "loss": 1.0801, "step": 135 }, { "epoch": 0.08931419457735247, "grad_norm": 8.536433219909668, "learning_rate": 0.00026751592356687897, "loss": 1.0728, "step": 140 }, { "epoch": 0.09250398724082935, "grad_norm": 2.490248441696167, "learning_rate": 0.0002770700636942675, "loss": 1.0875, "step": 145 }, { "epoch": 0.09569377990430622, "grad_norm": 2.524031639099121, "learning_rate": 0.00028662420382165603, "loss": 1.0371, "step": 150 }, { "epoch": 0.09888357256778309, "grad_norm": 3.3123767375946045, "learning_rate": 0.00029617834394904456, "loss": 1.049, "step": 155 }, { "epoch": 0.10207336523125997, "grad_norm": 1.9446417093276978, "learning_rate": 0.0002999966490829603, "loss": 1.0379, "step": 160 }, { "epoch": 0.10526315789473684, "grad_norm": 1.845177173614502, "learning_rate": 0.00029997617179878324, "loss": 1.0484, "step": 165 }, { "epoch": 0.10845295055821372, "grad_norm": 4.8916015625, "learning_rate": 0.0002999370813893047, "loss": 1.0585, "step": 170 }, { "epoch": 0.11164274322169059, "grad_norm": 23.282398223876953, "learning_rate": 0.00029987938270592676, "loss": 1.0488, "step": 175 }, { "epoch": 0.11483253588516747, "grad_norm": 4.743477821350098, "learning_rate": 0.0002998030829094724, "loss": 1.0479, "step": 180 }, { "epoch": 0.11802232854864433, "grad_norm": 2.9644861221313477, "learning_rate": 0.00029970819146929694, "loss": 1.0125, "step": 185 }, { "epoch": 0.12121212121212122, "grad_norm": 1.3036428689956665, "learning_rate": 0.0002995947201621131, "loss": 1.0429, "step": 190 }, { "epoch": 0.12440191387559808, "grad_norm": 12.80103588104248, "learning_rate": 0.00029946268307052886, "loss": 1.031, "step": 195 }, { "epoch": 0.12759170653907495, "grad_norm": 3.6281776428222656, "learning_rate": 0.0002993120965813003, "loss": 1.046, "step": 200 }, { "epoch": 0.13078149920255183, "grad_norm": 6.867593765258789, "learning_rate": 0.0002991429793832975, "loss": 1.0034, "step": 205 }, { "epoch": 0.1339712918660287, "grad_norm": 4.86781120300293, "learning_rate": 0.00029895535246518524, "loss": 1.0558, "step": 210 }, { "epoch": 0.1371610845295056, "grad_norm": 1.8913637399673462, "learning_rate": 0.0002987492391128182, "loss": 1.0404, "step": 215 }, { "epoch": 0.14035087719298245, "grad_norm": 1.4582362174987793, "learning_rate": 0.0002985246649063509, "loss": 1.0121, "step": 220 }, { "epoch": 0.14354066985645933, "grad_norm": 4.474733352661133, "learning_rate": 0.0002982816577170631, "loss": 0.9997, "step": 225 }, { "epoch": 0.1467304625199362, "grad_norm": 1.3253235816955566, "learning_rate": 0.00029802024770390087, "loss": 1.0034, "step": 230 }, { "epoch": 0.14992025518341306, "grad_norm": 1.452958106994629, "learning_rate": 0.00029774046730973334, "loss": 0.9983, "step": 235 }, { "epoch": 0.15311004784688995, "grad_norm": 2.8578474521636963, "learning_rate": 0.00029744235125732664, "loss": 1.0123, "step": 240 }, { "epoch": 0.15629984051036683, "grad_norm": 4.637836933135986, "learning_rate": 0.0002971259365450344, "loss": 1.0427, "step": 245 }, { "epoch": 0.1594896331738437, "grad_norm": 1.3336002826690674, "learning_rate": 0.00029679126244220596, "loss": 0.9828, "step": 250 }, { "epoch": 0.16267942583732056, "grad_norm": 2.1084749698638916, "learning_rate": 0.00029643837048431293, "loss": 0.9907, "step": 255 }, { "epoch": 0.16586921850079744, "grad_norm": 1.7289551496505737, "learning_rate": 0.0002960673044677939, "loss": 1.0017, "step": 260 }, { "epoch": 0.16905901116427433, "grad_norm": 1.0768038034439087, "learning_rate": 0.00029567811044461977, "loss": 0.9639, "step": 265 }, { "epoch": 0.1722488038277512, "grad_norm": 3.015089750289917, "learning_rate": 0.00029527083671657746, "loss": 0.9858, "step": 270 }, { "epoch": 0.17543859649122806, "grad_norm": 3.8819258213043213, "learning_rate": 0.00029484553382927594, "loss": 0.9907, "step": 275 }, { "epoch": 0.17862838915470494, "grad_norm": 1.3600151538848877, "learning_rate": 0.000294402254565873, "loss": 1.0016, "step": 280 }, { "epoch": 0.18181818181818182, "grad_norm": 2.409559488296509, "learning_rate": 0.00029394105394052434, "loss": 1.0042, "step": 285 }, { "epoch": 0.1850079744816587, "grad_norm": 2.906012773513794, "learning_rate": 0.00029346198919155616, "loss": 1.0192, "step": 290 }, { "epoch": 0.18819776714513556, "grad_norm": 3.697054386138916, "learning_rate": 0.00029296511977436107, "loss": 1.0157, "step": 295 }, { "epoch": 0.19138755980861244, "grad_norm": 2.336791753768921, "learning_rate": 0.0002924505073540198, "loss": 0.992, "step": 300 }, { "epoch": 0.19457735247208932, "grad_norm": 3.517256259918213, "learning_rate": 0.0002919182157976476, "loss": 0.9753, "step": 305 }, { "epoch": 0.19776714513556617, "grad_norm": 4.080871105194092, "learning_rate": 0.00029136831116646815, "loss": 0.9739, "step": 310 }, { "epoch": 0.20095693779904306, "grad_norm": 1.9315543174743652, "learning_rate": 0.000290800861707615, "loss": 0.959, "step": 315 }, { "epoch": 0.20414673046251994, "grad_norm": 4.471941947937012, "learning_rate": 0.00029021593784566113, "loss": 0.9718, "step": 320 }, { "epoch": 0.20733652312599682, "grad_norm": 4.151180744171143, "learning_rate": 0.0002896136121738793, "loss": 0.9705, "step": 325 }, { "epoch": 0.21052631578947367, "grad_norm": 2.165132999420166, "learning_rate": 0.0002889939594452323, "loss": 0.9575, "step": 330 }, { "epoch": 0.21371610845295055, "grad_norm": 1.3790881633758545, "learning_rate": 0.00028835705656309583, "loss": 0.9773, "step": 335 }, { "epoch": 0.21690590111642744, "grad_norm": 1.334600806236267, "learning_rate": 0.0002877029825717142, "loss": 0.9818, "step": 340 }, { "epoch": 0.22009569377990432, "grad_norm": 2.155672788619995, "learning_rate": 0.0002870318186463901, "loss": 0.9487, "step": 345 }, { "epoch": 0.22328548644338117, "grad_norm": 2.6333436965942383, "learning_rate": 0.0002863436480834105, "loss": 0.9999, "step": 350 }, { "epoch": 0.22647527910685805, "grad_norm": 1.2809535264968872, "learning_rate": 0.00028563855628970886, "loss": 0.967, "step": 355 }, { "epoch": 0.22966507177033493, "grad_norm": 2.015334367752075, "learning_rate": 0.0002849166307722653, "loss": 0.9965, "step": 360 }, { "epoch": 0.23285486443381181, "grad_norm": 1.6532715559005737, "learning_rate": 0.00028417796112724684, "loss": 0.9949, "step": 365 }, { "epoch": 0.23604465709728867, "grad_norm": 1.243087649345398, "learning_rate": 0.0002834226390288873, "loss": 0.9407, "step": 370 }, { "epoch": 0.23923444976076555, "grad_norm": 2.077500104904175, "learning_rate": 0.0002826507582181103, "loss": 0.9508, "step": 375 }, { "epoch": 0.24242424242424243, "grad_norm": 1.656862497329712, "learning_rate": 0.00028186241449089524, "loss": 0.9701, "step": 380 }, { "epoch": 0.24561403508771928, "grad_norm": 1.4174304008483887, "learning_rate": 0.000281057705686388, "loss": 0.9649, "step": 385 }, { "epoch": 0.24880382775119617, "grad_norm": 2.1187663078308105, "learning_rate": 0.0002802367316747589, "loss": 0.9655, "step": 390 }, { "epoch": 0.25199362041467305, "grad_norm": 2.494663953781128, "learning_rate": 0.0002793995943448078, "loss": 0.9422, "step": 395 }, { "epoch": 0.2551834130781499, "grad_norm": 2.778376579284668, "learning_rate": 0.00027854639759131893, "loss": 0.9518, "step": 400 }, { "epoch": 0.2583732057416268, "grad_norm": 1.4692327976226807, "learning_rate": 0.00027767724730216696, "loss": 0.9773, "step": 405 }, { "epoch": 0.26156299840510366, "grad_norm": 2.749265670776367, "learning_rate": 0.0002767922513451754, "loss": 0.9687, "step": 410 }, { "epoch": 0.2647527910685805, "grad_norm": 1.0992218255996704, "learning_rate": 0.00027589151955472965, "loss": 0.9535, "step": 415 }, { "epoch": 0.2679425837320574, "grad_norm": 1.787949800491333, "learning_rate": 0.00027497516371814543, "loss": 0.9435, "step": 420 }, { "epoch": 0.2711323763955343, "grad_norm": 5.4542412757873535, "learning_rate": 0.00027404329756179537, "loss": 0.9498, "step": 425 }, { "epoch": 0.2743221690590112, "grad_norm": 2.4093711376190186, "learning_rate": 0.0002730960367369949, "loss": 1.0197, "step": 430 }, { "epoch": 0.27751196172248804, "grad_norm": 2.159757375717163, "learning_rate": 0.00027213349880564873, "loss": 0.9865, "step": 435 }, { "epoch": 0.2807017543859649, "grad_norm": 2.393873691558838, "learning_rate": 0.0002711558032256607, "loss": 0.9587, "step": 440 }, { "epoch": 0.2838915470494418, "grad_norm": 2.645077705383301, "learning_rate": 0.0002701630713361085, "loss": 0.9645, "step": 445 }, { "epoch": 0.28708133971291866, "grad_norm": 1.9091484546661377, "learning_rate": 0.00026915542634218403, "loss": 0.9629, "step": 450 }, { "epoch": 0.2902711323763955, "grad_norm": 1.352075457572937, "learning_rate": 0.00026813299329990335, "loss": 0.9761, "step": 455 }, { "epoch": 0.2934609250398724, "grad_norm": 2.6942131519317627, "learning_rate": 0.0002670958991005859, "loss": 0.9531, "step": 460 }, { "epoch": 0.2966507177033493, "grad_norm": 2.5837950706481934, "learning_rate": 0.0002660442724551065, "loss": 0.9489, "step": 465 }, { "epoch": 0.29984051036682613, "grad_norm": 2.5745198726654053, "learning_rate": 0.00026497824387792146, "loss": 0.9677, "step": 470 }, { "epoch": 0.30303030303030304, "grad_norm": 1.8516491651535034, "learning_rate": 0.00026389794567087085, "loss": 0.9664, "step": 475 }, { "epoch": 0.3062200956937799, "grad_norm": 2.2351913452148438, "learning_rate": 0.0002628035119067586, "loss": 0.9425, "step": 480 }, { "epoch": 0.3094098883572568, "grad_norm": 1.51954185962677, "learning_rate": 0.0002616950784127135, "loss": 0.9261, "step": 485 }, { "epoch": 0.31259968102073366, "grad_norm": 1.722433090209961, "learning_rate": 0.00026057278275333165, "loss": 0.973, "step": 490 }, { "epoch": 0.3157894736842105, "grad_norm": 6.133448600769043, "learning_rate": 0.00025943676421360395, "loss": 0.9492, "step": 495 }, { "epoch": 0.3189792663476874, "grad_norm": 2.187849283218384, "learning_rate": 0.00025828716378163, "loss": 0.9612, "step": 500 }, { "epoch": 0.32216905901116427, "grad_norm": 2.13073468208313, "learning_rate": 0.00025712412413112006, "loss": 0.9485, "step": 505 }, { "epoch": 0.3253588516746411, "grad_norm": 13.925029754638672, "learning_rate": 0.00025594778960368844, "loss": 0.9697, "step": 510 }, { "epoch": 0.32854864433811803, "grad_norm": 1.2951607704162598, "learning_rate": 0.0002547583061909396, "loss": 0.9433, "step": 515 }, { "epoch": 0.3317384370015949, "grad_norm": 1.386167287826538, "learning_rate": 0.00025355582151634956, "loss": 0.9613, "step": 520 }, { "epoch": 0.3349282296650718, "grad_norm": 3.0757944583892822, "learning_rate": 0.00025234048481694477, "loss": 0.9238, "step": 525 }, { "epoch": 0.33811802232854865, "grad_norm": 2.6696460247039795, "learning_rate": 0.0002511124469247809, "loss": 0.956, "step": 530 }, { "epoch": 0.3413078149920255, "grad_norm": 1.5839197635650635, "learning_rate": 0.00024987186024822295, "loss": 0.9467, "step": 535 }, { "epoch": 0.3444976076555024, "grad_norm": 1.6664620637893677, "learning_rate": 0.0002486188787530309, "loss": 0.9578, "step": 540 }, { "epoch": 0.34768740031897927, "grad_norm": 1.2985708713531494, "learning_rate": 0.00024735365794325117, "loss": 0.9704, "step": 545 }, { "epoch": 0.3508771929824561, "grad_norm": 2.3431661128997803, "learning_rate": 0.0002460763548419172, "loss": 0.9408, "step": 550 }, { "epoch": 0.35406698564593303, "grad_norm": 1.8065603971481323, "learning_rate": 0.0002447871279715624, "loss": 0.9543, "step": 555 }, { "epoch": 0.3572567783094099, "grad_norm": 2.896728277206421, "learning_rate": 0.00024348613733454565, "loss": 0.948, "step": 560 }, { "epoch": 0.36044657097288674, "grad_norm": 2.205735206604004, "learning_rate": 0.00024217354439319427, "loss": 0.9263, "step": 565 }, { "epoch": 0.36363636363636365, "grad_norm": 2.1975135803222656, "learning_rate": 0.00024084951204976528, "loss": 0.9091, "step": 570 }, { "epoch": 0.3668261562998405, "grad_norm": 1.9090759754180908, "learning_rate": 0.0002395142046262281, "loss": 0.9426, "step": 575 }, { "epoch": 0.3700159489633174, "grad_norm": 2.041628360748291, "learning_rate": 0.00023816778784387094, "loss": 0.9296, "step": 580 }, { "epoch": 0.37320574162679426, "grad_norm": 1.0868958234786987, "learning_rate": 0.0002368104288027336, "loss": 0.9225, "step": 585 }, { "epoch": 0.3763955342902711, "grad_norm": 1.3771106004714966, "learning_rate": 0.0002354422959608692, "loss": 0.9267, "step": 590 }, { "epoch": 0.379585326953748, "grad_norm": 1.0883209705352783, "learning_rate": 0.00023406355911343717, "loss": 0.9482, "step": 595 }, { "epoch": 0.3827751196172249, "grad_norm": 4.788959980010986, "learning_rate": 0.00023267438937163077, "loss": 0.9281, "step": 600 }, { "epoch": 0.38596491228070173, "grad_norm": 1.3784050941467285, "learning_rate": 0.00023127495914144051, "loss": 0.9551, "step": 605 }, { "epoch": 0.38915470494417864, "grad_norm": 3.7434804439544678, "learning_rate": 0.00022986544210225774, "loss": 0.931, "step": 610 }, { "epoch": 0.3923444976076555, "grad_norm": 1.4201207160949707, "learning_rate": 0.00022844601318531955, "loss": 0.9154, "step": 615 }, { "epoch": 0.39553429027113235, "grad_norm": 2.0786445140838623, "learning_rate": 0.00022701684855199857, "loss": 0.9615, "step": 620 }, { "epoch": 0.39872408293460926, "grad_norm": 1.343225121498108, "learning_rate": 0.00022557812557194, "loss": 0.9164, "step": 625 }, { "epoch": 0.4019138755980861, "grad_norm": 1.6810039281845093, "learning_rate": 0.00022413002280104915, "loss": 0.9112, "step": 630 }, { "epoch": 0.405103668261563, "grad_norm": 1.2535432577133179, "learning_rate": 0.00022267271995933074, "loss": 0.9467, "step": 635 }, { "epoch": 0.4082934609250399, "grad_norm": 1.297658085823059, "learning_rate": 0.00022120639790858482, "loss": 0.927, "step": 640 }, { "epoch": 0.41148325358851673, "grad_norm": 1.5415024757385254, "learning_rate": 0.00021973123862996044, "loss": 0.9447, "step": 645 }, { "epoch": 0.41467304625199364, "grad_norm": 1.305788278579712, "learning_rate": 0.00021824742520137026, "loss": 0.9376, "step": 650 }, { "epoch": 0.4178628389154705, "grad_norm": 1.043959617614746, "learning_rate": 0.00021675514177476945, "loss": 0.9255, "step": 655 }, { "epoch": 0.42105263157894735, "grad_norm": 1.2613155841827393, "learning_rate": 0.0002152545735533012, "loss": 0.9493, "step": 660 }, { "epoch": 0.42424242424242425, "grad_norm": 2.3178012371063232, "learning_rate": 0.00021374590676831136, "loss": 0.9336, "step": 665 }, { "epoch": 0.4274322169059011, "grad_norm": 1.5422348976135254, "learning_rate": 0.00021222932865623605, "loss": 0.9411, "step": 670 }, { "epoch": 0.430622009569378, "grad_norm": 2.053426504135132, "learning_rate": 0.00021070502743536414, "loss": 0.9086, "step": 675 }, { "epoch": 0.43381180223285487, "grad_norm": 1.4567965269088745, "learning_rate": 0.00020917319228247805, "loss": 0.9135, "step": 680 }, { "epoch": 0.4370015948963317, "grad_norm": 1.7360142469406128, "learning_rate": 0.00020763401330937555, "loss": 0.9208, "step": 685 }, { "epoch": 0.44019138755980863, "grad_norm": 3.2345566749572754, "learning_rate": 0.00020608768153927546, "loss": 0.9281, "step": 690 }, { "epoch": 0.4433811802232855, "grad_norm": 2.1315088272094727, "learning_rate": 0.00020453438888311042, "loss": 0.9203, "step": 695 }, { "epoch": 0.44657097288676234, "grad_norm": 2.226001262664795, "learning_rate": 0.00020297432811570916, "loss": 0.9319, "step": 700 }, { "epoch": 0.44976076555023925, "grad_norm": 2.0687644481658936, "learning_rate": 0.00020140769285187187, "loss": 0.9079, "step": 705 }, { "epoch": 0.4529505582137161, "grad_norm": 1.8376455307006836, "learning_rate": 0.00019983467752234132, "loss": 0.898, "step": 710 }, { "epoch": 0.45614035087719296, "grad_norm": 1.3176559209823608, "learning_rate": 0.0001982554773496723, "loss": 0.9483, "step": 715 }, { "epoch": 0.45933014354066987, "grad_norm": 1.6489756107330322, "learning_rate": 0.00019667028832400345, "loss": 0.8925, "step": 720 }, { "epoch": 0.4625199362041467, "grad_norm": 1.6719319820404053, "learning_rate": 0.00019507930717873313, "loss": 0.9151, "step": 725 }, { "epoch": 0.46570972886762363, "grad_norm": 2.6439127922058105, "learning_rate": 0.00019348273136610364, "loss": 0.9222, "step": 730 }, { "epoch": 0.4688995215311005, "grad_norm": 2.219003677368164, "learning_rate": 0.00019188075903269587, "loss": 0.9322, "step": 735 }, { "epoch": 0.47208931419457734, "grad_norm": 3.740412712097168, "learning_rate": 0.00019027358899483776, "loss": 0.9102, "step": 740 }, { "epoch": 0.47527910685805425, "grad_norm": 1.2723886966705322, "learning_rate": 0.00018866142071393013, "loss": 0.9193, "step": 745 }, { "epoch": 0.4784688995215311, "grad_norm": 2.935451030731201, "learning_rate": 0.00018704445427169156, "loss": 0.9242, "step": 750 }, { "epoch": 0.48165869218500795, "grad_norm": 3.1072025299072266, "learning_rate": 0.00018542289034532733, "loss": 0.9203, "step": 755 }, { "epoch": 0.48484848484848486, "grad_norm": 1.3583980798721313, "learning_rate": 0.00018379693018262349, "loss": 0.9202, "step": 760 }, { "epoch": 0.4880382775119617, "grad_norm": 1.6865596771240234, "learning_rate": 0.00018216677557697083, "loss": 0.9088, "step": 765 }, { "epoch": 0.49122807017543857, "grad_norm": 1.3977638483047485, "learning_rate": 0.00018053262884232078, "loss": 0.8964, "step": 770 }, { "epoch": 0.4944178628389155, "grad_norm": 1.32778000831604, "learning_rate": 0.0001788946927880768, "loss": 0.9019, "step": 775 }, { "epoch": 0.49760765550239233, "grad_norm": 1.1595113277435303, "learning_rate": 0.00017725317069392418, "loss": 0.8887, "step": 780 }, { "epoch": 0.5007974481658692, "grad_norm": 1.3176358938217163, "learning_rate": 0.00017560826628460182, "loss": 0.8992, "step": 785 }, { "epoch": 0.5039872408293461, "grad_norm": 2.6832964420318604, "learning_rate": 0.00017396018370461808, "loss": 0.922, "step": 790 }, { "epoch": 0.507177033492823, "grad_norm": 2.5020227432250977, "learning_rate": 0.00017230912749291547, "loss": 0.9321, "step": 795 }, { "epoch": 0.5103668261562998, "grad_norm": 2.304370403289795, "learning_rate": 0.00017065530255748557, "loss": 0.9174, "step": 800 }, { "epoch": 0.5135566188197768, "grad_norm": 1.984249472618103, "learning_rate": 0.00016899891414993854, "loss": 0.9184, "step": 805 }, { "epoch": 0.5167464114832536, "grad_norm": 1.0693037509918213, "learning_rate": 0.00016734016784002994, "loss": 0.9239, "step": 810 }, { "epoch": 0.5199362041467305, "grad_norm": 1.1777863502502441, "learning_rate": 0.000165679269490148, "loss": 0.9004, "step": 815 }, { "epoch": 0.5231259968102073, "grad_norm": 1.2900274991989136, "learning_rate": 0.0001640164252297648, "loss": 0.9034, "step": 820 }, { "epoch": 0.5263157894736842, "grad_norm": 1.350598692893982, "learning_rate": 0.00016235184142985367, "loss": 0.8938, "step": 825 }, { "epoch": 0.529505582137161, "grad_norm": 1.7906914949417114, "learning_rate": 0.00016068572467727762, "loss": 0.93, "step": 830 }, { "epoch": 0.532695374800638, "grad_norm": 1.3412362337112427, "learning_rate": 0.00015901828174915005, "loss": 0.9181, "step": 835 }, { "epoch": 0.5358851674641149, "grad_norm": 2.151066780090332, "learning_rate": 0.00015734971958717228, "loss": 0.9143, "step": 840 }, { "epoch": 0.5390749601275917, "grad_norm": 1.6764332056045532, "learning_rate": 0.00015568024527195067, "loss": 0.9275, "step": 845 }, { "epoch": 0.5422647527910686, "grad_norm": 2.056030750274658, "learning_rate": 0.0001540100659972963, "loss": 0.8855, "step": 850 }, { "epoch": 0.5454545454545454, "grad_norm": 1.733733892440796, "learning_rate": 0.000152339389044511, "loss": 0.9206, "step": 855 }, { "epoch": 0.5486443381180224, "grad_norm": 1.7512574195861816, "learning_rate": 0.00015066842175666186, "loss": 0.9079, "step": 860 }, { "epoch": 0.5518341307814992, "grad_norm": 1.1346197128295898, "learning_rate": 0.0001489973715128487, "loss": 0.8834, "step": 865 }, { "epoch": 0.5550239234449761, "grad_norm": 1.2773138284683228, "learning_rate": 0.00014732644570246675, "loss": 0.889, "step": 870 }, { "epoch": 0.5582137161084529, "grad_norm": 1.3865187168121338, "learning_rate": 0.00014565585169946824, "loss": 0.904, "step": 875 }, { "epoch": 0.5614035087719298, "grad_norm": 1.5330930948257446, "learning_rate": 0.0001439857968366256, "loss": 0.903, "step": 880 }, { "epoch": 0.5645933014354066, "grad_norm": 1.0300610065460205, "learning_rate": 0.00014231648837980022, "loss": 0.8837, "step": 885 }, { "epoch": 0.5677830940988836, "grad_norm": 1.1675297021865845, "learning_rate": 0.00014064813350221894, "loss": 0.9344, "step": 890 }, { "epoch": 0.5709728867623605, "grad_norm": 1.1978455781936646, "learning_rate": 0.00013898093925876267, "loss": 0.8725, "step": 895 }, { "epoch": 0.5741626794258373, "grad_norm": 1.0207343101501465, "learning_rate": 0.00013731511256026913, "loss": 0.8891, "step": 900 }, { "epoch": 0.5773524720893142, "grad_norm": 1.192966103553772, "learning_rate": 0.00013565086014785406, "loss": 0.9068, "step": 905 }, { "epoch": 0.580542264752791, "grad_norm": 1.3042093515396118, "learning_rate": 0.00013398838856725257, "loss": 0.9001, "step": 910 }, { "epoch": 0.583732057416268, "grad_norm": 1.0508038997650146, "learning_rate": 0.00013232790414318608, "loss": 0.8838, "step": 915 }, { "epoch": 0.5869218500797448, "grad_norm": 1.1323915719985962, "learning_rate": 0.0001306696129537553, "loss": 0.9089, "step": 920 }, { "epoch": 0.5901116427432217, "grad_norm": 1.5706528425216675, "learning_rate": 0.00012901372080486472, "loss": 0.8854, "step": 925 }, { "epoch": 0.5933014354066986, "grad_norm": 0.9833298325538635, "learning_rate": 0.00012736043320468073, "loss": 0.8725, "step": 930 }, { "epoch": 0.5964912280701754, "grad_norm": 1.0308219194412231, "learning_rate": 0.0001257099553381262, "loss": 0.8864, "step": 935 }, { "epoch": 0.5996810207336523, "grad_norm": 1.3172023296356201, "learning_rate": 0.00012406249204141603, "loss": 0.8876, "step": 940 }, { "epoch": 0.6028708133971292, "grad_norm": 1.1759730577468872, "learning_rate": 0.000122418247776635, "loss": 0.8923, "step": 945 }, { "epoch": 0.6060606060606061, "grad_norm": 1.251922369003296, "learning_rate": 0.00012077742660636299, "loss": 0.8914, "step": 950 }, { "epoch": 0.6092503987240829, "grad_norm": 1.0379743576049805, "learning_rate": 0.00011914023216834904, "loss": 0.8637, "step": 955 }, { "epoch": 0.6124401913875598, "grad_norm": 1.1909719705581665, "learning_rate": 0.0001175068676502386, "loss": 0.8646, "step": 960 }, { "epoch": 0.6156299840510366, "grad_norm": 1.0347654819488525, "learning_rate": 0.00011587753576435634, "loss": 0.8648, "step": 965 }, { "epoch": 0.6188197767145136, "grad_norm": 1.340886116027832, "learning_rate": 0.00011425243872254835, "loss": 0.8753, "step": 970 }, { "epoch": 0.6220095693779905, "grad_norm": 1.0791716575622559, "learning_rate": 0.00011263177821108573, "loss": 0.8905, "step": 975 }, { "epoch": 0.6251993620414673, "grad_norm": 1.0844085216522217, "learning_rate": 0.00011101575536563433, "loss": 0.8742, "step": 980 }, { "epoch": 0.6283891547049442, "grad_norm": 1.2681643962860107, "learning_rate": 0.00010940457074629217, "loss": 0.8707, "step": 985 }, { "epoch": 0.631578947368421, "grad_norm": 1.3830182552337646, "learning_rate": 0.00010779842431269843, "loss": 0.8816, "step": 990 }, { "epoch": 0.6347687400318979, "grad_norm": 1.7231197357177734, "learning_rate": 0.0001061975153992172, "loss": 0.8691, "step": 995 }, { "epoch": 0.6379585326953748, "grad_norm": 1.4428848028182983, "learning_rate": 0.00010460204269019829, "loss": 0.8635, "step": 1000 }, { "epoch": 0.6411483253588517, "grad_norm": 1.5241450071334839, "learning_rate": 0.0001030122041953196, "loss": 0.8796, "step": 1005 }, { "epoch": 0.6443381180223285, "grad_norm": 1.2227340936660767, "learning_rate": 0.0001014281972250121, "loss": 0.8776, "step": 1010 }, { "epoch": 0.6475279106858054, "grad_norm": 1.110999584197998, "learning_rate": 9.985021836597273e-05, "loss": 0.8541, "step": 1015 }, { "epoch": 0.6507177033492823, "grad_norm": 1.0933470726013184, "learning_rate": 9.827846345676614e-05, "loss": 0.8668, "step": 1020 }, { "epoch": 0.6539074960127592, "grad_norm": 1.448918342590332, "learning_rate": 9.671312756351998e-05, "loss": 0.8672, "step": 1025 }, { "epoch": 0.6570972886762361, "grad_norm": 1.1408368349075317, "learning_rate": 9.515440495571569e-05, "loss": 0.8688, "step": 1030 }, { "epoch": 0.6602870813397129, "grad_norm": 0.9816156625747681, "learning_rate": 9.360248908207813e-05, "loss": 0.8623, "step": 1035 }, { "epoch": 0.6634768740031898, "grad_norm": 1.3135740756988525, "learning_rate": 9.205757254656755e-05, "loss": 0.8462, "step": 1040 }, { "epoch": 0.6666666666666666, "grad_norm": 1.1900348663330078, "learning_rate": 9.05198470844756e-05, "loss": 0.8489, "step": 1045 }, { "epoch": 0.6698564593301436, "grad_norm": 1.4275152683258057, "learning_rate": 8.898950353862998e-05, "loss": 0.8703, "step": 1050 }, { "epoch": 0.6730462519936204, "grad_norm": 1.202156901359558, "learning_rate": 8.746673183570923e-05, "loss": 0.8671, "step": 1055 }, { "epoch": 0.6762360446570973, "grad_norm": 1.4236383438110352, "learning_rate": 8.595172096267157e-05, "loss": 0.8948, "step": 1060 }, { "epoch": 0.6794258373205742, "grad_norm": 1.1701873540878296, "learning_rate": 8.444465894330024e-05, "loss": 0.8417, "step": 1065 }, { "epoch": 0.682615629984051, "grad_norm": 1.1106019020080566, "learning_rate": 8.294573281486828e-05, "loss": 0.8628, "step": 1070 }, { "epoch": 0.6858054226475279, "grad_norm": 1.0223338603973389, "learning_rate": 8.145512860492596e-05, "loss": 0.8571, "step": 1075 }, { "epoch": 0.6889952153110048, "grad_norm": 1.305443525314331, "learning_rate": 7.997303130821362e-05, "loss": 0.872, "step": 1080 }, { "epoch": 0.6921850079744817, "grad_norm": 0.9939827919006348, "learning_rate": 7.849962486370206e-05, "loss": 0.8614, "step": 1085 }, { "epoch": 0.6953748006379585, "grad_norm": 1.004202961921692, "learning_rate": 7.703509213176451e-05, "loss": 0.879, "step": 1090 }, { "epoch": 0.6985645933014354, "grad_norm": 0.9680610299110413, "learning_rate": 7.557961487148272e-05, "loss": 0.8788, "step": 1095 }, { "epoch": 0.7017543859649122, "grad_norm": 0.912745475769043, "learning_rate": 7.413337371808884e-05, "loss": 0.8599, "step": 1100 }, { "epoch": 0.7049441786283892, "grad_norm": 1.167737603187561, "learning_rate": 7.269654816054756e-05, "loss": 0.8488, "step": 1105 }, { "epoch": 0.7081339712918661, "grad_norm": 1.242891788482666, "learning_rate": 7.126931651928012e-05, "loss": 0.8472, "step": 1110 }, { "epoch": 0.7113237639553429, "grad_norm": 1.010082721710205, "learning_rate": 6.985185592403367e-05, "loss": 0.8675, "step": 1115 }, { "epoch": 0.7145135566188198, "grad_norm": 1.4250268936157227, "learning_rate": 6.844434229189787e-05, "loss": 0.8626, "step": 1120 }, { "epoch": 0.7177033492822966, "grad_norm": 1.2300432920455933, "learning_rate": 6.704695030547252e-05, "loss": 0.8422, "step": 1125 }, { "epoch": 0.7208931419457735, "grad_norm": 1.642844319343567, "learning_rate": 6.56598533911881e-05, "loss": 0.8562, "step": 1130 }, { "epoch": 0.7240829346092504, "grad_norm": 1.089853048324585, "learning_rate": 6.428322369778254e-05, "loss": 0.8725, "step": 1135 }, { "epoch": 0.7272727272727273, "grad_norm": 0.9274362921714783, "learning_rate": 6.291723207493577e-05, "loss": 0.8328, "step": 1140 }, { "epoch": 0.7304625199362041, "grad_norm": 1.2909477949142456, "learning_rate": 6.15620480520666e-05, "loss": 0.8529, "step": 1145 }, { "epoch": 0.733652312599681, "grad_norm": 1.4193487167358398, "learning_rate": 6.0217839817292765e-05, "loss": 0.8341, "step": 1150 }, { "epoch": 0.7368421052631579, "grad_norm": 1.0142663717269897, "learning_rate": 5.888477419655733e-05, "loss": 0.8574, "step": 1155 }, { "epoch": 0.7400318979266348, "grad_norm": 1.0169768333435059, "learning_rate": 5.7563016632924555e-05, "loss": 0.8384, "step": 1160 }, { "epoch": 0.7432216905901117, "grad_norm": 1.1476997137069702, "learning_rate": 5.625273116604728e-05, "loss": 0.8734, "step": 1165 }, { "epoch": 0.7464114832535885, "grad_norm": 1.1595078706741333, "learning_rate": 5.495408041180829e-05, "loss": 0.8681, "step": 1170 }, { "epoch": 0.7496012759170654, "grad_norm": 1.3419498205184937, "learning_rate": 5.3667225542138507e-05, "loss": 0.8247, "step": 1175 }, { "epoch": 0.7527910685805422, "grad_norm": 1.0906846523284912, "learning_rate": 5.239232626501464e-05, "loss": 0.8724, "step": 1180 }, { "epoch": 0.7559808612440191, "grad_norm": 1.4378533363342285, "learning_rate": 5.1129540804637906e-05, "loss": 0.8566, "step": 1185 }, { "epoch": 0.759170653907496, "grad_norm": 1.0742905139923096, "learning_rate": 4.9879025881797466e-05, "loss": 0.8414, "step": 1190 }, { "epoch": 0.7623604465709729, "grad_norm": 1.0578699111938477, "learning_rate": 4.8640936694420245e-05, "loss": 0.839, "step": 1195 }, { "epoch": 0.7655502392344498, "grad_norm": 1.0157151222229004, "learning_rate": 4.7415426898309704e-05, "loss": 0.8279, "step": 1200 }, { "epoch": 0.7687400318979266, "grad_norm": 1.1732484102249146, "learning_rate": 4.62026485880761e-05, "loss": 0.8406, "step": 1205 }, { "epoch": 0.7719298245614035, "grad_norm": 1.163568139076233, "learning_rate": 4.50027522782603e-05, "loss": 0.8474, "step": 1210 }, { "epoch": 0.7751196172248804, "grad_norm": 1.4089716672897339, "learning_rate": 4.3815886884654136e-05, "loss": 0.8321, "step": 1215 }, { "epoch": 0.7783094098883573, "grad_norm": 1.2893257141113281, "learning_rate": 4.264219970581854e-05, "loss": 0.8578, "step": 1220 }, { "epoch": 0.7814992025518341, "grad_norm": 1.2172292470932007, "learning_rate": 4.148183640480293e-05, "loss": 0.8627, "step": 1225 }, { "epoch": 0.784688995215311, "grad_norm": 1.0065255165100098, "learning_rate": 4.0334940991067276e-05, "loss": 0.8574, "step": 1230 }, { "epoch": 0.7878787878787878, "grad_norm": 0.9151318669319153, "learning_rate": 3.920165580260973e-05, "loss": 0.8571, "step": 1235 }, { "epoch": 0.7910685805422647, "grad_norm": 1.065956473350525, "learning_rate": 3.808212148830095e-05, "loss": 0.8367, "step": 1240 }, { "epoch": 0.7942583732057417, "grad_norm": 1.0603456497192383, "learning_rate": 3.697647699042918e-05, "loss": 0.8336, "step": 1245 }, { "epoch": 0.7974481658692185, "grad_norm": 1.1699525117874146, "learning_rate": 3.5884859527455995e-05, "loss": 0.8453, "step": 1250 }, { "epoch": 0.8006379585326954, "grad_norm": 1.0553467273712158, "learning_rate": 3.48074045769868e-05, "loss": 0.8519, "step": 1255 }, { "epoch": 0.8038277511961722, "grad_norm": 1.0572487115859985, "learning_rate": 3.3744245858956967e-05, "loss": 0.8773, "step": 1260 }, { "epoch": 0.8070175438596491, "grad_norm": 1.0170570611953735, "learning_rate": 3.2695515319036186e-05, "loss": 0.856, "step": 1265 }, { "epoch": 0.810207336523126, "grad_norm": 0.894329309463501, "learning_rate": 3.1661343112253304e-05, "loss": 0.8405, "step": 1270 }, { "epoch": 0.8133971291866029, "grad_norm": 1.1681292057037354, "learning_rate": 3.064185758684265e-05, "loss": 0.8217, "step": 1275 }, { "epoch": 0.8165869218500797, "grad_norm": 1.047298550605774, "learning_rate": 2.96371852683157e-05, "loss": 0.847, "step": 1280 }, { "epoch": 0.8197767145135566, "grad_norm": 1.0787161588668823, "learning_rate": 2.8647450843757897e-05, "loss": 0.8541, "step": 1285 }, { "epoch": 0.8229665071770335, "grad_norm": 1.1034142971038818, "learning_rate": 2.7672777146354246e-05, "loss": 0.8522, "step": 1290 }, { "epoch": 0.8261562998405104, "grad_norm": 1.006184458732605, "learning_rate": 2.6713285140144802e-05, "loss": 0.8551, "step": 1295 }, { "epoch": 0.8293460925039873, "grad_norm": 1.2398364543914795, "learning_rate": 2.5769093905012333e-05, "loss": 0.8363, "step": 1300 }, { "epoch": 0.8325358851674641, "grad_norm": 1.0368655920028687, "learning_rate": 2.4840320621903253e-05, "loss": 0.837, "step": 1305 }, { "epoch": 0.835725677830941, "grad_norm": 1.1005960702896118, "learning_rate": 2.392708055828495e-05, "loss": 0.8372, "step": 1310 }, { "epoch": 0.8389154704944178, "grad_norm": 1.031446099281311, "learning_rate": 2.3029487053840295e-05, "loss": 0.838, "step": 1315 }, { "epoch": 0.8421052631578947, "grad_norm": 0.9887619614601135, "learning_rate": 2.214765150640108e-05, "loss": 0.8521, "step": 1320 }, { "epoch": 0.8452950558213717, "grad_norm": 1.0324478149414062, "learning_rate": 2.1281683358122996e-05, "loss": 0.8391, "step": 1325 }, { "epoch": 0.8484848484848485, "grad_norm": 0.9141530394554138, "learning_rate": 2.043169008190289e-05, "loss": 0.8526, "step": 1330 }, { "epoch": 0.8516746411483254, "grad_norm": 0.9525683522224426, "learning_rate": 1.9597777168040872e-05, "loss": 0.8589, "step": 1335 }, { "epoch": 0.8548644338118022, "grad_norm": 0.9357597231864929, "learning_rate": 1.8780048111147776e-05, "loss": 0.8565, "step": 1340 }, { "epoch": 0.8580542264752791, "grad_norm": 0.989803671836853, "learning_rate": 1.797860439730126e-05, "loss": 0.8097, "step": 1345 }, { "epoch": 0.861244019138756, "grad_norm": 0.9911389946937561, "learning_rate": 1.7193545491450183e-05, "loss": 0.85, "step": 1350 }, { "epoch": 0.8644338118022329, "grad_norm": 0.9085144996643066, "learning_rate": 1.6424968825070567e-05, "loss": 0.83, "step": 1355 }, { "epoch": 0.8676236044657097, "grad_norm": 0.9999231696128845, "learning_rate": 1.567296978407353e-05, "loss": 0.8378, "step": 1360 }, { "epoch": 0.8708133971291866, "grad_norm": 0.9475440979003906, "learning_rate": 1.4937641696967245e-05, "loss": 0.8531, "step": 1365 }, { "epoch": 0.8740031897926634, "grad_norm": 0.985124945640564, "learning_rate": 1.4219075823274251e-05, "loss": 0.8431, "step": 1370 }, { "epoch": 0.8771929824561403, "grad_norm": 0.9646289348602295, "learning_rate": 1.3517361342205295e-05, "loss": 0.8361, "step": 1375 }, { "epoch": 0.8803827751196173, "grad_norm": 0.9125611186027527, "learning_rate": 1.2832585341591844e-05, "loss": 0.8106, "step": 1380 }, { "epoch": 0.8835725677830941, "grad_norm": 1.214565634727478, "learning_rate": 1.2164832807077585e-05, "loss": 0.8441, "step": 1385 }, { "epoch": 0.886762360446571, "grad_norm": 1.085919976234436, "learning_rate": 1.151418661157122e-05, "loss": 0.8271, "step": 1390 }, { "epoch": 0.8899521531100478, "grad_norm": 0.9682278633117676, "learning_rate": 1.0880727504961339e-05, "loss": 0.8293, "step": 1395 }, { "epoch": 0.8931419457735247, "grad_norm": 1.015742540359497, "learning_rate": 1.0264534104094812e-05, "loss": 0.8452, "step": 1400 }, { "epoch": 0.8963317384370016, "grad_norm": 0.9043238162994385, "learning_rate": 9.665682883019732e-06, "loss": 0.8274, "step": 1405 }, { "epoch": 0.8995215311004785, "grad_norm": 0.9109312891960144, "learning_rate": 9.084248163494511e-06, "loss": 0.8332, "step": 1410 }, { "epoch": 0.9027113237639554, "grad_norm": 1.0157884359359741, "learning_rate": 8.520302105764148e-06, "loss": 0.8304, "step": 1415 }, { "epoch": 0.9059011164274322, "grad_norm": 0.9898545742034912, "learning_rate": 7.973914699604367e-06, "loss": 0.8346, "step": 1420 }, { "epoch": 0.9090909090909091, "grad_norm": 0.914945662021637, "learning_rate": 7.445153755635569e-06, "loss": 0.8443, "step": 1425 }, { "epoch": 0.9122807017543859, "grad_norm": 0.9307130575180054, "learning_rate": 6.934084896906983e-06, "loss": 0.8429, "step": 1430 }, { "epoch": 0.9154704944178629, "grad_norm": 1.0683540105819702, "learning_rate": 6.440771550752377e-06, "loss": 0.8504, "step": 1435 }, { "epoch": 0.9186602870813397, "grad_norm": 0.9779647588729858, "learning_rate": 5.965274940918274e-06, "loss": 0.8422, "step": 1440 }, { "epoch": 0.9218500797448166, "grad_norm": 0.9050576686859131, "learning_rate": 5.507654079965612e-06, "loss": 0.8186, "step": 1445 }, { "epoch": 0.9250398724082934, "grad_norm": 1.057937502861023, "learning_rate": 5.067965761945869e-06, "loss": 0.8447, "step": 1450 }, { "epoch": 0.9282296650717703, "grad_norm": 0.9198818206787109, "learning_rate": 4.646264555352586e-06, "loss": 0.8252, "step": 1455 }, { "epoch": 0.9314194577352473, "grad_norm": 1.075323462486267, "learning_rate": 4.242602796348915e-06, "loss": 0.8197, "step": 1460 }, { "epoch": 0.9346092503987241, "grad_norm": 1.1147295236587524, "learning_rate": 3.857030582272369e-06, "loss": 0.8224, "step": 1465 }, { "epoch": 0.937799043062201, "grad_norm": 1.0000038146972656, "learning_rate": 3.489595765417441e-06, "loss": 0.7997, "step": 1470 }, { "epoch": 0.9409888357256778, "grad_norm": 0.9286321401596069, "learning_rate": 3.140343947096624e-06, "loss": 0.8055, "step": 1475 }, { "epoch": 0.9441786283891547, "grad_norm": 0.9910067915916443, "learning_rate": 2.80931847198117e-06, "loss": 0.8619, "step": 1480 }, { "epoch": 0.9473684210526315, "grad_norm": 1.0662553310394287, "learning_rate": 2.4965604227215774e-06, "loss": 0.8424, "step": 1485 }, { "epoch": 0.9505582137161085, "grad_norm": 0.9524323344230652, "learning_rate": 2.202108614848885e-06, "loss": 0.8281, "step": 1490 }, { "epoch": 0.9537480063795853, "grad_norm": 0.910805344581604, "learning_rate": 1.925999591957561e-06, "loss": 0.8206, "step": 1495 }, { "epoch": 0.9569377990430622, "grad_norm": 0.9802441596984863, "learning_rate": 1.6682676211700107e-06, "loss": 0.8394, "step": 1500 }, { "epoch": 0.960127591706539, "grad_norm": 0.907018780708313, "learning_rate": 1.4289446888838652e-06, "loss": 0.8453, "step": 1505 }, { "epoch": 0.9633173843700159, "grad_norm": 1.0577489137649536, "learning_rate": 1.2080604968022378e-06, "loss": 0.8119, "step": 1510 }, { "epoch": 0.9665071770334929, "grad_norm": 0.9329078197479248, "learning_rate": 1.0056424582474575e-06, "loss": 0.8241, "step": 1515 }, { "epoch": 0.9696969696969697, "grad_norm": 0.9109911322593689, "learning_rate": 8.217156947590064e-07, "loss": 0.8181, "step": 1520 }, { "epoch": 0.9728867623604466, "grad_norm": 0.9339520931243896, "learning_rate": 6.563030329755969e-07, "loss": 0.805, "step": 1525 }, { "epoch": 0.9760765550239234, "grad_norm": 1.0125936269760132, "learning_rate": 5.094250018023715e-07, "loss": 0.85, "step": 1530 }, { "epoch": 0.9792663476874003, "grad_norm": 0.9169021844863892, "learning_rate": 3.8109982986300747e-07, "loss": 0.8446, "step": 1535 }, { "epoch": 0.9824561403508771, "grad_norm": 0.9390758872032166, "learning_rate": 2.7134344323747616e-07, "loss": 0.8234, "step": 1540 }, { "epoch": 0.9856459330143541, "grad_norm": 0.9467016458511353, "learning_rate": 1.801694634854578e-07, "loss": 0.8461, "step": 1545 }, { "epoch": 0.988835725677831, "grad_norm": 1.0382705926895142, "learning_rate": 1.075892059558603e-07, "loss": 0.8478, "step": 1550 }, { "epoch": 0.9920255183413078, "grad_norm": 0.9475221633911133, "learning_rate": 5.3611678382442516e-08, "loss": 0.8307, "step": 1555 }, { "epoch": 0.9952153110047847, "grad_norm": 0.9038047790527344, "learning_rate": 1.824357976594193e-08, "loss": 0.8018, "step": 1560 }, { "epoch": 0.9984051036682615, "grad_norm": 0.977640688419342, "learning_rate": 1.4892995426396548e-09, "loss": 0.8292, "step": 1565 }, { "epoch": 0.9996810207336523, "eval_loss": 2.336217164993286, "eval_runtime": 2.955, "eval_samples_per_second": 3.384, "eval_steps_per_second": 0.677, "step": 1567 }, { "epoch": 0.9996810207336523, "step": 1567, "total_flos": 6.008655759456338e+17, "train_loss": 1.5068606398405355, "train_runtime": 9641.8347, "train_samples_per_second": 1.3, "train_steps_per_second": 0.163 } ], "logging_steps": 5, "max_steps": 1567, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.008655759456338e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }