{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.762437640614301, "eval_steps": 100, "global_step": 35300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019563728846718185, "grad_norm": NaN, "learning_rate": 4.999021756143372e-05, "loss": 16559.3187, "step": 25 }, { "epoch": 0.003912745769343637, "grad_norm": NaN, "learning_rate": 4.998043512286743e-05, "loss": 0.0, "step": 50 }, { "epoch": 0.005869118654015455, "grad_norm": NaN, "learning_rate": 4.997065268430115e-05, "loss": 0.0, "step": 75 }, { "epoch": 0.007825491538687274, "grad_norm": NaN, "learning_rate": 4.996087024573486e-05, "loss": 0.0, "step": 100 }, { "epoch": 0.007825491538687274, "eval_loss": NaN, "eval_runtime": 295.7989, "eval_samples_per_second": 406.746, "eval_steps_per_second": 6.356, "step": 100 }, { "epoch": 0.009781864423359092, "grad_norm": NaN, "learning_rate": 4.9951087807168576e-05, "loss": 0.0, "step": 125 }, { "epoch": 0.01173823730803091, "grad_norm": NaN, "learning_rate": 4.994130536860229e-05, "loss": 0.0, "step": 150 }, { "epoch": 0.01369461019270273, "grad_norm": NaN, "learning_rate": 4.9931522930036004e-05, "loss": 0.0, "step": 175 }, { "epoch": 0.015650983077374548, "grad_norm": NaN, "learning_rate": 4.9921740491469715e-05, "loss": 0.0, "step": 200 }, { "epoch": 0.015650983077374548, "eval_loss": NaN, "eval_runtime": 294.9422, "eval_samples_per_second": 407.927, "eval_steps_per_second": 6.374, "step": 200 }, { "epoch": 0.017607355962046364, "grad_norm": NaN, "learning_rate": 4.991195805290343e-05, "loss": 0.0, "step": 225 }, { "epoch": 0.019563728846718184, "grad_norm": NaN, "learning_rate": 4.990217561433714e-05, "loss": 0.0, "step": 250 }, { "epoch": 0.021520101731390004, "grad_norm": NaN, "learning_rate": 4.989239317577086e-05, "loss": 0.0, "step": 275 }, { "epoch": 0.02347647461606182, "grad_norm": NaN, "learning_rate": 4.988261073720457e-05, "loss": 0.0, "step": 300 }, { "epoch": 0.02347647461606182, "eval_loss": NaN, "eval_runtime": 293.577, "eval_samples_per_second": 409.824, "eval_steps_per_second": 6.404, "step": 300 }, { "epoch": 0.02543284750073364, "grad_norm": NaN, "learning_rate": 4.987282829863829e-05, "loss": 0.0, "step": 325 }, { "epoch": 0.02738922038540546, "grad_norm": NaN, "learning_rate": 4.9863045860072e-05, "loss": 0.0, "step": 350 }, { "epoch": 0.029345593270077276, "grad_norm": NaN, "learning_rate": 4.985326342150572e-05, "loss": 0.0, "step": 375 }, { "epoch": 0.031301966154749096, "grad_norm": NaN, "learning_rate": 4.984348098293943e-05, "loss": 0.0, "step": 400 }, { "epoch": 0.031301966154749096, "eval_loss": NaN, "eval_runtime": 295.5517, "eval_samples_per_second": 407.086, "eval_steps_per_second": 6.361, "step": 400 }, { "epoch": 0.033258339039420916, "grad_norm": NaN, "learning_rate": 4.9833698544373145e-05, "loss": 0.0, "step": 425 }, { "epoch": 0.03521471192409273, "grad_norm": NaN, "learning_rate": 4.9823916105806856e-05, "loss": 0.0, "step": 450 }, { "epoch": 0.03717108480876455, "grad_norm": NaN, "learning_rate": 4.981413366724057e-05, "loss": 0.0, "step": 475 }, { "epoch": 0.03912745769343637, "grad_norm": NaN, "learning_rate": 4.9804351228674284e-05, "loss": 0.0, "step": 500 }, { "epoch": 0.03912745769343637, "eval_loss": NaN, "eval_runtime": 295.0369, "eval_samples_per_second": 407.796, "eval_steps_per_second": 6.372, "step": 500 }, { "epoch": 0.04108383057810819, "grad_norm": NaN, "learning_rate": 4.9794568790108e-05, "loss": 0.0, "step": 525 }, { "epoch": 0.04304020346278001, "grad_norm": NaN, "learning_rate": 4.978478635154171e-05, "loss": 0.0, "step": 550 }, { "epoch": 0.04499657634745183, "grad_norm": NaN, "learning_rate": 4.977500391297543e-05, "loss": 0.0, "step": 575 }, { "epoch": 0.04695294923212364, "grad_norm": NaN, "learning_rate": 4.976522147440914e-05, "loss": 0.0, "step": 600 }, { "epoch": 0.04695294923212364, "eval_loss": NaN, "eval_runtime": 295.3378, "eval_samples_per_second": 407.381, "eval_steps_per_second": 6.366, "step": 600 }, { "epoch": 0.04890932211679546, "grad_norm": NaN, "learning_rate": 4.975543903584286e-05, "loss": 0.0, "step": 625 }, { "epoch": 0.05086569500146728, "grad_norm": NaN, "learning_rate": 4.9745656597276575e-05, "loss": 0.0, "step": 650 }, { "epoch": 0.0528220678861391, "grad_norm": NaN, "learning_rate": 4.9735874158710286e-05, "loss": 0.0, "step": 675 }, { "epoch": 0.05477844077081092, "grad_norm": NaN, "learning_rate": 4.9726091720144003e-05, "loss": 0.0, "step": 700 }, { "epoch": 0.05477844077081092, "eval_loss": NaN, "eval_runtime": 294.0048, "eval_samples_per_second": 409.228, "eval_steps_per_second": 6.394, "step": 700 }, { "epoch": 0.05673481365548273, "grad_norm": NaN, "learning_rate": 4.9716309281577714e-05, "loss": 0.0, "step": 725 }, { "epoch": 0.05869118654015455, "grad_norm": NaN, "learning_rate": 4.970652684301143e-05, "loss": 0.0, "step": 750 }, { "epoch": 0.06064755942482637, "grad_norm": NaN, "learning_rate": 4.969674440444514e-05, "loss": 0.0, "step": 775 }, { "epoch": 0.06260393230949819, "grad_norm": NaN, "learning_rate": 4.968696196587886e-05, "loss": 0.0, "step": 800 }, { "epoch": 0.06260393230949819, "eval_loss": NaN, "eval_runtime": 295.8848, "eval_samples_per_second": 406.628, "eval_steps_per_second": 6.354, "step": 800 }, { "epoch": 0.06456030519417001, "grad_norm": NaN, "learning_rate": 4.967717952731257e-05, "loss": 0.0, "step": 825 }, { "epoch": 0.06651667807884183, "grad_norm": NaN, "learning_rate": 4.966739708874629e-05, "loss": 0.0, "step": 850 }, { "epoch": 0.06847305096351365, "grad_norm": NaN, "learning_rate": 4.965761465018e-05, "loss": 0.0, "step": 875 }, { "epoch": 0.07042942384818546, "grad_norm": NaN, "learning_rate": 4.9647832211613716e-05, "loss": 0.0, "step": 900 }, { "epoch": 0.07042942384818546, "eval_loss": NaN, "eval_runtime": 295.5382, "eval_samples_per_second": 407.105, "eval_steps_per_second": 6.361, "step": 900 }, { "epoch": 0.07238579673285728, "grad_norm": NaN, "learning_rate": 4.963804977304743e-05, "loss": 0.0, "step": 925 }, { "epoch": 0.0743421696175291, "grad_norm": NaN, "learning_rate": 4.9628267334481144e-05, "loss": 0.0, "step": 950 }, { "epoch": 0.07629854250220092, "grad_norm": NaN, "learning_rate": 4.9618484895914855e-05, "loss": 0.0, "step": 975 }, { "epoch": 0.07825491538687274, "grad_norm": NaN, "learning_rate": 4.960870245734857e-05, "loss": 0.0, "step": 1000 }, { "epoch": 0.07825491538687274, "eval_loss": NaN, "eval_runtime": 296.9914, "eval_samples_per_second": 405.113, "eval_steps_per_second": 6.33, "step": 1000 }, { "epoch": 0.08021128827154456, "grad_norm": NaN, "learning_rate": 4.959892001878228e-05, "loss": 0.0, "step": 1025 }, { "epoch": 0.08216766115621638, "grad_norm": NaN, "learning_rate": 4.9589137580216e-05, "loss": 0.0, "step": 1050 }, { "epoch": 0.0841240340408882, "grad_norm": NaN, "learning_rate": 4.957935514164971e-05, "loss": 0.0, "step": 1075 }, { "epoch": 0.08608040692556002, "grad_norm": NaN, "learning_rate": 4.956957270308343e-05, "loss": 0.0, "step": 1100 }, { "epoch": 0.08608040692556002, "eval_loss": NaN, "eval_runtime": 296.9879, "eval_samples_per_second": 405.117, "eval_steps_per_second": 6.33, "step": 1100 }, { "epoch": 0.08803677981023184, "grad_norm": NaN, "learning_rate": 4.955979026451714e-05, "loss": 0.0, "step": 1125 }, { "epoch": 0.08999315269490366, "grad_norm": NaN, "learning_rate": 4.955000782595086e-05, "loss": 0.0, "step": 1150 }, { "epoch": 0.09194952557957546, "grad_norm": NaN, "learning_rate": 4.954022538738457e-05, "loss": 0.0, "step": 1175 }, { "epoch": 0.09390589846424728, "grad_norm": NaN, "learning_rate": 4.9530442948818285e-05, "loss": 0.0, "step": 1200 }, { "epoch": 0.09390589846424728, "eval_loss": NaN, "eval_runtime": 295.7662, "eval_samples_per_second": 406.791, "eval_steps_per_second": 6.356, "step": 1200 }, { "epoch": 0.0958622713489191, "grad_norm": NaN, "learning_rate": 4.9520660510251996e-05, "loss": 0.0, "step": 1225 }, { "epoch": 0.09781864423359092, "grad_norm": NaN, "learning_rate": 4.951087807168571e-05, "loss": 0.0, "step": 1250 }, { "epoch": 0.09977501711826274, "grad_norm": NaN, "learning_rate": 4.9501095633119424e-05, "loss": 0.0, "step": 1275 }, { "epoch": 0.10173139000293456, "grad_norm": NaN, "learning_rate": 4.949131319455314e-05, "loss": 0.0, "step": 1300 }, { "epoch": 0.10173139000293456, "eval_loss": NaN, "eval_runtime": 296.6574, "eval_samples_per_second": 405.569, "eval_steps_per_second": 6.337, "step": 1300 }, { "epoch": 0.10368776288760638, "grad_norm": NaN, "learning_rate": 4.948153075598685e-05, "loss": 0.0, "step": 1325 }, { "epoch": 0.1056441357722782, "grad_norm": NaN, "learning_rate": 4.947174831742057e-05, "loss": 0.0, "step": 1350 }, { "epoch": 0.10760050865695002, "grad_norm": NaN, "learning_rate": 4.946196587885428e-05, "loss": 0.0, "step": 1375 }, { "epoch": 0.10955688154162184, "grad_norm": NaN, "learning_rate": 4.9452183440288e-05, "loss": 0.0, "step": 1400 }, { "epoch": 0.10955688154162184, "eval_loss": NaN, "eval_runtime": 297.0487, "eval_samples_per_second": 405.035, "eval_steps_per_second": 6.329, "step": 1400 }, { "epoch": 0.11151325442629365, "grad_norm": NaN, "learning_rate": 4.944240100172171e-05, "loss": 0.0, "step": 1425 }, { "epoch": 0.11346962731096547, "grad_norm": NaN, "learning_rate": 4.9432618563155426e-05, "loss": 0.0, "step": 1450 }, { "epoch": 0.11542600019563728, "grad_norm": NaN, "learning_rate": 4.942283612458914e-05, "loss": 0.0, "step": 1475 }, { "epoch": 0.1173823730803091, "grad_norm": NaN, "learning_rate": 4.9413053686022854e-05, "loss": 0.0, "step": 1500 }, { "epoch": 0.1173823730803091, "eval_loss": NaN, "eval_runtime": 294.7911, "eval_samples_per_second": 408.136, "eval_steps_per_second": 6.377, "step": 1500 }, { "epoch": 0.11933874596498092, "grad_norm": NaN, "learning_rate": 4.9403271247456565e-05, "loss": 0.0, "step": 1525 }, { "epoch": 0.12129511884965274, "grad_norm": NaN, "learning_rate": 4.939348880889028e-05, "loss": 0.0, "step": 1550 }, { "epoch": 0.12325149173432456, "grad_norm": NaN, "learning_rate": 4.938370637032399e-05, "loss": 0.0, "step": 1575 }, { "epoch": 0.12520786461899638, "grad_norm": NaN, "learning_rate": 4.937392393175771e-05, "loss": 0.0, "step": 1600 }, { "epoch": 0.12520786461899638, "eval_loss": NaN, "eval_runtime": 317.7116, "eval_samples_per_second": 378.692, "eval_steps_per_second": 5.917, "step": 1600 }, { "epoch": 0.1271642375036682, "grad_norm": NaN, "learning_rate": 4.936414149319142e-05, "loss": 0.0, "step": 1625 }, { "epoch": 0.12912061038834002, "grad_norm": NaN, "learning_rate": 4.935435905462514e-05, "loss": 0.0, "step": 1650 }, { "epoch": 0.13107698327301184, "grad_norm": NaN, "learning_rate": 4.934457661605885e-05, "loss": 0.0, "step": 1675 }, { "epoch": 0.13303335615768366, "grad_norm": NaN, "learning_rate": 4.933479417749257e-05, "loss": 0.0, "step": 1700 }, { "epoch": 0.13303335615768366, "eval_loss": NaN, "eval_runtime": 296.8557, "eval_samples_per_second": 405.298, "eval_steps_per_second": 6.333, "step": 1700 }, { "epoch": 0.13498972904235548, "grad_norm": NaN, "learning_rate": 4.932501173892628e-05, "loss": 0.0, "step": 1725 }, { "epoch": 0.1369461019270273, "grad_norm": NaN, "learning_rate": 4.9315229300359995e-05, "loss": 0.0, "step": 1750 }, { "epoch": 0.13890247481169912, "grad_norm": NaN, "learning_rate": 4.9305446861793706e-05, "loss": 0.0, "step": 1775 }, { "epoch": 0.14085884769637091, "grad_norm": NaN, "learning_rate": 4.929566442322742e-05, "loss": 0.0, "step": 1800 }, { "epoch": 0.14085884769637091, "eval_loss": NaN, "eval_runtime": 465.8904, "eval_samples_per_second": 258.247, "eval_steps_per_second": 4.035, "step": 1800 }, { "epoch": 0.14281522058104273, "grad_norm": NaN, "learning_rate": 4.9285881984661134e-05, "loss": 0.0, "step": 1825 }, { "epoch": 0.14477159346571455, "grad_norm": NaN, "learning_rate": 4.927609954609485e-05, "loss": 0.0, "step": 1850 }, { "epoch": 0.14672796635038637, "grad_norm": NaN, "learning_rate": 4.926631710752856e-05, "loss": 0.0, "step": 1875 }, { "epoch": 0.1486843392350582, "grad_norm": NaN, "learning_rate": 4.925653466896228e-05, "loss": 0.0, "step": 1900 }, { "epoch": 0.1486843392350582, "eval_loss": NaN, "eval_runtime": 296.3111, "eval_samples_per_second": 406.043, "eval_steps_per_second": 6.345, "step": 1900 }, { "epoch": 0.15064071211973, "grad_norm": NaN, "learning_rate": 4.924675223039599e-05, "loss": 0.0, "step": 1925 }, { "epoch": 0.15259708500440183, "grad_norm": NaN, "learning_rate": 4.9236969791829714e-05, "loss": 0.0, "step": 1950 }, { "epoch": 0.15455345788907365, "grad_norm": NaN, "learning_rate": 4.9227187353263425e-05, "loss": 0.0, "step": 1975 }, { "epoch": 0.15650983077374547, "grad_norm": NaN, "learning_rate": 4.921740491469714e-05, "loss": 0.0, "step": 2000 }, { "epoch": 0.15650983077374547, "eval_loss": NaN, "eval_runtime": 295.9552, "eval_samples_per_second": 406.531, "eval_steps_per_second": 6.352, "step": 2000 }, { "epoch": 0.1584662036584173, "grad_norm": NaN, "learning_rate": 4.920762247613085e-05, "loss": 0.0, "step": 2025 }, { "epoch": 0.1604225765430891, "grad_norm": NaN, "learning_rate": 4.919784003756457e-05, "loss": 0.0, "step": 2050 }, { "epoch": 0.16237894942776093, "grad_norm": NaN, "learning_rate": 4.918805759899828e-05, "loss": 0.0, "step": 2075 }, { "epoch": 0.16433532231243275, "grad_norm": NaN, "learning_rate": 4.9178275160432e-05, "loss": 0.0, "step": 2100 }, { "epoch": 0.16433532231243275, "eval_loss": NaN, "eval_runtime": 294.8614, "eval_samples_per_second": 408.039, "eval_steps_per_second": 6.376, "step": 2100 }, { "epoch": 0.16629169519710457, "grad_norm": NaN, "learning_rate": 4.916849272186571e-05, "loss": 0.0, "step": 2125 }, { "epoch": 0.1682480680817764, "grad_norm": NaN, "learning_rate": 4.915871028329943e-05, "loss": 0.0, "step": 2150 }, { "epoch": 0.1702044409664482, "grad_norm": NaN, "learning_rate": 4.914892784473314e-05, "loss": 0.0, "step": 2175 }, { "epoch": 0.17216081385112003, "grad_norm": NaN, "learning_rate": 4.9139145406166855e-05, "loss": 0.0, "step": 2200 }, { "epoch": 0.17216081385112003, "eval_loss": NaN, "eval_runtime": 296.3624, "eval_samples_per_second": 405.973, "eval_steps_per_second": 6.344, "step": 2200 }, { "epoch": 0.17411718673579185, "grad_norm": NaN, "learning_rate": 4.9129362967600566e-05, "loss": 0.0, "step": 2225 }, { "epoch": 0.17607355962046367, "grad_norm": NaN, "learning_rate": 4.9119580529034283e-05, "loss": 0.0, "step": 2250 }, { "epoch": 0.1780299325051355, "grad_norm": NaN, "learning_rate": 4.9109798090467994e-05, "loss": 0.0, "step": 2275 }, { "epoch": 0.1799863053898073, "grad_norm": NaN, "learning_rate": 4.910001565190171e-05, "loss": 0.0, "step": 2300 }, { "epoch": 0.1799863053898073, "eval_loss": NaN, "eval_runtime": 299.4977, "eval_samples_per_second": 401.723, "eval_steps_per_second": 6.277, "step": 2300 }, { "epoch": 0.1819426782744791, "grad_norm": NaN, "learning_rate": 4.909023321333542e-05, "loss": 0.0, "step": 2325 }, { "epoch": 0.18389905115915092, "grad_norm": NaN, "learning_rate": 4.908045077476914e-05, "loss": 0.0, "step": 2350 }, { "epoch": 0.18585542404382274, "grad_norm": NaN, "learning_rate": 4.907066833620285e-05, "loss": 0.0, "step": 2375 }, { "epoch": 0.18781179692849456, "grad_norm": NaN, "learning_rate": 4.906088589763657e-05, "loss": 0.0, "step": 2400 }, { "epoch": 0.18781179692849456, "eval_loss": NaN, "eval_runtime": 296.149, "eval_samples_per_second": 406.265, "eval_steps_per_second": 6.348, "step": 2400 }, { "epoch": 0.18976816981316638, "grad_norm": NaN, "learning_rate": 4.905110345907028e-05, "loss": 0.0, "step": 2425 }, { "epoch": 0.1917245426978382, "grad_norm": NaN, "learning_rate": 4.9041321020503996e-05, "loss": 0.0, "step": 2450 }, { "epoch": 0.19368091558251002, "grad_norm": NaN, "learning_rate": 4.903153858193771e-05, "loss": 0.0, "step": 2475 }, { "epoch": 0.19563728846718184, "grad_norm": NaN, "learning_rate": 4.9021756143371424e-05, "loss": 0.0, "step": 2500 }, { "epoch": 0.19563728846718184, "eval_loss": NaN, "eval_runtime": 298.9773, "eval_samples_per_second": 402.422, "eval_steps_per_second": 6.288, "step": 2500 }, { "epoch": 0.19759366135185366, "grad_norm": NaN, "learning_rate": 4.9011973704805135e-05, "loss": 0.0, "step": 2525 }, { "epoch": 0.19955003423652548, "grad_norm": NaN, "learning_rate": 4.900219126623885e-05, "loss": 0.0, "step": 2550 }, { "epoch": 0.2015064071211973, "grad_norm": NaN, "learning_rate": 4.899240882767256e-05, "loss": 0.0, "step": 2575 }, { "epoch": 0.20346278000586912, "grad_norm": NaN, "learning_rate": 4.898262638910628e-05, "loss": 0.0, "step": 2600 }, { "epoch": 0.20346278000586912, "eval_loss": NaN, "eval_runtime": 295.4699, "eval_samples_per_second": 407.199, "eval_steps_per_second": 6.363, "step": 2600 }, { "epoch": 0.20541915289054094, "grad_norm": NaN, "learning_rate": 4.897284395053999e-05, "loss": 0.0, "step": 2625 }, { "epoch": 0.20737552577521276, "grad_norm": NaN, "learning_rate": 4.896306151197371e-05, "loss": 0.0, "step": 2650 }, { "epoch": 0.20933189865988458, "grad_norm": NaN, "learning_rate": 4.895327907340742e-05, "loss": 0.0, "step": 2675 }, { "epoch": 0.2112882715445564, "grad_norm": NaN, "learning_rate": 4.894349663484114e-05, "loss": 0.0, "step": 2700 }, { "epoch": 0.2112882715445564, "eval_loss": NaN, "eval_runtime": 295.2782, "eval_samples_per_second": 407.463, "eval_steps_per_second": 6.367, "step": 2700 }, { "epoch": 0.21324464442922822, "grad_norm": NaN, "learning_rate": 4.893371419627485e-05, "loss": 0.0, "step": 2725 }, { "epoch": 0.21520101731390004, "grad_norm": NaN, "learning_rate": 4.8923931757708565e-05, "loss": 0.0, "step": 2750 }, { "epoch": 0.21715739019857186, "grad_norm": NaN, "learning_rate": 4.8914149319142276e-05, "loss": 0.0, "step": 2775 }, { "epoch": 0.21911376308324368, "grad_norm": NaN, "learning_rate": 4.890436688057599e-05, "loss": 0.0, "step": 2800 }, { "epoch": 0.21911376308324368, "eval_loss": NaN, "eval_runtime": 295.1417, "eval_samples_per_second": 407.652, "eval_steps_per_second": 6.37, "step": 2800 }, { "epoch": 0.22107013596791547, "grad_norm": NaN, "learning_rate": 4.8894584442009704e-05, "loss": 0.0, "step": 2825 }, { "epoch": 0.2230265088525873, "grad_norm": NaN, "learning_rate": 4.888480200344342e-05, "loss": 0.0, "step": 2850 }, { "epoch": 0.2249828817372591, "grad_norm": NaN, "learning_rate": 4.887501956487713e-05, "loss": 0.0, "step": 2875 }, { "epoch": 0.22693925462193093, "grad_norm": NaN, "learning_rate": 4.886523712631085e-05, "loss": 0.0, "step": 2900 }, { "epoch": 0.22693925462193093, "eval_loss": NaN, "eval_runtime": 296.3389, "eval_samples_per_second": 406.005, "eval_steps_per_second": 6.344, "step": 2900 }, { "epoch": 0.22889562750660275, "grad_norm": NaN, "learning_rate": 4.885545468774456e-05, "loss": 0.0, "step": 2925 }, { "epoch": 0.23085200039127457, "grad_norm": NaN, "learning_rate": 4.884567224917828e-05, "loss": 0.0, "step": 2950 }, { "epoch": 0.2328083732759464, "grad_norm": NaN, "learning_rate": 4.883588981061199e-05, "loss": 0.0, "step": 2975 }, { "epoch": 0.2347647461606182, "grad_norm": NaN, "learning_rate": 4.8826107372045706e-05, "loss": 0.0, "step": 3000 }, { "epoch": 0.2347647461606182, "eval_loss": NaN, "eval_runtime": 296.0382, "eval_samples_per_second": 406.417, "eval_steps_per_second": 6.351, "step": 3000 }, { "epoch": 0.23672111904529003, "grad_norm": NaN, "learning_rate": 4.881632493347942e-05, "loss": 0.0, "step": 3025 }, { "epoch": 0.23867749192996185, "grad_norm": NaN, "learning_rate": 4.8806542494913134e-05, "loss": 0.0, "step": 3050 }, { "epoch": 0.24063386481463367, "grad_norm": NaN, "learning_rate": 4.8796760056346845e-05, "loss": 0.0, "step": 3075 }, { "epoch": 0.2425902376993055, "grad_norm": NaN, "learning_rate": 4.878697761778056e-05, "loss": 0.0, "step": 3100 }, { "epoch": 0.2425902376993055, "eval_loss": NaN, "eval_runtime": 297.342, "eval_samples_per_second": 404.635, "eval_steps_per_second": 6.323, "step": 3100 }, { "epoch": 0.2445466105839773, "grad_norm": NaN, "learning_rate": 4.877719517921427e-05, "loss": 0.0, "step": 3125 }, { "epoch": 0.24650298346864913, "grad_norm": NaN, "learning_rate": 4.876741274064799e-05, "loss": 0.0, "step": 3150 }, { "epoch": 0.24845935635332095, "grad_norm": NaN, "learning_rate": 4.87576303020817e-05, "loss": 0.0, "step": 3175 }, { "epoch": 0.25041572923799277, "grad_norm": NaN, "learning_rate": 4.874784786351542e-05, "loss": 0.0, "step": 3200 }, { "epoch": 0.25041572923799277, "eval_loss": NaN, "eval_runtime": 296.9861, "eval_samples_per_second": 405.12, "eval_steps_per_second": 6.33, "step": 3200 }, { "epoch": 0.2523721021226646, "grad_norm": NaN, "learning_rate": 4.8738065424949136e-05, "loss": 0.0, "step": 3225 }, { "epoch": 0.2543284750073364, "grad_norm": NaN, "learning_rate": 4.8728282986382854e-05, "loss": 0.0, "step": 3250 }, { "epoch": 0.2562848478920082, "grad_norm": NaN, "learning_rate": 4.8718500547816564e-05, "loss": 0.0, "step": 3275 }, { "epoch": 0.25824122077668005, "grad_norm": NaN, "learning_rate": 4.870871810925028e-05, "loss": 0.0, "step": 3300 }, { "epoch": 0.25824122077668005, "eval_loss": NaN, "eval_runtime": 297.7569, "eval_samples_per_second": 404.071, "eval_steps_per_second": 6.314, "step": 3300 }, { "epoch": 0.26019759366135187, "grad_norm": NaN, "learning_rate": 4.869893567068399e-05, "loss": 0.0, "step": 3325 }, { "epoch": 0.2621539665460237, "grad_norm": NaN, "learning_rate": 4.868915323211771e-05, "loss": 0.0, "step": 3350 }, { "epoch": 0.2641103394306955, "grad_norm": NaN, "learning_rate": 4.867937079355142e-05, "loss": 0.0, "step": 3375 }, { "epoch": 0.2660667123153673, "grad_norm": NaN, "learning_rate": 4.866958835498514e-05, "loss": 0.0, "step": 3400 }, { "epoch": 0.2660667123153673, "eval_loss": NaN, "eval_runtime": 297.6148, "eval_samples_per_second": 404.264, "eval_steps_per_second": 6.317, "step": 3400 }, { "epoch": 0.26802308520003915, "grad_norm": NaN, "learning_rate": 4.865980591641885e-05, "loss": 0.0, "step": 3425 }, { "epoch": 0.26997945808471097, "grad_norm": NaN, "learning_rate": 4.8650023477852566e-05, "loss": 0.0, "step": 3450 }, { "epoch": 0.2719358309693828, "grad_norm": NaN, "learning_rate": 4.864024103928628e-05, "loss": 0.0, "step": 3475 }, { "epoch": 0.2738922038540546, "grad_norm": NaN, "learning_rate": 4.8630458600719994e-05, "loss": 0.0, "step": 3500 }, { "epoch": 0.2738922038540546, "eval_loss": NaN, "eval_runtime": 296.9235, "eval_samples_per_second": 405.205, "eval_steps_per_second": 6.332, "step": 3500 }, { "epoch": 0.2758485767387264, "grad_norm": NaN, "learning_rate": 4.8620676162153705e-05, "loss": 0.0, "step": 3525 }, { "epoch": 0.27780494962339825, "grad_norm": NaN, "learning_rate": 4.861089372358742e-05, "loss": 0.0, "step": 3550 }, { "epoch": 0.27976132250807006, "grad_norm": NaN, "learning_rate": 4.860111128502113e-05, "loss": 0.0, "step": 3575 }, { "epoch": 0.28171769539274183, "grad_norm": NaN, "learning_rate": 4.859132884645485e-05, "loss": 0.0, "step": 3600 }, { "epoch": 0.28171769539274183, "eval_loss": NaN, "eval_runtime": 297.3479, "eval_samples_per_second": 404.627, "eval_steps_per_second": 6.323, "step": 3600 }, { "epoch": 0.28367406827741365, "grad_norm": NaN, "learning_rate": 4.858154640788856e-05, "loss": 0.0, "step": 3625 }, { "epoch": 0.28563044116208547, "grad_norm": NaN, "learning_rate": 4.857176396932228e-05, "loss": 0.0, "step": 3650 }, { "epoch": 0.2875868140467573, "grad_norm": NaN, "learning_rate": 4.856198153075599e-05, "loss": 0.0, "step": 3675 }, { "epoch": 0.2895431869314291, "grad_norm": NaN, "learning_rate": 4.855219909218971e-05, "loss": 0.0, "step": 3700 }, { "epoch": 0.2895431869314291, "eval_loss": NaN, "eval_runtime": 297.174, "eval_samples_per_second": 404.864, "eval_steps_per_second": 6.326, "step": 3700 }, { "epoch": 0.29149955981610093, "grad_norm": NaN, "learning_rate": 4.854241665362342e-05, "loss": 0.0, "step": 3725 }, { "epoch": 0.29345593270077275, "grad_norm": NaN, "learning_rate": 4.8532634215057135e-05, "loss": 0.0, "step": 3750 }, { "epoch": 0.29541230558544457, "grad_norm": NaN, "learning_rate": 4.8522851776490846e-05, "loss": 0.0, "step": 3775 }, { "epoch": 0.2973686784701164, "grad_norm": NaN, "learning_rate": 4.851306933792456e-05, "loss": 0.0, "step": 3800 }, { "epoch": 0.2973686784701164, "eval_loss": NaN, "eval_runtime": 296.1374, "eval_samples_per_second": 406.281, "eval_steps_per_second": 6.348, "step": 3800 }, { "epoch": 0.2993250513547882, "grad_norm": NaN, "learning_rate": 4.8503286899358274e-05, "loss": 0.0, "step": 3825 }, { "epoch": 0.30128142423946, "grad_norm": NaN, "learning_rate": 4.849350446079199e-05, "loss": 0.0, "step": 3850 }, { "epoch": 0.30323779712413185, "grad_norm": NaN, "learning_rate": 4.84837220222257e-05, "loss": 0.0, "step": 3875 }, { "epoch": 0.30519417000880367, "grad_norm": NaN, "learning_rate": 4.847393958365942e-05, "loss": 0.0, "step": 3900 }, { "epoch": 0.30519417000880367, "eval_loss": NaN, "eval_runtime": 298.187, "eval_samples_per_second": 403.488, "eval_steps_per_second": 6.305, "step": 3900 }, { "epoch": 0.3071505428934755, "grad_norm": NaN, "learning_rate": 4.846415714509313e-05, "loss": 0.0, "step": 3925 }, { "epoch": 0.3091069157781473, "grad_norm": NaN, "learning_rate": 4.845437470652685e-05, "loss": 0.0, "step": 3950 }, { "epoch": 0.3110632886628191, "grad_norm": NaN, "learning_rate": 4.844459226796056e-05, "loss": 0.0, "step": 3975 }, { "epoch": 0.31301966154749095, "grad_norm": NaN, "learning_rate": 4.8434809829394276e-05, "loss": 0.0, "step": 4000 }, { "epoch": 0.31301966154749095, "eval_loss": NaN, "eval_runtime": 296.8393, "eval_samples_per_second": 405.32, "eval_steps_per_second": 6.333, "step": 4000 }, { "epoch": 0.31497603443216277, "grad_norm": NaN, "learning_rate": 4.842502739082799e-05, "loss": 0.0, "step": 4025 }, { "epoch": 0.3169324073168346, "grad_norm": NaN, "learning_rate": 4.8415244952261704e-05, "loss": 0.0, "step": 4050 }, { "epoch": 0.3188887802015064, "grad_norm": NaN, "learning_rate": 4.8405462513695415e-05, "loss": 0.0, "step": 4075 }, { "epoch": 0.3208451530861782, "grad_norm": NaN, "learning_rate": 4.839568007512913e-05, "loss": 0.0, "step": 4100 }, { "epoch": 0.3208451530861782, "eval_loss": NaN, "eval_runtime": 297.1517, "eval_samples_per_second": 404.894, "eval_steps_per_second": 6.327, "step": 4100 }, { "epoch": 0.32280152597085005, "grad_norm": NaN, "learning_rate": 4.838589763656284e-05, "loss": 0.0, "step": 4125 }, { "epoch": 0.32475789885552186, "grad_norm": NaN, "learning_rate": 4.837611519799656e-05, "loss": 0.0, "step": 4150 }, { "epoch": 0.3267142717401937, "grad_norm": NaN, "learning_rate": 4.836633275943027e-05, "loss": 0.0, "step": 4175 }, { "epoch": 0.3286706446248655, "grad_norm": NaN, "learning_rate": 4.835655032086399e-05, "loss": 0.0, "step": 4200 }, { "epoch": 0.3286706446248655, "eval_loss": NaN, "eval_runtime": 296.3488, "eval_samples_per_second": 405.991, "eval_steps_per_second": 6.344, "step": 4200 }, { "epoch": 0.3306270175095373, "grad_norm": NaN, "learning_rate": 4.83467678822977e-05, "loss": 0.0, "step": 4225 }, { "epoch": 0.33258339039420914, "grad_norm": NaN, "learning_rate": 4.833698544373142e-05, "loss": 0.0, "step": 4250 }, { "epoch": 0.33453976327888096, "grad_norm": NaN, "learning_rate": 4.832720300516513e-05, "loss": 0.0, "step": 4275 }, { "epoch": 0.3364961361635528, "grad_norm": NaN, "learning_rate": 4.8317420566598845e-05, "loss": 0.0, "step": 4300 }, { "epoch": 0.3364961361635528, "eval_loss": NaN, "eval_runtime": 295.9214, "eval_samples_per_second": 406.578, "eval_steps_per_second": 6.353, "step": 4300 }, { "epoch": 0.3384525090482246, "grad_norm": NaN, "learning_rate": 4.8307638128032556e-05, "loss": 0.0, "step": 4325 }, { "epoch": 0.3404088819328964, "grad_norm": NaN, "learning_rate": 4.829785568946627e-05, "loss": 0.0, "step": 4350 }, { "epoch": 0.34236525481756824, "grad_norm": NaN, "learning_rate": 4.8288073250899984e-05, "loss": 0.0, "step": 4375 }, { "epoch": 0.34432162770224006, "grad_norm": NaN, "learning_rate": 4.82782908123337e-05, "loss": 0.0, "step": 4400 }, { "epoch": 0.34432162770224006, "eval_loss": NaN, "eval_runtime": 296.0167, "eval_samples_per_second": 406.447, "eval_steps_per_second": 6.351, "step": 4400 }, { "epoch": 0.3462780005869119, "grad_norm": NaN, "learning_rate": 4.826850837376741e-05, "loss": 0.0, "step": 4425 }, { "epoch": 0.3482343734715837, "grad_norm": NaN, "learning_rate": 4.825872593520113e-05, "loss": 0.0, "step": 4450 }, { "epoch": 0.3501907463562555, "grad_norm": NaN, "learning_rate": 4.824894349663484e-05, "loss": 0.0, "step": 4475 }, { "epoch": 0.35214711924092734, "grad_norm": NaN, "learning_rate": 4.823916105806856e-05, "loss": 0.0, "step": 4500 }, { "epoch": 0.35214711924092734, "eval_loss": NaN, "eval_runtime": 296.0133, "eval_samples_per_second": 406.451, "eval_steps_per_second": 6.351, "step": 4500 }, { "epoch": 0.35410349212559916, "grad_norm": NaN, "learning_rate": 4.8229378619502275e-05, "loss": 0.0, "step": 4525 }, { "epoch": 0.356059865010271, "grad_norm": NaN, "learning_rate": 4.8219596180935986e-05, "loss": 0.0, "step": 4550 }, { "epoch": 0.3580162378949428, "grad_norm": NaN, "learning_rate": 4.82098137423697e-05, "loss": 0.0, "step": 4575 }, { "epoch": 0.3599726107796146, "grad_norm": NaN, "learning_rate": 4.8200031303803414e-05, "loss": 0.0, "step": 4600 }, { "epoch": 0.3599726107796146, "eval_loss": NaN, "eval_runtime": 298.5818, "eval_samples_per_second": 402.955, "eval_steps_per_second": 6.296, "step": 4600 }, { "epoch": 0.3619289836642864, "grad_norm": NaN, "learning_rate": 4.819024886523713e-05, "loss": 0.0, "step": 4625 }, { "epoch": 0.3638853565489582, "grad_norm": NaN, "learning_rate": 4.818046642667084e-05, "loss": 0.0, "step": 4650 }, { "epoch": 0.36584172943363, "grad_norm": NaN, "learning_rate": 4.817068398810456e-05, "loss": 0.0, "step": 4675 }, { "epoch": 0.36779810231830184, "grad_norm": NaN, "learning_rate": 4.816090154953827e-05, "loss": 0.0, "step": 4700 }, { "epoch": 0.36779810231830184, "eval_loss": NaN, "eval_runtime": 296.7043, "eval_samples_per_second": 405.505, "eval_steps_per_second": 6.336, "step": 4700 }, { "epoch": 0.36975447520297366, "grad_norm": NaN, "learning_rate": 4.815111911097199e-05, "loss": 0.0, "step": 4725 }, { "epoch": 0.3717108480876455, "grad_norm": NaN, "learning_rate": 4.81413366724057e-05, "loss": 0.0, "step": 4750 }, { "epoch": 0.3736672209723173, "grad_norm": NaN, "learning_rate": 4.8131554233839416e-05, "loss": 0.0, "step": 4775 }, { "epoch": 0.3756235938569891, "grad_norm": NaN, "learning_rate": 4.812177179527313e-05, "loss": 0.0, "step": 4800 }, { "epoch": 0.3756235938569891, "eval_loss": NaN, "eval_runtime": 296.2936, "eval_samples_per_second": 406.067, "eval_steps_per_second": 6.345, "step": 4800 }, { "epoch": 0.37757996674166094, "grad_norm": NaN, "learning_rate": 4.8111989356706844e-05, "loss": 0.0, "step": 4825 }, { "epoch": 0.37953633962633276, "grad_norm": NaN, "learning_rate": 4.8102206918140555e-05, "loss": 0.0, "step": 4850 }, { "epoch": 0.3814927125110046, "grad_norm": NaN, "learning_rate": 4.809242447957427e-05, "loss": 0.0, "step": 4875 }, { "epoch": 0.3834490853956764, "grad_norm": NaN, "learning_rate": 4.808264204100798e-05, "loss": 0.0, "step": 4900 }, { "epoch": 0.3834490853956764, "eval_loss": NaN, "eval_runtime": 295.9464, "eval_samples_per_second": 406.543, "eval_steps_per_second": 6.353, "step": 4900 }, { "epoch": 0.3854054582803482, "grad_norm": NaN, "learning_rate": 4.80728596024417e-05, "loss": 0.0, "step": 4925 }, { "epoch": 0.38736183116502004, "grad_norm": NaN, "learning_rate": 4.806307716387541e-05, "loss": 0.0, "step": 4950 }, { "epoch": 0.38931820404969186, "grad_norm": NaN, "learning_rate": 4.805329472530913e-05, "loss": 0.0, "step": 4975 }, { "epoch": 0.3912745769343637, "grad_norm": NaN, "learning_rate": 4.804351228674284e-05, "loss": 0.0, "step": 5000 }, { "epoch": 0.3912745769343637, "eval_loss": NaN, "eval_runtime": 296.0731, "eval_samples_per_second": 406.369, "eval_steps_per_second": 6.35, "step": 5000 }, { "epoch": 0.3932309498190355, "grad_norm": NaN, "learning_rate": 4.803372984817656e-05, "loss": 0.0, "step": 5025 }, { "epoch": 0.3951873227037073, "grad_norm": NaN, "learning_rate": 4.802394740961027e-05, "loss": 0.0, "step": 5050 }, { "epoch": 0.39714369558837914, "grad_norm": NaN, "learning_rate": 4.8014164971043985e-05, "loss": 0.0, "step": 5075 }, { "epoch": 0.39910006847305096, "grad_norm": NaN, "learning_rate": 4.8004382532477696e-05, "loss": 0.0, "step": 5100 }, { "epoch": 0.39910006847305096, "eval_loss": NaN, "eval_runtime": 295.9175, "eval_samples_per_second": 406.583, "eval_steps_per_second": 6.353, "step": 5100 }, { "epoch": 0.4010564413577228, "grad_norm": NaN, "learning_rate": 4.799460009391141e-05, "loss": 0.0, "step": 5125 }, { "epoch": 0.4030128142423946, "grad_norm": NaN, "learning_rate": 4.7984817655345124e-05, "loss": 0.0, "step": 5150 }, { "epoch": 0.4049691871270664, "grad_norm": NaN, "learning_rate": 4.797503521677884e-05, "loss": 0.0, "step": 5175 }, { "epoch": 0.40692556001173824, "grad_norm": NaN, "learning_rate": 4.796525277821255e-05, "loss": 0.0, "step": 5200 }, { "epoch": 0.40692556001173824, "eval_loss": NaN, "eval_runtime": 295.8779, "eval_samples_per_second": 406.637, "eval_steps_per_second": 6.354, "step": 5200 }, { "epoch": 0.40888193289641006, "grad_norm": NaN, "learning_rate": 4.795547033964627e-05, "loss": 0.0, "step": 5225 }, { "epoch": 0.4108383057810819, "grad_norm": NaN, "learning_rate": 4.794568790107998e-05, "loss": 0.0, "step": 5250 }, { "epoch": 0.4127946786657537, "grad_norm": NaN, "learning_rate": 4.79359054625137e-05, "loss": 0.0, "step": 5275 }, { "epoch": 0.4147510515504255, "grad_norm": NaN, "learning_rate": 4.792612302394741e-05, "loss": 0.0, "step": 5300 }, { "epoch": 0.4147510515504255, "eval_loss": NaN, "eval_runtime": 296.7831, "eval_samples_per_second": 405.397, "eval_steps_per_second": 6.335, "step": 5300 }, { "epoch": 0.41670742443509734, "grad_norm": NaN, "learning_rate": 4.7916340585381126e-05, "loss": 0.0, "step": 5325 }, { "epoch": 0.41866379731976916, "grad_norm": NaN, "learning_rate": 4.7906558146814837e-05, "loss": 0.0, "step": 5350 }, { "epoch": 0.420620170204441, "grad_norm": NaN, "learning_rate": 4.7896775708248554e-05, "loss": 0.0, "step": 5375 }, { "epoch": 0.4225765430891128, "grad_norm": NaN, "learning_rate": 4.7886993269682265e-05, "loss": 0.0, "step": 5400 }, { "epoch": 0.4225765430891128, "eval_loss": NaN, "eval_runtime": 298.1152, "eval_samples_per_second": 403.586, "eval_steps_per_second": 6.306, "step": 5400 }, { "epoch": 0.4245329159737846, "grad_norm": NaN, "learning_rate": 4.787721083111598e-05, "loss": 0.0, "step": 5425 }, { "epoch": 0.42648928885845644, "grad_norm": NaN, "learning_rate": 4.786742839254969e-05, "loss": 0.0, "step": 5450 }, { "epoch": 0.42844566174312826, "grad_norm": NaN, "learning_rate": 4.785764595398341e-05, "loss": 0.0, "step": 5475 }, { "epoch": 0.4304020346278001, "grad_norm": NaN, "learning_rate": 4.784786351541712e-05, "loss": 0.0, "step": 5500 }, { "epoch": 0.4304020346278001, "eval_loss": NaN, "eval_runtime": 298.3055, "eval_samples_per_second": 403.328, "eval_steps_per_second": 6.302, "step": 5500 }, { "epoch": 0.4323584075124719, "grad_norm": NaN, "learning_rate": 4.783808107685084e-05, "loss": 0.0, "step": 5525 }, { "epoch": 0.4343147803971437, "grad_norm": NaN, "learning_rate": 4.782829863828455e-05, "loss": 0.0, "step": 5550 }, { "epoch": 0.43627115328181554, "grad_norm": NaN, "learning_rate": 4.781851619971827e-05, "loss": 0.0, "step": 5575 }, { "epoch": 0.43822752616648736, "grad_norm": NaN, "learning_rate": 4.780873376115198e-05, "loss": 0.0, "step": 5600 }, { "epoch": 0.43822752616648736, "eval_loss": NaN, "eval_runtime": 299.1887, "eval_samples_per_second": 402.138, "eval_steps_per_second": 6.284, "step": 5600 }, { "epoch": 0.4401838990511592, "grad_norm": NaN, "learning_rate": 4.7798951322585695e-05, "loss": 0.0, "step": 5625 }, { "epoch": 0.44214027193583094, "grad_norm": NaN, "learning_rate": 4.7789168884019406e-05, "loss": 0.0, "step": 5650 }, { "epoch": 0.44409664482050276, "grad_norm": NaN, "learning_rate": 4.777938644545312e-05, "loss": 0.0, "step": 5675 }, { "epoch": 0.4460530177051746, "grad_norm": NaN, "learning_rate": 4.7769604006886834e-05, "loss": 0.0, "step": 5700 }, { "epoch": 0.4460530177051746, "eval_loss": NaN, "eval_runtime": 298.7665, "eval_samples_per_second": 402.706, "eval_steps_per_second": 6.293, "step": 5700 }, { "epoch": 0.4480093905898464, "grad_norm": NaN, "learning_rate": 4.775982156832055e-05, "loss": 0.0, "step": 5725 }, { "epoch": 0.4499657634745182, "grad_norm": NaN, "learning_rate": 4.775003912975426e-05, "loss": 0.0, "step": 5750 }, { "epoch": 0.45192213635919004, "grad_norm": NaN, "learning_rate": 4.774025669118798e-05, "loss": 0.0, "step": 5775 }, { "epoch": 0.45387850924386186, "grad_norm": NaN, "learning_rate": 4.77304742526217e-05, "loss": 0.0, "step": 5800 }, { "epoch": 0.45387850924386186, "eval_loss": NaN, "eval_runtime": 299.5371, "eval_samples_per_second": 401.67, "eval_steps_per_second": 6.276, "step": 5800 }, { "epoch": 0.4558348821285337, "grad_norm": NaN, "learning_rate": 4.7720691814055414e-05, "loss": 0.0, "step": 5825 }, { "epoch": 0.4577912550132055, "grad_norm": NaN, "learning_rate": 4.7710909375489125e-05, "loss": 0.0, "step": 5850 }, { "epoch": 0.4597476278978773, "grad_norm": NaN, "learning_rate": 4.770112693692284e-05, "loss": 0.0, "step": 5875 }, { "epoch": 0.46170400078254914, "grad_norm": NaN, "learning_rate": 4.769134449835655e-05, "loss": 0.0, "step": 5900 }, { "epoch": 0.46170400078254914, "eval_loss": NaN, "eval_runtime": 300.5037, "eval_samples_per_second": 400.378, "eval_steps_per_second": 6.256, "step": 5900 }, { "epoch": 0.46366037366722096, "grad_norm": NaN, "learning_rate": 4.768156205979027e-05, "loss": 0.0, "step": 5925 }, { "epoch": 0.4656167465518928, "grad_norm": NaN, "learning_rate": 4.767177962122398e-05, "loss": 0.0, "step": 5950 }, { "epoch": 0.4675731194365646, "grad_norm": NaN, "learning_rate": 4.76619971826577e-05, "loss": 0.0, "step": 5975 }, { "epoch": 0.4695294923212364, "grad_norm": NaN, "learning_rate": 4.765221474409141e-05, "loss": 0.0, "step": 6000 }, { "epoch": 0.4695294923212364, "eval_loss": NaN, "eval_runtime": 298.9524, "eval_samples_per_second": 402.455, "eval_steps_per_second": 6.289, "step": 6000 }, { "epoch": 0.47148586520590824, "grad_norm": NaN, "learning_rate": 4.764243230552513e-05, "loss": 0.0, "step": 6025 }, { "epoch": 0.47344223809058006, "grad_norm": NaN, "learning_rate": 4.763264986695884e-05, "loss": 0.0, "step": 6050 }, { "epoch": 0.4753986109752519, "grad_norm": NaN, "learning_rate": 4.7622867428392555e-05, "loss": 0.0, "step": 6075 }, { "epoch": 0.4773549838599237, "grad_norm": NaN, "learning_rate": 4.7613084989826266e-05, "loss": 0.0, "step": 6100 }, { "epoch": 0.4773549838599237, "eval_loss": NaN, "eval_runtime": 301.123, "eval_samples_per_second": 399.554, "eval_steps_per_second": 6.243, "step": 6100 }, { "epoch": 0.4793113567445955, "grad_norm": NaN, "learning_rate": 4.760330255125998e-05, "loss": 0.0, "step": 6125 }, { "epoch": 0.48126772962926734, "grad_norm": NaN, "learning_rate": 4.7593520112693694e-05, "loss": 0.0, "step": 6150 }, { "epoch": 0.48322410251393916, "grad_norm": NaN, "learning_rate": 4.758373767412741e-05, "loss": 0.0, "step": 6175 }, { "epoch": 0.485180475398611, "grad_norm": NaN, "learning_rate": 4.757395523556112e-05, "loss": 0.0, "step": 6200 }, { "epoch": 0.485180475398611, "eval_loss": NaN, "eval_runtime": 300.3739, "eval_samples_per_second": 400.551, "eval_steps_per_second": 6.259, "step": 6200 }, { "epoch": 0.4871368482832828, "grad_norm": NaN, "learning_rate": 4.756417279699484e-05, "loss": 0.0, "step": 6225 }, { "epoch": 0.4890932211679546, "grad_norm": NaN, "learning_rate": 4.755439035842855e-05, "loss": 0.0, "step": 6250 }, { "epoch": 0.49104959405262644, "grad_norm": NaN, "learning_rate": 4.754460791986227e-05, "loss": 0.0, "step": 6275 }, { "epoch": 0.49300596693729826, "grad_norm": NaN, "learning_rate": 4.753482548129598e-05, "loss": 0.0, "step": 6300 }, { "epoch": 0.49300596693729826, "eval_loss": NaN, "eval_runtime": 299.3964, "eval_samples_per_second": 401.859, "eval_steps_per_second": 6.279, "step": 6300 }, { "epoch": 0.4949623398219701, "grad_norm": NaN, "learning_rate": 4.7525043042729696e-05, "loss": 0.0, "step": 6325 }, { "epoch": 0.4969187127066419, "grad_norm": NaN, "learning_rate": 4.751526060416341e-05, "loss": 0.0, "step": 6350 }, { "epoch": 0.4988750855913137, "grad_norm": NaN, "learning_rate": 4.7505478165597124e-05, "loss": 0.0, "step": 6375 }, { "epoch": 0.5008314584759855, "grad_norm": NaN, "learning_rate": 4.7495695727030835e-05, "loss": 0.0, "step": 6400 }, { "epoch": 0.5008314584759855, "eval_loss": NaN, "eval_runtime": 298.7919, "eval_samples_per_second": 402.672, "eval_steps_per_second": 6.292, "step": 6400 }, { "epoch": 0.5027878313606573, "grad_norm": NaN, "learning_rate": 4.748591328846455e-05, "loss": 0.0, "step": 6425 }, { "epoch": 0.5047442042453292, "grad_norm": NaN, "learning_rate": 4.747613084989826e-05, "loss": 0.0, "step": 6450 }, { "epoch": 0.5067005771300009, "grad_norm": NaN, "learning_rate": 4.746634841133198e-05, "loss": 0.0, "step": 6475 }, { "epoch": 0.5086569500146728, "grad_norm": NaN, "learning_rate": 4.745656597276569e-05, "loss": 0.0, "step": 6500 }, { "epoch": 0.5086569500146728, "eval_loss": NaN, "eval_runtime": 298.2846, "eval_samples_per_second": 403.356, "eval_steps_per_second": 6.303, "step": 6500 }, { "epoch": 0.5106133228993446, "grad_norm": NaN, "learning_rate": 4.744678353419941e-05, "loss": 0.0, "step": 6525 }, { "epoch": 0.5125696957840165, "grad_norm": NaN, "learning_rate": 4.743700109563312e-05, "loss": 0.0, "step": 6550 }, { "epoch": 0.5145260686686882, "grad_norm": NaN, "learning_rate": 4.742721865706684e-05, "loss": 0.0, "step": 6575 }, { "epoch": 0.5164824415533601, "grad_norm": NaN, "learning_rate": 4.741743621850055e-05, "loss": 0.0, "step": 6600 }, { "epoch": 0.5164824415533601, "eval_loss": NaN, "eval_runtime": 298.9561, "eval_samples_per_second": 402.45, "eval_steps_per_second": 6.289, "step": 6600 }, { "epoch": 0.5184388144380319, "grad_norm": NaN, "learning_rate": 4.7407653779934265e-05, "loss": 0.0, "step": 6625 }, { "epoch": 0.5203951873227037, "grad_norm": NaN, "learning_rate": 4.7397871341367976e-05, "loss": 0.0, "step": 6650 }, { "epoch": 0.5223515602073755, "grad_norm": NaN, "learning_rate": 4.738808890280169e-05, "loss": 0.0, "step": 6675 }, { "epoch": 0.5243079330920474, "grad_norm": NaN, "learning_rate": 4.7378306464235404e-05, "loss": 0.0, "step": 6700 }, { "epoch": 0.5243079330920474, "eval_loss": NaN, "eval_runtime": 299.521, "eval_samples_per_second": 401.691, "eval_steps_per_second": 6.277, "step": 6700 }, { "epoch": 0.5262643059767191, "grad_norm": NaN, "learning_rate": 4.736852402566912e-05, "loss": 0.0, "step": 6725 }, { "epoch": 0.528220678861391, "grad_norm": NaN, "learning_rate": 4.735874158710283e-05, "loss": 0.0, "step": 6750 }, { "epoch": 0.5301770517460628, "grad_norm": NaN, "learning_rate": 4.734895914853655e-05, "loss": 0.0, "step": 6775 }, { "epoch": 0.5321334246307347, "grad_norm": NaN, "learning_rate": 4.733917670997026e-05, "loss": 0.0, "step": 6800 }, { "epoch": 0.5321334246307347, "eval_loss": NaN, "eval_runtime": 298.94, "eval_samples_per_second": 402.472, "eval_steps_per_second": 6.289, "step": 6800 }, { "epoch": 0.5340897975154064, "grad_norm": NaN, "learning_rate": 4.732939427140398e-05, "loss": 0.0, "step": 6825 }, { "epoch": 0.5360461704000783, "grad_norm": NaN, "learning_rate": 4.731961183283769e-05, "loss": 0.0, "step": 6850 }, { "epoch": 0.5380025432847501, "grad_norm": NaN, "learning_rate": 4.7309829394271406e-05, "loss": 0.0, "step": 6875 }, { "epoch": 0.5399589161694219, "grad_norm": NaN, "learning_rate": 4.7300046955705116e-05, "loss": 0.0, "step": 6900 }, { "epoch": 0.5399589161694219, "eval_loss": NaN, "eval_runtime": 300.3444, "eval_samples_per_second": 400.59, "eval_steps_per_second": 6.259, "step": 6900 }, { "epoch": 0.5419152890540937, "grad_norm": NaN, "learning_rate": 4.7290264517138834e-05, "loss": 0.0, "step": 6925 }, { "epoch": 0.5438716619387656, "grad_norm": NaN, "learning_rate": 4.7280482078572545e-05, "loss": 0.0, "step": 6950 }, { "epoch": 0.5458280348234373, "grad_norm": NaN, "learning_rate": 4.727069964000626e-05, "loss": 0.0, "step": 6975 }, { "epoch": 0.5477844077081092, "grad_norm": NaN, "learning_rate": 4.726091720143997e-05, "loss": 0.0, "step": 7000 }, { "epoch": 0.5477844077081092, "eval_loss": NaN, "eval_runtime": 300.6597, "eval_samples_per_second": 400.17, "eval_steps_per_second": 6.253, "step": 7000 }, { "epoch": 0.549740780592781, "grad_norm": NaN, "learning_rate": 4.725113476287369e-05, "loss": 0.0, "step": 7025 }, { "epoch": 0.5516971534774529, "grad_norm": NaN, "learning_rate": 4.72413523243074e-05, "loss": 0.0, "step": 7050 }, { "epoch": 0.5536535263621246, "grad_norm": NaN, "learning_rate": 4.723156988574112e-05, "loss": 0.0, "step": 7075 }, { "epoch": 0.5556098992467965, "grad_norm": NaN, "learning_rate": 4.7221787447174836e-05, "loss": 0.0, "step": 7100 }, { "epoch": 0.5556098992467965, "eval_loss": NaN, "eval_runtime": 297.6725, "eval_samples_per_second": 404.186, "eval_steps_per_second": 6.316, "step": 7100 }, { "epoch": 0.5575662721314683, "grad_norm": NaN, "learning_rate": 4.721200500860855e-05, "loss": 0.0, "step": 7125 }, { "epoch": 0.5595226450161401, "grad_norm": NaN, "learning_rate": 4.7202222570042264e-05, "loss": 0.0, "step": 7150 }, { "epoch": 0.5614790179008119, "grad_norm": NaN, "learning_rate": 4.719244013147598e-05, "loss": 0.0, "step": 7175 }, { "epoch": 0.5634353907854837, "grad_norm": NaN, "learning_rate": 4.718265769290969e-05, "loss": 0.0, "step": 7200 }, { "epoch": 0.5634353907854837, "eval_loss": NaN, "eval_runtime": 295.947, "eval_samples_per_second": 406.542, "eval_steps_per_second": 6.352, "step": 7200 }, { "epoch": 0.5653917636701555, "grad_norm": NaN, "learning_rate": 4.717287525434341e-05, "loss": 0.0, "step": 7225 }, { "epoch": 0.5673481365548273, "grad_norm": NaN, "learning_rate": 4.716309281577712e-05, "loss": 0.0, "step": 7250 }, { "epoch": 0.5693045094394992, "grad_norm": NaN, "learning_rate": 4.715331037721084e-05, "loss": 0.0, "step": 7275 }, { "epoch": 0.5712608823241709, "grad_norm": NaN, "learning_rate": 4.714352793864455e-05, "loss": 0.0, "step": 7300 }, { "epoch": 0.5712608823241709, "eval_loss": NaN, "eval_runtime": 296.5153, "eval_samples_per_second": 405.763, "eval_steps_per_second": 6.34, "step": 7300 }, { "epoch": 0.5732172552088428, "grad_norm": NaN, "learning_rate": 4.7133745500078266e-05, "loss": 0.0, "step": 7325 }, { "epoch": 0.5751736280935146, "grad_norm": NaN, "learning_rate": 4.712396306151198e-05, "loss": 0.0, "step": 7350 }, { "epoch": 0.5771300009781865, "grad_norm": NaN, "learning_rate": 4.7114180622945694e-05, "loss": 0.0, "step": 7375 }, { "epoch": 0.5790863738628582, "grad_norm": NaN, "learning_rate": 4.7104398184379405e-05, "loss": 0.0, "step": 7400 }, { "epoch": 0.5790863738628582, "eval_loss": NaN, "eval_runtime": 296.5888, "eval_samples_per_second": 405.663, "eval_steps_per_second": 6.339, "step": 7400 }, { "epoch": 0.5810427467475301, "grad_norm": NaN, "learning_rate": 4.709461574581312e-05, "loss": 0.0, "step": 7425 }, { "epoch": 0.5829991196322019, "grad_norm": NaN, "learning_rate": 4.708483330724683e-05, "loss": 0.0, "step": 7450 }, { "epoch": 0.5849554925168737, "grad_norm": NaN, "learning_rate": 4.707505086868055e-05, "loss": 0.0, "step": 7475 }, { "epoch": 0.5869118654015455, "grad_norm": NaN, "learning_rate": 4.706526843011426e-05, "loss": 0.0, "step": 7500 }, { "epoch": 0.5869118654015455, "eval_loss": NaN, "eval_runtime": 295.5109, "eval_samples_per_second": 407.142, "eval_steps_per_second": 6.362, "step": 7500 }, { "epoch": 0.5888682382862174, "grad_norm": NaN, "learning_rate": 4.705548599154798e-05, "loss": 0.0, "step": 7525 }, { "epoch": 0.5908246111708891, "grad_norm": NaN, "learning_rate": 4.704570355298169e-05, "loss": 0.0, "step": 7550 }, { "epoch": 0.592780984055561, "grad_norm": NaN, "learning_rate": 4.703592111441541e-05, "loss": 0.0, "step": 7575 }, { "epoch": 0.5947373569402328, "grad_norm": NaN, "learning_rate": 4.702613867584912e-05, "loss": 0.0, "step": 7600 }, { "epoch": 0.5947373569402328, "eval_loss": NaN, "eval_runtime": 295.472, "eval_samples_per_second": 407.196, "eval_steps_per_second": 6.363, "step": 7600 }, { "epoch": 0.5966937298249047, "grad_norm": NaN, "learning_rate": 4.7016356237282835e-05, "loss": 0.0, "step": 7625 }, { "epoch": 0.5986501027095764, "grad_norm": NaN, "learning_rate": 4.7006573798716546e-05, "loss": 0.0, "step": 7650 }, { "epoch": 0.6006064755942483, "grad_norm": NaN, "learning_rate": 4.699679136015026e-05, "loss": 0.0, "step": 7675 }, { "epoch": 0.60256284847892, "grad_norm": NaN, "learning_rate": 4.6987008921583974e-05, "loss": 0.0, "step": 7700 }, { "epoch": 0.60256284847892, "eval_loss": NaN, "eval_runtime": 298.86, "eval_samples_per_second": 402.58, "eval_steps_per_second": 6.291, "step": 7700 }, { "epoch": 0.6045192213635919, "grad_norm": NaN, "learning_rate": 4.697722648301769e-05, "loss": 0.0, "step": 7725 }, { "epoch": 0.6064755942482637, "grad_norm": NaN, "learning_rate": 4.69674440444514e-05, "loss": 0.0, "step": 7750 }, { "epoch": 0.6084319671329356, "grad_norm": NaN, "learning_rate": 4.695766160588512e-05, "loss": 0.0, "step": 7775 }, { "epoch": 0.6103883400176073, "grad_norm": NaN, "learning_rate": 4.694787916731883e-05, "loss": 0.0, "step": 7800 }, { "epoch": 0.6103883400176073, "eval_loss": NaN, "eval_runtime": 297.6492, "eval_samples_per_second": 404.217, "eval_steps_per_second": 6.316, "step": 7800 }, { "epoch": 0.6123447129022792, "grad_norm": NaN, "learning_rate": 4.693809672875255e-05, "loss": 0.0, "step": 7825 }, { "epoch": 0.614301085786951, "grad_norm": NaN, "learning_rate": 4.692831429018626e-05, "loss": 0.0, "step": 7850 }, { "epoch": 0.6162574586716228, "grad_norm": NaN, "learning_rate": 4.6918531851619976e-05, "loss": 0.0, "step": 7875 }, { "epoch": 0.6182138315562946, "grad_norm": NaN, "learning_rate": 4.6908749413053687e-05, "loss": 0.0, "step": 7900 }, { "epoch": 0.6182138315562946, "eval_loss": NaN, "eval_runtime": 296.1804, "eval_samples_per_second": 406.222, "eval_steps_per_second": 6.347, "step": 7900 }, { "epoch": 0.6201702044409665, "grad_norm": NaN, "learning_rate": 4.6898966974487404e-05, "loss": 0.0, "step": 7925 }, { "epoch": 0.6221265773256383, "grad_norm": NaN, "learning_rate": 4.6889184535921115e-05, "loss": 0.0, "step": 7950 }, { "epoch": 0.6240829502103101, "grad_norm": NaN, "learning_rate": 4.687940209735483e-05, "loss": 0.0, "step": 7975 }, { "epoch": 0.6260393230949819, "grad_norm": NaN, "learning_rate": 4.686961965878854e-05, "loss": 0.0, "step": 8000 }, { "epoch": 0.6260393230949819, "eval_loss": NaN, "eval_runtime": 295.8946, "eval_samples_per_second": 406.614, "eval_steps_per_second": 6.354, "step": 8000 }, { "epoch": 0.6279956959796538, "grad_norm": NaN, "learning_rate": 4.685983722022226e-05, "loss": 0.0, "step": 8025 }, { "epoch": 0.6299520688643255, "grad_norm": NaN, "learning_rate": 4.685005478165597e-05, "loss": 0.0, "step": 8050 }, { "epoch": 0.6319084417489974, "grad_norm": NaN, "learning_rate": 4.684027234308969e-05, "loss": 0.0, "step": 8075 }, { "epoch": 0.6338648146336692, "grad_norm": NaN, "learning_rate": 4.68304899045234e-05, "loss": 0.0, "step": 8100 }, { "epoch": 0.6338648146336692, "eval_loss": NaN, "eval_runtime": 294.801, "eval_samples_per_second": 408.123, "eval_steps_per_second": 6.377, "step": 8100 }, { "epoch": 0.635821187518341, "grad_norm": NaN, "learning_rate": 4.682070746595712e-05, "loss": 0.0, "step": 8125 }, { "epoch": 0.6377775604030128, "grad_norm": NaN, "learning_rate": 4.681092502739083e-05, "loss": 0.0, "step": 8150 }, { "epoch": 0.6397339332876847, "grad_norm": NaN, "learning_rate": 4.6801142588824545e-05, "loss": 0.0, "step": 8175 }, { "epoch": 0.6416903061723565, "grad_norm": NaN, "learning_rate": 4.6791360150258256e-05, "loss": 0.0, "step": 8200 }, { "epoch": 0.6416903061723565, "eval_loss": NaN, "eval_runtime": 295.9161, "eval_samples_per_second": 406.585, "eval_steps_per_second": 6.353, "step": 8200 }, { "epoch": 0.6436466790570282, "grad_norm": NaN, "learning_rate": 4.678157771169197e-05, "loss": 0.0, "step": 8225 }, { "epoch": 0.6456030519417001, "grad_norm": NaN, "learning_rate": 4.6771795273125684e-05, "loss": 0.0, "step": 8250 }, { "epoch": 0.6475594248263719, "grad_norm": NaN, "learning_rate": 4.67620128345594e-05, "loss": 0.0, "step": 8275 }, { "epoch": 0.6495157977110437, "grad_norm": NaN, "learning_rate": 4.675223039599311e-05, "loss": 0.0, "step": 8300 }, { "epoch": 0.6495157977110437, "eval_loss": NaN, "eval_runtime": 296.9022, "eval_samples_per_second": 405.234, "eval_steps_per_second": 6.332, "step": 8300 }, { "epoch": 0.6514721705957155, "grad_norm": NaN, "learning_rate": 4.674244795742683e-05, "loss": 0.0, "step": 8325 }, { "epoch": 0.6534285434803874, "grad_norm": NaN, "learning_rate": 4.673266551886054e-05, "loss": 0.0, "step": 8350 }, { "epoch": 0.6553849163650591, "grad_norm": NaN, "learning_rate": 4.672288308029426e-05, "loss": 0.0, "step": 8375 }, { "epoch": 0.657341289249731, "grad_norm": NaN, "learning_rate": 4.6713100641727975e-05, "loss": 0.0, "step": 8400 }, { "epoch": 0.657341289249731, "eval_loss": NaN, "eval_runtime": 297.0723, "eval_samples_per_second": 405.002, "eval_steps_per_second": 6.328, "step": 8400 }, { "epoch": 0.6592976621344028, "grad_norm": NaN, "learning_rate": 4.6703318203161686e-05, "loss": 0.0, "step": 8425 }, { "epoch": 0.6612540350190746, "grad_norm": NaN, "learning_rate": 4.66935357645954e-05, "loss": 0.0, "step": 8450 }, { "epoch": 0.6632104079037464, "grad_norm": NaN, "learning_rate": 4.6683753326029114e-05, "loss": 0.0, "step": 8475 }, { "epoch": 0.6651667807884183, "grad_norm": NaN, "learning_rate": 4.667397088746283e-05, "loss": 0.0, "step": 8500 }, { "epoch": 0.6651667807884183, "eval_loss": NaN, "eval_runtime": 296.5474, "eval_samples_per_second": 405.719, "eval_steps_per_second": 6.34, "step": 8500 }, { "epoch": 0.66712315367309, "grad_norm": NaN, "learning_rate": 4.666418844889654e-05, "loss": 0.0, "step": 8525 }, { "epoch": 0.6690795265577619, "grad_norm": NaN, "learning_rate": 4.665440601033026e-05, "loss": 0.0, "step": 8550 }, { "epoch": 0.6710358994424337, "grad_norm": NaN, "learning_rate": 4.664462357176397e-05, "loss": 0.0, "step": 8575 }, { "epoch": 0.6729922723271056, "grad_norm": NaN, "learning_rate": 4.663484113319769e-05, "loss": 0.0, "step": 8600 }, { "epoch": 0.6729922723271056, "eval_loss": NaN, "eval_runtime": 295.0741, "eval_samples_per_second": 407.745, "eval_steps_per_second": 6.371, "step": 8600 }, { "epoch": 0.6749486452117773, "grad_norm": NaN, "learning_rate": 4.66250586946314e-05, "loss": 0.0, "step": 8625 }, { "epoch": 0.6769050180964492, "grad_norm": NaN, "learning_rate": 4.6615276256065116e-05, "loss": 0.0, "step": 8650 }, { "epoch": 0.678861390981121, "grad_norm": NaN, "learning_rate": 4.6605493817498827e-05, "loss": 0.0, "step": 8675 }, { "epoch": 0.6808177638657928, "grad_norm": NaN, "learning_rate": 4.6595711378932544e-05, "loss": 0.0, "step": 8700 }, { "epoch": 0.6808177638657928, "eval_loss": NaN, "eval_runtime": 295.5794, "eval_samples_per_second": 407.048, "eval_steps_per_second": 6.36, "step": 8700 }, { "epoch": 0.6827741367504646, "grad_norm": NaN, "learning_rate": 4.6585928940366255e-05, "loss": 0.0, "step": 8725 }, { "epoch": 0.6847305096351365, "grad_norm": NaN, "learning_rate": 4.657614650179997e-05, "loss": 0.0, "step": 8750 }, { "epoch": 0.6866868825198083, "grad_norm": NaN, "learning_rate": 4.656636406323368e-05, "loss": 0.0, "step": 8775 }, { "epoch": 0.6886432554044801, "grad_norm": NaN, "learning_rate": 4.65565816246674e-05, "loss": 0.0, "step": 8800 }, { "epoch": 0.6886432554044801, "eval_loss": NaN, "eval_runtime": 295.9977, "eval_samples_per_second": 406.473, "eval_steps_per_second": 6.351, "step": 8800 }, { "epoch": 0.6905996282891519, "grad_norm": NaN, "learning_rate": 4.654679918610111e-05, "loss": 0.0, "step": 8825 }, { "epoch": 0.6925560011738238, "grad_norm": NaN, "learning_rate": 4.653701674753483e-05, "loss": 0.0, "step": 8850 }, { "epoch": 0.6945123740584955, "grad_norm": NaN, "learning_rate": 4.652723430896854e-05, "loss": 0.0, "step": 8875 }, { "epoch": 0.6964687469431674, "grad_norm": NaN, "learning_rate": 4.651745187040226e-05, "loss": 0.0, "step": 8900 }, { "epoch": 0.6964687469431674, "eval_loss": NaN, "eval_runtime": 295.8095, "eval_samples_per_second": 406.731, "eval_steps_per_second": 6.355, "step": 8900 }, { "epoch": 0.6984251198278392, "grad_norm": NaN, "learning_rate": 4.650766943183597e-05, "loss": 0.0, "step": 8925 }, { "epoch": 0.700381492712511, "grad_norm": NaN, "learning_rate": 4.6497886993269685e-05, "loss": 0.0, "step": 8950 }, { "epoch": 0.7023378655971828, "grad_norm": NaN, "learning_rate": 4.6488104554703396e-05, "loss": 0.0, "step": 8975 }, { "epoch": 0.7042942384818547, "grad_norm": NaN, "learning_rate": 4.647832211613711e-05, "loss": 0.0, "step": 9000 }, { "epoch": 0.7042942384818547, "eval_loss": NaN, "eval_runtime": 297.141, "eval_samples_per_second": 404.909, "eval_steps_per_second": 6.327, "step": 9000 }, { "epoch": 0.7062506113665264, "grad_norm": NaN, "learning_rate": 4.6468539677570824e-05, "loss": 0.0, "step": 9025 }, { "epoch": 0.7082069842511983, "grad_norm": NaN, "learning_rate": 4.645875723900454e-05, "loss": 0.0, "step": 9050 }, { "epoch": 0.7101633571358701, "grad_norm": NaN, "learning_rate": 4.644897480043825e-05, "loss": 0.0, "step": 9075 }, { "epoch": 0.712119730020542, "grad_norm": NaN, "learning_rate": 4.643919236187197e-05, "loss": 0.0, "step": 9100 }, { "epoch": 0.712119730020542, "eval_loss": NaN, "eval_runtime": 294.9591, "eval_samples_per_second": 407.904, "eval_steps_per_second": 6.374, "step": 9100 }, { "epoch": 0.7140761029052137, "grad_norm": NaN, "learning_rate": 4.642940992330569e-05, "loss": 0.0, "step": 9125 }, { "epoch": 0.7160324757898856, "grad_norm": NaN, "learning_rate": 4.64196274847394e-05, "loss": 0.0, "step": 9150 }, { "epoch": 0.7179888486745574, "grad_norm": NaN, "learning_rate": 4.6409845046173115e-05, "loss": 0.0, "step": 9175 }, { "epoch": 0.7199452215592292, "grad_norm": NaN, "learning_rate": 4.6400062607606826e-05, "loss": 0.0, "step": 9200 }, { "epoch": 0.7199452215592292, "eval_loss": NaN, "eval_runtime": 295.3742, "eval_samples_per_second": 407.331, "eval_steps_per_second": 6.365, "step": 9200 }, { "epoch": 0.721901594443901, "grad_norm": NaN, "learning_rate": 4.639028016904054e-05, "loss": 0.0, "step": 9225 }, { "epoch": 0.7238579673285728, "grad_norm": NaN, "learning_rate": 4.6380497730474254e-05, "loss": 0.0, "step": 9250 }, { "epoch": 0.7258143402132446, "grad_norm": NaN, "learning_rate": 4.637071529190797e-05, "loss": 0.0, "step": 9275 }, { "epoch": 0.7277707130979164, "grad_norm": NaN, "learning_rate": 4.636093285334168e-05, "loss": 0.0, "step": 9300 }, { "epoch": 0.7277707130979164, "eval_loss": NaN, "eval_runtime": 295.611, "eval_samples_per_second": 407.005, "eval_steps_per_second": 6.36, "step": 9300 }, { "epoch": 0.7297270859825883, "grad_norm": NaN, "learning_rate": 4.63511504147754e-05, "loss": 0.0, "step": 9325 }, { "epoch": 0.73168345886726, "grad_norm": NaN, "learning_rate": 4.634136797620911e-05, "loss": 0.0, "step": 9350 }, { "epoch": 0.7336398317519319, "grad_norm": NaN, "learning_rate": 4.633158553764283e-05, "loss": 0.0, "step": 9375 }, { "epoch": 0.7355962046366037, "grad_norm": NaN, "learning_rate": 4.632180309907654e-05, "loss": 0.0, "step": 9400 }, { "epoch": 0.7355962046366037, "eval_loss": NaN, "eval_runtime": 296.4867, "eval_samples_per_second": 405.802, "eval_steps_per_second": 6.341, "step": 9400 }, { "epoch": 0.7375525775212756, "grad_norm": NaN, "learning_rate": 4.6312020660510256e-05, "loss": 0.0, "step": 9425 }, { "epoch": 0.7395089504059473, "grad_norm": NaN, "learning_rate": 4.6302238221943966e-05, "loss": 0.0, "step": 9450 }, { "epoch": 0.7414653232906192, "grad_norm": NaN, "learning_rate": 4.6292455783377684e-05, "loss": 0.0, "step": 9475 }, { "epoch": 0.743421696175291, "grad_norm": NaN, "learning_rate": 4.6282673344811395e-05, "loss": 0.0, "step": 9500 }, { "epoch": 0.743421696175291, "eval_loss": NaN, "eval_runtime": 295.7695, "eval_samples_per_second": 406.786, "eval_steps_per_second": 6.356, "step": 9500 }, { "epoch": 0.7453780690599628, "grad_norm": NaN, "learning_rate": 4.627289090624511e-05, "loss": 0.0, "step": 9525 }, { "epoch": 0.7473344419446346, "grad_norm": NaN, "learning_rate": 4.626310846767882e-05, "loss": 0.0, "step": 9550 }, { "epoch": 0.7492908148293065, "grad_norm": NaN, "learning_rate": 4.625332602911254e-05, "loss": 0.0, "step": 9575 }, { "epoch": 0.7512471877139782, "grad_norm": NaN, "learning_rate": 4.624354359054625e-05, "loss": 0.0, "step": 9600 }, { "epoch": 0.7512471877139782, "eval_loss": NaN, "eval_runtime": 296.8578, "eval_samples_per_second": 405.295, "eval_steps_per_second": 6.333, "step": 9600 }, { "epoch": 0.7532035605986501, "grad_norm": NaN, "learning_rate": 4.623376115197997e-05, "loss": 0.0, "step": 9625 }, { "epoch": 0.7551599334833219, "grad_norm": NaN, "learning_rate": 4.622397871341368e-05, "loss": 0.0, "step": 9650 }, { "epoch": 0.7571163063679938, "grad_norm": NaN, "learning_rate": 4.6214196274847397e-05, "loss": 0.0, "step": 9675 }, { "epoch": 0.7590726792526655, "grad_norm": NaN, "learning_rate": 4.6204413836281114e-05, "loss": 0.0, "step": 9700 }, { "epoch": 0.7590726792526655, "eval_loss": NaN, "eval_runtime": 296.2813, "eval_samples_per_second": 406.084, "eval_steps_per_second": 6.345, "step": 9700 }, { "epoch": 0.7610290521373374, "grad_norm": NaN, "learning_rate": 4.6194631397714825e-05, "loss": 0.0, "step": 9725 }, { "epoch": 0.7629854250220092, "grad_norm": NaN, "learning_rate": 4.618484895914854e-05, "loss": 0.0, "step": 9750 }, { "epoch": 0.764941797906681, "grad_norm": NaN, "learning_rate": 4.617506652058225e-05, "loss": 0.0, "step": 9775 }, { "epoch": 0.7668981707913528, "grad_norm": NaN, "learning_rate": 4.616528408201597e-05, "loss": 0.0, "step": 9800 }, { "epoch": 0.7668981707913528, "eval_loss": NaN, "eval_runtime": 296.8443, "eval_samples_per_second": 405.313, "eval_steps_per_second": 6.333, "step": 9800 }, { "epoch": 0.7688545436760247, "grad_norm": NaN, "learning_rate": 4.615550164344968e-05, "loss": 0.0, "step": 9825 }, { "epoch": 0.7708109165606964, "grad_norm": NaN, "learning_rate": 4.61457192048834e-05, "loss": 0.0, "step": 9850 }, { "epoch": 0.7727672894453683, "grad_norm": NaN, "learning_rate": 4.613593676631711e-05, "loss": 0.0, "step": 9875 }, { "epoch": 0.7747236623300401, "grad_norm": NaN, "learning_rate": 4.612615432775083e-05, "loss": 0.0, "step": 9900 }, { "epoch": 0.7747236623300401, "eval_loss": NaN, "eval_runtime": 296.0536, "eval_samples_per_second": 406.396, "eval_steps_per_second": 6.35, "step": 9900 }, { "epoch": 0.776680035214712, "grad_norm": NaN, "learning_rate": 4.611637188918454e-05, "loss": 0.0, "step": 9925 }, { "epoch": 0.7786364080993837, "grad_norm": NaN, "learning_rate": 4.6106589450618255e-05, "loss": 0.0, "step": 9950 }, { "epoch": 0.7805927809840556, "grad_norm": NaN, "learning_rate": 4.6096807012051966e-05, "loss": 0.0, "step": 9975 }, { "epoch": 0.7825491538687274, "grad_norm": NaN, "learning_rate": 4.608702457348568e-05, "loss": 0.0, "step": 10000 }, { "epoch": 0.7825491538687274, "eval_loss": NaN, "eval_runtime": 295.9018, "eval_samples_per_second": 406.604, "eval_steps_per_second": 6.353, "step": 10000 }, { "epoch": 0.7845055267533992, "grad_norm": NaN, "learning_rate": 4.6077242134919394e-05, "loss": 0.0, "step": 10025 }, { "epoch": 0.786461899638071, "grad_norm": NaN, "learning_rate": 4.606745969635311e-05, "loss": 0.0, "step": 10050 }, { "epoch": 0.7884182725227429, "grad_norm": NaN, "learning_rate": 4.605767725778682e-05, "loss": 0.0, "step": 10075 }, { "epoch": 0.7903746454074146, "grad_norm": NaN, "learning_rate": 4.604789481922054e-05, "loss": 0.0, "step": 10100 }, { "epoch": 0.7903746454074146, "eval_loss": NaN, "eval_runtime": 296.6121, "eval_samples_per_second": 405.631, "eval_steps_per_second": 6.338, "step": 10100 }, { "epoch": 0.7923310182920865, "grad_norm": NaN, "learning_rate": 4.603811238065425e-05, "loss": 0.0, "step": 10125 }, { "epoch": 0.7942873911767583, "grad_norm": NaN, "learning_rate": 4.602832994208797e-05, "loss": 0.0, "step": 10150 }, { "epoch": 0.7962437640614302, "grad_norm": NaN, "learning_rate": 4.601854750352168e-05, "loss": 0.0, "step": 10175 }, { "epoch": 0.7982001369461019, "grad_norm": NaN, "learning_rate": 4.6008765064955396e-05, "loss": 0.0, "step": 10200 }, { "epoch": 0.7982001369461019, "eval_loss": NaN, "eval_runtime": 296.7627, "eval_samples_per_second": 405.425, "eval_steps_per_second": 6.335, "step": 10200 }, { "epoch": 0.8001565098307738, "grad_norm": NaN, "learning_rate": 4.5998982626389106e-05, "loss": 0.0, "step": 10225 }, { "epoch": 0.8021128827154456, "grad_norm": NaN, "learning_rate": 4.5989200187822824e-05, "loss": 0.0, "step": 10250 }, { "epoch": 0.8040692556001173, "grad_norm": NaN, "learning_rate": 4.5979417749256535e-05, "loss": 0.0, "step": 10275 }, { "epoch": 0.8060256284847892, "grad_norm": NaN, "learning_rate": 4.596963531069025e-05, "loss": 0.0, "step": 10300 }, { "epoch": 0.8060256284847892, "eval_loss": NaN, "eval_runtime": 295.507, "eval_samples_per_second": 407.148, "eval_steps_per_second": 6.362, "step": 10300 }, { "epoch": 0.807982001369461, "grad_norm": NaN, "learning_rate": 4.595985287212396e-05, "loss": 0.0, "step": 10325 }, { "epoch": 0.8099383742541328, "grad_norm": NaN, "learning_rate": 4.595007043355768e-05, "loss": 0.0, "step": 10350 }, { "epoch": 0.8118947471388046, "grad_norm": NaN, "learning_rate": 4.594028799499139e-05, "loss": 0.0, "step": 10375 }, { "epoch": 0.8138511200234765, "grad_norm": NaN, "learning_rate": 4.593050555642511e-05, "loss": 0.0, "step": 10400 }, { "epoch": 0.8138511200234765, "eval_loss": NaN, "eval_runtime": 295.5311, "eval_samples_per_second": 407.115, "eval_steps_per_second": 6.361, "step": 10400 }, { "epoch": 0.8158074929081482, "grad_norm": NaN, "learning_rate": 4.592072311785882e-05, "loss": 0.0, "step": 10425 }, { "epoch": 0.8177638657928201, "grad_norm": NaN, "learning_rate": 4.5910940679292537e-05, "loss": 0.0, "step": 10450 }, { "epoch": 0.8197202386774919, "grad_norm": NaN, "learning_rate": 4.590115824072625e-05, "loss": 0.0, "step": 10475 }, { "epoch": 0.8216766115621638, "grad_norm": NaN, "learning_rate": 4.5891375802159965e-05, "loss": 0.0, "step": 10500 }, { "epoch": 0.8216766115621638, "eval_loss": NaN, "eval_runtime": 296.4125, "eval_samples_per_second": 405.904, "eval_steps_per_second": 6.343, "step": 10500 }, { "epoch": 0.8236329844468355, "grad_norm": NaN, "learning_rate": 4.5881593363593675e-05, "loss": 0.0, "step": 10525 }, { "epoch": 0.8255893573315074, "grad_norm": NaN, "learning_rate": 4.587181092502739e-05, "loss": 0.0, "step": 10550 }, { "epoch": 0.8275457302161792, "grad_norm": NaN, "learning_rate": 4.5862028486461104e-05, "loss": 0.0, "step": 10575 }, { "epoch": 0.829502103100851, "grad_norm": NaN, "learning_rate": 4.585224604789482e-05, "loss": 0.0, "step": 10600 }, { "epoch": 0.829502103100851, "eval_loss": NaN, "eval_runtime": 295.9683, "eval_samples_per_second": 406.513, "eval_steps_per_second": 6.352, "step": 10600 }, { "epoch": 0.8314584759855228, "grad_norm": NaN, "learning_rate": 4.584246360932853e-05, "loss": 0.0, "step": 10625 }, { "epoch": 0.8334148488701947, "grad_norm": NaN, "learning_rate": 4.583268117076225e-05, "loss": 0.0, "step": 10650 }, { "epoch": 0.8353712217548664, "grad_norm": NaN, "learning_rate": 4.582289873219596e-05, "loss": 0.0, "step": 10675 }, { "epoch": 0.8373275946395383, "grad_norm": NaN, "learning_rate": 4.581311629362968e-05, "loss": 0.0, "step": 10700 }, { "epoch": 0.8373275946395383, "eval_loss": NaN, "eval_runtime": 295.3452, "eval_samples_per_second": 407.371, "eval_steps_per_second": 6.365, "step": 10700 }, { "epoch": 0.8392839675242101, "grad_norm": NaN, "learning_rate": 4.580333385506339e-05, "loss": 0.0, "step": 10725 }, { "epoch": 0.841240340408882, "grad_norm": NaN, "learning_rate": 4.5793551416497106e-05, "loss": 0.0, "step": 10750 }, { "epoch": 0.8431967132935537, "grad_norm": NaN, "learning_rate": 4.5783768977930816e-05, "loss": 0.0, "step": 10775 }, { "epoch": 0.8451530861782256, "grad_norm": NaN, "learning_rate": 4.5773986539364534e-05, "loss": 0.0, "step": 10800 }, { "epoch": 0.8451530861782256, "eval_loss": NaN, "eval_runtime": 295.9389, "eval_samples_per_second": 406.554, "eval_steps_per_second": 6.353, "step": 10800 }, { "epoch": 0.8471094590628974, "grad_norm": NaN, "learning_rate": 4.5764204100798244e-05, "loss": 0.0, "step": 10825 }, { "epoch": 0.8490658319475692, "grad_norm": NaN, "learning_rate": 4.575442166223196e-05, "loss": 0.0, "step": 10850 }, { "epoch": 0.851022204832241, "grad_norm": NaN, "learning_rate": 4.574463922366567e-05, "loss": 0.0, "step": 10875 }, { "epoch": 0.8529785777169129, "grad_norm": NaN, "learning_rate": 4.573485678509939e-05, "loss": 0.0, "step": 10900 }, { "epoch": 0.8529785777169129, "eval_loss": NaN, "eval_runtime": 295.7355, "eval_samples_per_second": 406.833, "eval_steps_per_second": 6.357, "step": 10900 }, { "epoch": 0.8549349506015846, "grad_norm": NaN, "learning_rate": 4.57250743465331e-05, "loss": 0.0, "step": 10925 }, { "epoch": 0.8568913234862565, "grad_norm": NaN, "learning_rate": 4.5715291907966825e-05, "loss": 0.0, "step": 10950 }, { "epoch": 0.8588476963709283, "grad_norm": NaN, "learning_rate": 4.5705509469400536e-05, "loss": 0.0, "step": 10975 }, { "epoch": 0.8608040692556002, "grad_norm": NaN, "learning_rate": 4.569572703083425e-05, "loss": 0.0, "step": 11000 }, { "epoch": 0.8608040692556002, "eval_loss": NaN, "eval_runtime": 296.6812, "eval_samples_per_second": 405.536, "eval_steps_per_second": 6.337, "step": 11000 }, { "epoch": 0.8627604421402719, "grad_norm": NaN, "learning_rate": 4.5685944592267964e-05, "loss": 0.0, "step": 11025 }, { "epoch": 0.8647168150249438, "grad_norm": NaN, "learning_rate": 4.567616215370168e-05, "loss": 0.0, "step": 11050 }, { "epoch": 0.8666731879096156, "grad_norm": NaN, "learning_rate": 4.566637971513539e-05, "loss": 0.0, "step": 11075 }, { "epoch": 0.8686295607942874, "grad_norm": NaN, "learning_rate": 4.565659727656911e-05, "loss": 0.0, "step": 11100 }, { "epoch": 0.8686295607942874, "eval_loss": NaN, "eval_runtime": 295.7845, "eval_samples_per_second": 406.766, "eval_steps_per_second": 6.356, "step": 11100 }, { "epoch": 0.8705859336789592, "grad_norm": NaN, "learning_rate": 4.564681483800282e-05, "loss": 0.0, "step": 11125 }, { "epoch": 0.8725423065636311, "grad_norm": NaN, "learning_rate": 4.563703239943654e-05, "loss": 0.0, "step": 11150 }, { "epoch": 0.8744986794483028, "grad_norm": NaN, "learning_rate": 4.562724996087025e-05, "loss": 0.0, "step": 11175 }, { "epoch": 0.8764550523329747, "grad_norm": NaN, "learning_rate": 4.5617467522303966e-05, "loss": 0.0, "step": 11200 }, { "epoch": 0.8764550523329747, "eval_loss": NaN, "eval_runtime": 294.9026, "eval_samples_per_second": 407.982, "eval_steps_per_second": 6.375, "step": 11200 }, { "epoch": 0.8784114252176465, "grad_norm": NaN, "learning_rate": 4.5607685083737677e-05, "loss": 0.0, "step": 11225 }, { "epoch": 0.8803677981023184, "grad_norm": NaN, "learning_rate": 4.5597902645171394e-05, "loss": 0.0, "step": 11250 }, { "epoch": 0.8823241709869901, "grad_norm": NaN, "learning_rate": 4.5588120206605105e-05, "loss": 0.0, "step": 11275 }, { "epoch": 0.8842805438716619, "grad_norm": NaN, "learning_rate": 4.557833776803882e-05, "loss": 0.0, "step": 11300 }, { "epoch": 0.8842805438716619, "eval_loss": NaN, "eval_runtime": 296.1399, "eval_samples_per_second": 406.278, "eval_steps_per_second": 6.348, "step": 11300 }, { "epoch": 0.8862369167563338, "grad_norm": NaN, "learning_rate": 4.556855532947253e-05, "loss": 0.0, "step": 11325 }, { "epoch": 0.8881932896410055, "grad_norm": NaN, "learning_rate": 4.555877289090625e-05, "loss": 0.0, "step": 11350 }, { "epoch": 0.8901496625256774, "grad_norm": NaN, "learning_rate": 4.554899045233996e-05, "loss": 0.0, "step": 11375 }, { "epoch": 0.8921060354103492, "grad_norm": NaN, "learning_rate": 4.553920801377368e-05, "loss": 0.0, "step": 11400 }, { "epoch": 0.8921060354103492, "eval_loss": NaN, "eval_runtime": 296.7598, "eval_samples_per_second": 405.429, "eval_steps_per_second": 6.335, "step": 11400 }, { "epoch": 0.894062408295021, "grad_norm": NaN, "learning_rate": 4.552942557520739e-05, "loss": 0.0, "step": 11425 }, { "epoch": 0.8960187811796928, "grad_norm": NaN, "learning_rate": 4.551964313664111e-05, "loss": 0.0, "step": 11450 }, { "epoch": 0.8979751540643647, "grad_norm": NaN, "learning_rate": 4.550986069807482e-05, "loss": 0.0, "step": 11475 }, { "epoch": 0.8999315269490364, "grad_norm": NaN, "learning_rate": 4.5500078259508535e-05, "loss": 0.0, "step": 11500 }, { "epoch": 0.8999315269490364, "eval_loss": NaN, "eval_runtime": 294.9325, "eval_samples_per_second": 407.941, "eval_steps_per_second": 6.374, "step": 11500 }, { "epoch": 0.9018878998337083, "grad_norm": NaN, "learning_rate": 4.5490295820942246e-05, "loss": 0.0, "step": 11525 }, { "epoch": 0.9038442727183801, "grad_norm": NaN, "learning_rate": 4.548051338237596e-05, "loss": 0.0, "step": 11550 }, { "epoch": 0.905800645603052, "grad_norm": NaN, "learning_rate": 4.5470730943809674e-05, "loss": 0.0, "step": 11575 }, { "epoch": 0.9077570184877237, "grad_norm": NaN, "learning_rate": 4.546094850524339e-05, "loss": 0.0, "step": 11600 }, { "epoch": 0.9077570184877237, "eval_loss": NaN, "eval_runtime": 297.0162, "eval_samples_per_second": 405.079, "eval_steps_per_second": 6.33, "step": 11600 }, { "epoch": 0.9097133913723956, "grad_norm": NaN, "learning_rate": 4.54511660666771e-05, "loss": 0.0, "step": 11625 }, { "epoch": 0.9116697642570674, "grad_norm": NaN, "learning_rate": 4.544138362811082e-05, "loss": 0.0, "step": 11650 }, { "epoch": 0.9136261371417392, "grad_norm": NaN, "learning_rate": 4.543160118954453e-05, "loss": 0.0, "step": 11675 }, { "epoch": 0.915582510026411, "grad_norm": NaN, "learning_rate": 4.542181875097825e-05, "loss": 0.0, "step": 11700 }, { "epoch": 0.915582510026411, "eval_loss": NaN, "eval_runtime": 295.3778, "eval_samples_per_second": 407.326, "eval_steps_per_second": 6.365, "step": 11700 }, { "epoch": 0.9175388829110829, "grad_norm": NaN, "learning_rate": 4.541203631241196e-05, "loss": 0.0, "step": 11725 }, { "epoch": 0.9194952557957546, "grad_norm": NaN, "learning_rate": 4.5402253873845676e-05, "loss": 0.0, "step": 11750 }, { "epoch": 0.9214516286804265, "grad_norm": NaN, "learning_rate": 4.5392471435279386e-05, "loss": 0.0, "step": 11775 }, { "epoch": 0.9234080015650983, "grad_norm": NaN, "learning_rate": 4.5382688996713104e-05, "loss": 0.0, "step": 11800 }, { "epoch": 0.9234080015650983, "eval_loss": NaN, "eval_runtime": 296.2348, "eval_samples_per_second": 406.147, "eval_steps_per_second": 6.346, "step": 11800 }, { "epoch": 0.9253643744497702, "grad_norm": NaN, "learning_rate": 4.5372906558146815e-05, "loss": 0.0, "step": 11825 }, { "epoch": 0.9273207473344419, "grad_norm": NaN, "learning_rate": 4.536312411958053e-05, "loss": 0.0, "step": 11850 }, { "epoch": 0.9292771202191138, "grad_norm": NaN, "learning_rate": 4.535334168101424e-05, "loss": 0.0, "step": 11875 }, { "epoch": 0.9312334931037856, "grad_norm": NaN, "learning_rate": 4.534355924244796e-05, "loss": 0.0, "step": 11900 }, { "epoch": 0.9312334931037856, "eval_loss": NaN, "eval_runtime": 297.1373, "eval_samples_per_second": 404.914, "eval_steps_per_second": 6.327, "step": 11900 }, { "epoch": 0.9331898659884574, "grad_norm": NaN, "learning_rate": 4.533377680388167e-05, "loss": 0.0, "step": 11925 }, { "epoch": 0.9351462388731292, "grad_norm": NaN, "learning_rate": 4.532399436531539e-05, "loss": 0.0, "step": 11950 }, { "epoch": 0.9371026117578011, "grad_norm": NaN, "learning_rate": 4.53142119267491e-05, "loss": 0.0, "step": 11975 }, { "epoch": 0.9390589846424728, "grad_norm": NaN, "learning_rate": 4.5304429488182817e-05, "loss": 0.0, "step": 12000 }, { "epoch": 0.9390589846424728, "eval_loss": NaN, "eval_runtime": 295.6734, "eval_samples_per_second": 406.919, "eval_steps_per_second": 6.358, "step": 12000 }, { "epoch": 0.9410153575271447, "grad_norm": NaN, "learning_rate": 4.529464704961653e-05, "loss": 0.0, "step": 12025 }, { "epoch": 0.9429717304118165, "grad_norm": NaN, "learning_rate": 4.5284864611050245e-05, "loss": 0.0, "step": 12050 }, { "epoch": 0.9449281032964884, "grad_norm": NaN, "learning_rate": 4.5275082172483955e-05, "loss": 0.0, "step": 12075 }, { "epoch": 0.9468844761811601, "grad_norm": NaN, "learning_rate": 4.526529973391767e-05, "loss": 0.0, "step": 12100 }, { "epoch": 0.9468844761811601, "eval_loss": NaN, "eval_runtime": 295.7189, "eval_samples_per_second": 406.856, "eval_steps_per_second": 6.357, "step": 12100 }, { "epoch": 0.948840849065832, "grad_norm": NaN, "learning_rate": 4.5255517295351384e-05, "loss": 0.0, "step": 12125 }, { "epoch": 0.9507972219505038, "grad_norm": NaN, "learning_rate": 4.52457348567851e-05, "loss": 0.0, "step": 12150 }, { "epoch": 0.9527535948351756, "grad_norm": NaN, "learning_rate": 4.523595241821881e-05, "loss": 0.0, "step": 12175 }, { "epoch": 0.9547099677198474, "grad_norm": NaN, "learning_rate": 4.522616997965253e-05, "loss": 0.0, "step": 12200 }, { "epoch": 0.9547099677198474, "eval_loss": NaN, "eval_runtime": 296.7137, "eval_samples_per_second": 405.492, "eval_steps_per_second": 6.336, "step": 12200 }, { "epoch": 0.9566663406045193, "grad_norm": NaN, "learning_rate": 4.521638754108624e-05, "loss": 0.0, "step": 12225 }, { "epoch": 0.958622713489191, "grad_norm": NaN, "learning_rate": 4.5206605102519964e-05, "loss": 0.0, "step": 12250 }, { "epoch": 0.9605790863738629, "grad_norm": NaN, "learning_rate": 4.5196822663953675e-05, "loss": 0.0, "step": 12275 }, { "epoch": 0.9625354592585347, "grad_norm": NaN, "learning_rate": 4.518704022538739e-05, "loss": 0.0, "step": 12300 }, { "epoch": 0.9625354592585347, "eval_loss": NaN, "eval_runtime": 297.5793, "eval_samples_per_second": 404.312, "eval_steps_per_second": 6.318, "step": 12300 }, { "epoch": 0.9644918321432064, "grad_norm": NaN, "learning_rate": 4.51772577868211e-05, "loss": 0.0, "step": 12325 }, { "epoch": 0.9664482050278783, "grad_norm": NaN, "learning_rate": 4.516747534825482e-05, "loss": 0.0, "step": 12350 }, { "epoch": 0.9684045779125501, "grad_norm": NaN, "learning_rate": 4.515769290968853e-05, "loss": 0.0, "step": 12375 }, { "epoch": 0.970360950797222, "grad_norm": NaN, "learning_rate": 4.514791047112225e-05, "loss": 0.0, "step": 12400 }, { "epoch": 0.970360950797222, "eval_loss": NaN, "eval_runtime": 295.6162, "eval_samples_per_second": 406.997, "eval_steps_per_second": 6.36, "step": 12400 }, { "epoch": 0.9723173236818937, "grad_norm": NaN, "learning_rate": 4.513812803255596e-05, "loss": 0.0, "step": 12425 }, { "epoch": 0.9742736965665656, "grad_norm": NaN, "learning_rate": 4.512834559398968e-05, "loss": 0.0, "step": 12450 }, { "epoch": 0.9762300694512374, "grad_norm": NaN, "learning_rate": 4.511856315542339e-05, "loss": 0.0, "step": 12475 }, { "epoch": 0.9781864423359092, "grad_norm": NaN, "learning_rate": 4.5108780716857105e-05, "loss": 0.0, "step": 12500 }, { "epoch": 0.9781864423359092, "eval_loss": NaN, "eval_runtime": 297.0502, "eval_samples_per_second": 405.033, "eval_steps_per_second": 6.329, "step": 12500 }, { "epoch": 0.980142815220581, "grad_norm": NaN, "learning_rate": 4.5098998278290816e-05, "loss": 0.0, "step": 12525 }, { "epoch": 0.9820991881052529, "grad_norm": NaN, "learning_rate": 4.508921583972453e-05, "loss": 0.0, "step": 12550 }, { "epoch": 0.9840555609899246, "grad_norm": NaN, "learning_rate": 4.5079433401158244e-05, "loss": 0.0, "step": 12575 }, { "epoch": 0.9860119338745965, "grad_norm": NaN, "learning_rate": 4.506965096259196e-05, "loss": 0.0, "step": 12600 }, { "epoch": 0.9860119338745965, "eval_loss": NaN, "eval_runtime": 296.8312, "eval_samples_per_second": 405.331, "eval_steps_per_second": 6.334, "step": 12600 }, { "epoch": 0.9879683067592683, "grad_norm": NaN, "learning_rate": 4.505986852402567e-05, "loss": 0.0, "step": 12625 }, { "epoch": 0.9899246796439402, "grad_norm": NaN, "learning_rate": 4.505008608545939e-05, "loss": 0.0, "step": 12650 }, { "epoch": 0.9918810525286119, "grad_norm": NaN, "learning_rate": 4.50403036468931e-05, "loss": 0.0, "step": 12675 }, { "epoch": 0.9938374254132838, "grad_norm": NaN, "learning_rate": 4.503052120832682e-05, "loss": 0.0, "step": 12700 }, { "epoch": 0.9938374254132838, "eval_loss": NaN, "eval_runtime": 295.9271, "eval_samples_per_second": 406.57, "eval_steps_per_second": 6.353, "step": 12700 }, { "epoch": 0.9957937982979556, "grad_norm": NaN, "learning_rate": 4.502073876976053e-05, "loss": 0.0, "step": 12725 }, { "epoch": 0.9977501711826274, "grad_norm": NaN, "learning_rate": 4.5010956331194246e-05, "loss": 0.0, "step": 12750 }, { "epoch": 0.9997065440672992, "grad_norm": NaN, "learning_rate": 4.5001173892627956e-05, "loss": 0.0, "step": 12775 }, { "epoch": 1.0016824806808178, "grad_norm": NaN, "learning_rate": 4.4991391454061674e-05, "loss": 0.0, "step": 12800 }, { "epoch": 1.0016824806808178, "eval_loss": NaN, "eval_runtime": 296.3821, "eval_samples_per_second": 405.946, "eval_steps_per_second": 6.343, "step": 12800 }, { "epoch": 1.0036388535654897, "grad_norm": NaN, "learning_rate": 4.4981609015495385e-05, "loss": 0.0, "step": 12825 }, { "epoch": 1.0055952264501613, "grad_norm": NaN, "learning_rate": 4.49718265769291e-05, "loss": 0.0, "step": 12850 }, { "epoch": 1.0075515993348332, "grad_norm": NaN, "learning_rate": 4.496204413836281e-05, "loss": 0.0, "step": 12875 }, { "epoch": 1.009507972219505, "grad_norm": NaN, "learning_rate": 4.495226169979653e-05, "loss": 0.0, "step": 12900 }, { "epoch": 1.009507972219505, "eval_loss": NaN, "eval_runtime": 295.1129, "eval_samples_per_second": 407.691, "eval_steps_per_second": 6.37, "step": 12900 }, { "epoch": 1.011464345104177, "grad_norm": NaN, "learning_rate": 4.494247926123024e-05, "loss": 0.0, "step": 12925 }, { "epoch": 1.0134207179888486, "grad_norm": NaN, "learning_rate": 4.493269682266396e-05, "loss": 0.0, "step": 12950 }, { "epoch": 1.0153770908735205, "grad_norm": NaN, "learning_rate": 4.492291438409767e-05, "loss": 0.0, "step": 12975 }, { "epoch": 1.0173334637581923, "grad_norm": NaN, "learning_rate": 4.4913131945531387e-05, "loss": 0.0, "step": 13000 }, { "epoch": 1.0173334637581923, "eval_loss": NaN, "eval_runtime": 295.3676, "eval_samples_per_second": 407.34, "eval_steps_per_second": 6.365, "step": 13000 }, { "epoch": 1.0192898366428642, "grad_norm": NaN, "learning_rate": 4.49033495069651e-05, "loss": 0.0, "step": 13025 }, { "epoch": 1.0212462095275359, "grad_norm": NaN, "learning_rate": 4.4893567068398815e-05, "loss": 0.0, "step": 13050 }, { "epoch": 1.0232025824122077, "grad_norm": NaN, "learning_rate": 4.4883784629832525e-05, "loss": 0.0, "step": 13075 }, { "epoch": 1.0251589552968796, "grad_norm": NaN, "learning_rate": 4.487400219126624e-05, "loss": 0.0, "step": 13100 }, { "epoch": 1.0251589552968796, "eval_loss": NaN, "eval_runtime": 294.7591, "eval_samples_per_second": 408.181, "eval_steps_per_second": 6.378, "step": 13100 }, { "epoch": 1.0271153281815515, "grad_norm": NaN, "learning_rate": 4.4864219752699954e-05, "loss": 0.0, "step": 13125 }, { "epoch": 1.0290717010662231, "grad_norm": NaN, "learning_rate": 4.485443731413367e-05, "loss": 0.0, "step": 13150 }, { "epoch": 1.031028073950895, "grad_norm": NaN, "learning_rate": 4.484465487556738e-05, "loss": 0.0, "step": 13175 }, { "epoch": 1.032984446835567, "grad_norm": NaN, "learning_rate": 4.48348724370011e-05, "loss": 0.0, "step": 13200 }, { "epoch": 1.032984446835567, "eval_loss": NaN, "eval_runtime": 295.3292, "eval_samples_per_second": 407.393, "eval_steps_per_second": 6.366, "step": 13200 }, { "epoch": 1.0349408197202388, "grad_norm": NaN, "learning_rate": 4.482508999843481e-05, "loss": 0.0, "step": 13225 }, { "epoch": 1.0368971926049104, "grad_norm": NaN, "learning_rate": 4.481530755986853e-05, "loss": 0.0, "step": 13250 }, { "epoch": 1.0388535654895823, "grad_norm": NaN, "learning_rate": 4.480552512130224e-05, "loss": 0.0, "step": 13275 }, { "epoch": 1.0408099383742542, "grad_norm": NaN, "learning_rate": 4.4795742682735956e-05, "loss": 0.0, "step": 13300 }, { "epoch": 1.0408099383742542, "eval_loss": NaN, "eval_runtime": 295.7767, "eval_samples_per_second": 406.777, "eval_steps_per_second": 6.356, "step": 13300 }, { "epoch": 1.042766311258926, "grad_norm": NaN, "learning_rate": 4.4785960244169666e-05, "loss": 0.0, "step": 13325 }, { "epoch": 1.0447226841435977, "grad_norm": NaN, "learning_rate": 4.4776177805603384e-05, "loss": 0.0, "step": 13350 }, { "epoch": 1.0466790570282696, "grad_norm": NaN, "learning_rate": 4.4766395367037094e-05, "loss": 0.0, "step": 13375 }, { "epoch": 1.0486354299129415, "grad_norm": NaN, "learning_rate": 4.475661292847081e-05, "loss": 0.0, "step": 13400 }, { "epoch": 1.0486354299129415, "eval_loss": NaN, "eval_runtime": 295.3509, "eval_samples_per_second": 407.363, "eval_steps_per_second": 6.365, "step": 13400 }, { "epoch": 1.0505918027976133, "grad_norm": NaN, "learning_rate": 4.474683048990452e-05, "loss": 0.0, "step": 13425 }, { "epoch": 1.052548175682285, "grad_norm": NaN, "learning_rate": 4.473704805133824e-05, "loss": 0.0, "step": 13450 }, { "epoch": 1.0545045485669569, "grad_norm": NaN, "learning_rate": 4.472726561277195e-05, "loss": 0.0, "step": 13475 }, { "epoch": 1.0564609214516287, "grad_norm": NaN, "learning_rate": 4.471748317420567e-05, "loss": 0.0, "step": 13500 }, { "epoch": 1.0564609214516287, "eval_loss": NaN, "eval_runtime": 296.3735, "eval_samples_per_second": 405.957, "eval_steps_per_second": 6.343, "step": 13500 }, { "epoch": 1.0584172943363006, "grad_norm": NaN, "learning_rate": 4.470770073563938e-05, "loss": 0.0, "step": 13525 }, { "epoch": 1.0603736672209723, "grad_norm": NaN, "learning_rate": 4.4697918297073096e-05, "loss": 0.0, "step": 13550 }, { "epoch": 1.0623300401056441, "grad_norm": NaN, "learning_rate": 4.4688135858506814e-05, "loss": 0.0, "step": 13575 }, { "epoch": 1.064286412990316, "grad_norm": NaN, "learning_rate": 4.4678353419940525e-05, "loss": 0.0, "step": 13600 }, { "epoch": 1.064286412990316, "eval_loss": NaN, "eval_runtime": 296.0046, "eval_samples_per_second": 406.463, "eval_steps_per_second": 6.351, "step": 13600 }, { "epoch": 1.0662427858749877, "grad_norm": NaN, "learning_rate": 4.466857098137424e-05, "loss": 0.0, "step": 13625 }, { "epoch": 1.0681991587596595, "grad_norm": NaN, "learning_rate": 4.465878854280795e-05, "loss": 0.0, "step": 13650 }, { "epoch": 1.0701555316443314, "grad_norm": NaN, "learning_rate": 4.464900610424167e-05, "loss": 0.0, "step": 13675 }, { "epoch": 1.0721119045290033, "grad_norm": NaN, "learning_rate": 4.463922366567538e-05, "loss": 0.0, "step": 13700 }, { "epoch": 1.0721119045290033, "eval_loss": NaN, "eval_runtime": 295.087, "eval_samples_per_second": 407.727, "eval_steps_per_second": 6.371, "step": 13700 }, { "epoch": 1.074068277413675, "grad_norm": NaN, "learning_rate": 4.46294412271091e-05, "loss": 0.0, "step": 13725 }, { "epoch": 1.0760246502983468, "grad_norm": NaN, "learning_rate": 4.461965878854281e-05, "loss": 0.0, "step": 13750 }, { "epoch": 1.0779810231830187, "grad_norm": NaN, "learning_rate": 4.4609876349976527e-05, "loss": 0.0, "step": 13775 }, { "epoch": 1.0799373960676906, "grad_norm": NaN, "learning_rate": 4.460009391141024e-05, "loss": 0.0, "step": 13800 }, { "epoch": 1.0799373960676906, "eval_loss": NaN, "eval_runtime": 295.5178, "eval_samples_per_second": 407.133, "eval_steps_per_second": 6.362, "step": 13800 }, { "epoch": 1.0818937689523622, "grad_norm": NaN, "learning_rate": 4.4590311472843955e-05, "loss": 0.0, "step": 13825 }, { "epoch": 1.083850141837034, "grad_norm": NaN, "learning_rate": 4.4580529034277665e-05, "loss": 0.0, "step": 13850 }, { "epoch": 1.085806514721706, "grad_norm": NaN, "learning_rate": 4.457074659571138e-05, "loss": 0.0, "step": 13875 }, { "epoch": 1.0877628876063778, "grad_norm": NaN, "learning_rate": 4.4560964157145094e-05, "loss": 0.0, "step": 13900 }, { "epoch": 1.0877628876063778, "eval_loss": NaN, "eval_runtime": 296.1837, "eval_samples_per_second": 406.218, "eval_steps_per_second": 6.347, "step": 13900 }, { "epoch": 1.0897192604910495, "grad_norm": NaN, "learning_rate": 4.455118171857881e-05, "loss": 0.0, "step": 13925 }, { "epoch": 1.0916756333757214, "grad_norm": NaN, "learning_rate": 4.454139928001252e-05, "loss": 0.0, "step": 13950 }, { "epoch": 1.0936320062603933, "grad_norm": NaN, "learning_rate": 4.453161684144624e-05, "loss": 0.0, "step": 13975 }, { "epoch": 1.0955883791450651, "grad_norm": NaN, "learning_rate": 4.452183440287995e-05, "loss": 0.0, "step": 14000 }, { "epoch": 1.0955883791450651, "eval_loss": NaN, "eval_runtime": 296.7074, "eval_samples_per_second": 405.501, "eval_steps_per_second": 6.336, "step": 14000 }, { "epoch": 1.0975447520297368, "grad_norm": NaN, "learning_rate": 4.451205196431367e-05, "loss": 0.0, "step": 14025 }, { "epoch": 1.0995011249144087, "grad_norm": NaN, "learning_rate": 4.450226952574738e-05, "loss": 0.0, "step": 14050 }, { "epoch": 1.1014574977990805, "grad_norm": NaN, "learning_rate": 4.4492487087181096e-05, "loss": 0.0, "step": 14075 }, { "epoch": 1.1034138706837524, "grad_norm": NaN, "learning_rate": 4.4482704648614806e-05, "loss": 0.0, "step": 14100 }, { "epoch": 1.1034138706837524, "eval_loss": NaN, "eval_runtime": 295.4672, "eval_samples_per_second": 407.203, "eval_steps_per_second": 6.363, "step": 14100 }, { "epoch": 1.105370243568424, "grad_norm": NaN, "learning_rate": 4.4472922210048524e-05, "loss": 0.0, "step": 14125 }, { "epoch": 1.107326616453096, "grad_norm": NaN, "learning_rate": 4.4463139771482234e-05, "loss": 0.0, "step": 14150 }, { "epoch": 1.1092829893377678, "grad_norm": NaN, "learning_rate": 4.445335733291595e-05, "loss": 0.0, "step": 14175 }, { "epoch": 1.1112393622224397, "grad_norm": NaN, "learning_rate": 4.444357489434966e-05, "loss": 0.0, "step": 14200 }, { "epoch": 1.1112393622224397, "eval_loss": NaN, "eval_runtime": 295.492, "eval_samples_per_second": 407.168, "eval_steps_per_second": 6.362, "step": 14200 }, { "epoch": 1.1131957351071113, "grad_norm": NaN, "learning_rate": 4.443379245578338e-05, "loss": 0.0, "step": 14225 }, { "epoch": 1.1151521079917832, "grad_norm": NaN, "learning_rate": 4.442401001721709e-05, "loss": 0.0, "step": 14250 }, { "epoch": 1.117108480876455, "grad_norm": NaN, "learning_rate": 4.441422757865081e-05, "loss": 0.0, "step": 14275 }, { "epoch": 1.119064853761127, "grad_norm": NaN, "learning_rate": 4.440444514008452e-05, "loss": 0.0, "step": 14300 }, { "epoch": 1.119064853761127, "eval_loss": NaN, "eval_runtime": 296.0115, "eval_samples_per_second": 406.454, "eval_steps_per_second": 6.351, "step": 14300 }, { "epoch": 1.1210212266457986, "grad_norm": NaN, "learning_rate": 4.4394662701518236e-05, "loss": 0.0, "step": 14325 }, { "epoch": 1.1229775995304705, "grad_norm": NaN, "learning_rate": 4.438488026295195e-05, "loss": 0.0, "step": 14350 }, { "epoch": 1.1249339724151424, "grad_norm": NaN, "learning_rate": 4.4375097824385665e-05, "loss": 0.0, "step": 14375 }, { "epoch": 1.126890345299814, "grad_norm": NaN, "learning_rate": 4.4365315385819375e-05, "loss": 0.0, "step": 14400 }, { "epoch": 1.126890345299814, "eval_loss": NaN, "eval_runtime": 295.9043, "eval_samples_per_second": 406.601, "eval_steps_per_second": 6.353, "step": 14400 }, { "epoch": 1.128846718184486, "grad_norm": NaN, "learning_rate": 4.435553294725309e-05, "loss": 0.0, "step": 14425 }, { "epoch": 1.1308030910691578, "grad_norm": NaN, "learning_rate": 4.4345750508686803e-05, "loss": 0.0, "step": 14450 }, { "epoch": 1.1327594639538296, "grad_norm": NaN, "learning_rate": 4.433596807012052e-05, "loss": 0.0, "step": 14475 }, { "epoch": 1.1347158368385015, "grad_norm": NaN, "learning_rate": 4.432618563155423e-05, "loss": 0.0, "step": 14500 }, { "epoch": 1.1347158368385015, "eval_loss": NaN, "eval_runtime": 296.4025, "eval_samples_per_second": 405.918, "eval_steps_per_second": 6.343, "step": 14500 }, { "epoch": 1.1366722097231732, "grad_norm": NaN, "learning_rate": 4.431640319298795e-05, "loss": 0.0, "step": 14525 }, { "epoch": 1.138628582607845, "grad_norm": NaN, "learning_rate": 4.430662075442166e-05, "loss": 0.0, "step": 14550 }, { "epoch": 1.140584955492517, "grad_norm": NaN, "learning_rate": 4.429683831585538e-05, "loss": 0.0, "step": 14575 }, { "epoch": 1.1425413283771886, "grad_norm": NaN, "learning_rate": 4.428705587728909e-05, "loss": 0.0, "step": 14600 }, { "epoch": 1.1425413283771886, "eval_loss": NaN, "eval_runtime": 296.7388, "eval_samples_per_second": 405.458, "eval_steps_per_second": 6.336, "step": 14600 }, { "epoch": 1.1444977012618605, "grad_norm": NaN, "learning_rate": 4.4277273438722805e-05, "loss": 0.0, "step": 14625 }, { "epoch": 1.1464540741465323, "grad_norm": NaN, "learning_rate": 4.4267491000156516e-05, "loss": 0.0, "step": 14650 }, { "epoch": 1.1484104470312042, "grad_norm": NaN, "learning_rate": 4.4257708561590234e-05, "loss": 0.0, "step": 14675 }, { "epoch": 1.150366819915876, "grad_norm": NaN, "learning_rate": 4.4247926123023944e-05, "loss": 0.0, "step": 14700 }, { "epoch": 1.150366819915876, "eval_loss": NaN, "eval_runtime": 294.8498, "eval_samples_per_second": 408.055, "eval_steps_per_second": 6.376, "step": 14700 }, { "epoch": 1.1523231928005477, "grad_norm": NaN, "learning_rate": 4.423814368445766e-05, "loss": 0.0, "step": 14725 }, { "epoch": 1.1542795656852196, "grad_norm": NaN, "learning_rate": 4.422836124589137e-05, "loss": 0.0, "step": 14750 }, { "epoch": 1.1562359385698915, "grad_norm": NaN, "learning_rate": 4.421857880732509e-05, "loss": 0.0, "step": 14775 }, { "epoch": 1.1581923114545631, "grad_norm": NaN, "learning_rate": 4.42087963687588e-05, "loss": 0.0, "step": 14800 }, { "epoch": 1.1581923114545631, "eval_loss": NaN, "eval_runtime": 295.5386, "eval_samples_per_second": 407.104, "eval_steps_per_second": 6.361, "step": 14800 }, { "epoch": 1.160148684339235, "grad_norm": NaN, "learning_rate": 4.4199013930192525e-05, "loss": 0.0, "step": 14825 }, { "epoch": 1.162105057223907, "grad_norm": NaN, "learning_rate": 4.4189231491626236e-05, "loss": 0.0, "step": 14850 }, { "epoch": 1.1640614301085788, "grad_norm": NaN, "learning_rate": 4.417944905305995e-05, "loss": 0.0, "step": 14875 }, { "epoch": 1.1660178029932504, "grad_norm": NaN, "learning_rate": 4.4169666614493664e-05, "loss": 0.0, "step": 14900 }, { "epoch": 1.1660178029932504, "eval_loss": NaN, "eval_runtime": 295.9771, "eval_samples_per_second": 406.501, "eval_steps_per_second": 6.352, "step": 14900 }, { "epoch": 1.1679741758779223, "grad_norm": NaN, "learning_rate": 4.415988417592738e-05, "loss": 0.0, "step": 14925 }, { "epoch": 1.1699305487625942, "grad_norm": NaN, "learning_rate": 4.415010173736109e-05, "loss": 0.0, "step": 14950 }, { "epoch": 1.171886921647266, "grad_norm": NaN, "learning_rate": 4.414031929879481e-05, "loss": 0.0, "step": 14975 }, { "epoch": 1.1738432945319377, "grad_norm": NaN, "learning_rate": 4.413053686022852e-05, "loss": 0.0, "step": 15000 }, { "epoch": 1.1738432945319377, "eval_loss": NaN, "eval_runtime": 295.7892, "eval_samples_per_second": 406.759, "eval_steps_per_second": 6.356, "step": 15000 }, { "epoch": 1.1757996674166096, "grad_norm": NaN, "learning_rate": 4.412075442166224e-05, "loss": 0.0, "step": 15025 }, { "epoch": 1.1777560403012814, "grad_norm": NaN, "learning_rate": 4.411097198309595e-05, "loss": 0.0, "step": 15050 }, { "epoch": 1.1797124131859533, "grad_norm": NaN, "learning_rate": 4.4101189544529666e-05, "loss": 0.0, "step": 15075 }, { "epoch": 1.181668786070625, "grad_norm": NaN, "learning_rate": 4.4091407105963376e-05, "loss": 0.0, "step": 15100 }, { "epoch": 1.181668786070625, "eval_loss": NaN, "eval_runtime": 295.9788, "eval_samples_per_second": 406.499, "eval_steps_per_second": 6.352, "step": 15100 }, { "epoch": 1.1836251589552969, "grad_norm": NaN, "learning_rate": 4.4081624667397094e-05, "loss": 0.0, "step": 15125 }, { "epoch": 1.1855815318399687, "grad_norm": NaN, "learning_rate": 4.4071842228830805e-05, "loss": 0.0, "step": 15150 }, { "epoch": 1.1875379047246406, "grad_norm": NaN, "learning_rate": 4.406205979026452e-05, "loss": 0.0, "step": 15175 }, { "epoch": 1.1894942776093123, "grad_norm": NaN, "learning_rate": 4.405227735169823e-05, "loss": 0.0, "step": 15200 }, { "epoch": 1.1894942776093123, "eval_loss": NaN, "eval_runtime": 294.3991, "eval_samples_per_second": 408.68, "eval_steps_per_second": 6.386, "step": 15200 }, { "epoch": 1.1914506504939841, "grad_norm": NaN, "learning_rate": 4.404249491313195e-05, "loss": 0.0, "step": 15225 }, { "epoch": 1.193407023378656, "grad_norm": NaN, "learning_rate": 4.403271247456566e-05, "loss": 0.0, "step": 15250 }, { "epoch": 1.1953633962633279, "grad_norm": NaN, "learning_rate": 4.402293003599938e-05, "loss": 0.0, "step": 15275 }, { "epoch": 1.1973197691479995, "grad_norm": NaN, "learning_rate": 4.401314759743309e-05, "loss": 0.0, "step": 15300 }, { "epoch": 1.1973197691479995, "eval_loss": NaN, "eval_runtime": 296.6273, "eval_samples_per_second": 405.61, "eval_steps_per_second": 6.338, "step": 15300 }, { "epoch": 1.1992761420326714, "grad_norm": NaN, "learning_rate": 4.4003365158866806e-05, "loss": 0.0, "step": 15325 }, { "epoch": 1.2012325149173433, "grad_norm": NaN, "learning_rate": 4.399358272030052e-05, "loss": 0.0, "step": 15350 }, { "epoch": 1.2031888878020152, "grad_norm": NaN, "learning_rate": 4.3983800281734235e-05, "loss": 0.0, "step": 15375 }, { "epoch": 1.2051452606866868, "grad_norm": NaN, "learning_rate": 4.3974017843167945e-05, "loss": 0.0, "step": 15400 }, { "epoch": 1.2051452606866868, "eval_loss": NaN, "eval_runtime": 295.9835, "eval_samples_per_second": 406.492, "eval_steps_per_second": 6.352, "step": 15400 }, { "epoch": 1.2071016335713587, "grad_norm": NaN, "learning_rate": 4.396423540460166e-05, "loss": 0.0, "step": 15425 }, { "epoch": 1.2090580064560306, "grad_norm": NaN, "learning_rate": 4.3954452966035374e-05, "loss": 0.0, "step": 15450 }, { "epoch": 1.2110143793407024, "grad_norm": NaN, "learning_rate": 4.394467052746909e-05, "loss": 0.0, "step": 15475 }, { "epoch": 1.212970752225374, "grad_norm": NaN, "learning_rate": 4.39348880889028e-05, "loss": 0.0, "step": 15500 }, { "epoch": 1.212970752225374, "eval_loss": NaN, "eval_runtime": 296.0429, "eval_samples_per_second": 406.411, "eval_steps_per_second": 6.35, "step": 15500 }, { "epoch": 1.214927125110046, "grad_norm": NaN, "learning_rate": 4.392510565033652e-05, "loss": 0.0, "step": 15525 }, { "epoch": 1.2168834979947178, "grad_norm": NaN, "learning_rate": 4.391532321177023e-05, "loss": 0.0, "step": 15550 }, { "epoch": 1.2188398708793895, "grad_norm": NaN, "learning_rate": 4.390554077320395e-05, "loss": 0.0, "step": 15575 }, { "epoch": 1.2207962437640614, "grad_norm": NaN, "learning_rate": 4.389575833463766e-05, "loss": 0.0, "step": 15600 }, { "epoch": 1.2207962437640614, "eval_loss": NaN, "eval_runtime": 297.417, "eval_samples_per_second": 404.533, "eval_steps_per_second": 6.321, "step": 15600 }, { "epoch": 1.2227526166487332, "grad_norm": NaN, "learning_rate": 4.3885975896071375e-05, "loss": 0.0, "step": 15625 }, { "epoch": 1.2247089895334051, "grad_norm": NaN, "learning_rate": 4.3876193457505086e-05, "loss": 0.0, "step": 15650 }, { "epoch": 1.226665362418077, "grad_norm": NaN, "learning_rate": 4.3866411018938804e-05, "loss": 0.0, "step": 15675 }, { "epoch": 1.2286217353027487, "grad_norm": NaN, "learning_rate": 4.3856628580372514e-05, "loss": 0.0, "step": 15700 }, { "epoch": 1.2286217353027487, "eval_loss": NaN, "eval_runtime": 295.1591, "eval_samples_per_second": 407.628, "eval_steps_per_second": 6.369, "step": 15700 }, { "epoch": 1.2305781081874205, "grad_norm": NaN, "learning_rate": 4.384684614180623e-05, "loss": 0.0, "step": 15725 }, { "epoch": 1.2325344810720924, "grad_norm": NaN, "learning_rate": 4.383706370323994e-05, "loss": 0.0, "step": 15750 }, { "epoch": 1.234490853956764, "grad_norm": NaN, "learning_rate": 4.382728126467366e-05, "loss": 0.0, "step": 15775 }, { "epoch": 1.236447226841436, "grad_norm": NaN, "learning_rate": 4.381749882610737e-05, "loss": 0.0, "step": 15800 }, { "epoch": 1.236447226841436, "eval_loss": NaN, "eval_runtime": 295.6761, "eval_samples_per_second": 406.915, "eval_steps_per_second": 6.358, "step": 15800 }, { "epoch": 1.2384035997261078, "grad_norm": NaN, "learning_rate": 4.380771638754109e-05, "loss": 0.0, "step": 15825 }, { "epoch": 1.2403599726107797, "grad_norm": NaN, "learning_rate": 4.37979339489748e-05, "loss": 0.0, "step": 15850 }, { "epoch": 1.2423163454954516, "grad_norm": NaN, "learning_rate": 4.3788151510408516e-05, "loss": 0.0, "step": 15875 }, { "epoch": 1.2442727183801232, "grad_norm": NaN, "learning_rate": 4.377836907184223e-05, "loss": 0.0, "step": 15900 }, { "epoch": 1.2442727183801232, "eval_loss": NaN, "eval_runtime": 295.387, "eval_samples_per_second": 407.313, "eval_steps_per_second": 6.365, "step": 15900 }, { "epoch": 1.246229091264795, "grad_norm": NaN, "learning_rate": 4.3768586633275944e-05, "loss": 0.0, "step": 15925 }, { "epoch": 1.248185464149467, "grad_norm": NaN, "learning_rate": 4.3758804194709655e-05, "loss": 0.0, "step": 15950 }, { "epoch": 1.2501418370341386, "grad_norm": NaN, "learning_rate": 4.374902175614337e-05, "loss": 0.0, "step": 15975 }, { "epoch": 1.2520982099188105, "grad_norm": NaN, "learning_rate": 4.373923931757708e-05, "loss": 0.0, "step": 16000 }, { "epoch": 1.2520982099188105, "eval_loss": NaN, "eval_runtime": 295.6354, "eval_samples_per_second": 406.971, "eval_steps_per_second": 6.359, "step": 16000 }, { "epoch": 1.2540545828034824, "grad_norm": NaN, "learning_rate": 4.37294568790108e-05, "loss": 0.0, "step": 16025 }, { "epoch": 1.2560109556881542, "grad_norm": NaN, "learning_rate": 4.371967444044451e-05, "loss": 0.0, "step": 16050 }, { "epoch": 1.2579673285728261, "grad_norm": NaN, "learning_rate": 4.370989200187823e-05, "loss": 0.0, "step": 16075 }, { "epoch": 1.2599237014574978, "grad_norm": NaN, "learning_rate": 4.370010956331194e-05, "loss": 0.0, "step": 16100 }, { "epoch": 1.2599237014574978, "eval_loss": NaN, "eval_runtime": 296.5334, "eval_samples_per_second": 405.738, "eval_steps_per_second": 6.34, "step": 16100 }, { "epoch": 1.2618800743421696, "grad_norm": NaN, "learning_rate": 4.3690327124745664e-05, "loss": 0.0, "step": 16125 }, { "epoch": 1.2638364472268413, "grad_norm": NaN, "learning_rate": 4.3680544686179375e-05, "loss": 0.0, "step": 16150 }, { "epoch": 1.2657928201115132, "grad_norm": NaN, "learning_rate": 4.367076224761309e-05, "loss": 0.0, "step": 16175 }, { "epoch": 1.267749192996185, "grad_norm": NaN, "learning_rate": 4.36609798090468e-05, "loss": 0.0, "step": 16200 }, { "epoch": 1.267749192996185, "eval_loss": NaN, "eval_runtime": 296.4742, "eval_samples_per_second": 405.819, "eval_steps_per_second": 6.341, "step": 16200 }, { "epoch": 1.269705565880857, "grad_norm": NaN, "learning_rate": 4.365119737048052e-05, "loss": 0.0, "step": 16225 }, { "epoch": 1.2716619387655288, "grad_norm": NaN, "learning_rate": 4.364141493191423e-05, "loss": 0.0, "step": 16250 }, { "epoch": 1.2736183116502005, "grad_norm": NaN, "learning_rate": 4.363163249334795e-05, "loss": 0.0, "step": 16275 }, { "epoch": 1.2755746845348723, "grad_norm": NaN, "learning_rate": 4.362185005478166e-05, "loss": 0.0, "step": 16300 }, { "epoch": 1.2755746845348723, "eval_loss": NaN, "eval_runtime": 296.4176, "eval_samples_per_second": 405.897, "eval_steps_per_second": 6.342, "step": 16300 }, { "epoch": 1.2775310574195442, "grad_norm": NaN, "learning_rate": 4.3612067616215377e-05, "loss": 0.0, "step": 16325 }, { "epoch": 1.2794874303042159, "grad_norm": NaN, "learning_rate": 4.360228517764909e-05, "loss": 0.0, "step": 16350 }, { "epoch": 1.2814438031888877, "grad_norm": NaN, "learning_rate": 4.3592502739082805e-05, "loss": 0.0, "step": 16375 }, { "epoch": 1.2834001760735596, "grad_norm": NaN, "learning_rate": 4.3582720300516515e-05, "loss": 0.0, "step": 16400 }, { "epoch": 1.2834001760735596, "eval_loss": NaN, "eval_runtime": 295.5087, "eval_samples_per_second": 407.145, "eval_steps_per_second": 6.362, "step": 16400 }, { "epoch": 1.2853565489582315, "grad_norm": NaN, "learning_rate": 4.357293786195023e-05, "loss": 0.0, "step": 16425 }, { "epoch": 1.2873129218429034, "grad_norm": NaN, "learning_rate": 4.3563155423383944e-05, "loss": 0.0, "step": 16450 }, { "epoch": 1.289269294727575, "grad_norm": NaN, "learning_rate": 4.355337298481766e-05, "loss": 0.0, "step": 16475 }, { "epoch": 1.2912256676122469, "grad_norm": NaN, "learning_rate": 4.354359054625137e-05, "loss": 0.0, "step": 16500 }, { "epoch": 1.2912256676122469, "eval_loss": NaN, "eval_runtime": 296.2902, "eval_samples_per_second": 406.071, "eval_steps_per_second": 6.345, "step": 16500 }, { "epoch": 1.2931820404969188, "grad_norm": NaN, "learning_rate": 4.353380810768509e-05, "loss": 0.0, "step": 16525 }, { "epoch": 1.2951384133815904, "grad_norm": NaN, "learning_rate": 4.35240256691188e-05, "loss": 0.0, "step": 16550 }, { "epoch": 1.2970947862662623, "grad_norm": NaN, "learning_rate": 4.351424323055252e-05, "loss": 0.0, "step": 16575 }, { "epoch": 1.2990511591509342, "grad_norm": NaN, "learning_rate": 4.350446079198623e-05, "loss": 0.0, "step": 16600 }, { "epoch": 1.2990511591509342, "eval_loss": NaN, "eval_runtime": 295.5638, "eval_samples_per_second": 407.07, "eval_steps_per_second": 6.361, "step": 16600 }, { "epoch": 1.301007532035606, "grad_norm": NaN, "learning_rate": 4.3494678353419946e-05, "loss": 0.0, "step": 16625 }, { "epoch": 1.302963904920278, "grad_norm": NaN, "learning_rate": 4.3484895914853656e-05, "loss": 0.0, "step": 16650 }, { "epoch": 1.3049202778049496, "grad_norm": NaN, "learning_rate": 4.3475113476287374e-05, "loss": 0.0, "step": 16675 }, { "epoch": 1.3068766506896214, "grad_norm": NaN, "learning_rate": 4.3465331037721084e-05, "loss": 0.0, "step": 16700 }, { "epoch": 1.3068766506896214, "eval_loss": NaN, "eval_runtime": 296.2743, "eval_samples_per_second": 406.093, "eval_steps_per_second": 6.345, "step": 16700 }, { "epoch": 1.3088330235742933, "grad_norm": NaN, "learning_rate": 4.34555485991548e-05, "loss": 0.0, "step": 16725 }, { "epoch": 1.310789396458965, "grad_norm": NaN, "learning_rate": 4.344576616058851e-05, "loss": 0.0, "step": 16750 }, { "epoch": 1.3127457693436368, "grad_norm": NaN, "learning_rate": 4.343598372202223e-05, "loss": 0.0, "step": 16775 }, { "epoch": 1.3147021422283087, "grad_norm": NaN, "learning_rate": 4.342620128345594e-05, "loss": 0.0, "step": 16800 }, { "epoch": 1.3147021422283087, "eval_loss": NaN, "eval_runtime": 296.8189, "eval_samples_per_second": 405.348, "eval_steps_per_second": 6.334, "step": 16800 }, { "epoch": 1.3166585151129806, "grad_norm": NaN, "learning_rate": 4.341641884488966e-05, "loss": 0.0, "step": 16825 }, { "epoch": 1.3186148879976525, "grad_norm": NaN, "learning_rate": 4.340663640632337e-05, "loss": 0.0, "step": 16850 }, { "epoch": 1.3205712608823241, "grad_norm": NaN, "learning_rate": 4.3396853967757086e-05, "loss": 0.0, "step": 16875 }, { "epoch": 1.322527633766996, "grad_norm": NaN, "learning_rate": 4.33870715291908e-05, "loss": 0.0, "step": 16900 }, { "epoch": 1.322527633766996, "eval_loss": NaN, "eval_runtime": 295.3064, "eval_samples_per_second": 407.424, "eval_steps_per_second": 6.366, "step": 16900 }, { "epoch": 1.3244840066516679, "grad_norm": NaN, "learning_rate": 4.3377289090624515e-05, "loss": 0.0, "step": 16925 }, { "epoch": 1.3264403795363395, "grad_norm": NaN, "learning_rate": 4.3367506652058225e-05, "loss": 0.0, "step": 16950 }, { "epoch": 1.3283967524210114, "grad_norm": NaN, "learning_rate": 4.335772421349194e-05, "loss": 0.0, "step": 16975 }, { "epoch": 1.3303531253056833, "grad_norm": NaN, "learning_rate": 4.3347941774925653e-05, "loss": 0.0, "step": 17000 }, { "epoch": 1.3303531253056833, "eval_loss": NaN, "eval_runtime": 297.0564, "eval_samples_per_second": 405.024, "eval_steps_per_second": 6.329, "step": 17000 }, { "epoch": 1.3323094981903552, "grad_norm": NaN, "learning_rate": 4.333815933635937e-05, "loss": 0.0, "step": 17025 }, { "epoch": 1.334265871075027, "grad_norm": NaN, "learning_rate": 4.332837689779308e-05, "loss": 0.0, "step": 17050 }, { "epoch": 1.3362222439596987, "grad_norm": NaN, "learning_rate": 4.33185944592268e-05, "loss": 0.0, "step": 17075 }, { "epoch": 1.3381786168443706, "grad_norm": NaN, "learning_rate": 4.330881202066051e-05, "loss": 0.0, "step": 17100 }, { "epoch": 1.3381786168443706, "eval_loss": NaN, "eval_runtime": 295.9561, "eval_samples_per_second": 406.53, "eval_steps_per_second": 6.352, "step": 17100 }, { "epoch": 1.3401349897290424, "grad_norm": NaN, "learning_rate": 4.329902958209423e-05, "loss": 0.0, "step": 17125 }, { "epoch": 1.342091362613714, "grad_norm": NaN, "learning_rate": 4.328924714352794e-05, "loss": 0.0, "step": 17150 }, { "epoch": 1.344047735498386, "grad_norm": NaN, "learning_rate": 4.3279464704961655e-05, "loss": 0.0, "step": 17175 }, { "epoch": 1.3460041083830578, "grad_norm": NaN, "learning_rate": 4.3269682266395366e-05, "loss": 0.0, "step": 17200 }, { "epoch": 1.3460041083830578, "eval_loss": NaN, "eval_runtime": 295.8326, "eval_samples_per_second": 406.7, "eval_steps_per_second": 6.355, "step": 17200 }, { "epoch": 1.3479604812677297, "grad_norm": NaN, "learning_rate": 4.3259899827829084e-05, "loss": 0.0, "step": 17225 }, { "epoch": 1.3499168541524014, "grad_norm": NaN, "learning_rate": 4.3250117389262794e-05, "loss": 0.0, "step": 17250 }, { "epoch": 1.3518732270370732, "grad_norm": NaN, "learning_rate": 4.324033495069651e-05, "loss": 0.0, "step": 17275 }, { "epoch": 1.3538295999217451, "grad_norm": NaN, "learning_rate": 4.323055251213022e-05, "loss": 0.0, "step": 17300 }, { "epoch": 1.3538295999217451, "eval_loss": NaN, "eval_runtime": 297.1846, "eval_samples_per_second": 404.849, "eval_steps_per_second": 6.326, "step": 17300 }, { "epoch": 1.3557859728064168, "grad_norm": NaN, "learning_rate": 4.322077007356394e-05, "loss": 0.0, "step": 17325 }, { "epoch": 1.3577423456910886, "grad_norm": NaN, "learning_rate": 4.321098763499765e-05, "loss": 0.0, "step": 17350 }, { "epoch": 1.3596987185757605, "grad_norm": NaN, "learning_rate": 4.320120519643137e-05, "loss": 0.0, "step": 17375 }, { "epoch": 1.3616550914604324, "grad_norm": NaN, "learning_rate": 4.3191422757865086e-05, "loss": 0.0, "step": 17400 }, { "epoch": 1.3616550914604324, "eval_loss": NaN, "eval_runtime": 297.4389, "eval_samples_per_second": 404.503, "eval_steps_per_second": 6.321, "step": 17400 }, { "epoch": 1.3636114643451043, "grad_norm": NaN, "learning_rate": 4.3181640319298796e-05, "loss": 0.0, "step": 17425 }, { "epoch": 1.365567837229776, "grad_norm": NaN, "learning_rate": 4.3171857880732514e-05, "loss": 0.0, "step": 17450 }, { "epoch": 1.3675242101144478, "grad_norm": NaN, "learning_rate": 4.3162075442166224e-05, "loss": 0.0, "step": 17475 }, { "epoch": 1.3694805829991197, "grad_norm": NaN, "learning_rate": 4.315229300359994e-05, "loss": 0.0, "step": 17500 }, { "epoch": 1.3694805829991197, "eval_loss": NaN, "eval_runtime": 295.6139, "eval_samples_per_second": 407.001, "eval_steps_per_second": 6.36, "step": 17500 }, { "epoch": 1.3714369558837913, "grad_norm": NaN, "learning_rate": 4.314251056503365e-05, "loss": 0.0, "step": 17525 }, { "epoch": 1.3733933287684632, "grad_norm": NaN, "learning_rate": 4.313272812646737e-05, "loss": 0.0, "step": 17550 }, { "epoch": 1.375349701653135, "grad_norm": NaN, "learning_rate": 4.312294568790108e-05, "loss": 0.0, "step": 17575 }, { "epoch": 1.377306074537807, "grad_norm": NaN, "learning_rate": 4.31131632493348e-05, "loss": 0.0, "step": 17600 }, { "epoch": 1.377306074537807, "eval_loss": NaN, "eval_runtime": 295.8702, "eval_samples_per_second": 406.648, "eval_steps_per_second": 6.354, "step": 17600 }, { "epoch": 1.3792624474224788, "grad_norm": NaN, "learning_rate": 4.310338081076851e-05, "loss": 0.0, "step": 17625 }, { "epoch": 1.3812188203071505, "grad_norm": NaN, "learning_rate": 4.3093598372202226e-05, "loss": 0.0, "step": 17650 }, { "epoch": 1.3831751931918224, "grad_norm": NaN, "learning_rate": 4.308381593363594e-05, "loss": 0.0, "step": 17675 }, { "epoch": 1.3851315660764942, "grad_norm": NaN, "learning_rate": 4.3074033495069655e-05, "loss": 0.0, "step": 17700 }, { "epoch": 1.3851315660764942, "eval_loss": NaN, "eval_runtime": 296.1659, "eval_samples_per_second": 406.242, "eval_steps_per_second": 6.348, "step": 17700 }, { "epoch": 1.387087938961166, "grad_norm": NaN, "learning_rate": 4.3064251056503365e-05, "loss": 0.0, "step": 17725 }, { "epoch": 1.3890443118458378, "grad_norm": NaN, "learning_rate": 4.305446861793708e-05, "loss": 0.0, "step": 17750 }, { "epoch": 1.3910006847305096, "grad_norm": NaN, "learning_rate": 4.3044686179370793e-05, "loss": 0.0, "step": 17775 }, { "epoch": 1.3929570576151815, "grad_norm": NaN, "learning_rate": 4.303490374080451e-05, "loss": 0.0, "step": 17800 }, { "epoch": 1.3929570576151815, "eval_loss": NaN, "eval_runtime": 295.343, "eval_samples_per_second": 407.374, "eval_steps_per_second": 6.365, "step": 17800 }, { "epoch": 1.3949134304998534, "grad_norm": NaN, "learning_rate": 4.302512130223822e-05, "loss": 0.0, "step": 17825 }, { "epoch": 1.396869803384525, "grad_norm": NaN, "learning_rate": 4.301533886367194e-05, "loss": 0.0, "step": 17850 }, { "epoch": 1.398826176269197, "grad_norm": NaN, "learning_rate": 4.300555642510565e-05, "loss": 0.0, "step": 17875 }, { "epoch": 1.4007825491538688, "grad_norm": NaN, "learning_rate": 4.299577398653937e-05, "loss": 0.0, "step": 17900 }, { "epoch": 1.4007825491538688, "eval_loss": NaN, "eval_runtime": 295.8221, "eval_samples_per_second": 406.714, "eval_steps_per_second": 6.355, "step": 17900 }, { "epoch": 1.4027389220385404, "grad_norm": NaN, "learning_rate": 4.298599154797308e-05, "loss": 0.0, "step": 17925 }, { "epoch": 1.4046952949232123, "grad_norm": NaN, "learning_rate": 4.2976209109406795e-05, "loss": 0.0, "step": 17950 }, { "epoch": 1.4066516678078842, "grad_norm": NaN, "learning_rate": 4.2966426670840506e-05, "loss": 0.0, "step": 17975 }, { "epoch": 1.408608040692556, "grad_norm": NaN, "learning_rate": 4.2956644232274224e-05, "loss": 0.0, "step": 18000 }, { "epoch": 1.408608040692556, "eval_loss": NaN, "eval_runtime": 294.1824, "eval_samples_per_second": 408.981, "eval_steps_per_second": 6.391, "step": 18000 }, { "epoch": 1.410564413577228, "grad_norm": NaN, "learning_rate": 4.2946861793707934e-05, "loss": 0.0, "step": 18025 }, { "epoch": 1.4125207864618996, "grad_norm": NaN, "learning_rate": 4.293707935514165e-05, "loss": 0.0, "step": 18050 }, { "epoch": 1.4144771593465715, "grad_norm": NaN, "learning_rate": 4.292729691657536e-05, "loss": 0.0, "step": 18075 }, { "epoch": 1.4164335322312434, "grad_norm": NaN, "learning_rate": 4.291751447800908e-05, "loss": 0.0, "step": 18100 }, { "epoch": 1.4164335322312434, "eval_loss": NaN, "eval_runtime": 295.2392, "eval_samples_per_second": 407.517, "eval_steps_per_second": 6.368, "step": 18100 }, { "epoch": 1.418389905115915, "grad_norm": NaN, "learning_rate": 4.290773203944279e-05, "loss": 0.0, "step": 18125 }, { "epoch": 1.4203462780005869, "grad_norm": NaN, "learning_rate": 4.289794960087651e-05, "loss": 0.0, "step": 18150 }, { "epoch": 1.4223026508852588, "grad_norm": NaN, "learning_rate": 4.2888167162310226e-05, "loss": 0.0, "step": 18175 }, { "epoch": 1.4242590237699306, "grad_norm": NaN, "learning_rate": 4.2878384723743936e-05, "loss": 0.0, "step": 18200 }, { "epoch": 1.4242590237699306, "eval_loss": NaN, "eval_runtime": 296.7309, "eval_samples_per_second": 405.468, "eval_steps_per_second": 6.336, "step": 18200 }, { "epoch": 1.4262153966546023, "grad_norm": NaN, "learning_rate": 4.2868602285177654e-05, "loss": 0.0, "step": 18225 }, { "epoch": 1.4281717695392742, "grad_norm": NaN, "learning_rate": 4.2858819846611364e-05, "loss": 0.0, "step": 18250 }, { "epoch": 1.430128142423946, "grad_norm": NaN, "learning_rate": 4.284903740804508e-05, "loss": 0.0, "step": 18275 }, { "epoch": 1.4320845153086177, "grad_norm": NaN, "learning_rate": 4.283925496947879e-05, "loss": 0.0, "step": 18300 }, { "epoch": 1.4320845153086177, "eval_loss": NaN, "eval_runtime": 296.3822, "eval_samples_per_second": 405.945, "eval_steps_per_second": 6.343, "step": 18300 }, { "epoch": 1.4340408881932896, "grad_norm": NaN, "learning_rate": 4.282947253091251e-05, "loss": 0.0, "step": 18325 }, { "epoch": 1.4359972610779614, "grad_norm": NaN, "learning_rate": 4.281969009234622e-05, "loss": 0.0, "step": 18350 }, { "epoch": 1.4379536339626333, "grad_norm": NaN, "learning_rate": 4.280990765377994e-05, "loss": 0.0, "step": 18375 }, { "epoch": 1.4399100068473052, "grad_norm": NaN, "learning_rate": 4.280012521521365e-05, "loss": 0.0, "step": 18400 }, { "epoch": 1.4399100068473052, "eval_loss": NaN, "eval_runtime": 295.1574, "eval_samples_per_second": 407.63, "eval_steps_per_second": 6.369, "step": 18400 }, { "epoch": 1.4418663797319768, "grad_norm": NaN, "learning_rate": 4.2790342776647366e-05, "loss": 0.0, "step": 18425 }, { "epoch": 1.4438227526166487, "grad_norm": NaN, "learning_rate": 4.278056033808108e-05, "loss": 0.0, "step": 18450 }, { "epoch": 1.4457791255013206, "grad_norm": NaN, "learning_rate": 4.2770777899514795e-05, "loss": 0.0, "step": 18475 }, { "epoch": 1.4477354983859922, "grad_norm": NaN, "learning_rate": 4.2760995460948505e-05, "loss": 0.0, "step": 18500 }, { "epoch": 1.4477354983859922, "eval_loss": NaN, "eval_runtime": 295.2626, "eval_samples_per_second": 407.485, "eval_steps_per_second": 6.367, "step": 18500 }, { "epoch": 1.4496918712706641, "grad_norm": NaN, "learning_rate": 4.275121302238222e-05, "loss": 0.0, "step": 18525 }, { "epoch": 1.451648244155336, "grad_norm": NaN, "learning_rate": 4.274143058381593e-05, "loss": 0.0, "step": 18550 }, { "epoch": 1.4536046170400079, "grad_norm": NaN, "learning_rate": 4.273164814524965e-05, "loss": 0.0, "step": 18575 }, { "epoch": 1.4555609899246797, "grad_norm": NaN, "learning_rate": 4.272186570668336e-05, "loss": 0.0, "step": 18600 }, { "epoch": 1.4555609899246797, "eval_loss": NaN, "eval_runtime": 294.9072, "eval_samples_per_second": 407.976, "eval_steps_per_second": 6.375, "step": 18600 }, { "epoch": 1.4575173628093514, "grad_norm": NaN, "learning_rate": 4.271208326811708e-05, "loss": 0.0, "step": 18625 }, { "epoch": 1.4594737356940233, "grad_norm": NaN, "learning_rate": 4.270230082955079e-05, "loss": 0.0, "step": 18650 }, { "epoch": 1.4614301085786952, "grad_norm": NaN, "learning_rate": 4.269251839098451e-05, "loss": 0.0, "step": 18675 }, { "epoch": 1.4633864814633668, "grad_norm": NaN, "learning_rate": 4.2682735952418225e-05, "loss": 0.0, "step": 18700 }, { "epoch": 1.4633864814633668, "eval_loss": NaN, "eval_runtime": 295.7107, "eval_samples_per_second": 406.867, "eval_steps_per_second": 6.358, "step": 18700 }, { "epoch": 1.4653428543480387, "grad_norm": NaN, "learning_rate": 4.2672953513851935e-05, "loss": 0.0, "step": 18725 }, { "epoch": 1.4672992272327106, "grad_norm": NaN, "learning_rate": 4.266317107528565e-05, "loss": 0.0, "step": 18750 }, { "epoch": 1.4692556001173824, "grad_norm": NaN, "learning_rate": 4.2653388636719364e-05, "loss": 0.0, "step": 18775 }, { "epoch": 1.4712119730020543, "grad_norm": NaN, "learning_rate": 4.264360619815308e-05, "loss": 0.0, "step": 18800 }, { "epoch": 1.4712119730020543, "eval_loss": NaN, "eval_runtime": 296.7301, "eval_samples_per_second": 405.469, "eval_steps_per_second": 6.336, "step": 18800 }, { "epoch": 1.473168345886726, "grad_norm": NaN, "learning_rate": 4.263382375958679e-05, "loss": 0.0, "step": 18825 }, { "epoch": 1.4751247187713978, "grad_norm": NaN, "learning_rate": 4.262404132102051e-05, "loss": 0.0, "step": 18850 }, { "epoch": 1.4770810916560697, "grad_norm": NaN, "learning_rate": 4.261425888245422e-05, "loss": 0.0, "step": 18875 }, { "epoch": 1.4790374645407414, "grad_norm": NaN, "learning_rate": 4.260447644388794e-05, "loss": 0.0, "step": 18900 }, { "epoch": 1.4790374645407414, "eval_loss": NaN, "eval_runtime": 295.3576, "eval_samples_per_second": 407.354, "eval_steps_per_second": 6.365, "step": 18900 }, { "epoch": 1.4809938374254132, "grad_norm": NaN, "learning_rate": 4.259469400532165e-05, "loss": 0.0, "step": 18925 }, { "epoch": 1.4829502103100851, "grad_norm": NaN, "learning_rate": 4.2584911566755365e-05, "loss": 0.0, "step": 18950 }, { "epoch": 1.484906583194757, "grad_norm": NaN, "learning_rate": 4.2575129128189076e-05, "loss": 0.0, "step": 18975 }, { "epoch": 1.4868629560794289, "grad_norm": NaN, "learning_rate": 4.2565346689622794e-05, "loss": 0.0, "step": 19000 }, { "epoch": 1.4868629560794289, "eval_loss": NaN, "eval_runtime": 295.6446, "eval_samples_per_second": 406.958, "eval_steps_per_second": 6.359, "step": 19000 }, { "epoch": 1.4888193289641005, "grad_norm": NaN, "learning_rate": 4.2555564251056504e-05, "loss": 0.0, "step": 19025 }, { "epoch": 1.4907757018487724, "grad_norm": NaN, "learning_rate": 4.254578181249022e-05, "loss": 0.0, "step": 19050 }, { "epoch": 1.4927320747334443, "grad_norm": NaN, "learning_rate": 4.253599937392393e-05, "loss": 0.0, "step": 19075 }, { "epoch": 1.494688447618116, "grad_norm": NaN, "learning_rate": 4.252621693535765e-05, "loss": 0.0, "step": 19100 }, { "epoch": 1.494688447618116, "eval_loss": NaN, "eval_runtime": 294.8521, "eval_samples_per_second": 408.052, "eval_steps_per_second": 6.376, "step": 19100 }, { "epoch": 1.4966448205027878, "grad_norm": NaN, "learning_rate": 4.251643449679136e-05, "loss": 0.0, "step": 19125 }, { "epoch": 1.4986011933874597, "grad_norm": NaN, "learning_rate": 4.250665205822508e-05, "loss": 0.0, "step": 19150 }, { "epoch": 1.5005575662721315, "grad_norm": NaN, "learning_rate": 4.249686961965879e-05, "loss": 0.0, "step": 19175 }, { "epoch": 1.5025139391568034, "grad_norm": NaN, "learning_rate": 4.2487087181092506e-05, "loss": 0.0, "step": 19200 }, { "epoch": 1.5025139391568034, "eval_loss": NaN, "eval_runtime": 296.7214, "eval_samples_per_second": 405.481, "eval_steps_per_second": 6.336, "step": 19200 }, { "epoch": 1.504470312041475, "grad_norm": NaN, "learning_rate": 4.247730474252622e-05, "loss": 0.0, "step": 19225 }, { "epoch": 1.506426684926147, "grad_norm": NaN, "learning_rate": 4.2467522303959934e-05, "loss": 0.0, "step": 19250 }, { "epoch": 1.5083830578108186, "grad_norm": NaN, "learning_rate": 4.2457739865393645e-05, "loss": 0.0, "step": 19275 }, { "epoch": 1.5103394306954905, "grad_norm": NaN, "learning_rate": 4.244795742682736e-05, "loss": 0.0, "step": 19300 }, { "epoch": 1.5103394306954905, "eval_loss": NaN, "eval_runtime": 297.6589, "eval_samples_per_second": 404.204, "eval_steps_per_second": 6.316, "step": 19300 }, { "epoch": 1.5122958035801624, "grad_norm": NaN, "learning_rate": 4.243817498826107e-05, "loss": 0.0, "step": 19325 }, { "epoch": 1.5142521764648342, "grad_norm": NaN, "learning_rate": 4.242839254969479e-05, "loss": 0.0, "step": 19350 }, { "epoch": 1.516208549349506, "grad_norm": NaN, "learning_rate": 4.24186101111285e-05, "loss": 0.0, "step": 19375 }, { "epoch": 1.518164922234178, "grad_norm": NaN, "learning_rate": 4.240882767256222e-05, "loss": 0.0, "step": 19400 }, { "epoch": 1.518164922234178, "eval_loss": NaN, "eval_runtime": 294.7476, "eval_samples_per_second": 408.197, "eval_steps_per_second": 6.378, "step": 19400 }, { "epoch": 1.5201212951188496, "grad_norm": NaN, "learning_rate": 4.239904523399593e-05, "loss": 0.0, "step": 19425 }, { "epoch": 1.5220776680035215, "grad_norm": NaN, "learning_rate": 4.238926279542965e-05, "loss": 0.0, "step": 19450 }, { "epoch": 1.5240340408881932, "grad_norm": NaN, "learning_rate": 4.237948035686336e-05, "loss": 0.0, "step": 19475 }, { "epoch": 1.525990413772865, "grad_norm": NaN, "learning_rate": 4.2369697918297075e-05, "loss": 0.0, "step": 19500 }, { "epoch": 1.525990413772865, "eval_loss": NaN, "eval_runtime": 295.5406, "eval_samples_per_second": 407.101, "eval_steps_per_second": 6.361, "step": 19500 }, { "epoch": 1.527946786657537, "grad_norm": NaN, "learning_rate": 4.2359915479730786e-05, "loss": 0.0, "step": 19525 }, { "epoch": 1.5299031595422088, "grad_norm": NaN, "learning_rate": 4.2350133041164503e-05, "loss": 0.0, "step": 19550 }, { "epoch": 1.5318595324268807, "grad_norm": NaN, "learning_rate": 4.2340350602598214e-05, "loss": 0.0, "step": 19575 }, { "epoch": 1.5338159053115525, "grad_norm": NaN, "learning_rate": 4.233056816403193e-05, "loss": 0.0, "step": 19600 }, { "epoch": 1.5338159053115525, "eval_loss": NaN, "eval_runtime": 295.3261, "eval_samples_per_second": 407.397, "eval_steps_per_second": 6.366, "step": 19600 }, { "epoch": 1.5357722781962242, "grad_norm": NaN, "learning_rate": 4.232078572546564e-05, "loss": 0.0, "step": 19625 }, { "epoch": 1.537728651080896, "grad_norm": NaN, "learning_rate": 4.231100328689936e-05, "loss": 0.0, "step": 19650 }, { "epoch": 1.5396850239655677, "grad_norm": NaN, "learning_rate": 4.230122084833307e-05, "loss": 0.0, "step": 19675 }, { "epoch": 1.5416413968502396, "grad_norm": NaN, "learning_rate": 4.229143840976679e-05, "loss": 0.0, "step": 19700 }, { "epoch": 1.5416413968502396, "eval_loss": NaN, "eval_runtime": 296.5725, "eval_samples_per_second": 405.685, "eval_steps_per_second": 6.339, "step": 19700 }, { "epoch": 1.5435977697349115, "grad_norm": NaN, "learning_rate": 4.22816559712005e-05, "loss": 0.0, "step": 19725 }, { "epoch": 1.5455541426195833, "grad_norm": NaN, "learning_rate": 4.2271873532634216e-05, "loss": 0.0, "step": 19750 }, { "epoch": 1.5475105155042552, "grad_norm": NaN, "learning_rate": 4.226209109406793e-05, "loss": 0.0, "step": 19775 }, { "epoch": 1.5494668883889269, "grad_norm": NaN, "learning_rate": 4.2252308655501644e-05, "loss": 0.0, "step": 19800 }, { "epoch": 1.5494668883889269, "eval_loss": NaN, "eval_runtime": 297.1314, "eval_samples_per_second": 404.922, "eval_steps_per_second": 6.327, "step": 19800 }, { "epoch": 1.5514232612735988, "grad_norm": NaN, "learning_rate": 4.2242526216935355e-05, "loss": 0.0, "step": 19825 }, { "epoch": 1.5533796341582704, "grad_norm": NaN, "learning_rate": 4.223274377836907e-05, "loss": 0.0, "step": 19850 }, { "epoch": 1.5553360070429423, "grad_norm": NaN, "learning_rate": 4.222296133980278e-05, "loss": 0.0, "step": 19875 }, { "epoch": 1.5572923799276142, "grad_norm": NaN, "learning_rate": 4.22131789012365e-05, "loss": 0.0, "step": 19900 }, { "epoch": 1.5572923799276142, "eval_loss": NaN, "eval_runtime": 295.8073, "eval_samples_per_second": 406.734, "eval_steps_per_second": 6.355, "step": 19900 }, { "epoch": 1.559248752812286, "grad_norm": NaN, "learning_rate": 4.220339646267021e-05, "loss": 0.0, "step": 19925 }, { "epoch": 1.561205125696958, "grad_norm": NaN, "learning_rate": 4.219361402410393e-05, "loss": 0.0, "step": 19950 }, { "epoch": 1.5631614985816298, "grad_norm": NaN, "learning_rate": 4.2183831585537646e-05, "loss": 0.0, "step": 19975 }, { "epoch": 1.5651178714663014, "grad_norm": NaN, "learning_rate": 4.2174049146971364e-05, "loss": 0.0, "step": 20000 }, { "epoch": 1.5651178714663014, "eval_loss": NaN, "eval_runtime": 297.3544, "eval_samples_per_second": 404.618, "eval_steps_per_second": 6.322, "step": 20000 }, { "epoch": 1.5670742443509733, "grad_norm": NaN, "learning_rate": 4.2164266708405074e-05, "loss": 0.0, "step": 20025 }, { "epoch": 1.569030617235645, "grad_norm": NaN, "learning_rate": 4.215448426983879e-05, "loss": 0.0, "step": 20050 }, { "epoch": 1.5709869901203168, "grad_norm": NaN, "learning_rate": 4.21447018312725e-05, "loss": 0.0, "step": 20075 }, { "epoch": 1.5729433630049887, "grad_norm": NaN, "learning_rate": 4.213491939270622e-05, "loss": 0.0, "step": 20100 }, { "epoch": 1.5729433630049887, "eval_loss": NaN, "eval_runtime": 299.4288, "eval_samples_per_second": 401.815, "eval_steps_per_second": 6.279, "step": 20100 }, { "epoch": 1.5748997358896606, "grad_norm": NaN, "learning_rate": 4.212513695413993e-05, "loss": 0.0, "step": 20125 }, { "epoch": 1.5768561087743325, "grad_norm": NaN, "learning_rate": 4.211535451557365e-05, "loss": 0.0, "step": 20150 }, { "epoch": 1.5788124816590043, "grad_norm": NaN, "learning_rate": 4.210557207700736e-05, "loss": 0.0, "step": 20175 }, { "epoch": 1.580768854543676, "grad_norm": NaN, "learning_rate": 4.2095789638441076e-05, "loss": 0.0, "step": 20200 }, { "epoch": 1.580768854543676, "eval_loss": NaN, "eval_runtime": 302.5612, "eval_samples_per_second": 397.655, "eval_steps_per_second": 6.214, "step": 20200 }, { "epoch": 1.5827252274283479, "grad_norm": NaN, "learning_rate": 4.208600719987479e-05, "loss": 0.0, "step": 20225 }, { "epoch": 1.5846816003130195, "grad_norm": NaN, "learning_rate": 4.2076224761308505e-05, "loss": 0.0, "step": 20250 }, { "epoch": 1.5866379731976914, "grad_norm": NaN, "learning_rate": 4.2066442322742215e-05, "loss": 0.0, "step": 20275 }, { "epoch": 1.5885943460823633, "grad_norm": NaN, "learning_rate": 4.205665988417593e-05, "loss": 0.0, "step": 20300 }, { "epoch": 1.5885943460823633, "eval_loss": NaN, "eval_runtime": 303.1748, "eval_samples_per_second": 396.85, "eval_steps_per_second": 6.201, "step": 20300 }, { "epoch": 1.5905507189670351, "grad_norm": NaN, "learning_rate": 4.2046877445609643e-05, "loss": 0.0, "step": 20325 }, { "epoch": 1.592507091851707, "grad_norm": NaN, "learning_rate": 4.203709500704336e-05, "loss": 0.0, "step": 20350 }, { "epoch": 1.594463464736379, "grad_norm": NaN, "learning_rate": 4.202731256847707e-05, "loss": 0.0, "step": 20375 }, { "epoch": 1.5964198376210506, "grad_norm": NaN, "learning_rate": 4.201753012991079e-05, "loss": 0.0, "step": 20400 }, { "epoch": 1.5964198376210506, "eval_loss": NaN, "eval_runtime": 299.4371, "eval_samples_per_second": 401.804, "eval_steps_per_second": 6.278, "step": 20400 }, { "epoch": 1.5983762105057224, "grad_norm": NaN, "learning_rate": 4.20077476913445e-05, "loss": 0.0, "step": 20425 }, { "epoch": 1.600332583390394, "grad_norm": NaN, "learning_rate": 4.199796525277822e-05, "loss": 0.0, "step": 20450 }, { "epoch": 1.602288956275066, "grad_norm": NaN, "learning_rate": 4.198818281421193e-05, "loss": 0.0, "step": 20475 }, { "epoch": 1.6042453291597378, "grad_norm": NaN, "learning_rate": 4.1978400375645645e-05, "loss": 0.0, "step": 20500 }, { "epoch": 1.6042453291597378, "eval_loss": NaN, "eval_runtime": 298.7282, "eval_samples_per_second": 402.757, "eval_steps_per_second": 6.293, "step": 20500 }, { "epoch": 1.6062017020444097, "grad_norm": NaN, "learning_rate": 4.1968617937079356e-05, "loss": 0.0, "step": 20525 }, { "epoch": 1.6081580749290816, "grad_norm": NaN, "learning_rate": 4.1958835498513074e-05, "loss": 0.0, "step": 20550 }, { "epoch": 1.6101144478137535, "grad_norm": NaN, "learning_rate": 4.1949053059946784e-05, "loss": 0.0, "step": 20575 }, { "epoch": 1.612070820698425, "grad_norm": NaN, "learning_rate": 4.19392706213805e-05, "loss": 0.0, "step": 20600 }, { "epoch": 1.612070820698425, "eval_loss": NaN, "eval_runtime": 298.8682, "eval_samples_per_second": 402.569, "eval_steps_per_second": 6.29, "step": 20600 }, { "epoch": 1.614027193583097, "grad_norm": NaN, "learning_rate": 4.192948818281421e-05, "loss": 0.0, "step": 20625 }, { "epoch": 1.6159835664677686, "grad_norm": NaN, "learning_rate": 4.191970574424793e-05, "loss": 0.0, "step": 20650 }, { "epoch": 1.6179399393524405, "grad_norm": NaN, "learning_rate": 4.190992330568164e-05, "loss": 0.0, "step": 20675 }, { "epoch": 1.6198963122371124, "grad_norm": NaN, "learning_rate": 4.190014086711536e-05, "loss": 0.0, "step": 20700 }, { "epoch": 1.6198963122371124, "eval_loss": NaN, "eval_runtime": 298.5905, "eval_samples_per_second": 402.943, "eval_steps_per_second": 6.296, "step": 20700 }, { "epoch": 1.6218526851217843, "grad_norm": NaN, "learning_rate": 4.189035842854907e-05, "loss": 0.0, "step": 20725 }, { "epoch": 1.6238090580064561, "grad_norm": NaN, "learning_rate": 4.1880575989982786e-05, "loss": 0.0, "step": 20750 }, { "epoch": 1.6257654308911278, "grad_norm": NaN, "learning_rate": 4.18707935514165e-05, "loss": 0.0, "step": 20775 }, { "epoch": 1.6277218037757997, "grad_norm": NaN, "learning_rate": 4.1861011112850214e-05, "loss": 0.0, "step": 20800 }, { "epoch": 1.6277218037757997, "eval_loss": NaN, "eval_runtime": 302.1744, "eval_samples_per_second": 398.164, "eval_steps_per_second": 6.222, "step": 20800 }, { "epoch": 1.6296781766604713, "grad_norm": NaN, "learning_rate": 4.1851228674283925e-05, "loss": 0.0, "step": 20825 }, { "epoch": 1.6316345495451432, "grad_norm": NaN, "learning_rate": 4.184144623571764e-05, "loss": 0.0, "step": 20850 }, { "epoch": 1.633590922429815, "grad_norm": NaN, "learning_rate": 4.183166379715135e-05, "loss": 0.0, "step": 20875 }, { "epoch": 1.635547295314487, "grad_norm": NaN, "learning_rate": 4.182188135858507e-05, "loss": 0.0, "step": 20900 }, { "epoch": 1.635547295314487, "eval_loss": NaN, "eval_runtime": 302.1805, "eval_samples_per_second": 398.156, "eval_steps_per_second": 6.221, "step": 20900 }, { "epoch": 1.6375036681991588, "grad_norm": NaN, "learning_rate": 4.181209892001878e-05, "loss": 0.0, "step": 20925 }, { "epoch": 1.6394600410838307, "grad_norm": NaN, "learning_rate": 4.18023164814525e-05, "loss": 0.0, "step": 20950 }, { "epoch": 1.6414164139685024, "grad_norm": NaN, "learning_rate": 4.179253404288621e-05, "loss": 0.0, "step": 20975 }, { "epoch": 1.6433727868531742, "grad_norm": NaN, "learning_rate": 4.178275160431993e-05, "loss": 0.0, "step": 21000 }, { "epoch": 1.6433727868531742, "eval_loss": NaN, "eval_runtime": 299.0291, "eval_samples_per_second": 402.352, "eval_steps_per_second": 6.287, "step": 21000 }, { "epoch": 1.6453291597378459, "grad_norm": NaN, "learning_rate": 4.177296916575364e-05, "loss": 0.0, "step": 21025 }, { "epoch": 1.6472855326225178, "grad_norm": NaN, "learning_rate": 4.1763186727187355e-05, "loss": 0.0, "step": 21050 }, { "epoch": 1.6492419055071896, "grad_norm": NaN, "learning_rate": 4.1753404288621066e-05, "loss": 0.0, "step": 21075 }, { "epoch": 1.6511982783918615, "grad_norm": NaN, "learning_rate": 4.1743621850054783e-05, "loss": 0.0, "step": 21100 }, { "epoch": 1.6511982783918615, "eval_loss": NaN, "eval_runtime": 300.401, "eval_samples_per_second": 400.515, "eval_steps_per_second": 6.258, "step": 21100 }, { "epoch": 1.6531546512765334, "grad_norm": NaN, "learning_rate": 4.1733839411488494e-05, "loss": 0.0, "step": 21125 }, { "epoch": 1.6551110241612053, "grad_norm": NaN, "learning_rate": 4.172405697292221e-05, "loss": 0.0, "step": 21150 }, { "epoch": 1.657067397045877, "grad_norm": NaN, "learning_rate": 4.171427453435592e-05, "loss": 0.0, "step": 21175 }, { "epoch": 1.6590237699305488, "grad_norm": NaN, "learning_rate": 4.170449209578964e-05, "loss": 0.0, "step": 21200 }, { "epoch": 1.6590237699305488, "eval_loss": NaN, "eval_runtime": 299.9297, "eval_samples_per_second": 401.144, "eval_steps_per_second": 6.268, "step": 21200 }, { "epoch": 1.6609801428152204, "grad_norm": NaN, "learning_rate": 4.169470965722335e-05, "loss": 0.0, "step": 21225 }, { "epoch": 1.6629365156998923, "grad_norm": NaN, "learning_rate": 4.168492721865707e-05, "loss": 0.0, "step": 21250 }, { "epoch": 1.6648928885845642, "grad_norm": NaN, "learning_rate": 4.1675144780090785e-05, "loss": 0.0, "step": 21275 }, { "epoch": 1.666849261469236, "grad_norm": NaN, "learning_rate": 4.16653623415245e-05, "loss": 0.0, "step": 21300 }, { "epoch": 1.666849261469236, "eval_loss": NaN, "eval_runtime": 300.874, "eval_samples_per_second": 399.885, "eval_steps_per_second": 6.248, "step": 21300 }, { "epoch": 1.668805634353908, "grad_norm": NaN, "learning_rate": 4.1655579902958214e-05, "loss": 0.0, "step": 21325 }, { "epoch": 1.6707620072385798, "grad_norm": NaN, "learning_rate": 4.164579746439193e-05, "loss": 0.0, "step": 21350 }, { "epoch": 1.6727183801232515, "grad_norm": NaN, "learning_rate": 4.163601502582564e-05, "loss": 0.0, "step": 21375 }, { "epoch": 1.6746747530079233, "grad_norm": NaN, "learning_rate": 4.162623258725936e-05, "loss": 0.0, "step": 21400 }, { "epoch": 1.6746747530079233, "eval_loss": NaN, "eval_runtime": 301.8564, "eval_samples_per_second": 398.584, "eval_steps_per_second": 6.228, "step": 21400 }, { "epoch": 1.676631125892595, "grad_norm": NaN, "learning_rate": 4.161645014869307e-05, "loss": 0.0, "step": 21425 }, { "epoch": 1.6785874987772669, "grad_norm": NaN, "learning_rate": 4.160666771012679e-05, "loss": 0.0, "step": 21450 }, { "epoch": 1.6805438716619387, "grad_norm": NaN, "learning_rate": 4.15968852715605e-05, "loss": 0.0, "step": 21475 }, { "epoch": 1.6825002445466106, "grad_norm": NaN, "learning_rate": 4.1587102832994216e-05, "loss": 0.0, "step": 21500 }, { "epoch": 1.6825002445466106, "eval_loss": NaN, "eval_runtime": 299.1519, "eval_samples_per_second": 402.187, "eval_steps_per_second": 6.284, "step": 21500 }, { "epoch": 1.6844566174312825, "grad_norm": NaN, "learning_rate": 4.1577320394427926e-05, "loss": 0.0, "step": 21525 }, { "epoch": 1.6864129903159544, "grad_norm": NaN, "learning_rate": 4.1567537955861644e-05, "loss": 0.0, "step": 21550 }, { "epoch": 1.688369363200626, "grad_norm": NaN, "learning_rate": 4.1557755517295354e-05, "loss": 0.0, "step": 21575 }, { "epoch": 1.690325736085298, "grad_norm": NaN, "learning_rate": 4.154797307872907e-05, "loss": 0.0, "step": 21600 }, { "epoch": 1.690325736085298, "eval_loss": NaN, "eval_runtime": 298.5661, "eval_samples_per_second": 402.976, "eval_steps_per_second": 6.297, "step": 21600 }, { "epoch": 1.6922821089699696, "grad_norm": NaN, "learning_rate": 4.153819064016278e-05, "loss": 0.0, "step": 21625 }, { "epoch": 1.6942384818546414, "grad_norm": NaN, "learning_rate": 4.15284082015965e-05, "loss": 0.0, "step": 21650 }, { "epoch": 1.6961948547393133, "grad_norm": NaN, "learning_rate": 4.151862576303021e-05, "loss": 0.0, "step": 21675 }, { "epoch": 1.6981512276239852, "grad_norm": NaN, "learning_rate": 4.150884332446393e-05, "loss": 0.0, "step": 21700 }, { "epoch": 1.6981512276239852, "eval_loss": NaN, "eval_runtime": 303.0589, "eval_samples_per_second": 397.002, "eval_steps_per_second": 6.203, "step": 21700 }, { "epoch": 1.700107600508657, "grad_norm": NaN, "learning_rate": 4.149906088589764e-05, "loss": 0.0, "step": 21725 }, { "epoch": 1.702063973393329, "grad_norm": NaN, "learning_rate": 4.1489278447331356e-05, "loss": 0.0, "step": 21750 }, { "epoch": 1.7040203462780006, "grad_norm": NaN, "learning_rate": 4.147949600876507e-05, "loss": 0.0, "step": 21775 }, { "epoch": 1.7059767191626722, "grad_norm": NaN, "learning_rate": 4.1469713570198785e-05, "loss": 0.0, "step": 21800 }, { "epoch": 1.7059767191626722, "eval_loss": NaN, "eval_runtime": 303.9004, "eval_samples_per_second": 395.903, "eval_steps_per_second": 6.186, "step": 21800 }, { "epoch": 1.7079330920473441, "grad_norm": NaN, "learning_rate": 4.1459931131632495e-05, "loss": 0.0, "step": 21825 }, { "epoch": 1.709889464932016, "grad_norm": NaN, "learning_rate": 4.145014869306621e-05, "loss": 0.0, "step": 21850 }, { "epoch": 1.7118458378166879, "grad_norm": NaN, "learning_rate": 4.144036625449992e-05, "loss": 0.0, "step": 21875 }, { "epoch": 1.7138022107013597, "grad_norm": NaN, "learning_rate": 4.143058381593364e-05, "loss": 0.0, "step": 21900 }, { "epoch": 1.7138022107013597, "eval_loss": NaN, "eval_runtime": 306.5043, "eval_samples_per_second": 392.539, "eval_steps_per_second": 6.134, "step": 21900 }, { "epoch": 1.7157585835860316, "grad_norm": NaN, "learning_rate": 4.142080137736735e-05, "loss": 0.0, "step": 21925 }, { "epoch": 1.7177149564707033, "grad_norm": NaN, "learning_rate": 4.141101893880107e-05, "loss": 0.0, "step": 21950 }, { "epoch": 1.7196713293553751, "grad_norm": NaN, "learning_rate": 4.140123650023478e-05, "loss": 0.0, "step": 21975 }, { "epoch": 1.7216277022400468, "grad_norm": NaN, "learning_rate": 4.13914540616685e-05, "loss": 0.0, "step": 22000 }, { "epoch": 1.7216277022400468, "eval_loss": NaN, "eval_runtime": 309.4381, "eval_samples_per_second": 388.818, "eval_steps_per_second": 6.076, "step": 22000 }, { "epoch": 1.7235840751247187, "grad_norm": NaN, "learning_rate": 4.138167162310221e-05, "loss": 0.0, "step": 22025 }, { "epoch": 1.7255404480093905, "grad_norm": NaN, "learning_rate": 4.1371889184535925e-05, "loss": 0.0, "step": 22050 }, { "epoch": 1.7274968208940624, "grad_norm": NaN, "learning_rate": 4.1362106745969636e-05, "loss": 0.0, "step": 22075 }, { "epoch": 1.7294531937787343, "grad_norm": NaN, "learning_rate": 4.1352324307403354e-05, "loss": 0.0, "step": 22100 }, { "epoch": 1.7294531937787343, "eval_loss": NaN, "eval_runtime": 338.6981, "eval_samples_per_second": 355.228, "eval_steps_per_second": 5.551, "step": 22100 }, { "epoch": 1.7314095666634062, "grad_norm": NaN, "learning_rate": 4.1342541868837064e-05, "loss": 0.0, "step": 22125 }, { "epoch": 1.7333659395480778, "grad_norm": NaN, "learning_rate": 4.133275943027078e-05, "loss": 0.0, "step": 22150 }, { "epoch": 1.7353223124327497, "grad_norm": NaN, "learning_rate": 4.132297699170449e-05, "loss": 0.0, "step": 22175 }, { "epoch": 1.7372786853174214, "grad_norm": NaN, "learning_rate": 4.131319455313821e-05, "loss": 0.0, "step": 22200 }, { "epoch": 1.7372786853174214, "eval_loss": NaN, "eval_runtime": 339.9636, "eval_samples_per_second": 353.906, "eval_steps_per_second": 5.53, "step": 22200 }, { "epoch": 1.7392350582020932, "grad_norm": NaN, "learning_rate": 4.130341211457192e-05, "loss": 0.0, "step": 22225 }, { "epoch": 1.741191431086765, "grad_norm": NaN, "learning_rate": 4.129362967600564e-05, "loss": 0.0, "step": 22250 }, { "epoch": 1.743147803971437, "grad_norm": NaN, "learning_rate": 4.128384723743935e-05, "loss": 0.0, "step": 22275 }, { "epoch": 1.7451041768561089, "grad_norm": NaN, "learning_rate": 4.1274064798873066e-05, "loss": 0.0, "step": 22300 }, { "epoch": 1.7451041768561089, "eval_loss": NaN, "eval_runtime": 299.971, "eval_samples_per_second": 401.089, "eval_steps_per_second": 6.267, "step": 22300 }, { "epoch": 1.7470605497407807, "grad_norm": NaN, "learning_rate": 4.126428236030678e-05, "loss": 0.0, "step": 22325 }, { "epoch": 1.7490169226254524, "grad_norm": NaN, "learning_rate": 4.1254499921740494e-05, "loss": 0.0, "step": 22350 }, { "epoch": 1.7509732955101243, "grad_norm": NaN, "learning_rate": 4.1244717483174205e-05, "loss": 0.0, "step": 22375 }, { "epoch": 1.752929668394796, "grad_norm": NaN, "learning_rate": 4.123493504460792e-05, "loss": 0.0, "step": 22400 }, { "epoch": 1.752929668394796, "eval_loss": NaN, "eval_runtime": 298.3426, "eval_samples_per_second": 403.278, "eval_steps_per_second": 6.301, "step": 22400 }, { "epoch": 1.7548860412794678, "grad_norm": NaN, "learning_rate": 4.122515260604163e-05, "loss": 0.0, "step": 22425 }, { "epoch": 1.7568424141641397, "grad_norm": NaN, "learning_rate": 4.121537016747535e-05, "loss": 0.0, "step": 22450 }, { "epoch": 1.7587987870488115, "grad_norm": NaN, "learning_rate": 4.120558772890906e-05, "loss": 0.0, "step": 22475 }, { "epoch": 1.7607551599334834, "grad_norm": NaN, "learning_rate": 4.119580529034278e-05, "loss": 0.0, "step": 22500 }, { "epoch": 1.7607551599334834, "eval_loss": NaN, "eval_runtime": 312.7367, "eval_samples_per_second": 384.717, "eval_steps_per_second": 6.011, "step": 22500 }, { "epoch": 1.7627115328181553, "grad_norm": NaN, "learning_rate": 4.118602285177649e-05, "loss": 0.0, "step": 22525 }, { "epoch": 1.764667905702827, "grad_norm": NaN, "learning_rate": 4.117624041321021e-05, "loss": 0.0, "step": 22550 }, { "epoch": 1.7666242785874988, "grad_norm": NaN, "learning_rate": 4.1166457974643924e-05, "loss": 0.0, "step": 22575 }, { "epoch": 1.7685806514721705, "grad_norm": NaN, "learning_rate": 4.1156675536077635e-05, "loss": 0.0, "step": 22600 }, { "epoch": 1.7685806514721705, "eval_loss": NaN, "eval_runtime": 295.3403, "eval_samples_per_second": 407.378, "eval_steps_per_second": 6.366, "step": 22600 }, { "epoch": 1.7705370243568423, "grad_norm": NaN, "learning_rate": 4.114689309751135e-05, "loss": 0.0, "step": 22625 }, { "epoch": 1.7724933972415142, "grad_norm": NaN, "learning_rate": 4.113711065894506e-05, "loss": 0.0, "step": 22650 }, { "epoch": 1.774449770126186, "grad_norm": NaN, "learning_rate": 4.112732822037878e-05, "loss": 0.0, "step": 22675 }, { "epoch": 1.776406143010858, "grad_norm": NaN, "learning_rate": 4.111754578181249e-05, "loss": 0.0, "step": 22700 }, { "epoch": 1.776406143010858, "eval_loss": NaN, "eval_runtime": 296.0255, "eval_samples_per_second": 406.435, "eval_steps_per_second": 6.351, "step": 22700 }, { "epoch": 1.7783625158955298, "grad_norm": NaN, "learning_rate": 4.110776334324621e-05, "loss": 0.0, "step": 22725 }, { "epoch": 1.7803188887802015, "grad_norm": NaN, "learning_rate": 4.109798090467992e-05, "loss": 0.0, "step": 22750 }, { "epoch": 1.7822752616648734, "grad_norm": NaN, "learning_rate": 4.108819846611364e-05, "loss": 0.0, "step": 22775 }, { "epoch": 1.784231634549545, "grad_norm": NaN, "learning_rate": 4.107841602754735e-05, "loss": 0.0, "step": 22800 }, { "epoch": 1.784231634549545, "eval_loss": NaN, "eval_runtime": 295.6657, "eval_samples_per_second": 406.929, "eval_steps_per_second": 6.359, "step": 22800 }, { "epoch": 1.786188007434217, "grad_norm": NaN, "learning_rate": 4.1068633588981065e-05, "loss": 0.0, "step": 22825 }, { "epoch": 1.7881443803188888, "grad_norm": NaN, "learning_rate": 4.1058851150414776e-05, "loss": 0.0, "step": 22850 }, { "epoch": 1.7901007532035607, "grad_norm": NaN, "learning_rate": 4.1049068711848493e-05, "loss": 0.0, "step": 22875 }, { "epoch": 1.7920571260882325, "grad_norm": NaN, "learning_rate": 4.1039286273282204e-05, "loss": 0.0, "step": 22900 }, { "epoch": 1.7920571260882325, "eval_loss": NaN, "eval_runtime": 295.618, "eval_samples_per_second": 406.995, "eval_steps_per_second": 6.36, "step": 22900 }, { "epoch": 1.7940134989729042, "grad_norm": NaN, "learning_rate": 4.102950383471592e-05, "loss": 0.0, "step": 22925 }, { "epoch": 1.795969871857576, "grad_norm": NaN, "learning_rate": 4.101972139614963e-05, "loss": 0.0, "step": 22950 }, { "epoch": 1.7979262447422477, "grad_norm": NaN, "learning_rate": 4.100993895758335e-05, "loss": 0.0, "step": 22975 }, { "epoch": 1.7998826176269196, "grad_norm": NaN, "learning_rate": 4.100015651901706e-05, "loss": 0.0, "step": 23000 }, { "epoch": 1.7998826176269196, "eval_loss": NaN, "eval_runtime": 295.9738, "eval_samples_per_second": 406.506, "eval_steps_per_second": 6.352, "step": 23000 }, { "epoch": 1.8018389905115915, "grad_norm": NaN, "learning_rate": 4.099037408045078e-05, "loss": 0.0, "step": 23025 }, { "epoch": 1.8037953633962633, "grad_norm": NaN, "learning_rate": 4.098059164188449e-05, "loss": 0.0, "step": 23050 }, { "epoch": 1.8057517362809352, "grad_norm": NaN, "learning_rate": 4.0970809203318206e-05, "loss": 0.0, "step": 23075 }, { "epoch": 1.807708109165607, "grad_norm": NaN, "learning_rate": 4.096102676475192e-05, "loss": 0.0, "step": 23100 }, { "epoch": 1.807708109165607, "eval_loss": NaN, "eval_runtime": 295.8381, "eval_samples_per_second": 406.692, "eval_steps_per_second": 6.355, "step": 23100 }, { "epoch": 1.8096644820502787, "grad_norm": NaN, "learning_rate": 4.0951244326185634e-05, "loss": 0.0, "step": 23125 }, { "epoch": 1.8116208549349506, "grad_norm": NaN, "learning_rate": 4.0941461887619345e-05, "loss": 0.0, "step": 23150 }, { "epoch": 1.8135772278196223, "grad_norm": NaN, "learning_rate": 4.093167944905306e-05, "loss": 0.0, "step": 23175 }, { "epoch": 1.8155336007042941, "grad_norm": NaN, "learning_rate": 4.092189701048677e-05, "loss": 0.0, "step": 23200 }, { "epoch": 1.8155336007042941, "eval_loss": NaN, "eval_runtime": 296.114, "eval_samples_per_second": 406.313, "eval_steps_per_second": 6.349, "step": 23200 }, { "epoch": 1.817489973588966, "grad_norm": NaN, "learning_rate": 4.091211457192049e-05, "loss": 0.0, "step": 23225 }, { "epoch": 1.819446346473638, "grad_norm": NaN, "learning_rate": 4.09023321333542e-05, "loss": 0.0, "step": 23250 }, { "epoch": 1.8214027193583098, "grad_norm": NaN, "learning_rate": 4.089254969478792e-05, "loss": 0.0, "step": 23275 }, { "epoch": 1.8233590922429816, "grad_norm": NaN, "learning_rate": 4.088276725622163e-05, "loss": 0.0, "step": 23300 }, { "epoch": 1.8233590922429816, "eval_loss": NaN, "eval_runtime": 296.9146, "eval_samples_per_second": 405.217, "eval_steps_per_second": 6.332, "step": 23300 }, { "epoch": 1.8253154651276533, "grad_norm": NaN, "learning_rate": 4.087298481765535e-05, "loss": 0.0, "step": 23325 }, { "epoch": 1.8272718380123252, "grad_norm": NaN, "learning_rate": 4.086320237908906e-05, "loss": 0.0, "step": 23350 }, { "epoch": 1.8292282108969968, "grad_norm": NaN, "learning_rate": 4.0853419940522775e-05, "loss": 0.0, "step": 23375 }, { "epoch": 1.8311845837816687, "grad_norm": NaN, "learning_rate": 4.0843637501956486e-05, "loss": 0.0, "step": 23400 }, { "epoch": 1.8311845837816687, "eval_loss": NaN, "eval_runtime": 295.339, "eval_samples_per_second": 407.379, "eval_steps_per_second": 6.366, "step": 23400 }, { "epoch": 1.8331409566663406, "grad_norm": NaN, "learning_rate": 4.08338550633902e-05, "loss": 0.0, "step": 23425 }, { "epoch": 1.8350973295510125, "grad_norm": NaN, "learning_rate": 4.0824072624823914e-05, "loss": 0.0, "step": 23450 }, { "epoch": 1.8370537024356843, "grad_norm": NaN, "learning_rate": 4.081429018625763e-05, "loss": 0.0, "step": 23475 }, { "epoch": 1.8390100753203562, "grad_norm": NaN, "learning_rate": 4.080450774769134e-05, "loss": 0.0, "step": 23500 }, { "epoch": 1.8390100753203562, "eval_loss": NaN, "eval_runtime": 295.8804, "eval_samples_per_second": 406.634, "eval_steps_per_second": 6.354, "step": 23500 }, { "epoch": 1.8409664482050279, "grad_norm": NaN, "learning_rate": 4.079472530912506e-05, "loss": 0.0, "step": 23525 }, { "epoch": 1.8429228210896997, "grad_norm": NaN, "learning_rate": 4.078494287055877e-05, "loss": 0.0, "step": 23550 }, { "epoch": 1.8448791939743714, "grad_norm": NaN, "learning_rate": 4.077516043199249e-05, "loss": 0.0, "step": 23575 }, { "epoch": 1.8468355668590433, "grad_norm": NaN, "learning_rate": 4.07653779934262e-05, "loss": 0.0, "step": 23600 }, { "epoch": 1.8468355668590433, "eval_loss": NaN, "eval_runtime": 297.5251, "eval_samples_per_second": 404.386, "eval_steps_per_second": 6.319, "step": 23600 }, { "epoch": 1.8487919397437151, "grad_norm": NaN, "learning_rate": 4.0755595554859916e-05, "loss": 0.0, "step": 23625 }, { "epoch": 1.850748312628387, "grad_norm": NaN, "learning_rate": 4.074581311629363e-05, "loss": 0.0, "step": 23650 }, { "epoch": 1.852704685513059, "grad_norm": NaN, "learning_rate": 4.0736030677727344e-05, "loss": 0.0, "step": 23675 }, { "epoch": 1.8546610583977308, "grad_norm": NaN, "learning_rate": 4.0726248239161055e-05, "loss": 0.0, "step": 23700 }, { "epoch": 1.8546610583977308, "eval_loss": NaN, "eval_runtime": 296.379, "eval_samples_per_second": 405.95, "eval_steps_per_second": 6.343, "step": 23700 }, { "epoch": 1.8566174312824024, "grad_norm": NaN, "learning_rate": 4.071646580059477e-05, "loss": 0.0, "step": 23725 }, { "epoch": 1.8585738041670743, "grad_norm": NaN, "learning_rate": 4.070668336202848e-05, "loss": 0.0, "step": 23750 }, { "epoch": 1.860530177051746, "grad_norm": NaN, "learning_rate": 4.06969009234622e-05, "loss": 0.0, "step": 23775 }, { "epoch": 1.8624865499364178, "grad_norm": NaN, "learning_rate": 4.068711848489591e-05, "loss": 0.0, "step": 23800 }, { "epoch": 1.8624865499364178, "eval_loss": NaN, "eval_runtime": 297.3646, "eval_samples_per_second": 404.604, "eval_steps_per_second": 6.322, "step": 23800 }, { "epoch": 1.8644429228210897, "grad_norm": NaN, "learning_rate": 4.067733604632963e-05, "loss": 0.0, "step": 23825 }, { "epoch": 1.8663992957057616, "grad_norm": NaN, "learning_rate": 4.0667553607763346e-05, "loss": 0.0, "step": 23850 }, { "epoch": 1.8683556685904334, "grad_norm": NaN, "learning_rate": 4.0657771169197064e-05, "loss": 0.0, "step": 23875 }, { "epoch": 1.870312041475105, "grad_norm": NaN, "learning_rate": 4.0647988730630774e-05, "loss": 0.0, "step": 23900 }, { "epoch": 1.870312041475105, "eval_loss": NaN, "eval_runtime": 302.0188, "eval_samples_per_second": 398.369, "eval_steps_per_second": 6.225, "step": 23900 }, { "epoch": 1.872268414359777, "grad_norm": NaN, "learning_rate": 4.063820629206449e-05, "loss": 0.0, "step": 23925 }, { "epoch": 1.8742247872444486, "grad_norm": NaN, "learning_rate": 4.06284238534982e-05, "loss": 0.0, "step": 23950 }, { "epoch": 1.8761811601291205, "grad_norm": NaN, "learning_rate": 4.061864141493192e-05, "loss": 0.0, "step": 23975 }, { "epoch": 1.8781375330137924, "grad_norm": NaN, "learning_rate": 4.060885897636563e-05, "loss": 0.0, "step": 24000 }, { "epoch": 1.8781375330137924, "eval_loss": NaN, "eval_runtime": 297.575, "eval_samples_per_second": 404.318, "eval_steps_per_second": 6.318, "step": 24000 }, { "epoch": 1.8800939058984643, "grad_norm": NaN, "learning_rate": 4.059907653779935e-05, "loss": 0.0, "step": 24025 }, { "epoch": 1.8820502787831361, "grad_norm": NaN, "learning_rate": 4.058929409923306e-05, "loss": 0.0, "step": 24050 }, { "epoch": 1.884006651667808, "grad_norm": NaN, "learning_rate": 4.0579511660666776e-05, "loss": 0.0, "step": 24075 }, { "epoch": 1.8859630245524797, "grad_norm": NaN, "learning_rate": 4.056972922210049e-05, "loss": 0.0, "step": 24100 }, { "epoch": 1.8859630245524797, "eval_loss": NaN, "eval_runtime": 296.238, "eval_samples_per_second": 406.143, "eval_steps_per_second": 6.346, "step": 24100 }, { "epoch": 1.8879193974371515, "grad_norm": NaN, "learning_rate": 4.0559946783534204e-05, "loss": 0.0, "step": 24125 }, { "epoch": 1.8898757703218232, "grad_norm": NaN, "learning_rate": 4.0550164344967915e-05, "loss": 0.0, "step": 24150 }, { "epoch": 1.891832143206495, "grad_norm": NaN, "learning_rate": 4.054038190640163e-05, "loss": 0.0, "step": 24175 }, { "epoch": 1.893788516091167, "grad_norm": NaN, "learning_rate": 4.053059946783534e-05, "loss": 0.0, "step": 24200 }, { "epoch": 1.893788516091167, "eval_loss": NaN, "eval_runtime": 296.0845, "eval_samples_per_second": 406.354, "eval_steps_per_second": 6.35, "step": 24200 }, { "epoch": 1.8957448889758388, "grad_norm": NaN, "learning_rate": 4.052081702926906e-05, "loss": 0.0, "step": 24225 }, { "epoch": 1.8977012618605107, "grad_norm": NaN, "learning_rate": 4.051103459070277e-05, "loss": 0.0, "step": 24250 }, { "epoch": 1.8996576347451826, "grad_norm": NaN, "learning_rate": 4.050125215213649e-05, "loss": 0.0, "step": 24275 }, { "epoch": 1.9016140076298542, "grad_norm": NaN, "learning_rate": 4.04914697135702e-05, "loss": 0.0, "step": 24300 }, { "epoch": 1.9016140076298542, "eval_loss": NaN, "eval_runtime": 304.2323, "eval_samples_per_second": 395.471, "eval_steps_per_second": 6.179, "step": 24300 }, { "epoch": 1.903570380514526, "grad_norm": NaN, "learning_rate": 4.048168727500392e-05, "loss": 0.0, "step": 24325 }, { "epoch": 1.9055267533991977, "grad_norm": NaN, "learning_rate": 4.047190483643763e-05, "loss": 0.0, "step": 24350 }, { "epoch": 1.9074831262838696, "grad_norm": NaN, "learning_rate": 4.0462122397871345e-05, "loss": 0.0, "step": 24375 }, { "epoch": 1.9094394991685415, "grad_norm": NaN, "learning_rate": 4.0452339959305056e-05, "loss": 0.0, "step": 24400 }, { "epoch": 1.9094394991685415, "eval_loss": NaN, "eval_runtime": 296.3987, "eval_samples_per_second": 405.923, "eval_steps_per_second": 6.343, "step": 24400 }, { "epoch": 1.9113958720532134, "grad_norm": NaN, "learning_rate": 4.0442557520738773e-05, "loss": 0.0, "step": 24425 }, { "epoch": 1.9133522449378852, "grad_norm": NaN, "learning_rate": 4.0432775082172484e-05, "loss": 0.0, "step": 24450 }, { "epoch": 1.9153086178225571, "grad_norm": NaN, "learning_rate": 4.04229926436062e-05, "loss": 0.0, "step": 24475 }, { "epoch": 1.9172649907072288, "grad_norm": NaN, "learning_rate": 4.041321020503991e-05, "loss": 0.0, "step": 24500 }, { "epoch": 1.9172649907072288, "eval_loss": NaN, "eval_runtime": 294.9936, "eval_samples_per_second": 407.856, "eval_steps_per_second": 6.373, "step": 24500 }, { "epoch": 1.9192213635919007, "grad_norm": NaN, "learning_rate": 4.040342776647363e-05, "loss": 0.0, "step": 24525 }, { "epoch": 1.9211777364765723, "grad_norm": NaN, "learning_rate": 4.039364532790734e-05, "loss": 0.0, "step": 24550 }, { "epoch": 1.9231341093612442, "grad_norm": NaN, "learning_rate": 4.038386288934106e-05, "loss": 0.0, "step": 24575 }, { "epoch": 1.925090482245916, "grad_norm": NaN, "learning_rate": 4.037408045077477e-05, "loss": 0.0, "step": 24600 }, { "epoch": 1.925090482245916, "eval_loss": NaN, "eval_runtime": 296.4562, "eval_samples_per_second": 405.844, "eval_steps_per_second": 6.342, "step": 24600 }, { "epoch": 1.927046855130588, "grad_norm": NaN, "learning_rate": 4.0364298012208486e-05, "loss": 0.0, "step": 24625 }, { "epoch": 1.9290032280152598, "grad_norm": NaN, "learning_rate": 4.03545155736422e-05, "loss": 0.0, "step": 24650 }, { "epoch": 1.9309596008999317, "grad_norm": NaN, "learning_rate": 4.0344733135075914e-05, "loss": 0.0, "step": 24675 }, { "epoch": 1.9329159737846033, "grad_norm": NaN, "learning_rate": 4.0334950696509625e-05, "loss": 0.0, "step": 24700 }, { "epoch": 1.9329159737846033, "eval_loss": NaN, "eval_runtime": 295.4939, "eval_samples_per_second": 407.166, "eval_steps_per_second": 6.362, "step": 24700 }, { "epoch": 1.9348723466692752, "grad_norm": NaN, "learning_rate": 4.032516825794334e-05, "loss": 0.0, "step": 24725 }, { "epoch": 1.9368287195539469, "grad_norm": NaN, "learning_rate": 4.031538581937705e-05, "loss": 0.0, "step": 24750 }, { "epoch": 1.9387850924386187, "grad_norm": NaN, "learning_rate": 4.030560338081077e-05, "loss": 0.0, "step": 24775 }, { "epoch": 1.9407414653232906, "grad_norm": NaN, "learning_rate": 4.029582094224448e-05, "loss": 0.0, "step": 24800 }, { "epoch": 1.9407414653232906, "eval_loss": NaN, "eval_runtime": 295.9281, "eval_samples_per_second": 406.568, "eval_steps_per_second": 6.353, "step": 24800 }, { "epoch": 1.9426978382079625, "grad_norm": NaN, "learning_rate": 4.02860385036782e-05, "loss": 0.0, "step": 24825 }, { "epoch": 1.9446542110926344, "grad_norm": NaN, "learning_rate": 4.027625606511191e-05, "loss": 0.0, "step": 24850 }, { "epoch": 1.946610583977306, "grad_norm": NaN, "learning_rate": 4.026647362654563e-05, "loss": 0.0, "step": 24875 }, { "epoch": 1.948566956861978, "grad_norm": NaN, "learning_rate": 4.025669118797934e-05, "loss": 0.0, "step": 24900 }, { "epoch": 1.948566956861978, "eval_loss": NaN, "eval_runtime": 296.1341, "eval_samples_per_second": 406.286, "eval_steps_per_second": 6.348, "step": 24900 }, { "epoch": 1.9505233297466495, "grad_norm": NaN, "learning_rate": 4.0246908749413055e-05, "loss": 0.0, "step": 24925 }, { "epoch": 1.9524797026313214, "grad_norm": NaN, "learning_rate": 4.0237126310846766e-05, "loss": 0.0, "step": 24950 }, { "epoch": 1.9544360755159933, "grad_norm": NaN, "learning_rate": 4.022734387228048e-05, "loss": 0.0, "step": 24975 }, { "epoch": 1.9563924484006652, "grad_norm": NaN, "learning_rate": 4.0217561433714194e-05, "loss": 0.0, "step": 25000 }, { "epoch": 1.9563924484006652, "eval_loss": NaN, "eval_runtime": 295.92, "eval_samples_per_second": 406.579, "eval_steps_per_second": 6.353, "step": 25000 }, { "epoch": 1.958348821285337, "grad_norm": NaN, "learning_rate": 4.020777899514791e-05, "loss": 0.0, "step": 25025 }, { "epoch": 1.960305194170009, "grad_norm": NaN, "learning_rate": 4.019799655658162e-05, "loss": 0.0, "step": 25050 }, { "epoch": 1.9622615670546806, "grad_norm": NaN, "learning_rate": 4.018821411801534e-05, "loss": 0.0, "step": 25075 }, { "epoch": 1.9642179399393525, "grad_norm": NaN, "learning_rate": 4.017843167944905e-05, "loss": 0.0, "step": 25100 }, { "epoch": 1.9642179399393525, "eval_loss": NaN, "eval_runtime": 295.5565, "eval_samples_per_second": 407.079, "eval_steps_per_second": 6.361, "step": 25100 }, { "epoch": 1.966174312824024, "grad_norm": NaN, "learning_rate": 4.0168649240882775e-05, "loss": 0.0, "step": 25125 }, { "epoch": 1.968130685708696, "grad_norm": NaN, "learning_rate": 4.0158866802316485e-05, "loss": 0.0, "step": 25150 }, { "epoch": 1.9700870585933679, "grad_norm": NaN, "learning_rate": 4.01490843637502e-05, "loss": 0.0, "step": 25175 }, { "epoch": 1.9720434314780397, "grad_norm": NaN, "learning_rate": 4.013930192518391e-05, "loss": 0.0, "step": 25200 }, { "epoch": 1.9720434314780397, "eval_loss": NaN, "eval_runtime": 296.4126, "eval_samples_per_second": 405.904, "eval_steps_per_second": 6.343, "step": 25200 }, { "epoch": 1.9739998043627116, "grad_norm": NaN, "learning_rate": 4.012951948661763e-05, "loss": 0.0, "step": 25225 }, { "epoch": 1.9759561772473835, "grad_norm": NaN, "learning_rate": 4.011973704805134e-05, "loss": 0.0, "step": 25250 }, { "epoch": 1.9779125501320551, "grad_norm": NaN, "learning_rate": 4.010995460948506e-05, "loss": 0.0, "step": 25275 }, { "epoch": 1.979868923016727, "grad_norm": NaN, "learning_rate": 4.010017217091877e-05, "loss": 0.0, "step": 25300 }, { "epoch": 1.979868923016727, "eval_loss": NaN, "eval_runtime": 295.9277, "eval_samples_per_second": 406.569, "eval_steps_per_second": 6.353, "step": 25300 }, { "epoch": 1.9818252959013987, "grad_norm": NaN, "learning_rate": 4.009038973235249e-05, "loss": 0.0, "step": 25325 }, { "epoch": 1.9837816687860705, "grad_norm": NaN, "learning_rate": 4.00806072937862e-05, "loss": 0.0, "step": 25350 }, { "epoch": 1.9857380416707424, "grad_norm": NaN, "learning_rate": 4.0070824855219915e-05, "loss": 0.0, "step": 25375 }, { "epoch": 1.9876944145554143, "grad_norm": NaN, "learning_rate": 4.0061042416653626e-05, "loss": 0.0, "step": 25400 }, { "epoch": 1.9876944145554143, "eval_loss": NaN, "eval_runtime": 296.4241, "eval_samples_per_second": 405.888, "eval_steps_per_second": 6.342, "step": 25400 }, { "epoch": 1.9896507874400862, "grad_norm": NaN, "learning_rate": 4.0051259978087344e-05, "loss": 0.0, "step": 25425 }, { "epoch": 1.991607160324758, "grad_norm": NaN, "learning_rate": 4.0041477539521054e-05, "loss": 0.0, "step": 25450 }, { "epoch": 1.9935635332094297, "grad_norm": NaN, "learning_rate": 4.003169510095477e-05, "loss": 0.0, "step": 25475 }, { "epoch": 1.9955199060941016, "grad_norm": NaN, "learning_rate": 4.002191266238848e-05, "loss": 0.0, "step": 25500 }, { "epoch": 1.9955199060941016, "eval_loss": NaN, "eval_runtime": 295.9282, "eval_samples_per_second": 406.568, "eval_steps_per_second": 6.353, "step": 25500 }, { "epoch": 1.9974762789787732, "grad_norm": NaN, "learning_rate": 4.00121302238222e-05, "loss": 0.0, "step": 25525 }, { "epoch": 1.999432651863445, "grad_norm": NaN, "learning_rate": 4.000234778525591e-05, "loss": 0.0, "step": 25550 }, { "epoch": 2.0014085884769637, "grad_norm": NaN, "learning_rate": 3.999256534668963e-05, "loss": 0.0, "step": 25575 }, { "epoch": 2.0033649613616356, "grad_norm": NaN, "learning_rate": 3.998278290812334e-05, "loss": 0.0, "step": 25600 }, { "epoch": 2.0033649613616356, "eval_loss": NaN, "eval_runtime": 296.3164, "eval_samples_per_second": 406.036, "eval_steps_per_second": 6.345, "step": 25600 }, { "epoch": 2.0053213342463074, "grad_norm": NaN, "learning_rate": 3.9973000469557056e-05, "loss": 0.0, "step": 25625 }, { "epoch": 2.0072777071309793, "grad_norm": NaN, "learning_rate": 3.996321803099077e-05, "loss": 0.0, "step": 25650 }, { "epoch": 2.009234080015651, "grad_norm": NaN, "learning_rate": 3.9953435592424484e-05, "loss": 0.0, "step": 25675 }, { "epoch": 2.0111904529003226, "grad_norm": NaN, "learning_rate": 3.9943653153858195e-05, "loss": 0.0, "step": 25700 }, { "epoch": 2.0111904529003226, "eval_loss": NaN, "eval_runtime": 296.0597, "eval_samples_per_second": 406.388, "eval_steps_per_second": 6.35, "step": 25700 }, { "epoch": 2.0131468257849945, "grad_norm": NaN, "learning_rate": 3.993387071529191e-05, "loss": 0.0, "step": 25725 }, { "epoch": 2.0151031986696664, "grad_norm": NaN, "learning_rate": 3.992408827672562e-05, "loss": 0.0, "step": 25750 }, { "epoch": 2.0170595715543382, "grad_norm": NaN, "learning_rate": 3.991430583815934e-05, "loss": 0.0, "step": 25775 }, { "epoch": 2.01901594443901, "grad_norm": NaN, "learning_rate": 3.990452339959305e-05, "loss": 0.0, "step": 25800 }, { "epoch": 2.01901594443901, "eval_loss": NaN, "eval_runtime": 295.3246, "eval_samples_per_second": 407.399, "eval_steps_per_second": 6.366, "step": 25800 }, { "epoch": 2.020972317323682, "grad_norm": NaN, "learning_rate": 3.989474096102677e-05, "loss": 0.0, "step": 25825 }, { "epoch": 2.022928690208354, "grad_norm": NaN, "learning_rate": 3.988495852246048e-05, "loss": 0.0, "step": 25850 }, { "epoch": 2.0248850630930257, "grad_norm": NaN, "learning_rate": 3.98751760838942e-05, "loss": 0.0, "step": 25875 }, { "epoch": 2.026841435977697, "grad_norm": NaN, "learning_rate": 3.986539364532791e-05, "loss": 0.0, "step": 25900 }, { "epoch": 2.026841435977697, "eval_loss": NaN, "eval_runtime": 296.1742, "eval_samples_per_second": 406.231, "eval_steps_per_second": 6.348, "step": 25900 }, { "epoch": 2.028797808862369, "grad_norm": NaN, "learning_rate": 3.9855611206761625e-05, "loss": 0.0, "step": 25925 }, { "epoch": 2.030754181747041, "grad_norm": NaN, "learning_rate": 3.9845828768195336e-05, "loss": 0.0, "step": 25950 }, { "epoch": 2.032710554631713, "grad_norm": NaN, "learning_rate": 3.983604632962905e-05, "loss": 0.0, "step": 25975 }, { "epoch": 2.0346669275163847, "grad_norm": NaN, "learning_rate": 3.9826263891062764e-05, "loss": 0.0, "step": 26000 }, { "epoch": 2.0346669275163847, "eval_loss": NaN, "eval_runtime": 296.6015, "eval_samples_per_second": 405.645, "eval_steps_per_second": 6.338, "step": 26000 }, { "epoch": 2.0366233004010565, "grad_norm": NaN, "learning_rate": 3.981648145249648e-05, "loss": 0.0, "step": 26025 }, { "epoch": 2.0385796732857284, "grad_norm": NaN, "learning_rate": 3.980669901393019e-05, "loss": 0.0, "step": 26050 }, { "epoch": 2.0405360461704003, "grad_norm": NaN, "learning_rate": 3.979691657536391e-05, "loss": 0.0, "step": 26075 }, { "epoch": 2.0424924190550717, "grad_norm": NaN, "learning_rate": 3.978713413679762e-05, "loss": 0.0, "step": 26100 }, { "epoch": 2.0424924190550717, "eval_loss": NaN, "eval_runtime": 295.808, "eval_samples_per_second": 406.733, "eval_steps_per_second": 6.355, "step": 26100 }, { "epoch": 2.0444487919397436, "grad_norm": NaN, "learning_rate": 3.977735169823134e-05, "loss": 0.0, "step": 26125 }, { "epoch": 2.0464051648244155, "grad_norm": NaN, "learning_rate": 3.976756925966505e-05, "loss": 0.0, "step": 26150 }, { "epoch": 2.0483615377090874, "grad_norm": NaN, "learning_rate": 3.9757786821098766e-05, "loss": 0.0, "step": 26175 }, { "epoch": 2.0503179105937592, "grad_norm": NaN, "learning_rate": 3.974800438253248e-05, "loss": 0.0, "step": 26200 }, { "epoch": 2.0503179105937592, "eval_loss": NaN, "eval_runtime": 295.7139, "eval_samples_per_second": 406.863, "eval_steps_per_second": 6.357, "step": 26200 }, { "epoch": 2.052274283478431, "grad_norm": NaN, "learning_rate": 3.9738221943966194e-05, "loss": 0.0, "step": 26225 }, { "epoch": 2.054230656363103, "grad_norm": NaN, "learning_rate": 3.9728439505399905e-05, "loss": 0.0, "step": 26250 }, { "epoch": 2.0561870292477744, "grad_norm": NaN, "learning_rate": 3.971865706683362e-05, "loss": 0.0, "step": 26275 }, { "epoch": 2.0581434021324463, "grad_norm": NaN, "learning_rate": 3.970887462826733e-05, "loss": 0.0, "step": 26300 }, { "epoch": 2.0581434021324463, "eval_loss": NaN, "eval_runtime": 296.2958, "eval_samples_per_second": 406.064, "eval_steps_per_second": 6.345, "step": 26300 }, { "epoch": 2.060099775017118, "grad_norm": NaN, "learning_rate": 3.969909218970105e-05, "loss": 0.0, "step": 26325 }, { "epoch": 2.06205614790179, "grad_norm": NaN, "learning_rate": 3.968930975113476e-05, "loss": 0.0, "step": 26350 }, { "epoch": 2.064012520786462, "grad_norm": NaN, "learning_rate": 3.967952731256848e-05, "loss": 0.0, "step": 26375 }, { "epoch": 2.065968893671134, "grad_norm": NaN, "learning_rate": 3.966974487400219e-05, "loss": 0.0, "step": 26400 }, { "epoch": 2.065968893671134, "eval_loss": NaN, "eval_runtime": 296.5503, "eval_samples_per_second": 405.715, "eval_steps_per_second": 6.34, "step": 26400 }, { "epoch": 2.0679252665558057, "grad_norm": NaN, "learning_rate": 3.965996243543591e-05, "loss": 0.0, "step": 26425 }, { "epoch": 2.0698816394404775, "grad_norm": NaN, "learning_rate": 3.9650179996869624e-05, "loss": 0.0, "step": 26450 }, { "epoch": 2.071838012325149, "grad_norm": NaN, "learning_rate": 3.9640397558303335e-05, "loss": 0.0, "step": 26475 }, { "epoch": 2.073794385209821, "grad_norm": NaN, "learning_rate": 3.963061511973705e-05, "loss": 0.0, "step": 26500 }, { "epoch": 2.073794385209821, "eval_loss": NaN, "eval_runtime": 294.9122, "eval_samples_per_second": 407.969, "eval_steps_per_second": 6.375, "step": 26500 }, { "epoch": 2.0757507580944927, "grad_norm": NaN, "learning_rate": 3.962083268117076e-05, "loss": 0.0, "step": 26525 }, { "epoch": 2.0777071309791646, "grad_norm": NaN, "learning_rate": 3.961105024260448e-05, "loss": 0.0, "step": 26550 }, { "epoch": 2.0796635038638365, "grad_norm": NaN, "learning_rate": 3.960126780403819e-05, "loss": 0.0, "step": 26575 }, { "epoch": 2.0816198767485083, "grad_norm": NaN, "learning_rate": 3.959148536547191e-05, "loss": 0.0, "step": 26600 }, { "epoch": 2.0816198767485083, "eval_loss": NaN, "eval_runtime": 296.3271, "eval_samples_per_second": 406.021, "eval_steps_per_second": 6.344, "step": 26600 }, { "epoch": 2.08357624963318, "grad_norm": NaN, "learning_rate": 3.958170292690562e-05, "loss": 0.0, "step": 26625 }, { "epoch": 2.085532622517852, "grad_norm": NaN, "learning_rate": 3.957192048833934e-05, "loss": 0.0, "step": 26650 }, { "epoch": 2.0874889954025235, "grad_norm": NaN, "learning_rate": 3.956213804977305e-05, "loss": 0.0, "step": 26675 }, { "epoch": 2.0894453682871954, "grad_norm": NaN, "learning_rate": 3.9552355611206765e-05, "loss": 0.0, "step": 26700 }, { "epoch": 2.0894453682871954, "eval_loss": NaN, "eval_runtime": 296.1149, "eval_samples_per_second": 406.312, "eval_steps_per_second": 6.349, "step": 26700 }, { "epoch": 2.0914017411718673, "grad_norm": NaN, "learning_rate": 3.9542573172640476e-05, "loss": 0.0, "step": 26725 }, { "epoch": 2.093358114056539, "grad_norm": NaN, "learning_rate": 3.953279073407419e-05, "loss": 0.0, "step": 26750 }, { "epoch": 2.095314486941211, "grad_norm": NaN, "learning_rate": 3.9523008295507904e-05, "loss": 0.0, "step": 26775 }, { "epoch": 2.097270859825883, "grad_norm": NaN, "learning_rate": 3.951322585694162e-05, "loss": 0.0, "step": 26800 }, { "epoch": 2.097270859825883, "eval_loss": NaN, "eval_runtime": 295.5172, "eval_samples_per_second": 407.134, "eval_steps_per_second": 6.362, "step": 26800 }, { "epoch": 2.099227232710555, "grad_norm": NaN, "learning_rate": 3.950344341837533e-05, "loss": 0.0, "step": 26825 }, { "epoch": 2.1011836055952267, "grad_norm": NaN, "learning_rate": 3.949366097980905e-05, "loss": 0.0, "step": 26850 }, { "epoch": 2.103139978479898, "grad_norm": NaN, "learning_rate": 3.948387854124276e-05, "loss": 0.0, "step": 26875 }, { "epoch": 2.10509635136457, "grad_norm": NaN, "learning_rate": 3.947409610267648e-05, "loss": 0.0, "step": 26900 }, { "epoch": 2.10509635136457, "eval_loss": NaN, "eval_runtime": 295.6988, "eval_samples_per_second": 406.884, "eval_steps_per_second": 6.358, "step": 26900 }, { "epoch": 2.107052724249242, "grad_norm": NaN, "learning_rate": 3.946431366411019e-05, "loss": 0.0, "step": 26925 }, { "epoch": 2.1090090971339137, "grad_norm": NaN, "learning_rate": 3.9454531225543906e-05, "loss": 0.0, "step": 26950 }, { "epoch": 2.1109654700185856, "grad_norm": NaN, "learning_rate": 3.944474878697762e-05, "loss": 0.0, "step": 26975 }, { "epoch": 2.1129218429032575, "grad_norm": NaN, "learning_rate": 3.9434966348411334e-05, "loss": 0.0, "step": 27000 }, { "epoch": 2.1129218429032575, "eval_loss": NaN, "eval_runtime": 296.3976, "eval_samples_per_second": 405.924, "eval_steps_per_second": 6.343, "step": 27000 }, { "epoch": 2.1148782157879293, "grad_norm": NaN, "learning_rate": 3.9425183909845045e-05, "loss": 0.0, "step": 27025 }, { "epoch": 2.116834588672601, "grad_norm": NaN, "learning_rate": 3.941540147127876e-05, "loss": 0.0, "step": 27050 }, { "epoch": 2.1187909615572726, "grad_norm": NaN, "learning_rate": 3.940561903271247e-05, "loss": 0.0, "step": 27075 }, { "epoch": 2.1207473344419445, "grad_norm": NaN, "learning_rate": 3.939583659414619e-05, "loss": 0.0, "step": 27100 }, { "epoch": 2.1207473344419445, "eval_loss": NaN, "eval_runtime": 295.7013, "eval_samples_per_second": 406.88, "eval_steps_per_second": 6.358, "step": 27100 }, { "epoch": 2.1227037073266164, "grad_norm": NaN, "learning_rate": 3.93860541555799e-05, "loss": 0.0, "step": 27125 }, { "epoch": 2.1246600802112883, "grad_norm": NaN, "learning_rate": 3.937627171701362e-05, "loss": 0.0, "step": 27150 }, { "epoch": 2.12661645309596, "grad_norm": NaN, "learning_rate": 3.936648927844733e-05, "loss": 0.0, "step": 27175 }, { "epoch": 2.128572825980632, "grad_norm": NaN, "learning_rate": 3.935670683988105e-05, "loss": 0.0, "step": 27200 }, { "epoch": 2.128572825980632, "eval_loss": NaN, "eval_runtime": 297.4385, "eval_samples_per_second": 404.504, "eval_steps_per_second": 6.321, "step": 27200 }, { "epoch": 2.130529198865304, "grad_norm": NaN, "learning_rate": 3.934692440131476e-05, "loss": 0.0, "step": 27225 }, { "epoch": 2.1324855717499753, "grad_norm": NaN, "learning_rate": 3.9337141962748475e-05, "loss": 0.0, "step": 27250 }, { "epoch": 2.134441944634647, "grad_norm": NaN, "learning_rate": 3.932735952418219e-05, "loss": 0.0, "step": 27275 }, { "epoch": 2.136398317519319, "grad_norm": NaN, "learning_rate": 3.93175770856159e-05, "loss": 0.0, "step": 27300 }, { "epoch": 2.136398317519319, "eval_loss": NaN, "eval_runtime": 295.9286, "eval_samples_per_second": 406.568, "eval_steps_per_second": 6.353, "step": 27300 }, { "epoch": 2.138354690403991, "grad_norm": NaN, "learning_rate": 3.930779464704962e-05, "loss": 0.0, "step": 27325 }, { "epoch": 2.140311063288663, "grad_norm": NaN, "learning_rate": 3.929801220848333e-05, "loss": 0.0, "step": 27350 }, { "epoch": 2.1422674361733347, "grad_norm": NaN, "learning_rate": 3.928822976991705e-05, "loss": 0.0, "step": 27375 }, { "epoch": 2.1442238090580066, "grad_norm": NaN, "learning_rate": 3.927844733135076e-05, "loss": 0.0, "step": 27400 }, { "epoch": 2.1442238090580066, "eval_loss": NaN, "eval_runtime": 295.3647, "eval_samples_per_second": 407.344, "eval_steps_per_second": 6.365, "step": 27400 }, { "epoch": 2.1461801819426785, "grad_norm": NaN, "learning_rate": 3.926866489278448e-05, "loss": 0.0, "step": 27425 }, { "epoch": 2.14813655482735, "grad_norm": NaN, "learning_rate": 3.925888245421819e-05, "loss": 0.0, "step": 27450 }, { "epoch": 2.1500929277120218, "grad_norm": NaN, "learning_rate": 3.9249100015651905e-05, "loss": 0.0, "step": 27475 }, { "epoch": 2.1520493005966936, "grad_norm": NaN, "learning_rate": 3.9239317577085616e-05, "loss": 0.0, "step": 27500 }, { "epoch": 2.1520493005966936, "eval_loss": NaN, "eval_runtime": 296.6433, "eval_samples_per_second": 405.588, "eval_steps_per_second": 6.338, "step": 27500 }, { "epoch": 2.1540056734813655, "grad_norm": NaN, "learning_rate": 3.922953513851933e-05, "loss": 0.0, "step": 27525 }, { "epoch": 2.1559620463660374, "grad_norm": NaN, "learning_rate": 3.9219752699953044e-05, "loss": 0.0, "step": 27550 }, { "epoch": 2.1579184192507093, "grad_norm": NaN, "learning_rate": 3.920997026138676e-05, "loss": 0.0, "step": 27575 }, { "epoch": 2.159874792135381, "grad_norm": NaN, "learning_rate": 3.920018782282047e-05, "loss": 0.0, "step": 27600 }, { "epoch": 2.159874792135381, "eval_loss": NaN, "eval_runtime": 296.0283, "eval_samples_per_second": 406.431, "eval_steps_per_second": 6.351, "step": 27600 }, { "epoch": 2.161831165020053, "grad_norm": NaN, "learning_rate": 3.919040538425419e-05, "loss": 0.0, "step": 27625 }, { "epoch": 2.1637875379047244, "grad_norm": NaN, "learning_rate": 3.91806229456879e-05, "loss": 0.0, "step": 27650 }, { "epoch": 2.1657439107893963, "grad_norm": NaN, "learning_rate": 3.917084050712162e-05, "loss": 0.0, "step": 27675 }, { "epoch": 2.167700283674068, "grad_norm": NaN, "learning_rate": 3.9161058068555335e-05, "loss": 0.0, "step": 27700 }, { "epoch": 2.167700283674068, "eval_loss": NaN, "eval_runtime": 297.1062, "eval_samples_per_second": 404.956, "eval_steps_per_second": 6.328, "step": 27700 }, { "epoch": 2.16965665655874, "grad_norm": NaN, "learning_rate": 3.9151275629989046e-05, "loss": 0.0, "step": 27725 }, { "epoch": 2.171613029443412, "grad_norm": NaN, "learning_rate": 3.9141493191422763e-05, "loss": 0.0, "step": 27750 }, { "epoch": 2.173569402328084, "grad_norm": NaN, "learning_rate": 3.9131710752856474e-05, "loss": 0.0, "step": 27775 }, { "epoch": 2.1755257752127557, "grad_norm": NaN, "learning_rate": 3.912192831429019e-05, "loss": 0.0, "step": 27800 }, { "epoch": 2.1755257752127557, "eval_loss": NaN, "eval_runtime": 295.4641, "eval_samples_per_second": 407.207, "eval_steps_per_second": 6.363, "step": 27800 }, { "epoch": 2.1774821480974276, "grad_norm": NaN, "learning_rate": 3.91121458757239e-05, "loss": 0.0, "step": 27825 }, { "epoch": 2.179438520982099, "grad_norm": NaN, "learning_rate": 3.910236343715762e-05, "loss": 0.0, "step": 27850 }, { "epoch": 2.181394893866771, "grad_norm": NaN, "learning_rate": 3.909258099859133e-05, "loss": 0.0, "step": 27875 }, { "epoch": 2.1833512667514428, "grad_norm": NaN, "learning_rate": 3.908279856002505e-05, "loss": 0.0, "step": 27900 }, { "epoch": 2.1833512667514428, "eval_loss": NaN, "eval_runtime": 295.4111, "eval_samples_per_second": 407.28, "eval_steps_per_second": 6.364, "step": 27900 }, { "epoch": 2.1853076396361146, "grad_norm": NaN, "learning_rate": 3.907301612145876e-05, "loss": 0.0, "step": 27925 }, { "epoch": 2.1872640125207865, "grad_norm": NaN, "learning_rate": 3.9063233682892476e-05, "loss": 0.0, "step": 27950 }, { "epoch": 2.1892203854054584, "grad_norm": NaN, "learning_rate": 3.905345124432619e-05, "loss": 0.0, "step": 27975 }, { "epoch": 2.1911767582901303, "grad_norm": NaN, "learning_rate": 3.9043668805759904e-05, "loss": 0.0, "step": 28000 }, { "epoch": 2.1911767582901303, "eval_loss": NaN, "eval_runtime": 297.3446, "eval_samples_per_second": 404.631, "eval_steps_per_second": 6.323, "step": 28000 }, { "epoch": 2.193133131174802, "grad_norm": NaN, "learning_rate": 3.9033886367193615e-05, "loss": 0.0, "step": 28025 }, { "epoch": 2.1950895040594736, "grad_norm": NaN, "learning_rate": 3.902410392862733e-05, "loss": 0.0, "step": 28050 }, { "epoch": 2.1970458769441454, "grad_norm": NaN, "learning_rate": 3.901432149006104e-05, "loss": 0.0, "step": 28075 }, { "epoch": 2.1990022498288173, "grad_norm": NaN, "learning_rate": 3.900453905149476e-05, "loss": 0.0, "step": 28100 }, { "epoch": 2.1990022498288173, "eval_loss": NaN, "eval_runtime": 295.7188, "eval_samples_per_second": 406.856, "eval_steps_per_second": 6.357, "step": 28100 }, { "epoch": 2.200958622713489, "grad_norm": NaN, "learning_rate": 3.899475661292847e-05, "loss": 0.0, "step": 28125 }, { "epoch": 2.202914995598161, "grad_norm": NaN, "learning_rate": 3.898497417436219e-05, "loss": 0.0, "step": 28150 }, { "epoch": 2.204871368482833, "grad_norm": NaN, "learning_rate": 3.89751917357959e-05, "loss": 0.0, "step": 28175 }, { "epoch": 2.206827741367505, "grad_norm": NaN, "learning_rate": 3.896540929722962e-05, "loss": 0.0, "step": 28200 }, { "epoch": 2.206827741367505, "eval_loss": NaN, "eval_runtime": 296.2334, "eval_samples_per_second": 406.149, "eval_steps_per_second": 6.346, "step": 28200 }, { "epoch": 2.2087841142521762, "grad_norm": NaN, "learning_rate": 3.895562685866333e-05, "loss": 0.0, "step": 28225 }, { "epoch": 2.210740487136848, "grad_norm": NaN, "learning_rate": 3.8945844420097045e-05, "loss": 0.0, "step": 28250 }, { "epoch": 2.21269686002152, "grad_norm": NaN, "learning_rate": 3.8936061981530756e-05, "loss": 0.0, "step": 28275 }, { "epoch": 2.214653232906192, "grad_norm": NaN, "learning_rate": 3.892627954296447e-05, "loss": 0.0, "step": 28300 }, { "epoch": 2.214653232906192, "eval_loss": NaN, "eval_runtime": 296.4053, "eval_samples_per_second": 405.914, "eval_steps_per_second": 6.343, "step": 28300 }, { "epoch": 2.2166096057908637, "grad_norm": NaN, "learning_rate": 3.8916497104398184e-05, "loss": 0.0, "step": 28325 }, { "epoch": 2.2185659786755356, "grad_norm": NaN, "learning_rate": 3.89067146658319e-05, "loss": 0.0, "step": 28350 }, { "epoch": 2.2205223515602075, "grad_norm": NaN, "learning_rate": 3.889693222726561e-05, "loss": 0.0, "step": 28375 }, { "epoch": 2.2224787244448794, "grad_norm": NaN, "learning_rate": 3.888714978869933e-05, "loss": 0.0, "step": 28400 }, { "epoch": 2.2224787244448794, "eval_loss": NaN, "eval_runtime": 297.2769, "eval_samples_per_second": 404.724, "eval_steps_per_second": 6.324, "step": 28400 }, { "epoch": 2.2244350973295512, "grad_norm": NaN, "learning_rate": 3.887736735013304e-05, "loss": 0.0, "step": 28425 }, { "epoch": 2.2263914702142227, "grad_norm": NaN, "learning_rate": 3.886758491156676e-05, "loss": 0.0, "step": 28450 }, { "epoch": 2.2283478430988946, "grad_norm": NaN, "learning_rate": 3.885780247300047e-05, "loss": 0.0, "step": 28475 }, { "epoch": 2.2303042159835664, "grad_norm": NaN, "learning_rate": 3.8848020034434186e-05, "loss": 0.0, "step": 28500 }, { "epoch": 2.2303042159835664, "eval_loss": NaN, "eval_runtime": 297.2736, "eval_samples_per_second": 404.728, "eval_steps_per_second": 6.324, "step": 28500 }, { "epoch": 2.2322605888682383, "grad_norm": NaN, "learning_rate": 3.8838237595867897e-05, "loss": 0.0, "step": 28525 }, { "epoch": 2.23421696175291, "grad_norm": NaN, "learning_rate": 3.8828455157301614e-05, "loss": 0.0, "step": 28550 }, { "epoch": 2.236173334637582, "grad_norm": NaN, "learning_rate": 3.8818672718735325e-05, "loss": 0.0, "step": 28575 }, { "epoch": 2.238129707522254, "grad_norm": NaN, "learning_rate": 3.880889028016904e-05, "loss": 0.0, "step": 28600 }, { "epoch": 2.238129707522254, "eval_loss": NaN, "eval_runtime": 296.1847, "eval_samples_per_second": 406.216, "eval_steps_per_second": 6.347, "step": 28600 }, { "epoch": 2.2400860804069254, "grad_norm": NaN, "learning_rate": 3.879910784160275e-05, "loss": 0.0, "step": 28625 }, { "epoch": 2.2420424532915972, "grad_norm": NaN, "learning_rate": 3.878932540303647e-05, "loss": 0.0, "step": 28650 }, { "epoch": 2.243998826176269, "grad_norm": NaN, "learning_rate": 3.877954296447018e-05, "loss": 0.0, "step": 28675 }, { "epoch": 2.245955199060941, "grad_norm": NaN, "learning_rate": 3.87697605259039e-05, "loss": 0.0, "step": 28700 }, { "epoch": 2.245955199060941, "eval_loss": NaN, "eval_runtime": 296.4748, "eval_samples_per_second": 405.819, "eval_steps_per_second": 6.341, "step": 28700 }, { "epoch": 2.247911571945613, "grad_norm": NaN, "learning_rate": 3.875997808733761e-05, "loss": 0.0, "step": 28725 }, { "epoch": 2.2498679448302847, "grad_norm": NaN, "learning_rate": 3.875019564877133e-05, "loss": 0.0, "step": 28750 }, { "epoch": 2.2518243177149566, "grad_norm": NaN, "learning_rate": 3.874041321020504e-05, "loss": 0.0, "step": 28775 }, { "epoch": 2.253780690599628, "grad_norm": NaN, "learning_rate": 3.8730630771638755e-05, "loss": 0.0, "step": 28800 }, { "epoch": 2.253780690599628, "eval_loss": NaN, "eval_runtime": 296.2115, "eval_samples_per_second": 406.179, "eval_steps_per_second": 6.347, "step": 28800 }, { "epoch": 2.2557370634843, "grad_norm": NaN, "learning_rate": 3.8720848333072466e-05, "loss": 0.0, "step": 28825 }, { "epoch": 2.257693436368972, "grad_norm": NaN, "learning_rate": 3.871106589450618e-05, "loss": 0.0, "step": 28850 }, { "epoch": 2.2596498092536437, "grad_norm": NaN, "learning_rate": 3.8701283455939894e-05, "loss": 0.0, "step": 28875 }, { "epoch": 2.2616061821383155, "grad_norm": NaN, "learning_rate": 3.869150101737361e-05, "loss": 0.0, "step": 28900 }, { "epoch": 2.2616061821383155, "eval_loss": NaN, "eval_runtime": 296.8275, "eval_samples_per_second": 405.336, "eval_steps_per_second": 6.334, "step": 28900 }, { "epoch": 2.2635625550229874, "grad_norm": NaN, "learning_rate": 3.868171857880732e-05, "loss": 0.0, "step": 28925 }, { "epoch": 2.2655189279076593, "grad_norm": NaN, "learning_rate": 3.867193614024104e-05, "loss": 0.0, "step": 28950 }, { "epoch": 2.267475300792331, "grad_norm": NaN, "learning_rate": 3.866215370167475e-05, "loss": 0.0, "step": 28975 }, { "epoch": 2.269431673677003, "grad_norm": NaN, "learning_rate": 3.8652371263108474e-05, "loss": 0.0, "step": 29000 }, { "epoch": 2.269431673677003, "eval_loss": NaN, "eval_runtime": 295.8628, "eval_samples_per_second": 406.658, "eval_steps_per_second": 6.354, "step": 29000 }, { "epoch": 2.2713880465616745, "grad_norm": NaN, "learning_rate": 3.8642588824542185e-05, "loss": 0.0, "step": 29025 }, { "epoch": 2.2733444194463464, "grad_norm": NaN, "learning_rate": 3.86328063859759e-05, "loss": 0.0, "step": 29050 }, { "epoch": 2.2753007923310182, "grad_norm": NaN, "learning_rate": 3.862302394740961e-05, "loss": 0.0, "step": 29075 }, { "epoch": 2.27725716521569, "grad_norm": NaN, "learning_rate": 3.861324150884333e-05, "loss": 0.0, "step": 29100 }, { "epoch": 2.27725716521569, "eval_loss": NaN, "eval_runtime": 296.2143, "eval_samples_per_second": 406.176, "eval_steps_per_second": 6.347, "step": 29100 }, { "epoch": 2.279213538100362, "grad_norm": NaN, "learning_rate": 3.860345907027704e-05, "loss": 0.0, "step": 29125 }, { "epoch": 2.281169910985034, "grad_norm": NaN, "learning_rate": 3.859367663171076e-05, "loss": 0.0, "step": 29150 }, { "epoch": 2.2831262838697057, "grad_norm": NaN, "learning_rate": 3.858389419314447e-05, "loss": 0.0, "step": 29175 }, { "epoch": 2.285082656754377, "grad_norm": NaN, "learning_rate": 3.857411175457819e-05, "loss": 0.0, "step": 29200 }, { "epoch": 2.285082656754377, "eval_loss": NaN, "eval_runtime": 296.6714, "eval_samples_per_second": 405.55, "eval_steps_per_second": 6.337, "step": 29200 }, { "epoch": 2.287039029639049, "grad_norm": NaN, "learning_rate": 3.85643293160119e-05, "loss": 0.0, "step": 29225 }, { "epoch": 2.288995402523721, "grad_norm": NaN, "learning_rate": 3.8554546877445615e-05, "loss": 0.0, "step": 29250 }, { "epoch": 2.290951775408393, "grad_norm": NaN, "learning_rate": 3.8544764438879326e-05, "loss": 0.0, "step": 29275 }, { "epoch": 2.2929081482930647, "grad_norm": NaN, "learning_rate": 3.853498200031304e-05, "loss": 0.0, "step": 29300 }, { "epoch": 2.2929081482930647, "eval_loss": NaN, "eval_runtime": 296.4259, "eval_samples_per_second": 405.886, "eval_steps_per_second": 6.342, "step": 29300 }, { "epoch": 2.2948645211777365, "grad_norm": NaN, "learning_rate": 3.8525199561746754e-05, "loss": 0.0, "step": 29325 }, { "epoch": 2.2968208940624084, "grad_norm": NaN, "learning_rate": 3.851541712318047e-05, "loss": 0.0, "step": 29350 }, { "epoch": 2.2987772669470803, "grad_norm": NaN, "learning_rate": 3.850563468461418e-05, "loss": 0.0, "step": 29375 }, { "epoch": 2.300733639831752, "grad_norm": NaN, "learning_rate": 3.84958522460479e-05, "loss": 0.0, "step": 29400 }, { "epoch": 2.300733639831752, "eval_loss": NaN, "eval_runtime": 295.8374, "eval_samples_per_second": 406.693, "eval_steps_per_second": 6.355, "step": 29400 }, { "epoch": 2.3026900127164236, "grad_norm": NaN, "learning_rate": 3.848606980748161e-05, "loss": 0.0, "step": 29425 }, { "epoch": 2.3046463856010955, "grad_norm": NaN, "learning_rate": 3.847628736891533e-05, "loss": 0.0, "step": 29450 }, { "epoch": 2.3066027584857673, "grad_norm": NaN, "learning_rate": 3.846650493034904e-05, "loss": 0.0, "step": 29475 }, { "epoch": 2.308559131370439, "grad_norm": NaN, "learning_rate": 3.8456722491782756e-05, "loss": 0.0, "step": 29500 }, { "epoch": 2.308559131370439, "eval_loss": NaN, "eval_runtime": 296.6853, "eval_samples_per_second": 405.531, "eval_steps_per_second": 6.337, "step": 29500 }, { "epoch": 2.310515504255111, "grad_norm": NaN, "learning_rate": 3.844694005321647e-05, "loss": 0.0, "step": 29525 }, { "epoch": 2.312471877139783, "grad_norm": NaN, "learning_rate": 3.8437157614650184e-05, "loss": 0.0, "step": 29550 }, { "epoch": 2.314428250024455, "grad_norm": NaN, "learning_rate": 3.8427375176083895e-05, "loss": 0.0, "step": 29575 }, { "epoch": 2.3163846229091263, "grad_norm": NaN, "learning_rate": 3.841759273751761e-05, "loss": 0.0, "step": 29600 }, { "epoch": 2.3163846229091263, "eval_loss": NaN, "eval_runtime": 297.6677, "eval_samples_per_second": 404.192, "eval_steps_per_second": 6.316, "step": 29600 }, { "epoch": 2.318340995793798, "grad_norm": NaN, "learning_rate": 3.840781029895132e-05, "loss": 0.0, "step": 29625 }, { "epoch": 2.32029736867847, "grad_norm": NaN, "learning_rate": 3.839802786038504e-05, "loss": 0.0, "step": 29650 }, { "epoch": 2.322253741563142, "grad_norm": NaN, "learning_rate": 3.838824542181875e-05, "loss": 0.0, "step": 29675 }, { "epoch": 2.324210114447814, "grad_norm": NaN, "learning_rate": 3.837846298325247e-05, "loss": 0.0, "step": 29700 }, { "epoch": 2.324210114447814, "eval_loss": NaN, "eval_runtime": 297.4734, "eval_samples_per_second": 404.456, "eval_steps_per_second": 6.32, "step": 29700 }, { "epoch": 2.3261664873324857, "grad_norm": NaN, "learning_rate": 3.836868054468618e-05, "loss": 0.0, "step": 29725 }, { "epoch": 2.3281228602171575, "grad_norm": NaN, "learning_rate": 3.83588981061199e-05, "loss": 0.0, "step": 29750 }, { "epoch": 2.330079233101829, "grad_norm": NaN, "learning_rate": 3.834911566755361e-05, "loss": 0.0, "step": 29775 }, { "epoch": 2.332035605986501, "grad_norm": NaN, "learning_rate": 3.8339333228987325e-05, "loss": 0.0, "step": 29800 }, { "epoch": 2.332035605986501, "eval_loss": NaN, "eval_runtime": 296.421, "eval_samples_per_second": 405.892, "eval_steps_per_second": 6.342, "step": 29800 }, { "epoch": 2.3339919788711727, "grad_norm": NaN, "learning_rate": 3.8329550790421036e-05, "loss": 0.0, "step": 29825 }, { "epoch": 2.3359483517558446, "grad_norm": NaN, "learning_rate": 3.831976835185475e-05, "loss": 0.0, "step": 29850 }, { "epoch": 2.3379047246405165, "grad_norm": NaN, "learning_rate": 3.8309985913288464e-05, "loss": 0.0, "step": 29875 }, { "epoch": 2.3398610975251883, "grad_norm": NaN, "learning_rate": 3.830020347472218e-05, "loss": 0.0, "step": 29900 }, { "epoch": 2.3398610975251883, "eval_loss": NaN, "eval_runtime": 296.6864, "eval_samples_per_second": 405.529, "eval_steps_per_second": 6.337, "step": 29900 }, { "epoch": 2.34181747040986, "grad_norm": NaN, "learning_rate": 3.829042103615589e-05, "loss": 0.0, "step": 29925 }, { "epoch": 2.343773843294532, "grad_norm": NaN, "learning_rate": 3.828063859758961e-05, "loss": 0.0, "step": 29950 }, { "epoch": 2.345730216179204, "grad_norm": NaN, "learning_rate": 3.827085615902332e-05, "loss": 0.0, "step": 29975 }, { "epoch": 2.3476865890638754, "grad_norm": NaN, "learning_rate": 3.826107372045704e-05, "loss": 0.0, "step": 30000 }, { "epoch": 2.3476865890638754, "eval_loss": NaN, "eval_runtime": 296.1668, "eval_samples_per_second": 406.241, "eval_steps_per_second": 6.348, "step": 30000 }, { "epoch": 2.3496429619485473, "grad_norm": NaN, "learning_rate": 3.825129128189075e-05, "loss": 0.0, "step": 30025 }, { "epoch": 2.351599334833219, "grad_norm": NaN, "learning_rate": 3.8241508843324466e-05, "loss": 0.0, "step": 30050 }, { "epoch": 2.353555707717891, "grad_norm": NaN, "learning_rate": 3.8231726404758177e-05, "loss": 0.0, "step": 30075 }, { "epoch": 2.355512080602563, "grad_norm": NaN, "learning_rate": 3.8221943966191894e-05, "loss": 0.0, "step": 30100 }, { "epoch": 2.355512080602563, "eval_loss": NaN, "eval_runtime": 297.3596, "eval_samples_per_second": 404.611, "eval_steps_per_second": 6.322, "step": 30100 }, { "epoch": 2.3574684534872348, "grad_norm": NaN, "learning_rate": 3.8212161527625605e-05, "loss": 0.0, "step": 30125 }, { "epoch": 2.3594248263719066, "grad_norm": NaN, "learning_rate": 3.820237908905932e-05, "loss": 0.0, "step": 30150 }, { "epoch": 2.361381199256578, "grad_norm": NaN, "learning_rate": 3.819259665049303e-05, "loss": 0.0, "step": 30175 }, { "epoch": 2.36333757214125, "grad_norm": NaN, "learning_rate": 3.818281421192675e-05, "loss": 0.0, "step": 30200 }, { "epoch": 2.36333757214125, "eval_loss": NaN, "eval_runtime": 296.0932, "eval_samples_per_second": 406.342, "eval_steps_per_second": 6.349, "step": 30200 }, { "epoch": 2.365293945025922, "grad_norm": NaN, "learning_rate": 3.817303177336046e-05, "loss": 0.0, "step": 30225 }, { "epoch": 2.3672503179105937, "grad_norm": NaN, "learning_rate": 3.816324933479418e-05, "loss": 0.0, "step": 30250 }, { "epoch": 2.3692066907952656, "grad_norm": NaN, "learning_rate": 3.8153466896227896e-05, "loss": 0.0, "step": 30275 }, { "epoch": 2.3711630636799375, "grad_norm": NaN, "learning_rate": 3.8143684457661613e-05, "loss": 0.0, "step": 30300 }, { "epoch": 2.3711630636799375, "eval_loss": NaN, "eval_runtime": 296.3021, "eval_samples_per_second": 406.055, "eval_steps_per_second": 6.345, "step": 30300 }, { "epoch": 2.3731194365646093, "grad_norm": NaN, "learning_rate": 3.8133902019095324e-05, "loss": 0.0, "step": 30325 }, { "epoch": 2.375075809449281, "grad_norm": NaN, "learning_rate": 3.812411958052904e-05, "loss": 0.0, "step": 30350 }, { "epoch": 2.377032182333953, "grad_norm": NaN, "learning_rate": 3.811433714196275e-05, "loss": 0.0, "step": 30375 }, { "epoch": 2.3789885552186245, "grad_norm": NaN, "learning_rate": 3.810455470339647e-05, "loss": 0.0, "step": 30400 }, { "epoch": 2.3789885552186245, "eval_loss": NaN, "eval_runtime": 296.153, "eval_samples_per_second": 406.26, "eval_steps_per_second": 6.348, "step": 30400 }, { "epoch": 2.3809449281032964, "grad_norm": NaN, "learning_rate": 3.809477226483018e-05, "loss": 0.0, "step": 30425 }, { "epoch": 2.3829013009879683, "grad_norm": NaN, "learning_rate": 3.80849898262639e-05, "loss": 0.0, "step": 30450 }, { "epoch": 2.38485767387264, "grad_norm": NaN, "learning_rate": 3.807520738769761e-05, "loss": 0.0, "step": 30475 }, { "epoch": 2.386814046757312, "grad_norm": NaN, "learning_rate": 3.8065424949131326e-05, "loss": 0.0, "step": 30500 }, { "epoch": 2.386814046757312, "eval_loss": NaN, "eval_runtime": 296.1743, "eval_samples_per_second": 406.23, "eval_steps_per_second": 6.348, "step": 30500 }, { "epoch": 2.388770419641984, "grad_norm": NaN, "learning_rate": 3.805564251056504e-05, "loss": 0.0, "step": 30525 }, { "epoch": 2.3907267925266558, "grad_norm": NaN, "learning_rate": 3.8045860071998754e-05, "loss": 0.0, "step": 30550 }, { "epoch": 2.392683165411327, "grad_norm": NaN, "learning_rate": 3.8036077633432465e-05, "loss": 0.0, "step": 30575 }, { "epoch": 2.394639538295999, "grad_norm": NaN, "learning_rate": 3.802629519486618e-05, "loss": 0.0, "step": 30600 }, { "epoch": 2.394639538295999, "eval_loss": NaN, "eval_runtime": 296.449, "eval_samples_per_second": 405.854, "eval_steps_per_second": 6.342, "step": 30600 }, { "epoch": 2.396595911180671, "grad_norm": NaN, "learning_rate": 3.801651275629989e-05, "loss": 0.0, "step": 30625 }, { "epoch": 2.398552284065343, "grad_norm": NaN, "learning_rate": 3.800673031773361e-05, "loss": 0.0, "step": 30650 }, { "epoch": 2.4005086569500147, "grad_norm": NaN, "learning_rate": 3.799694787916732e-05, "loss": 0.0, "step": 30675 }, { "epoch": 2.4024650298346866, "grad_norm": NaN, "learning_rate": 3.798716544060104e-05, "loss": 0.0, "step": 30700 }, { "epoch": 2.4024650298346866, "eval_loss": NaN, "eval_runtime": 297.2008, "eval_samples_per_second": 404.827, "eval_steps_per_second": 6.326, "step": 30700 }, { "epoch": 2.4044214027193584, "grad_norm": NaN, "learning_rate": 3.797738300203475e-05, "loss": 0.0, "step": 30725 }, { "epoch": 2.4063777756040303, "grad_norm": NaN, "learning_rate": 3.796760056346847e-05, "loss": 0.0, "step": 30750 }, { "epoch": 2.4083341484887018, "grad_norm": NaN, "learning_rate": 3.795781812490218e-05, "loss": 0.0, "step": 30775 }, { "epoch": 2.4102905213733736, "grad_norm": NaN, "learning_rate": 3.7948035686335895e-05, "loss": 0.0, "step": 30800 }, { "epoch": 2.4102905213733736, "eval_loss": NaN, "eval_runtime": 297.1122, "eval_samples_per_second": 404.948, "eval_steps_per_second": 6.328, "step": 30800 }, { "epoch": 2.4122468942580455, "grad_norm": NaN, "learning_rate": 3.7938253247769606e-05, "loss": 0.0, "step": 30825 }, { "epoch": 2.4142032671427174, "grad_norm": NaN, "learning_rate": 3.792847080920332e-05, "loss": 0.0, "step": 30850 }, { "epoch": 2.4161596400273893, "grad_norm": NaN, "learning_rate": 3.7918688370637034e-05, "loss": 0.0, "step": 30875 }, { "epoch": 2.418116012912061, "grad_norm": NaN, "learning_rate": 3.790890593207075e-05, "loss": 0.0, "step": 30900 }, { "epoch": 2.418116012912061, "eval_loss": NaN, "eval_runtime": 295.0876, "eval_samples_per_second": 407.726, "eval_steps_per_second": 6.371, "step": 30900 }, { "epoch": 2.420072385796733, "grad_norm": NaN, "learning_rate": 3.789912349350446e-05, "loss": 0.0, "step": 30925 }, { "epoch": 2.422028758681405, "grad_norm": NaN, "learning_rate": 3.788934105493818e-05, "loss": 0.0, "step": 30950 }, { "epoch": 2.4239851315660763, "grad_norm": NaN, "learning_rate": 3.787955861637189e-05, "loss": 0.0, "step": 30975 }, { "epoch": 2.425941504450748, "grad_norm": NaN, "learning_rate": 3.786977617780561e-05, "loss": 0.0, "step": 31000 }, { "epoch": 2.425941504450748, "eval_loss": NaN, "eval_runtime": 297.9706, "eval_samples_per_second": 403.782, "eval_steps_per_second": 6.309, "step": 31000 }, { "epoch": 2.42789787733542, "grad_norm": NaN, "learning_rate": 3.785999373923932e-05, "loss": 0.0, "step": 31025 }, { "epoch": 2.429854250220092, "grad_norm": NaN, "learning_rate": 3.7850211300673036e-05, "loss": 0.0, "step": 31050 }, { "epoch": 2.431810623104764, "grad_norm": NaN, "learning_rate": 3.784042886210675e-05, "loss": 0.0, "step": 31075 }, { "epoch": 2.4337669959894357, "grad_norm": NaN, "learning_rate": 3.7830646423540464e-05, "loss": 0.0, "step": 31100 }, { "epoch": 2.4337669959894357, "eval_loss": NaN, "eval_runtime": 296.973, "eval_samples_per_second": 405.138, "eval_steps_per_second": 6.331, "step": 31100 }, { "epoch": 2.4357233688741076, "grad_norm": NaN, "learning_rate": 3.7820863984974175e-05, "loss": 0.0, "step": 31125 }, { "epoch": 2.437679741758779, "grad_norm": NaN, "learning_rate": 3.781108154640789e-05, "loss": 0.0, "step": 31150 }, { "epoch": 2.439636114643451, "grad_norm": NaN, "learning_rate": 3.78012991078416e-05, "loss": 0.0, "step": 31175 }, { "epoch": 2.4415924875281227, "grad_norm": NaN, "learning_rate": 3.779151666927532e-05, "loss": 0.0, "step": 31200 }, { "epoch": 2.4415924875281227, "eval_loss": NaN, "eval_runtime": 295.7571, "eval_samples_per_second": 406.803, "eval_steps_per_second": 6.357, "step": 31200 }, { "epoch": 2.4435488604127946, "grad_norm": NaN, "learning_rate": 3.778173423070903e-05, "loss": 0.0, "step": 31225 }, { "epoch": 2.4455052332974665, "grad_norm": NaN, "learning_rate": 3.777195179214275e-05, "loss": 0.0, "step": 31250 }, { "epoch": 2.4474616061821384, "grad_norm": NaN, "learning_rate": 3.776216935357646e-05, "loss": 0.0, "step": 31275 }, { "epoch": 2.4494179790668102, "grad_norm": NaN, "learning_rate": 3.775238691501018e-05, "loss": 0.0, "step": 31300 }, { "epoch": 2.4494179790668102, "eval_loss": NaN, "eval_runtime": 296.5757, "eval_samples_per_second": 405.681, "eval_steps_per_second": 6.339, "step": 31300 }, { "epoch": 2.451374351951482, "grad_norm": NaN, "learning_rate": 3.774260447644389e-05, "loss": 0.0, "step": 31325 }, { "epoch": 2.453330724836154, "grad_norm": NaN, "learning_rate": 3.7732822037877605e-05, "loss": 0.0, "step": 31350 }, { "epoch": 2.4552870977208254, "grad_norm": NaN, "learning_rate": 3.7723039599311316e-05, "loss": 0.0, "step": 31375 }, { "epoch": 2.4572434706054973, "grad_norm": NaN, "learning_rate": 3.771325716074503e-05, "loss": 0.0, "step": 31400 }, { "epoch": 2.4572434706054973, "eval_loss": NaN, "eval_runtime": 296.9307, "eval_samples_per_second": 405.196, "eval_steps_per_second": 6.331, "step": 31400 }, { "epoch": 2.459199843490169, "grad_norm": NaN, "learning_rate": 3.7703474722178744e-05, "loss": 0.0, "step": 31425 }, { "epoch": 2.461156216374841, "grad_norm": NaN, "learning_rate": 3.769369228361246e-05, "loss": 0.0, "step": 31450 }, { "epoch": 2.463112589259513, "grad_norm": NaN, "learning_rate": 3.768390984504617e-05, "loss": 0.0, "step": 31475 }, { "epoch": 2.465068962144185, "grad_norm": NaN, "learning_rate": 3.767412740647989e-05, "loss": 0.0, "step": 31500 }, { "epoch": 2.465068962144185, "eval_loss": NaN, "eval_runtime": 297.611, "eval_samples_per_second": 404.269, "eval_steps_per_second": 6.317, "step": 31500 }, { "epoch": 2.4670253350288567, "grad_norm": NaN, "learning_rate": 3.76643449679136e-05, "loss": 0.0, "step": 31525 }, { "epoch": 2.468981707913528, "grad_norm": NaN, "learning_rate": 3.765456252934732e-05, "loss": 0.0, "step": 31550 }, { "epoch": 2.4709380807982, "grad_norm": NaN, "learning_rate": 3.7644780090781035e-05, "loss": 0.0, "step": 31575 }, { "epoch": 2.472894453682872, "grad_norm": NaN, "learning_rate": 3.7634997652214746e-05, "loss": 0.0, "step": 31600 }, { "epoch": 2.472894453682872, "eval_loss": NaN, "eval_runtime": 295.587, "eval_samples_per_second": 407.037, "eval_steps_per_second": 6.36, "step": 31600 }, { "epoch": 2.4748508265675437, "grad_norm": NaN, "learning_rate": 3.762521521364846e-05, "loss": 0.0, "step": 31625 }, { "epoch": 2.4768071994522156, "grad_norm": NaN, "learning_rate": 3.7615432775082174e-05, "loss": 0.0, "step": 31650 }, { "epoch": 2.4787635723368875, "grad_norm": NaN, "learning_rate": 3.760565033651589e-05, "loss": 0.0, "step": 31675 }, { "epoch": 2.4807199452215594, "grad_norm": NaN, "learning_rate": 3.75958678979496e-05, "loss": 0.0, "step": 31700 }, { "epoch": 2.4807199452215594, "eval_loss": NaN, "eval_runtime": 297.9969, "eval_samples_per_second": 403.746, "eval_steps_per_second": 6.309, "step": 31700 }, { "epoch": 2.4826763181062312, "grad_norm": NaN, "learning_rate": 3.758608545938332e-05, "loss": 0.0, "step": 31725 }, { "epoch": 2.484632690990903, "grad_norm": NaN, "learning_rate": 3.757630302081703e-05, "loss": 0.0, "step": 31750 }, { "epoch": 2.4865890638755745, "grad_norm": NaN, "learning_rate": 3.756652058225075e-05, "loss": 0.0, "step": 31775 }, { "epoch": 2.4885454367602464, "grad_norm": NaN, "learning_rate": 3.755673814368446e-05, "loss": 0.0, "step": 31800 }, { "epoch": 2.4885454367602464, "eval_loss": NaN, "eval_runtime": 296.009, "eval_samples_per_second": 406.457, "eval_steps_per_second": 6.351, "step": 31800 }, { "epoch": 2.4905018096449183, "grad_norm": NaN, "learning_rate": 3.7546955705118176e-05, "loss": 0.0, "step": 31825 }, { "epoch": 2.49245818252959, "grad_norm": NaN, "learning_rate": 3.7537173266551887e-05, "loss": 0.0, "step": 31850 }, { "epoch": 2.494414555414262, "grad_norm": NaN, "learning_rate": 3.7527390827985604e-05, "loss": 0.0, "step": 31875 }, { "epoch": 2.496370928298934, "grad_norm": NaN, "learning_rate": 3.7517608389419315e-05, "loss": 0.0, "step": 31900 }, { "epoch": 2.496370928298934, "eval_loss": NaN, "eval_runtime": 297.0989, "eval_samples_per_second": 404.966, "eval_steps_per_second": 6.328, "step": 31900 }, { "epoch": 2.498327301183606, "grad_norm": NaN, "learning_rate": 3.750782595085303e-05, "loss": 0.0, "step": 31925 }, { "epoch": 2.5002836740682772, "grad_norm": NaN, "learning_rate": 3.749804351228674e-05, "loss": 0.0, "step": 31950 }, { "epoch": 2.502240046952949, "grad_norm": NaN, "learning_rate": 3.748826107372046e-05, "loss": 0.0, "step": 31975 }, { "epoch": 2.504196419837621, "grad_norm": NaN, "learning_rate": 3.747847863515417e-05, "loss": 0.0, "step": 32000 }, { "epoch": 2.504196419837621, "eval_loss": NaN, "eval_runtime": 294.6654, "eval_samples_per_second": 408.311, "eval_steps_per_second": 6.38, "step": 32000 }, { "epoch": 2.506152792722293, "grad_norm": NaN, "learning_rate": 3.746869619658789e-05, "loss": 0.0, "step": 32025 }, { "epoch": 2.5081091656069647, "grad_norm": NaN, "learning_rate": 3.74589137580216e-05, "loss": 0.0, "step": 32050 }, { "epoch": 2.5100655384916366, "grad_norm": NaN, "learning_rate": 3.744913131945532e-05, "loss": 0.0, "step": 32075 }, { "epoch": 2.5120219113763085, "grad_norm": NaN, "learning_rate": 3.743934888088903e-05, "loss": 0.0, "step": 32100 }, { "epoch": 2.5120219113763085, "eval_loss": NaN, "eval_runtime": 295.3586, "eval_samples_per_second": 407.352, "eval_steps_per_second": 6.365, "step": 32100 }, { "epoch": 2.51397828426098, "grad_norm": NaN, "learning_rate": 3.7429566442322745e-05, "loss": 0.0, "step": 32125 }, { "epoch": 2.5159346571456522, "grad_norm": NaN, "learning_rate": 3.7419784003756456e-05, "loss": 0.0, "step": 32150 }, { "epoch": 2.5178910300303237, "grad_norm": NaN, "learning_rate": 3.741000156519017e-05, "loss": 0.0, "step": 32175 }, { "epoch": 2.5198474029149955, "grad_norm": NaN, "learning_rate": 3.7400219126623884e-05, "loss": 0.0, "step": 32200 }, { "epoch": 2.5198474029149955, "eval_loss": NaN, "eval_runtime": 295.0158, "eval_samples_per_second": 407.826, "eval_steps_per_second": 6.373, "step": 32200 }, { "epoch": 2.5218037757996674, "grad_norm": NaN, "learning_rate": 3.73904366880576e-05, "loss": 0.0, "step": 32225 }, { "epoch": 2.5237601486843393, "grad_norm": NaN, "learning_rate": 3.738065424949131e-05, "loss": 0.0, "step": 32250 }, { "epoch": 2.525716521569011, "grad_norm": NaN, "learning_rate": 3.737087181092503e-05, "loss": 0.0, "step": 32275 }, { "epoch": 2.5276728944536826, "grad_norm": NaN, "learning_rate": 3.736108937235874e-05, "loss": 0.0, "step": 32300 }, { "epoch": 2.5276728944536826, "eval_loss": NaN, "eval_runtime": 296.8143, "eval_samples_per_second": 405.354, "eval_steps_per_second": 6.334, "step": 32300 }, { "epoch": 2.529629267338355, "grad_norm": NaN, "learning_rate": 3.735130693379246e-05, "loss": 0.0, "step": 32325 }, { "epoch": 2.5315856402230263, "grad_norm": NaN, "learning_rate": 3.734152449522617e-05, "loss": 0.0, "step": 32350 }, { "epoch": 2.533542013107698, "grad_norm": NaN, "learning_rate": 3.7331742056659886e-05, "loss": 0.0, "step": 32375 }, { "epoch": 2.53549838599237, "grad_norm": NaN, "learning_rate": 3.7321959618093596e-05, "loss": 0.0, "step": 32400 }, { "epoch": 2.53549838599237, "eval_loss": NaN, "eval_runtime": 296.5938, "eval_samples_per_second": 405.656, "eval_steps_per_second": 6.339, "step": 32400 }, { "epoch": 2.537454758877042, "grad_norm": NaN, "learning_rate": 3.7312177179527314e-05, "loss": 0.0, "step": 32425 }, { "epoch": 2.539411131761714, "grad_norm": NaN, "learning_rate": 3.7302394740961025e-05, "loss": 0.0, "step": 32450 }, { "epoch": 2.5413675046463857, "grad_norm": NaN, "learning_rate": 3.729261230239474e-05, "loss": 0.0, "step": 32475 }, { "epoch": 2.5433238775310576, "grad_norm": NaN, "learning_rate": 3.728282986382845e-05, "loss": 0.0, "step": 32500 }, { "epoch": 2.5433238775310576, "eval_loss": NaN, "eval_runtime": 296.3782, "eval_samples_per_second": 405.951, "eval_steps_per_second": 6.343, "step": 32500 }, { "epoch": 2.545280250415729, "grad_norm": NaN, "learning_rate": 3.727304742526217e-05, "loss": 0.0, "step": 32525 }, { "epoch": 2.547236623300401, "grad_norm": NaN, "learning_rate": 3.726326498669588e-05, "loss": 0.0, "step": 32550 }, { "epoch": 2.549192996185073, "grad_norm": NaN, "learning_rate": 3.72534825481296e-05, "loss": 0.0, "step": 32575 }, { "epoch": 2.5511493690697447, "grad_norm": NaN, "learning_rate": 3.724370010956331e-05, "loss": 0.0, "step": 32600 }, { "epoch": 2.5511493690697447, "eval_loss": NaN, "eval_runtime": 296.5827, "eval_samples_per_second": 405.671, "eval_steps_per_second": 6.339, "step": 32600 }, { "epoch": 2.5531057419544165, "grad_norm": NaN, "learning_rate": 3.7233917670997027e-05, "loss": 0.0, "step": 32625 }, { "epoch": 2.5550621148390884, "grad_norm": NaN, "learning_rate": 3.722413523243074e-05, "loss": 0.0, "step": 32650 }, { "epoch": 2.5570184877237603, "grad_norm": NaN, "learning_rate": 3.7214352793864455e-05, "loss": 0.0, "step": 32675 }, { "epoch": 2.5589748606084317, "grad_norm": NaN, "learning_rate": 3.7204570355298165e-05, "loss": 0.0, "step": 32700 }, { "epoch": 2.5589748606084317, "eval_loss": NaN, "eval_runtime": 296.2889, "eval_samples_per_second": 406.073, "eval_steps_per_second": 6.345, "step": 32700 }, { "epoch": 2.560931233493104, "grad_norm": NaN, "learning_rate": 3.719478791673188e-05, "loss": 0.0, "step": 32725 }, { "epoch": 2.5628876063777755, "grad_norm": NaN, "learning_rate": 3.7185005478165594e-05, "loss": 0.0, "step": 32750 }, { "epoch": 2.5648439792624473, "grad_norm": NaN, "learning_rate": 3.717522303959931e-05, "loss": 0.0, "step": 32775 }, { "epoch": 2.566800352147119, "grad_norm": NaN, "learning_rate": 3.716544060103302e-05, "loss": 0.0, "step": 32800 }, { "epoch": 2.566800352147119, "eval_loss": NaN, "eval_runtime": 297.6756, "eval_samples_per_second": 404.182, "eval_steps_per_second": 6.316, "step": 32800 }, { "epoch": 2.568756725031791, "grad_norm": NaN, "learning_rate": 3.715565816246674e-05, "loss": 0.0, "step": 32825 }, { "epoch": 2.570713097916463, "grad_norm": NaN, "learning_rate": 3.714587572390046e-05, "loss": 0.0, "step": 32850 }, { "epoch": 2.572669470801135, "grad_norm": NaN, "learning_rate": 3.7136093285334174e-05, "loss": 0.0, "step": 32875 }, { "epoch": 2.5746258436858067, "grad_norm": NaN, "learning_rate": 3.7126310846767885e-05, "loss": 0.0, "step": 32900 }, { "epoch": 2.5746258436858067, "eval_loss": NaN, "eval_runtime": 295.858, "eval_samples_per_second": 406.665, "eval_steps_per_second": 6.354, "step": 32900 }, { "epoch": 2.576582216570478, "grad_norm": NaN, "learning_rate": 3.71165284082016e-05, "loss": 0.0, "step": 32925 }, { "epoch": 2.57853858945515, "grad_norm": NaN, "learning_rate": 3.710674596963531e-05, "loss": 0.0, "step": 32950 }, { "epoch": 2.580494962339822, "grad_norm": NaN, "learning_rate": 3.709696353106903e-05, "loss": 0.0, "step": 32975 }, { "epoch": 2.5824513352244938, "grad_norm": NaN, "learning_rate": 3.708718109250274e-05, "loss": 0.0, "step": 33000 }, { "epoch": 2.5824513352244938, "eval_loss": NaN, "eval_runtime": 296.7587, "eval_samples_per_second": 405.43, "eval_steps_per_second": 6.335, "step": 33000 }, { "epoch": 2.5844077081091656, "grad_norm": NaN, "learning_rate": 3.707739865393646e-05, "loss": 0.0, "step": 33025 }, { "epoch": 2.5863640809938375, "grad_norm": NaN, "learning_rate": 3.706761621537017e-05, "loss": 0.0, "step": 33050 }, { "epoch": 2.5883204538785094, "grad_norm": NaN, "learning_rate": 3.705783377680389e-05, "loss": 0.0, "step": 33075 }, { "epoch": 2.590276826763181, "grad_norm": NaN, "learning_rate": 3.70480513382376e-05, "loss": 0.0, "step": 33100 }, { "epoch": 2.590276826763181, "eval_loss": NaN, "eval_runtime": 297.6418, "eval_samples_per_second": 404.228, "eval_steps_per_second": 6.316, "step": 33100 }, { "epoch": 2.592233199647853, "grad_norm": NaN, "learning_rate": 3.7038268899671315e-05, "loss": 0.0, "step": 33125 }, { "epoch": 2.5941895725325246, "grad_norm": NaN, "learning_rate": 3.7028486461105026e-05, "loss": 0.0, "step": 33150 }, { "epoch": 2.5961459454171965, "grad_norm": NaN, "learning_rate": 3.701870402253874e-05, "loss": 0.0, "step": 33175 }, { "epoch": 2.5981023183018683, "grad_norm": NaN, "learning_rate": 3.7008921583972454e-05, "loss": 0.0, "step": 33200 }, { "epoch": 2.5981023183018683, "eval_loss": NaN, "eval_runtime": 296.7386, "eval_samples_per_second": 405.458, "eval_steps_per_second": 6.336, "step": 33200 }, { "epoch": 2.60005869118654, "grad_norm": NaN, "learning_rate": 3.699913914540617e-05, "loss": 0.0, "step": 33225 }, { "epoch": 2.602015064071212, "grad_norm": NaN, "learning_rate": 3.698935670683988e-05, "loss": 0.0, "step": 33250 }, { "epoch": 2.603971436955884, "grad_norm": NaN, "learning_rate": 3.69795742682736e-05, "loss": 0.0, "step": 33275 }, { "epoch": 2.605927809840556, "grad_norm": NaN, "learning_rate": 3.696979182970731e-05, "loss": 0.0, "step": 33300 }, { "epoch": 2.605927809840556, "eval_loss": NaN, "eval_runtime": 297.7145, "eval_samples_per_second": 404.129, "eval_steps_per_second": 6.315, "step": 33300 }, { "epoch": 2.6078841827252273, "grad_norm": NaN, "learning_rate": 3.696000939114103e-05, "loss": 0.0, "step": 33325 }, { "epoch": 2.609840555609899, "grad_norm": NaN, "learning_rate": 3.695022695257474e-05, "loss": 0.0, "step": 33350 }, { "epoch": 2.611796928494571, "grad_norm": NaN, "learning_rate": 3.6940444514008456e-05, "loss": 0.0, "step": 33375 }, { "epoch": 2.613753301379243, "grad_norm": NaN, "learning_rate": 3.6930662075442167e-05, "loss": 0.0, "step": 33400 }, { "epoch": 2.613753301379243, "eval_loss": NaN, "eval_runtime": 297.4573, "eval_samples_per_second": 404.478, "eval_steps_per_second": 6.32, "step": 33400 }, { "epoch": 2.6157096742639148, "grad_norm": NaN, "learning_rate": 3.6920879636875884e-05, "loss": 0.0, "step": 33425 }, { "epoch": 2.6176660471485866, "grad_norm": NaN, "learning_rate": 3.6911097198309595e-05, "loss": 0.0, "step": 33450 }, { "epoch": 2.6196224200332585, "grad_norm": NaN, "learning_rate": 3.690131475974331e-05, "loss": 0.0, "step": 33475 }, { "epoch": 2.62157879291793, "grad_norm": NaN, "learning_rate": 3.689153232117702e-05, "loss": 0.0, "step": 33500 }, { "epoch": 2.62157879291793, "eval_loss": NaN, "eval_runtime": 295.4355, "eval_samples_per_second": 407.246, "eval_steps_per_second": 6.363, "step": 33500 }, { "epoch": 2.623535165802602, "grad_norm": NaN, "learning_rate": 3.688174988261074e-05, "loss": 0.0, "step": 33525 }, { "epoch": 2.6254915386872737, "grad_norm": NaN, "learning_rate": 3.687196744404445e-05, "loss": 0.0, "step": 33550 }, { "epoch": 2.6274479115719456, "grad_norm": NaN, "learning_rate": 3.686218500547817e-05, "loss": 0.0, "step": 33575 }, { "epoch": 2.6294042844566174, "grad_norm": NaN, "learning_rate": 3.685240256691188e-05, "loss": 0.0, "step": 33600 }, { "epoch": 2.6294042844566174, "eval_loss": NaN, "eval_runtime": 296.4171, "eval_samples_per_second": 405.898, "eval_steps_per_second": 6.342, "step": 33600 }, { "epoch": 2.6313606573412893, "grad_norm": NaN, "learning_rate": 3.68426201283456e-05, "loss": 0.0, "step": 33625 }, { "epoch": 2.633317030225961, "grad_norm": NaN, "learning_rate": 3.683283768977931e-05, "loss": 0.0, "step": 33650 }, { "epoch": 2.6352734031106326, "grad_norm": NaN, "learning_rate": 3.6823055251213025e-05, "loss": 0.0, "step": 33675 }, { "epoch": 2.637229775995305, "grad_norm": NaN, "learning_rate": 3.6813272812646736e-05, "loss": 0.0, "step": 33700 }, { "epoch": 2.637229775995305, "eval_loss": NaN, "eval_runtime": 296.4199, "eval_samples_per_second": 405.894, "eval_steps_per_second": 6.342, "step": 33700 }, { "epoch": 2.6391861488799764, "grad_norm": NaN, "learning_rate": 3.680349037408045e-05, "loss": 0.0, "step": 33725 }, { "epoch": 2.6411425217646483, "grad_norm": NaN, "learning_rate": 3.6793707935514164e-05, "loss": 0.0, "step": 33750 }, { "epoch": 2.64309889464932, "grad_norm": NaN, "learning_rate": 3.678392549694788e-05, "loss": 0.0, "step": 33775 }, { "epoch": 2.645055267533992, "grad_norm": NaN, "learning_rate": 3.677414305838159e-05, "loss": 0.0, "step": 33800 }, { "epoch": 2.645055267533992, "eval_loss": NaN, "eval_runtime": 296.1025, "eval_samples_per_second": 406.329, "eval_steps_per_second": 6.349, "step": 33800 }, { "epoch": 2.647011640418664, "grad_norm": NaN, "learning_rate": 3.676436061981531e-05, "loss": 0.0, "step": 33825 }, { "epoch": 2.6489680133033358, "grad_norm": NaN, "learning_rate": 3.675457818124902e-05, "loss": 0.0, "step": 33850 }, { "epoch": 2.6509243861880076, "grad_norm": NaN, "learning_rate": 3.674479574268274e-05, "loss": 0.0, "step": 33875 }, { "epoch": 2.652880759072679, "grad_norm": NaN, "learning_rate": 3.673501330411645e-05, "loss": 0.0, "step": 33900 }, { "epoch": 2.652880759072679, "eval_loss": NaN, "eval_runtime": 298.269, "eval_samples_per_second": 403.378, "eval_steps_per_second": 6.303, "step": 33900 }, { "epoch": 2.654837131957351, "grad_norm": NaN, "learning_rate": 3.6725230865550166e-05, "loss": 0.0, "step": 33925 }, { "epoch": 2.656793504842023, "grad_norm": NaN, "learning_rate": 3.6715448426983876e-05, "loss": 0.0, "step": 33950 }, { "epoch": 2.6587498777266947, "grad_norm": NaN, "learning_rate": 3.6705665988417594e-05, "loss": 0.0, "step": 33975 }, { "epoch": 2.6607062506113666, "grad_norm": NaN, "learning_rate": 3.6695883549851305e-05, "loss": 0.0, "step": 34000 }, { "epoch": 2.6607062506113666, "eval_loss": NaN, "eval_runtime": 295.4105, "eval_samples_per_second": 407.281, "eval_steps_per_second": 6.364, "step": 34000 }, { "epoch": 2.6626626234960384, "grad_norm": NaN, "learning_rate": 3.668610111128502e-05, "loss": 0.0, "step": 34025 }, { "epoch": 2.6646189963807103, "grad_norm": NaN, "learning_rate": 3.667631867271873e-05, "loss": 0.0, "step": 34050 }, { "epoch": 2.6665753692653817, "grad_norm": NaN, "learning_rate": 3.666653623415245e-05, "loss": 0.0, "step": 34075 }, { "epoch": 2.668531742150054, "grad_norm": NaN, "learning_rate": 3.665675379558616e-05, "loss": 0.0, "step": 34100 }, { "epoch": 2.668531742150054, "eval_loss": NaN, "eval_runtime": 295.8003, "eval_samples_per_second": 406.744, "eval_steps_per_second": 6.356, "step": 34100 }, { "epoch": 2.6704881150347255, "grad_norm": NaN, "learning_rate": 3.664697135701988e-05, "loss": 0.0, "step": 34125 }, { "epoch": 2.6724444879193974, "grad_norm": NaN, "learning_rate": 3.6637188918453596e-05, "loss": 0.0, "step": 34150 }, { "epoch": 2.6744008608040692, "grad_norm": NaN, "learning_rate": 3.662740647988731e-05, "loss": 0.0, "step": 34175 }, { "epoch": 2.676357233688741, "grad_norm": NaN, "learning_rate": 3.6617624041321024e-05, "loss": 0.0, "step": 34200 }, { "epoch": 2.676357233688741, "eval_loss": NaN, "eval_runtime": 296.6534, "eval_samples_per_second": 405.574, "eval_steps_per_second": 6.337, "step": 34200 }, { "epoch": 2.678313606573413, "grad_norm": NaN, "learning_rate": 3.660784160275474e-05, "loss": 0.0, "step": 34225 }, { "epoch": 2.680269979458085, "grad_norm": NaN, "learning_rate": 3.659805916418845e-05, "loss": 0.0, "step": 34250 }, { "epoch": 2.6822263523427567, "grad_norm": NaN, "learning_rate": 3.658827672562217e-05, "loss": 0.0, "step": 34275 }, { "epoch": 2.684182725227428, "grad_norm": NaN, "learning_rate": 3.657849428705588e-05, "loss": 0.0, "step": 34300 }, { "epoch": 2.684182725227428, "eval_loss": NaN, "eval_runtime": 296.8048, "eval_samples_per_second": 405.367, "eval_steps_per_second": 6.334, "step": 34300 }, { "epoch": 2.6861390981121, "grad_norm": NaN, "learning_rate": 3.65687118484896e-05, "loss": 0.0, "step": 34325 }, { "epoch": 2.688095470996772, "grad_norm": NaN, "learning_rate": 3.655892940992331e-05, "loss": 0.0, "step": 34350 }, { "epoch": 2.690051843881444, "grad_norm": NaN, "learning_rate": 3.6549146971357026e-05, "loss": 0.0, "step": 34375 }, { "epoch": 2.6920082167661157, "grad_norm": NaN, "learning_rate": 3.653936453279074e-05, "loss": 0.0, "step": 34400 }, { "epoch": 2.6920082167661157, "eval_loss": NaN, "eval_runtime": 296.2122, "eval_samples_per_second": 406.178, "eval_steps_per_second": 6.347, "step": 34400 }, { "epoch": 2.6939645896507876, "grad_norm": NaN, "learning_rate": 3.6529582094224454e-05, "loss": 0.0, "step": 34425 }, { "epoch": 2.6959209625354594, "grad_norm": NaN, "learning_rate": 3.6519799655658165e-05, "loss": 0.0, "step": 34450 }, { "epoch": 2.697877335420131, "grad_norm": NaN, "learning_rate": 3.651001721709188e-05, "loss": 0.0, "step": 34475 }, { "epoch": 2.6998337083048027, "grad_norm": NaN, "learning_rate": 3.650023477852559e-05, "loss": 0.0, "step": 34500 }, { "epoch": 2.6998337083048027, "eval_loss": NaN, "eval_runtime": 296.2068, "eval_samples_per_second": 406.186, "eval_steps_per_second": 6.347, "step": 34500 }, { "epoch": 2.7017900811894746, "grad_norm": NaN, "learning_rate": 3.649045233995931e-05, "loss": 0.0, "step": 34525 }, { "epoch": 2.7037464540741465, "grad_norm": NaN, "learning_rate": 3.648066990139302e-05, "loss": 0.0, "step": 34550 }, { "epoch": 2.7057028269588184, "grad_norm": NaN, "learning_rate": 3.647088746282674e-05, "loss": 0.0, "step": 34575 }, { "epoch": 2.7076591998434902, "grad_norm": NaN, "learning_rate": 3.646110502426045e-05, "loss": 0.0, "step": 34600 }, { "epoch": 2.7076591998434902, "eval_loss": NaN, "eval_runtime": 297.4105, "eval_samples_per_second": 404.542, "eval_steps_per_second": 6.321, "step": 34600 }, { "epoch": 2.709615572728162, "grad_norm": NaN, "learning_rate": 3.645132258569417e-05, "loss": 0.0, "step": 34625 }, { "epoch": 2.7115719456128335, "grad_norm": NaN, "learning_rate": 3.644154014712788e-05, "loss": 0.0, "step": 34650 }, { "epoch": 2.713528318497506, "grad_norm": NaN, "learning_rate": 3.6431757708561595e-05, "loss": 0.0, "step": 34675 }, { "epoch": 2.7154846913821773, "grad_norm": NaN, "learning_rate": 3.6421975269995306e-05, "loss": 0.0, "step": 34700 }, { "epoch": 2.7154846913821773, "eval_loss": NaN, "eval_runtime": 295.9058, "eval_samples_per_second": 406.599, "eval_steps_per_second": 6.353, "step": 34700 }, { "epoch": 2.717441064266849, "grad_norm": NaN, "learning_rate": 3.641219283142902e-05, "loss": 0.0, "step": 34725 }, { "epoch": 2.719397437151521, "grad_norm": NaN, "learning_rate": 3.6402410392862734e-05, "loss": 0.0, "step": 34750 }, { "epoch": 2.721353810036193, "grad_norm": NaN, "learning_rate": 3.639262795429645e-05, "loss": 0.0, "step": 34775 }, { "epoch": 2.723310182920865, "grad_norm": NaN, "learning_rate": 3.638284551573016e-05, "loss": 0.0, "step": 34800 }, { "epoch": 2.723310182920865, "eval_loss": NaN, "eval_runtime": 297.3032, "eval_samples_per_second": 404.688, "eval_steps_per_second": 6.324, "step": 34800 }, { "epoch": 2.7252665558055367, "grad_norm": NaN, "learning_rate": 3.637306307716388e-05, "loss": 0.0, "step": 34825 }, { "epoch": 2.7272229286902085, "grad_norm": NaN, "learning_rate": 3.636328063859759e-05, "loss": 0.0, "step": 34850 }, { "epoch": 2.72917930157488, "grad_norm": NaN, "learning_rate": 3.635349820003131e-05, "loss": 0.0, "step": 34875 }, { "epoch": 2.731135674459552, "grad_norm": NaN, "learning_rate": 3.634371576146502e-05, "loss": 0.0, "step": 34900 }, { "epoch": 2.731135674459552, "eval_loss": NaN, "eval_runtime": 303.9441, "eval_samples_per_second": 395.846, "eval_steps_per_second": 6.185, "step": 34900 }, { "epoch": 2.7330920473442237, "grad_norm": NaN, "learning_rate": 3.6333933322898736e-05, "loss": 0.0, "step": 34925 }, { "epoch": 2.7350484202288956, "grad_norm": NaN, "learning_rate": 3.6324150884332446e-05, "loss": 0.0, "step": 34950 }, { "epoch": 2.7370047931135675, "grad_norm": NaN, "learning_rate": 3.6314368445766164e-05, "loss": 0.0, "step": 34975 }, { "epoch": 2.7389611659982394, "grad_norm": NaN, "learning_rate": 3.6304586007199875e-05, "loss": 0.0, "step": 35000 }, { "epoch": 2.7389611659982394, "eval_loss": NaN, "eval_runtime": 297.2476, "eval_samples_per_second": 404.764, "eval_steps_per_second": 6.325, "step": 35000 }, { "epoch": 2.7409175388829112, "grad_norm": NaN, "learning_rate": 3.629480356863359e-05, "loss": 0.0, "step": 35025 }, { "epoch": 2.7428739117675827, "grad_norm": NaN, "learning_rate": 3.62850211300673e-05, "loss": 0.0, "step": 35050 }, { "epoch": 2.744830284652255, "grad_norm": NaN, "learning_rate": 3.627523869150102e-05, "loss": 0.0, "step": 35075 }, { "epoch": 2.7467866575369264, "grad_norm": NaN, "learning_rate": 3.626545625293473e-05, "loss": 0.0, "step": 35100 }, { "epoch": 2.7467866575369264, "eval_loss": NaN, "eval_runtime": 297.6205, "eval_samples_per_second": 404.256, "eval_steps_per_second": 6.317, "step": 35100 }, { "epoch": 2.7487430304215983, "grad_norm": NaN, "learning_rate": 3.625567381436845e-05, "loss": 0.0, "step": 35125 }, { "epoch": 2.75069940330627, "grad_norm": NaN, "learning_rate": 3.624589137580216e-05, "loss": 0.0, "step": 35150 }, { "epoch": 2.752655776190942, "grad_norm": NaN, "learning_rate": 3.6236108937235877e-05, "loss": 0.0, "step": 35175 }, { "epoch": 2.754612149075614, "grad_norm": NaN, "learning_rate": 3.622632649866959e-05, "loss": 0.0, "step": 35200 }, { "epoch": 2.754612149075614, "eval_loss": NaN, "eval_runtime": 296.9984, "eval_samples_per_second": 405.103, "eval_steps_per_second": 6.33, "step": 35200 }, { "epoch": 2.756568521960286, "grad_norm": NaN, "learning_rate": 3.6216544060103305e-05, "loss": 0.0, "step": 35225 }, { "epoch": 2.7585248948449577, "grad_norm": NaN, "learning_rate": 3.6206761621537015e-05, "loss": 0.0, "step": 35250 }, { "epoch": 2.760481267729629, "grad_norm": NaN, "learning_rate": 3.619697918297073e-05, "loss": 0.0, "step": 35275 }, { "epoch": 2.762437640614301, "grad_norm": NaN, "learning_rate": 3.6187196744404444e-05, "loss": 0.0, "step": 35300 }, { "epoch": 2.762437640614301, "eval_loss": NaN, "eval_runtime": 296.7451, "eval_samples_per_second": 405.449, "eval_steps_per_second": 6.335, "step": 35300 } ], "logging_steps": 25, "max_steps": 127780, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.208614841106825e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }