{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999939379610938, "eval_steps": 300, "global_step": 20620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024248155624662798, "grad_norm": 0.53515625, "learning_rate": 2.4248302618816684e-07, "loss": 2.8874, "num_input_tokens_seen": 2621440, "step": 5 }, { "epoch": 0.00048496311249325596, "grad_norm": 0.5390625, "learning_rate": 4.849660523763337e-07, "loss": 2.8873, "num_input_tokens_seen": 5242880, "step": 10 }, { "epoch": 0.0007274446687398839, "grad_norm": 0.515625, "learning_rate": 7.274490785645005e-07, "loss": 2.8857, "num_input_tokens_seen": 7864320, "step": 15 }, { "epoch": 0.0009699262249865119, "grad_norm": 0.51953125, "learning_rate": 9.699321047526673e-07, "loss": 2.8831, "num_input_tokens_seen": 10485760, "step": 20 }, { "epoch": 0.00121240778123314, "grad_norm": 0.51171875, "learning_rate": 1.212415130940834e-06, "loss": 2.8732, "num_input_tokens_seen": 13107200, "step": 25 }, { "epoch": 0.0014548893374797679, "grad_norm": 0.51171875, "learning_rate": 1.454898157129001e-06, "loss": 2.8937, "num_input_tokens_seen": 15728640, "step": 30 }, { "epoch": 0.001697370893726396, "grad_norm": 0.51171875, "learning_rate": 1.697381183317168e-06, "loss": 2.879, "num_input_tokens_seen": 18350080, "step": 35 }, { "epoch": 0.0019398524499730238, "grad_norm": 0.4921875, "learning_rate": 1.9398642095053347e-06, "loss": 2.879, "num_input_tokens_seen": 20971520, "step": 40 }, { "epoch": 0.0021823340062196517, "grad_norm": 0.482421875, "learning_rate": 2.1823472356935016e-06, "loss": 2.8919, "num_input_tokens_seen": 23592960, "step": 45 }, { "epoch": 0.00242481556246628, "grad_norm": 0.474609375, "learning_rate": 2.424830261881668e-06, "loss": 2.8731, "num_input_tokens_seen": 26214400, "step": 50 }, { "epoch": 0.002667297118712908, "grad_norm": 0.46484375, "learning_rate": 2.667313288069835e-06, "loss": 2.8727, "num_input_tokens_seen": 28835840, "step": 55 }, { "epoch": 0.0029097786749595358, "grad_norm": 0.44921875, "learning_rate": 2.909796314258002e-06, "loss": 2.8773, "num_input_tokens_seen": 31457280, "step": 60 }, { "epoch": 0.003152260231206164, "grad_norm": 0.435546875, "learning_rate": 3.152279340446169e-06, "loss": 2.876, "num_input_tokens_seen": 34078720, "step": 65 }, { "epoch": 0.003394741787452792, "grad_norm": 0.4375, "learning_rate": 3.394762366634336e-06, "loss": 2.8747, "num_input_tokens_seen": 36700160, "step": 70 }, { "epoch": 0.00363722334369942, "grad_norm": 0.42578125, "learning_rate": 3.6372453928225025e-06, "loss": 2.8939, "num_input_tokens_seen": 39321600, "step": 75 }, { "epoch": 0.0038797048999460477, "grad_norm": 0.3984375, "learning_rate": 3.879728419010669e-06, "loss": 2.8853, "num_input_tokens_seen": 41943040, "step": 80 }, { "epoch": 0.0041221864561926756, "grad_norm": 0.408203125, "learning_rate": 4.122211445198836e-06, "loss": 2.8875, "num_input_tokens_seen": 44564480, "step": 85 }, { "epoch": 0.004364668012439303, "grad_norm": 0.396484375, "learning_rate": 4.364694471387003e-06, "loss": 2.8808, "num_input_tokens_seen": 47185920, "step": 90 }, { "epoch": 0.004607149568685932, "grad_norm": 0.380859375, "learning_rate": 4.60717749757517e-06, "loss": 2.871, "num_input_tokens_seen": 49807360, "step": 95 }, { "epoch": 0.00484963112493256, "grad_norm": 0.369140625, "learning_rate": 4.849660523763336e-06, "loss": 2.881, "num_input_tokens_seen": 52428800, "step": 100 }, { "epoch": 0.005092112681179188, "grad_norm": 0.353515625, "learning_rate": 5.092143549951504e-06, "loss": 2.8749, "num_input_tokens_seen": 55050240, "step": 105 }, { "epoch": 0.005334594237425816, "grad_norm": 0.33984375, "learning_rate": 5.33462657613967e-06, "loss": 2.8786, "num_input_tokens_seen": 57671680, "step": 110 }, { "epoch": 0.005577075793672444, "grad_norm": 0.333984375, "learning_rate": 5.5771096023278376e-06, "loss": 2.8811, "num_input_tokens_seen": 60293120, "step": 115 }, { "epoch": 0.0058195573499190715, "grad_norm": 0.3125, "learning_rate": 5.819592628516004e-06, "loss": 2.8714, "num_input_tokens_seen": 62914560, "step": 120 }, { "epoch": 0.006062038906165699, "grad_norm": 0.306640625, "learning_rate": 6.0620756547041715e-06, "loss": 2.8902, "num_input_tokens_seen": 65536000, "step": 125 }, { "epoch": 0.006304520462412328, "grad_norm": 0.2890625, "learning_rate": 6.304558680892338e-06, "loss": 2.8573, "num_input_tokens_seen": 68157440, "step": 130 }, { "epoch": 0.006547002018658956, "grad_norm": 0.29296875, "learning_rate": 6.5470417070805045e-06, "loss": 2.8806, "num_input_tokens_seen": 70778880, "step": 135 }, { "epoch": 0.006789483574905584, "grad_norm": 0.287109375, "learning_rate": 6.789524733268672e-06, "loss": 2.8807, "num_input_tokens_seen": 73400320, "step": 140 }, { "epoch": 0.007031965131152212, "grad_norm": 0.271484375, "learning_rate": 7.0320077594568375e-06, "loss": 2.8691, "num_input_tokens_seen": 76021760, "step": 145 }, { "epoch": 0.00727444668739884, "grad_norm": 0.267578125, "learning_rate": 7.274490785645005e-06, "loss": 2.8789, "num_input_tokens_seen": 78643200, "step": 150 }, { "epoch": 0.0075169282436454675, "grad_norm": 0.265625, "learning_rate": 7.516973811833172e-06, "loss": 2.8767, "num_input_tokens_seen": 81264640, "step": 155 }, { "epoch": 0.007759409799892095, "grad_norm": 0.2734375, "learning_rate": 7.759456838021339e-06, "loss": 2.8645, "num_input_tokens_seen": 83886080, "step": 160 }, { "epoch": 0.008001891356138723, "grad_norm": 0.263671875, "learning_rate": 8.001939864209505e-06, "loss": 2.8634, "num_input_tokens_seen": 86507520, "step": 165 }, { "epoch": 0.008244372912385351, "grad_norm": 0.255859375, "learning_rate": 8.244422890397672e-06, "loss": 2.8609, "num_input_tokens_seen": 89128960, "step": 170 }, { "epoch": 0.008486854468631979, "grad_norm": 0.259765625, "learning_rate": 8.48690591658584e-06, "loss": 2.8685, "num_input_tokens_seen": 91750400, "step": 175 }, { "epoch": 0.008729336024878607, "grad_norm": 0.25, "learning_rate": 8.729388942774007e-06, "loss": 2.8798, "num_input_tokens_seen": 94371840, "step": 180 }, { "epoch": 0.008971817581125236, "grad_norm": 0.255859375, "learning_rate": 8.971871968962173e-06, "loss": 2.8717, "num_input_tokens_seen": 96993280, "step": 185 }, { "epoch": 0.009214299137371864, "grad_norm": 0.25, "learning_rate": 9.21435499515034e-06, "loss": 2.8696, "num_input_tokens_seen": 99614720, "step": 190 }, { "epoch": 0.009456780693618492, "grad_norm": 0.2490234375, "learning_rate": 9.456838021338508e-06, "loss": 2.8578, "num_input_tokens_seen": 102236160, "step": 195 }, { "epoch": 0.00969926224986512, "grad_norm": 0.2490234375, "learning_rate": 9.699321047526673e-06, "loss": 2.8771, "num_input_tokens_seen": 104857600, "step": 200 }, { "epoch": 0.009941743806111748, "grad_norm": 0.2421875, "learning_rate": 9.941804073714841e-06, "loss": 2.8708, "num_input_tokens_seen": 107479040, "step": 205 }, { "epoch": 0.010184225362358376, "grad_norm": 0.248046875, "learning_rate": 1.0184287099903007e-05, "loss": 2.8761, "num_input_tokens_seen": 110100480, "step": 210 }, { "epoch": 0.010426706918605004, "grad_norm": 0.2392578125, "learning_rate": 1.0426770126091174e-05, "loss": 2.863, "num_input_tokens_seen": 112721920, "step": 215 }, { "epoch": 0.010669188474851632, "grad_norm": 0.2470703125, "learning_rate": 1.066925315227934e-05, "loss": 2.8727, "num_input_tokens_seen": 115343360, "step": 220 }, { "epoch": 0.01091167003109826, "grad_norm": 0.2412109375, "learning_rate": 1.0911736178467507e-05, "loss": 2.845, "num_input_tokens_seen": 117964800, "step": 225 }, { "epoch": 0.011154151587344887, "grad_norm": 0.248046875, "learning_rate": 1.1154219204655675e-05, "loss": 2.8734, "num_input_tokens_seen": 120586240, "step": 230 }, { "epoch": 0.011396633143591515, "grad_norm": 0.24609375, "learning_rate": 1.1396702230843842e-05, "loss": 2.8739, "num_input_tokens_seen": 123207680, "step": 235 }, { "epoch": 0.011639114699838143, "grad_norm": 0.2451171875, "learning_rate": 1.1639185257032008e-05, "loss": 2.8528, "num_input_tokens_seen": 125829120, "step": 240 }, { "epoch": 0.011881596256084771, "grad_norm": 0.2470703125, "learning_rate": 1.1881668283220175e-05, "loss": 2.8602, "num_input_tokens_seen": 128450560, "step": 245 }, { "epoch": 0.012124077812331399, "grad_norm": 0.2421875, "learning_rate": 1.2124151309408343e-05, "loss": 2.8657, "num_input_tokens_seen": 131072000, "step": 250 }, { "epoch": 0.012366559368578028, "grad_norm": 0.2431640625, "learning_rate": 1.236663433559651e-05, "loss": 2.8519, "num_input_tokens_seen": 133693440, "step": 255 }, { "epoch": 0.012609040924824656, "grad_norm": 0.2412109375, "learning_rate": 1.2609117361784676e-05, "loss": 2.8603, "num_input_tokens_seen": 136314880, "step": 260 }, { "epoch": 0.012851522481071284, "grad_norm": 0.2392578125, "learning_rate": 1.2851600387972842e-05, "loss": 2.8633, "num_input_tokens_seen": 138936320, "step": 265 }, { "epoch": 0.013094004037317912, "grad_norm": 0.2451171875, "learning_rate": 1.3094083414161009e-05, "loss": 2.8777, "num_input_tokens_seen": 141557760, "step": 270 }, { "epoch": 0.01333648559356454, "grad_norm": 0.2353515625, "learning_rate": 1.3336566440349177e-05, "loss": 2.8497, "num_input_tokens_seen": 144179200, "step": 275 }, { "epoch": 0.013578967149811168, "grad_norm": 0.2373046875, "learning_rate": 1.3579049466537344e-05, "loss": 2.8623, "num_input_tokens_seen": 146800640, "step": 280 }, { "epoch": 0.013821448706057796, "grad_norm": 0.2392578125, "learning_rate": 1.3821532492725509e-05, "loss": 2.8706, "num_input_tokens_seen": 149422080, "step": 285 }, { "epoch": 0.014063930262304424, "grad_norm": 0.2412109375, "learning_rate": 1.4064015518913675e-05, "loss": 2.8755, "num_input_tokens_seen": 152043520, "step": 290 }, { "epoch": 0.014306411818551051, "grad_norm": 0.236328125, "learning_rate": 1.4306498545101843e-05, "loss": 2.8671, "num_input_tokens_seen": 154664960, "step": 295 }, { "epoch": 0.01454889337479768, "grad_norm": 0.2431640625, "learning_rate": 1.454898157129001e-05, "loss": 2.8567, "num_input_tokens_seen": 157286400, "step": 300 }, { "epoch": 0.01454889337479768, "eval_accuracy": 0.4449780166096727, "eval_loss": 2.829090118408203, "eval_runtime": 5.8559, "eval_samples_per_second": 51.231, "eval_steps_per_second": 6.489, "num_input_tokens_seen": 157286400, "step": 300 }, { "epoch": 0.014791374931044307, "grad_norm": 0.2392578125, "learning_rate": 1.4791464597478178e-05, "loss": 2.8608, "num_input_tokens_seen": 159907840, "step": 305 }, { "epoch": 0.015033856487290935, "grad_norm": 0.240234375, "learning_rate": 1.5033947623666345e-05, "loss": 2.8402, "num_input_tokens_seen": 162529280, "step": 310 }, { "epoch": 0.015276338043537563, "grad_norm": 0.244140625, "learning_rate": 1.5276430649854513e-05, "loss": 2.8556, "num_input_tokens_seen": 165150720, "step": 315 }, { "epoch": 0.01551881959978419, "grad_norm": 0.2412109375, "learning_rate": 1.5518913676042678e-05, "loss": 2.8615, "num_input_tokens_seen": 167772160, "step": 320 }, { "epoch": 0.01576130115603082, "grad_norm": 0.2412109375, "learning_rate": 1.5761396702230842e-05, "loss": 2.8763, "num_input_tokens_seen": 170393600, "step": 325 }, { "epoch": 0.016003782712277446, "grad_norm": 0.2412109375, "learning_rate": 1.600387972841901e-05, "loss": 2.8641, "num_input_tokens_seen": 173015040, "step": 330 }, { "epoch": 0.016246264268524074, "grad_norm": 0.2392578125, "learning_rate": 1.624636275460718e-05, "loss": 2.8674, "num_input_tokens_seen": 175636480, "step": 335 }, { "epoch": 0.016488745824770702, "grad_norm": 0.2392578125, "learning_rate": 1.6488845780795344e-05, "loss": 2.8575, "num_input_tokens_seen": 178257920, "step": 340 }, { "epoch": 0.01673122738101733, "grad_norm": 0.240234375, "learning_rate": 1.6731328806983512e-05, "loss": 2.8661, "num_input_tokens_seen": 180879360, "step": 345 }, { "epoch": 0.016973708937263958, "grad_norm": 0.23828125, "learning_rate": 1.697381183317168e-05, "loss": 2.861, "num_input_tokens_seen": 183500800, "step": 350 }, { "epoch": 0.017216190493510586, "grad_norm": 0.23828125, "learning_rate": 1.7216294859359848e-05, "loss": 2.8666, "num_input_tokens_seen": 186122240, "step": 355 }, { "epoch": 0.017458672049757214, "grad_norm": 0.2353515625, "learning_rate": 1.7458777885548013e-05, "loss": 2.8575, "num_input_tokens_seen": 188743680, "step": 360 }, { "epoch": 0.017701153606003845, "grad_norm": 0.2490234375, "learning_rate": 1.7701260911736178e-05, "loss": 2.8656, "num_input_tokens_seen": 191365120, "step": 365 }, { "epoch": 0.017943635162250473, "grad_norm": 0.236328125, "learning_rate": 1.7943743937924346e-05, "loss": 2.8487, "num_input_tokens_seen": 193986560, "step": 370 }, { "epoch": 0.0181861167184971, "grad_norm": 0.244140625, "learning_rate": 1.8186226964112514e-05, "loss": 2.8808, "num_input_tokens_seen": 196608000, "step": 375 }, { "epoch": 0.01842859827474373, "grad_norm": 0.244140625, "learning_rate": 1.842870999030068e-05, "loss": 2.8551, "num_input_tokens_seen": 199229440, "step": 380 }, { "epoch": 0.018671079830990357, "grad_norm": 0.2353515625, "learning_rate": 1.8671193016488847e-05, "loss": 2.8548, "num_input_tokens_seen": 201850880, "step": 385 }, { "epoch": 0.018913561387236984, "grad_norm": 0.2431640625, "learning_rate": 1.8913676042677016e-05, "loss": 2.8544, "num_input_tokens_seen": 204472320, "step": 390 }, { "epoch": 0.019156042943483612, "grad_norm": 0.2392578125, "learning_rate": 1.915615906886518e-05, "loss": 2.8602, "num_input_tokens_seen": 207093760, "step": 395 }, { "epoch": 0.01939852449973024, "grad_norm": 0.2392578125, "learning_rate": 1.9398642095053345e-05, "loss": 2.8397, "num_input_tokens_seen": 209715200, "step": 400 }, { "epoch": 0.019641006055976868, "grad_norm": 0.24609375, "learning_rate": 1.9641125121241513e-05, "loss": 2.8642, "num_input_tokens_seen": 212336640, "step": 405 }, { "epoch": 0.019883487612223496, "grad_norm": 0.236328125, "learning_rate": 1.9883608147429682e-05, "loss": 2.8498, "num_input_tokens_seen": 214958080, "step": 410 }, { "epoch": 0.020125969168470124, "grad_norm": 0.25, "learning_rate": 2.0126091173617847e-05, "loss": 2.8676, "num_input_tokens_seen": 217579520, "step": 415 }, { "epoch": 0.02036845072471675, "grad_norm": 0.2314453125, "learning_rate": 2.0368574199806015e-05, "loss": 2.8698, "num_input_tokens_seen": 220200960, "step": 420 }, { "epoch": 0.02061093228096338, "grad_norm": 0.236328125, "learning_rate": 2.0611057225994183e-05, "loss": 2.8542, "num_input_tokens_seen": 222822400, "step": 425 }, { "epoch": 0.020853413837210007, "grad_norm": 0.236328125, "learning_rate": 2.0853540252182348e-05, "loss": 2.8501, "num_input_tokens_seen": 225443840, "step": 430 }, { "epoch": 0.021095895393456635, "grad_norm": 0.240234375, "learning_rate": 2.1096023278370516e-05, "loss": 2.8506, "num_input_tokens_seen": 228065280, "step": 435 }, { "epoch": 0.021338376949703263, "grad_norm": 0.2392578125, "learning_rate": 2.133850630455868e-05, "loss": 2.8552, "num_input_tokens_seen": 230686720, "step": 440 }, { "epoch": 0.02158085850594989, "grad_norm": 0.2431640625, "learning_rate": 2.158098933074685e-05, "loss": 2.8736, "num_input_tokens_seen": 233308160, "step": 445 }, { "epoch": 0.02182334006219652, "grad_norm": 0.248046875, "learning_rate": 2.1823472356935014e-05, "loss": 2.8596, "num_input_tokens_seen": 235929600, "step": 450 }, { "epoch": 0.022065821618443147, "grad_norm": 0.2392578125, "learning_rate": 2.2065955383123182e-05, "loss": 2.8558, "num_input_tokens_seen": 238551040, "step": 455 }, { "epoch": 0.022308303174689775, "grad_norm": 0.2392578125, "learning_rate": 2.230843840931135e-05, "loss": 2.8413, "num_input_tokens_seen": 241172480, "step": 460 }, { "epoch": 0.022550784730936402, "grad_norm": 0.2421875, "learning_rate": 2.255092143549952e-05, "loss": 2.8529, "num_input_tokens_seen": 243793920, "step": 465 }, { "epoch": 0.02279326628718303, "grad_norm": 0.2353515625, "learning_rate": 2.2793404461687683e-05, "loss": 2.8531, "num_input_tokens_seen": 246415360, "step": 470 }, { "epoch": 0.023035747843429658, "grad_norm": 0.248046875, "learning_rate": 2.3035887487875848e-05, "loss": 2.8582, "num_input_tokens_seen": 249036800, "step": 475 }, { "epoch": 0.023278229399676286, "grad_norm": 0.248046875, "learning_rate": 2.3278370514064016e-05, "loss": 2.8544, "num_input_tokens_seen": 251658240, "step": 480 }, { "epoch": 0.023520710955922914, "grad_norm": 0.2392578125, "learning_rate": 2.3520853540252185e-05, "loss": 2.8485, "num_input_tokens_seen": 254279680, "step": 485 }, { "epoch": 0.023763192512169542, "grad_norm": 0.2392578125, "learning_rate": 2.376333656644035e-05, "loss": 2.8555, "num_input_tokens_seen": 256901120, "step": 490 }, { "epoch": 0.02400567406841617, "grad_norm": 0.2470703125, "learning_rate": 2.4005819592628518e-05, "loss": 2.851, "num_input_tokens_seen": 259522560, "step": 495 }, { "epoch": 0.024248155624662798, "grad_norm": 0.2421875, "learning_rate": 2.4248302618816686e-05, "loss": 2.859, "num_input_tokens_seen": 262144000, "step": 500 }, { "epoch": 0.024490637180909425, "grad_norm": 0.2353515625, "learning_rate": 2.449078564500485e-05, "loss": 2.8565, "num_input_tokens_seen": 264765440, "step": 505 }, { "epoch": 0.024733118737156057, "grad_norm": 0.24609375, "learning_rate": 2.473326867119302e-05, "loss": 2.8593, "num_input_tokens_seen": 267386880, "step": 510 }, { "epoch": 0.024975600293402685, "grad_norm": 0.240234375, "learning_rate": 2.4975751697381184e-05, "loss": 2.8371, "num_input_tokens_seen": 270008320, "step": 515 }, { "epoch": 0.025218081849649313, "grad_norm": 0.236328125, "learning_rate": 2.5218234723569352e-05, "loss": 2.8615, "num_input_tokens_seen": 272629760, "step": 520 }, { "epoch": 0.02546056340589594, "grad_norm": 0.240234375, "learning_rate": 2.5460717749757517e-05, "loss": 2.8606, "num_input_tokens_seen": 275251200, "step": 525 }, { "epoch": 0.02570304496214257, "grad_norm": 0.2412109375, "learning_rate": 2.5703200775945685e-05, "loss": 2.8607, "num_input_tokens_seen": 277872640, "step": 530 }, { "epoch": 0.025945526518389196, "grad_norm": 0.2470703125, "learning_rate": 2.5945683802133853e-05, "loss": 2.8498, "num_input_tokens_seen": 280494080, "step": 535 }, { "epoch": 0.026188008074635824, "grad_norm": 0.232421875, "learning_rate": 2.6188166828322018e-05, "loss": 2.8578, "num_input_tokens_seen": 283115520, "step": 540 }, { "epoch": 0.026430489630882452, "grad_norm": 0.236328125, "learning_rate": 2.6430649854510186e-05, "loss": 2.858, "num_input_tokens_seen": 285736960, "step": 545 }, { "epoch": 0.02667297118712908, "grad_norm": 0.2421875, "learning_rate": 2.6673132880698354e-05, "loss": 2.8645, "num_input_tokens_seen": 288358400, "step": 550 }, { "epoch": 0.026915452743375708, "grad_norm": 0.2373046875, "learning_rate": 2.6915615906886523e-05, "loss": 2.8586, "num_input_tokens_seen": 290979840, "step": 555 }, { "epoch": 0.027157934299622336, "grad_norm": 0.2392578125, "learning_rate": 2.7158098933074687e-05, "loss": 2.8533, "num_input_tokens_seen": 293601280, "step": 560 }, { "epoch": 0.027400415855868963, "grad_norm": 0.2373046875, "learning_rate": 2.7400581959262856e-05, "loss": 2.8527, "num_input_tokens_seen": 296222720, "step": 565 }, { "epoch": 0.02764289741211559, "grad_norm": 0.2490234375, "learning_rate": 2.7643064985451017e-05, "loss": 2.8535, "num_input_tokens_seen": 298844160, "step": 570 }, { "epoch": 0.02788537896836222, "grad_norm": 0.2451171875, "learning_rate": 2.7885548011639185e-05, "loss": 2.8571, "num_input_tokens_seen": 301465600, "step": 575 }, { "epoch": 0.028127860524608847, "grad_norm": 0.236328125, "learning_rate": 2.812803103782735e-05, "loss": 2.8551, "num_input_tokens_seen": 304087040, "step": 580 }, { "epoch": 0.028370342080855475, "grad_norm": 0.236328125, "learning_rate": 2.837051406401552e-05, "loss": 2.8241, "num_input_tokens_seen": 306708480, "step": 585 }, { "epoch": 0.028612823637102103, "grad_norm": 0.2421875, "learning_rate": 2.8612997090203687e-05, "loss": 2.8533, "num_input_tokens_seen": 309329920, "step": 590 }, { "epoch": 0.02885530519334873, "grad_norm": 0.2421875, "learning_rate": 2.8855480116391855e-05, "loss": 2.8432, "num_input_tokens_seen": 311951360, "step": 595 }, { "epoch": 0.02909778674959536, "grad_norm": 0.2373046875, "learning_rate": 2.909796314258002e-05, "loss": 2.8517, "num_input_tokens_seen": 314572800, "step": 600 }, { "epoch": 0.02909778674959536, "eval_accuracy": 0.44654942191825436, "eval_loss": 2.81530499458313, "eval_runtime": 5.7832, "eval_samples_per_second": 51.874, "eval_steps_per_second": 6.571, "num_input_tokens_seen": 314572800, "step": 600 }, { "epoch": 0.029340268305841986, "grad_norm": 0.248046875, "learning_rate": 2.9340446168768188e-05, "loss": 2.8588, "num_input_tokens_seen": 317194240, "step": 605 }, { "epoch": 0.029582749862088614, "grad_norm": 0.2431640625, "learning_rate": 2.9582929194956356e-05, "loss": 2.8523, "num_input_tokens_seen": 319815680, "step": 610 }, { "epoch": 0.029825231418335242, "grad_norm": 0.23828125, "learning_rate": 2.982541222114452e-05, "loss": 2.8448, "num_input_tokens_seen": 322437120, "step": 615 }, { "epoch": 0.03006771297458187, "grad_norm": 0.23828125, "learning_rate": 3.006789524733269e-05, "loss": 2.843, "num_input_tokens_seen": 325058560, "step": 620 }, { "epoch": 0.030310194530828498, "grad_norm": 0.236328125, "learning_rate": 3.0310378273520857e-05, "loss": 2.8493, "num_input_tokens_seen": 327680000, "step": 625 }, { "epoch": 0.030552676087075126, "grad_norm": 0.2431640625, "learning_rate": 3.0552861299709026e-05, "loss": 2.8579, "num_input_tokens_seen": 330301440, "step": 630 }, { "epoch": 0.030795157643321754, "grad_norm": 0.2421875, "learning_rate": 3.0795344325897194e-05, "loss": 2.8366, "num_input_tokens_seen": 332922880, "step": 635 }, { "epoch": 0.03103763919956838, "grad_norm": 0.24609375, "learning_rate": 3.1037827352085355e-05, "loss": 2.8389, "num_input_tokens_seen": 335544320, "step": 640 }, { "epoch": 0.03128012075581501, "grad_norm": 0.2431640625, "learning_rate": 3.1280310378273517e-05, "loss": 2.8449, "num_input_tokens_seen": 338165760, "step": 645 }, { "epoch": 0.03152260231206164, "grad_norm": 0.2373046875, "learning_rate": 3.1522793404461685e-05, "loss": 2.8578, "num_input_tokens_seen": 340787200, "step": 650 }, { "epoch": 0.031765083868308265, "grad_norm": 0.2470703125, "learning_rate": 3.176527643064985e-05, "loss": 2.8494, "num_input_tokens_seen": 343408640, "step": 655 }, { "epoch": 0.03200756542455489, "grad_norm": 0.2431640625, "learning_rate": 3.200775945683802e-05, "loss": 2.8448, "num_input_tokens_seen": 346030080, "step": 660 }, { "epoch": 0.03225004698080152, "grad_norm": 0.248046875, "learning_rate": 3.225024248302619e-05, "loss": 2.8583, "num_input_tokens_seen": 348651520, "step": 665 }, { "epoch": 0.03249252853704815, "grad_norm": 0.234375, "learning_rate": 3.249272550921436e-05, "loss": 2.8622, "num_input_tokens_seen": 351272960, "step": 670 }, { "epoch": 0.03273501009329478, "grad_norm": 0.25390625, "learning_rate": 3.2735208535402526e-05, "loss": 2.8409, "num_input_tokens_seen": 353894400, "step": 675 }, { "epoch": 0.032977491649541404, "grad_norm": 0.251953125, "learning_rate": 3.297769156159069e-05, "loss": 2.8468, "num_input_tokens_seen": 356515840, "step": 680 }, { "epoch": 0.03321997320578803, "grad_norm": 0.248046875, "learning_rate": 3.3220174587778856e-05, "loss": 2.8385, "num_input_tokens_seen": 359137280, "step": 685 }, { "epoch": 0.03346245476203466, "grad_norm": 0.2578125, "learning_rate": 3.3462657613967024e-05, "loss": 2.8417, "num_input_tokens_seen": 361758720, "step": 690 }, { "epoch": 0.03370493631828129, "grad_norm": 0.251953125, "learning_rate": 3.370514064015519e-05, "loss": 2.8397, "num_input_tokens_seen": 364380160, "step": 695 }, { "epoch": 0.033947417874527916, "grad_norm": 0.234375, "learning_rate": 3.394762366634336e-05, "loss": 2.844, "num_input_tokens_seen": 367001600, "step": 700 }, { "epoch": 0.034189899430774544, "grad_norm": 0.2451171875, "learning_rate": 3.419010669253153e-05, "loss": 2.8459, "num_input_tokens_seen": 369623040, "step": 705 }, { "epoch": 0.03443238098702117, "grad_norm": 0.24609375, "learning_rate": 3.4432589718719697e-05, "loss": 2.8481, "num_input_tokens_seen": 372244480, "step": 710 }, { "epoch": 0.0346748625432678, "grad_norm": 0.2470703125, "learning_rate": 3.467507274490786e-05, "loss": 2.8362, "num_input_tokens_seen": 374865920, "step": 715 }, { "epoch": 0.03491734409951443, "grad_norm": 0.2412109375, "learning_rate": 3.4917555771096026e-05, "loss": 2.8333, "num_input_tokens_seen": 377487360, "step": 720 }, { "epoch": 0.035159825655761055, "grad_norm": 0.2412109375, "learning_rate": 3.516003879728419e-05, "loss": 2.844, "num_input_tokens_seen": 380108800, "step": 725 }, { "epoch": 0.03540230721200769, "grad_norm": 0.2392578125, "learning_rate": 3.5402521823472356e-05, "loss": 2.8341, "num_input_tokens_seen": 382730240, "step": 730 }, { "epoch": 0.03564478876825432, "grad_norm": 0.236328125, "learning_rate": 3.5645004849660524e-05, "loss": 2.8418, "num_input_tokens_seen": 385351680, "step": 735 }, { "epoch": 0.035887270324500946, "grad_norm": 0.2421875, "learning_rate": 3.588748787584869e-05, "loss": 2.8405, "num_input_tokens_seen": 387973120, "step": 740 }, { "epoch": 0.036129751880747574, "grad_norm": 0.2412109375, "learning_rate": 3.612997090203686e-05, "loss": 2.8324, "num_input_tokens_seen": 390594560, "step": 745 }, { "epoch": 0.0363722334369942, "grad_norm": 0.2412109375, "learning_rate": 3.637245392822503e-05, "loss": 2.8495, "num_input_tokens_seen": 393216000, "step": 750 }, { "epoch": 0.03661471499324083, "grad_norm": 0.251953125, "learning_rate": 3.661493695441319e-05, "loss": 2.8399, "num_input_tokens_seen": 395837440, "step": 755 }, { "epoch": 0.03685719654948746, "grad_norm": 0.2470703125, "learning_rate": 3.685741998060136e-05, "loss": 2.8427, "num_input_tokens_seen": 398458880, "step": 760 }, { "epoch": 0.037099678105734085, "grad_norm": 0.25390625, "learning_rate": 3.709990300678953e-05, "loss": 2.8604, "num_input_tokens_seen": 401080320, "step": 765 }, { "epoch": 0.03734215966198071, "grad_norm": 0.2451171875, "learning_rate": 3.7342386032977695e-05, "loss": 2.8341, "num_input_tokens_seen": 403701760, "step": 770 }, { "epoch": 0.03758464121822734, "grad_norm": 0.2333984375, "learning_rate": 3.758486905916586e-05, "loss": 2.8477, "num_input_tokens_seen": 406323200, "step": 775 }, { "epoch": 0.03782712277447397, "grad_norm": 0.2373046875, "learning_rate": 3.782735208535403e-05, "loss": 2.8507, "num_input_tokens_seen": 408944640, "step": 780 }, { "epoch": 0.0380696043307206, "grad_norm": 0.251953125, "learning_rate": 3.806983511154219e-05, "loss": 2.8484, "num_input_tokens_seen": 411566080, "step": 785 }, { "epoch": 0.038312085886967225, "grad_norm": 0.25, "learning_rate": 3.831231813773036e-05, "loss": 2.833, "num_input_tokens_seen": 414187520, "step": 790 }, { "epoch": 0.03855456744321385, "grad_norm": 0.2392578125, "learning_rate": 3.855480116391853e-05, "loss": 2.8401, "num_input_tokens_seen": 416808960, "step": 795 }, { "epoch": 0.03879704899946048, "grad_norm": 0.240234375, "learning_rate": 3.879728419010669e-05, "loss": 2.8459, "num_input_tokens_seen": 419430400, "step": 800 }, { "epoch": 0.03903953055570711, "grad_norm": 0.25390625, "learning_rate": 3.903976721629486e-05, "loss": 2.8508, "num_input_tokens_seen": 422051840, "step": 805 }, { "epoch": 0.039282012111953736, "grad_norm": 0.240234375, "learning_rate": 3.928225024248303e-05, "loss": 2.8379, "num_input_tokens_seen": 424673280, "step": 810 }, { "epoch": 0.039524493668200364, "grad_norm": 0.248046875, "learning_rate": 3.9524733268671195e-05, "loss": 2.826, "num_input_tokens_seen": 427294720, "step": 815 }, { "epoch": 0.03976697522444699, "grad_norm": 0.244140625, "learning_rate": 3.9767216294859363e-05, "loss": 2.837, "num_input_tokens_seen": 429916160, "step": 820 }, { "epoch": 0.04000945678069362, "grad_norm": 0.2470703125, "learning_rate": 4.0009699321047525e-05, "loss": 2.8427, "num_input_tokens_seen": 432537600, "step": 825 }, { "epoch": 0.04025193833694025, "grad_norm": 0.240234375, "learning_rate": 4.025218234723569e-05, "loss": 2.8387, "num_input_tokens_seen": 435159040, "step": 830 }, { "epoch": 0.040494419893186875, "grad_norm": 0.248046875, "learning_rate": 4.049466537342386e-05, "loss": 2.8321, "num_input_tokens_seen": 437780480, "step": 835 }, { "epoch": 0.0407369014494335, "grad_norm": 0.25, "learning_rate": 4.073714839961203e-05, "loss": 2.8436, "num_input_tokens_seen": 440401920, "step": 840 }, { "epoch": 0.04097938300568013, "grad_norm": 0.248046875, "learning_rate": 4.09796314258002e-05, "loss": 2.8422, "num_input_tokens_seen": 443023360, "step": 845 }, { "epoch": 0.04122186456192676, "grad_norm": 0.2490234375, "learning_rate": 4.1222114451988366e-05, "loss": 2.8361, "num_input_tokens_seen": 445644800, "step": 850 }, { "epoch": 0.04146434611817339, "grad_norm": 0.236328125, "learning_rate": 4.1464597478176534e-05, "loss": 2.8283, "num_input_tokens_seen": 448266240, "step": 855 }, { "epoch": 0.041706827674420015, "grad_norm": 0.2578125, "learning_rate": 4.1707080504364696e-05, "loss": 2.8344, "num_input_tokens_seen": 450887680, "step": 860 }, { "epoch": 0.04194930923066664, "grad_norm": 0.2421875, "learning_rate": 4.1949563530552864e-05, "loss": 2.8247, "num_input_tokens_seen": 453509120, "step": 865 }, { "epoch": 0.04219179078691327, "grad_norm": 0.244140625, "learning_rate": 4.219204655674103e-05, "loss": 2.8487, "num_input_tokens_seen": 456130560, "step": 870 }, { "epoch": 0.0424342723431599, "grad_norm": 0.23828125, "learning_rate": 4.2434529582929193e-05, "loss": 2.8383, "num_input_tokens_seen": 458752000, "step": 875 }, { "epoch": 0.042676753899406526, "grad_norm": 0.251953125, "learning_rate": 4.267701260911736e-05, "loss": 2.839, "num_input_tokens_seen": 461373440, "step": 880 }, { "epoch": 0.042919235455653154, "grad_norm": 0.24609375, "learning_rate": 4.291949563530553e-05, "loss": 2.8285, "num_input_tokens_seen": 463994880, "step": 885 }, { "epoch": 0.04316171701189978, "grad_norm": 0.244140625, "learning_rate": 4.31619786614937e-05, "loss": 2.8289, "num_input_tokens_seen": 466616320, "step": 890 }, { "epoch": 0.04340419856814641, "grad_norm": 0.2490234375, "learning_rate": 4.3404461687681866e-05, "loss": 2.847, "num_input_tokens_seen": 469237760, "step": 895 }, { "epoch": 0.04364668012439304, "grad_norm": 0.2451171875, "learning_rate": 4.364694471387003e-05, "loss": 2.8224, "num_input_tokens_seen": 471859200, "step": 900 }, { "epoch": 0.04364668012439304, "eval_accuracy": 0.44812734082397004, "eval_loss": 2.802464246749878, "eval_runtime": 5.8687, "eval_samples_per_second": 51.118, "eval_steps_per_second": 6.475, "num_input_tokens_seen": 471859200, "step": 900 }, { "epoch": 0.043889161680639666, "grad_norm": 0.248046875, "learning_rate": 4.3889427740058196e-05, "loss": 2.8469, "num_input_tokens_seen": 474480640, "step": 905 }, { "epoch": 0.044131643236886293, "grad_norm": 0.2421875, "learning_rate": 4.4131910766246364e-05, "loss": 2.8375, "num_input_tokens_seen": 477102080, "step": 910 }, { "epoch": 0.04437412479313292, "grad_norm": 0.25390625, "learning_rate": 4.437439379243453e-05, "loss": 2.8275, "num_input_tokens_seen": 479723520, "step": 915 }, { "epoch": 0.04461660634937955, "grad_norm": 0.244140625, "learning_rate": 4.46168768186227e-05, "loss": 2.835, "num_input_tokens_seen": 482344960, "step": 920 }, { "epoch": 0.04485908790562618, "grad_norm": 0.2578125, "learning_rate": 4.485935984481087e-05, "loss": 2.8451, "num_input_tokens_seen": 484966400, "step": 925 }, { "epoch": 0.045101569461872805, "grad_norm": 0.25390625, "learning_rate": 4.510184287099904e-05, "loss": 2.8424, "num_input_tokens_seen": 487587840, "step": 930 }, { "epoch": 0.04534405101811943, "grad_norm": 0.2421875, "learning_rate": 4.53443258971872e-05, "loss": 2.8322, "num_input_tokens_seen": 490209280, "step": 935 }, { "epoch": 0.04558653257436606, "grad_norm": 0.2470703125, "learning_rate": 4.558680892337537e-05, "loss": 2.8285, "num_input_tokens_seen": 492830720, "step": 940 }, { "epoch": 0.04582901413061269, "grad_norm": 0.244140625, "learning_rate": 4.5829291949563535e-05, "loss": 2.8391, "num_input_tokens_seen": 495452160, "step": 945 }, { "epoch": 0.046071495686859316, "grad_norm": 0.251953125, "learning_rate": 4.6071774975751696e-05, "loss": 2.8359, "num_input_tokens_seen": 498073600, "step": 950 }, { "epoch": 0.046313977243105944, "grad_norm": 0.25, "learning_rate": 4.6314258001939865e-05, "loss": 2.8355, "num_input_tokens_seen": 500695040, "step": 955 }, { "epoch": 0.04655645879935257, "grad_norm": 0.248046875, "learning_rate": 4.655674102812803e-05, "loss": 2.831, "num_input_tokens_seen": 503316480, "step": 960 }, { "epoch": 0.0467989403555992, "grad_norm": 0.248046875, "learning_rate": 4.67992240543162e-05, "loss": 2.835, "num_input_tokens_seen": 505937920, "step": 965 }, { "epoch": 0.04704142191184583, "grad_norm": 0.2421875, "learning_rate": 4.704170708050437e-05, "loss": 2.8501, "num_input_tokens_seen": 508559360, "step": 970 }, { "epoch": 0.047283903468092456, "grad_norm": 0.25390625, "learning_rate": 4.728419010669253e-05, "loss": 2.8216, "num_input_tokens_seen": 511180800, "step": 975 }, { "epoch": 0.047526385024339084, "grad_norm": 0.24609375, "learning_rate": 4.75266731328807e-05, "loss": 2.827, "num_input_tokens_seen": 513802240, "step": 980 }, { "epoch": 0.04776886658058571, "grad_norm": 0.248046875, "learning_rate": 4.776915615906887e-05, "loss": 2.8333, "num_input_tokens_seen": 516423680, "step": 985 }, { "epoch": 0.04801134813683234, "grad_norm": 0.244140625, "learning_rate": 4.8011639185257035e-05, "loss": 2.8285, "num_input_tokens_seen": 519045120, "step": 990 }, { "epoch": 0.04825382969307897, "grad_norm": 0.251953125, "learning_rate": 4.8254122211445203e-05, "loss": 2.8277, "num_input_tokens_seen": 521666560, "step": 995 }, { "epoch": 0.048496311249325595, "grad_norm": 0.25, "learning_rate": 4.849660523763337e-05, "loss": 2.8289, "num_input_tokens_seen": 524288000, "step": 1000 }, { "epoch": 0.04873879280557222, "grad_norm": 0.248046875, "learning_rate": 4.873908826382153e-05, "loss": 2.8286, "num_input_tokens_seen": 526909440, "step": 1005 }, { "epoch": 0.04898127436181885, "grad_norm": 0.251953125, "learning_rate": 4.89815712900097e-05, "loss": 2.832, "num_input_tokens_seen": 529530880, "step": 1010 }, { "epoch": 0.04922375591806548, "grad_norm": 0.2578125, "learning_rate": 4.922405431619787e-05, "loss": 2.827, "num_input_tokens_seen": 532152320, "step": 1015 }, { "epoch": 0.049466237474312114, "grad_norm": 0.248046875, "learning_rate": 4.946653734238604e-05, "loss": 2.8312, "num_input_tokens_seen": 534773760, "step": 1020 }, { "epoch": 0.04970871903055874, "grad_norm": 0.2451171875, "learning_rate": 4.97090203685742e-05, "loss": 2.841, "num_input_tokens_seen": 537395200, "step": 1025 }, { "epoch": 0.04995120058680537, "grad_norm": 0.2490234375, "learning_rate": 4.995150339476237e-05, "loss": 2.812, "num_input_tokens_seen": 540016640, "step": 1030 }, { "epoch": 0.050193682143052, "grad_norm": 0.255859375, "learning_rate": 4.999999485594985e-05, "loss": 2.8351, "num_input_tokens_seen": 542638080, "step": 1035 }, { "epoch": 0.050436163699298625, "grad_norm": 0.25390625, "learning_rate": 4.999997395824976e-05, "loss": 2.827, "num_input_tokens_seen": 545259520, "step": 1040 }, { "epoch": 0.05067864525554525, "grad_norm": 0.24609375, "learning_rate": 4.999993698541001e-05, "loss": 2.8299, "num_input_tokens_seen": 547880960, "step": 1045 }, { "epoch": 0.05092112681179188, "grad_norm": 0.26171875, "learning_rate": 4.999988393745438e-05, "loss": 2.825, "num_input_tokens_seen": 550502400, "step": 1050 }, { "epoch": 0.05116360836803851, "grad_norm": 0.255859375, "learning_rate": 4.999981481441698e-05, "loss": 2.8304, "num_input_tokens_seen": 553123840, "step": 1055 }, { "epoch": 0.05140608992428514, "grad_norm": 0.248046875, "learning_rate": 4.999972961634226e-05, "loss": 2.8367, "num_input_tokens_seen": 555745280, "step": 1060 }, { "epoch": 0.051648571480531764, "grad_norm": 0.2392578125, "learning_rate": 4.999962834328499e-05, "loss": 2.8163, "num_input_tokens_seen": 558366720, "step": 1065 }, { "epoch": 0.05189105303677839, "grad_norm": 0.25390625, "learning_rate": 4.99995109953103e-05, "loss": 2.8154, "num_input_tokens_seen": 560988160, "step": 1070 }, { "epoch": 0.05213353459302502, "grad_norm": 0.2490234375, "learning_rate": 4.999937757249364e-05, "loss": 2.8226, "num_input_tokens_seen": 563609600, "step": 1075 }, { "epoch": 0.05237601614927165, "grad_norm": 0.2578125, "learning_rate": 4.9999228074920814e-05, "loss": 2.8235, "num_input_tokens_seen": 566231040, "step": 1080 }, { "epoch": 0.052618497705518276, "grad_norm": 0.25390625, "learning_rate": 4.9999062502687935e-05, "loss": 2.8257, "num_input_tokens_seen": 568852480, "step": 1085 }, { "epoch": 0.052860979261764904, "grad_norm": 0.271484375, "learning_rate": 4.9998880855901476e-05, "loss": 2.825, "num_input_tokens_seen": 571473920, "step": 1090 }, { "epoch": 0.05310346081801153, "grad_norm": 0.263671875, "learning_rate": 4.999868313467824e-05, "loss": 2.8405, "num_input_tokens_seen": 574095360, "step": 1095 }, { "epoch": 0.05334594237425816, "grad_norm": 0.259765625, "learning_rate": 4.9998469339145346e-05, "loss": 2.8366, "num_input_tokens_seen": 576716800, "step": 1100 }, { "epoch": 0.05358842393050479, "grad_norm": 0.251953125, "learning_rate": 4.999823946944028e-05, "loss": 2.8472, "num_input_tokens_seen": 579338240, "step": 1105 }, { "epoch": 0.053830905486751415, "grad_norm": 0.259765625, "learning_rate": 4.999799352571085e-05, "loss": 2.82, "num_input_tokens_seen": 581959680, "step": 1110 }, { "epoch": 0.05407338704299804, "grad_norm": 0.2734375, "learning_rate": 4.999773150811519e-05, "loss": 2.8273, "num_input_tokens_seen": 584581120, "step": 1115 }, { "epoch": 0.05431586859924467, "grad_norm": 0.263671875, "learning_rate": 4.999745341682179e-05, "loss": 2.8349, "num_input_tokens_seen": 587202560, "step": 1120 }, { "epoch": 0.0545583501554913, "grad_norm": 0.25390625, "learning_rate": 4.999715925200946e-05, "loss": 2.8207, "num_input_tokens_seen": 589824000, "step": 1125 }, { "epoch": 0.05480083171173793, "grad_norm": 0.2490234375, "learning_rate": 4.999684901386734e-05, "loss": 2.8225, "num_input_tokens_seen": 592445440, "step": 1130 }, { "epoch": 0.055043313267984555, "grad_norm": 0.2451171875, "learning_rate": 4.999652270259493e-05, "loss": 2.8355, "num_input_tokens_seen": 595066880, "step": 1135 }, { "epoch": 0.05528579482423118, "grad_norm": 0.263671875, "learning_rate": 4.9996180318402056e-05, "loss": 2.8275, "num_input_tokens_seen": 597688320, "step": 1140 }, { "epoch": 0.05552827638047781, "grad_norm": 0.259765625, "learning_rate": 4.999582186150884e-05, "loss": 2.8332, "num_input_tokens_seen": 600309760, "step": 1145 }, { "epoch": 0.05577075793672444, "grad_norm": 0.2412109375, "learning_rate": 4.9995447332145804e-05, "loss": 2.828, "num_input_tokens_seen": 602931200, "step": 1150 }, { "epoch": 0.056013239492971066, "grad_norm": 0.251953125, "learning_rate": 4.999505673055377e-05, "loss": 2.836, "num_input_tokens_seen": 605552640, "step": 1155 }, { "epoch": 0.056255721049217694, "grad_norm": 0.255859375, "learning_rate": 4.999465005698388e-05, "loss": 2.8321, "num_input_tokens_seen": 608174080, "step": 1160 }, { "epoch": 0.05649820260546432, "grad_norm": 0.2451171875, "learning_rate": 4.999422731169764e-05, "loss": 2.8375, "num_input_tokens_seen": 610795520, "step": 1165 }, { "epoch": 0.05674068416171095, "grad_norm": 0.265625, "learning_rate": 4.999378849496687e-05, "loss": 2.8394, "num_input_tokens_seen": 613416960, "step": 1170 }, { "epoch": 0.05698316571795758, "grad_norm": 0.251953125, "learning_rate": 4.999333360707374e-05, "loss": 2.8123, "num_input_tokens_seen": 616038400, "step": 1175 }, { "epoch": 0.057225647274204205, "grad_norm": 0.267578125, "learning_rate": 4.999286264831075e-05, "loss": 2.8249, "num_input_tokens_seen": 618659840, "step": 1180 }, { "epoch": 0.05746812883045083, "grad_norm": 0.255859375, "learning_rate": 4.9992375618980715e-05, "loss": 2.8277, "num_input_tokens_seen": 621281280, "step": 1185 }, { "epoch": 0.05771061038669746, "grad_norm": 0.2578125, "learning_rate": 4.99918725193968e-05, "loss": 2.8284, "num_input_tokens_seen": 623902720, "step": 1190 }, { "epoch": 0.05795309194294409, "grad_norm": 0.2490234375, "learning_rate": 4.999135334988251e-05, "loss": 2.829, "num_input_tokens_seen": 626524160, "step": 1195 }, { "epoch": 0.05819557349919072, "grad_norm": 0.2451171875, "learning_rate": 4.9990818110771674e-05, "loss": 2.8178, "num_input_tokens_seen": 629145600, "step": 1200 }, { "epoch": 0.05819557349919072, "eval_accuracy": 0.4494723986321446, "eval_loss": 2.7911999225616455, "eval_runtime": 5.7873, "eval_samples_per_second": 51.837, "eval_steps_per_second": 6.566, "num_input_tokens_seen": 629145600, "step": 1200 }, { "epoch": 0.058438055055437345, "grad_norm": 0.2470703125, "learning_rate": 4.999026680240845e-05, "loss": 2.8176, "num_input_tokens_seen": 631767040, "step": 1205 }, { "epoch": 0.05868053661168397, "grad_norm": 0.251953125, "learning_rate": 4.998969942514733e-05, "loss": 2.8341, "num_input_tokens_seen": 634388480, "step": 1210 }, { "epoch": 0.0589230181679306, "grad_norm": 0.25390625, "learning_rate": 4.9989115979353154e-05, "loss": 2.8302, "num_input_tokens_seen": 637009920, "step": 1215 }, { "epoch": 0.05916549972417723, "grad_norm": 0.2412109375, "learning_rate": 4.998851646540106e-05, "loss": 2.8287, "num_input_tokens_seen": 639631360, "step": 1220 }, { "epoch": 0.059407981280423856, "grad_norm": 0.2490234375, "learning_rate": 4.998790088367655e-05, "loss": 2.8195, "num_input_tokens_seen": 642252800, "step": 1225 }, { "epoch": 0.059650462836670484, "grad_norm": 0.25, "learning_rate": 4.998726923457546e-05, "loss": 2.8209, "num_input_tokens_seen": 644874240, "step": 1230 }, { "epoch": 0.05989294439291711, "grad_norm": 0.26171875, "learning_rate": 4.9986621518503925e-05, "loss": 2.8037, "num_input_tokens_seen": 647495680, "step": 1235 }, { "epoch": 0.06013542594916374, "grad_norm": 0.2578125, "learning_rate": 4.9985957735878434e-05, "loss": 2.8244, "num_input_tokens_seen": 650117120, "step": 1240 }, { "epoch": 0.06037790750541037, "grad_norm": 0.25, "learning_rate": 4.9985277887125816e-05, "loss": 2.8115, "num_input_tokens_seen": 652738560, "step": 1245 }, { "epoch": 0.060620389061656996, "grad_norm": 0.248046875, "learning_rate": 4.99845819726832e-05, "loss": 2.8152, "num_input_tokens_seen": 655360000, "step": 1250 }, { "epoch": 0.060862870617903624, "grad_norm": 0.244140625, "learning_rate": 4.998386999299808e-05, "loss": 2.811, "num_input_tokens_seen": 657981440, "step": 1255 }, { "epoch": 0.06110535217415025, "grad_norm": 0.2412109375, "learning_rate": 4.998314194852825e-05, "loss": 2.8287, "num_input_tokens_seen": 660602880, "step": 1260 }, { "epoch": 0.06134783373039688, "grad_norm": 0.2578125, "learning_rate": 4.998239783974185e-05, "loss": 2.8147, "num_input_tokens_seen": 663224320, "step": 1265 }, { "epoch": 0.06159031528664351, "grad_norm": 0.25, "learning_rate": 4.998163766711735e-05, "loss": 2.8149, "num_input_tokens_seen": 665845760, "step": 1270 }, { "epoch": 0.061832796842890135, "grad_norm": 0.236328125, "learning_rate": 4.998086143114355e-05, "loss": 2.817, "num_input_tokens_seen": 668467200, "step": 1275 }, { "epoch": 0.06207527839913676, "grad_norm": 0.2578125, "learning_rate": 4.998006913231957e-05, "loss": 2.8326, "num_input_tokens_seen": 671088640, "step": 1280 }, { "epoch": 0.06231775995538339, "grad_norm": 0.2490234375, "learning_rate": 4.997926077115487e-05, "loss": 2.816, "num_input_tokens_seen": 673710080, "step": 1285 }, { "epoch": 0.06256024151163002, "grad_norm": 0.25390625, "learning_rate": 4.997843634816921e-05, "loss": 2.8116, "num_input_tokens_seen": 676331520, "step": 1290 }, { "epoch": 0.06280272306787665, "grad_norm": 0.26171875, "learning_rate": 4.9977595863892725e-05, "loss": 2.8138, "num_input_tokens_seen": 678952960, "step": 1295 }, { "epoch": 0.06304520462412327, "grad_norm": 0.25, "learning_rate": 4.9976739318865836e-05, "loss": 2.8286, "num_input_tokens_seen": 681574400, "step": 1300 }, { "epoch": 0.06328768618036991, "grad_norm": 0.25, "learning_rate": 4.997586671363931e-05, "loss": 2.8235, "num_input_tokens_seen": 684195840, "step": 1305 }, { "epoch": 0.06353016773661653, "grad_norm": 0.25, "learning_rate": 4.997497804877423e-05, "loss": 2.8223, "num_input_tokens_seen": 686817280, "step": 1310 }, { "epoch": 0.06377264929286316, "grad_norm": 0.248046875, "learning_rate": 4.9974073324842034e-05, "loss": 2.8338, "num_input_tokens_seen": 689438720, "step": 1315 }, { "epoch": 0.06401513084910979, "grad_norm": 0.263671875, "learning_rate": 4.997315254242445e-05, "loss": 2.8224, "num_input_tokens_seen": 692060160, "step": 1320 }, { "epoch": 0.06425761240535642, "grad_norm": 0.2421875, "learning_rate": 4.997221570211355e-05, "loss": 2.8266, "num_input_tokens_seen": 694681600, "step": 1325 }, { "epoch": 0.06450009396160304, "grad_norm": 0.25, "learning_rate": 4.997126280451173e-05, "loss": 2.8305, "num_input_tokens_seen": 697303040, "step": 1330 }, { "epoch": 0.06474257551784968, "grad_norm": 0.2451171875, "learning_rate": 4.9970293850231695e-05, "loss": 2.8248, "num_input_tokens_seen": 699924480, "step": 1335 }, { "epoch": 0.0649850570740963, "grad_norm": 0.2470703125, "learning_rate": 4.996930883989651e-05, "loss": 2.8145, "num_input_tokens_seen": 702545920, "step": 1340 }, { "epoch": 0.06522753863034293, "grad_norm": 0.2470703125, "learning_rate": 4.9968307774139535e-05, "loss": 2.8303, "num_input_tokens_seen": 705167360, "step": 1345 }, { "epoch": 0.06547002018658955, "grad_norm": 0.2470703125, "learning_rate": 4.9967290653604454e-05, "loss": 2.8118, "num_input_tokens_seen": 707788800, "step": 1350 }, { "epoch": 0.06571250174283619, "grad_norm": 0.26953125, "learning_rate": 4.996625747894529e-05, "loss": 2.8133, "num_input_tokens_seen": 710410240, "step": 1355 }, { "epoch": 0.06595498329908281, "grad_norm": 0.2578125, "learning_rate": 4.996520825082638e-05, "loss": 2.8189, "num_input_tokens_seen": 713031680, "step": 1360 }, { "epoch": 0.06619746485532944, "grad_norm": 0.2431640625, "learning_rate": 4.996414296992238e-05, "loss": 2.8219, "num_input_tokens_seen": 715653120, "step": 1365 }, { "epoch": 0.06643994641157606, "grad_norm": 0.2578125, "learning_rate": 4.9963061636918276e-05, "loss": 2.819, "num_input_tokens_seen": 718274560, "step": 1370 }, { "epoch": 0.0666824279678227, "grad_norm": 0.25, "learning_rate": 4.9961964252509367e-05, "loss": 2.8041, "num_input_tokens_seen": 720896000, "step": 1375 }, { "epoch": 0.06692490952406932, "grad_norm": 0.248046875, "learning_rate": 4.996085081740128e-05, "loss": 2.8279, "num_input_tokens_seen": 723517440, "step": 1380 }, { "epoch": 0.06716739108031596, "grad_norm": 0.248046875, "learning_rate": 4.995972133230997e-05, "loss": 2.8206, "num_input_tokens_seen": 726138880, "step": 1385 }, { "epoch": 0.06740987263656258, "grad_norm": 0.25, "learning_rate": 4.99585757979617e-05, "loss": 2.816, "num_input_tokens_seen": 728760320, "step": 1390 }, { "epoch": 0.06765235419280921, "grad_norm": 0.24609375, "learning_rate": 4.995741421509305e-05, "loss": 2.8114, "num_input_tokens_seen": 731381760, "step": 1395 }, { "epoch": 0.06789483574905583, "grad_norm": 0.2470703125, "learning_rate": 4.995623658445092e-05, "loss": 2.8284, "num_input_tokens_seen": 734003200, "step": 1400 }, { "epoch": 0.06813731730530247, "grad_norm": 0.248046875, "learning_rate": 4.995504290679254e-05, "loss": 2.8262, "num_input_tokens_seen": 736624640, "step": 1405 }, { "epoch": 0.06837979886154909, "grad_norm": 0.2490234375, "learning_rate": 4.995383318288546e-05, "loss": 2.8213, "num_input_tokens_seen": 739246080, "step": 1410 }, { "epoch": 0.06862228041779572, "grad_norm": 0.2431640625, "learning_rate": 4.9952607413507525e-05, "loss": 2.8199, "num_input_tokens_seen": 741867520, "step": 1415 }, { "epoch": 0.06886476197404234, "grad_norm": 0.24609375, "learning_rate": 4.995136559944692e-05, "loss": 2.8199, "num_input_tokens_seen": 744488960, "step": 1420 }, { "epoch": 0.06910724353028898, "grad_norm": 0.259765625, "learning_rate": 4.9950107741502136e-05, "loss": 2.8219, "num_input_tokens_seen": 747110400, "step": 1425 }, { "epoch": 0.0693497250865356, "grad_norm": 0.26171875, "learning_rate": 4.9948833840482e-05, "loss": 2.8168, "num_input_tokens_seen": 749731840, "step": 1430 }, { "epoch": 0.06959220664278223, "grad_norm": 0.263671875, "learning_rate": 4.994754389720561e-05, "loss": 2.83, "num_input_tokens_seen": 752353280, "step": 1435 }, { "epoch": 0.06983468819902885, "grad_norm": 0.25, "learning_rate": 4.9946237912502435e-05, "loss": 2.8221, "num_input_tokens_seen": 754974720, "step": 1440 }, { "epoch": 0.07007716975527549, "grad_norm": 0.2578125, "learning_rate": 4.994491588721221e-05, "loss": 2.8179, "num_input_tokens_seen": 757596160, "step": 1445 }, { "epoch": 0.07031965131152211, "grad_norm": 0.26171875, "learning_rate": 4.994357782218502e-05, "loss": 2.8064, "num_input_tokens_seen": 760217600, "step": 1450 }, { "epoch": 0.07056213286776875, "grad_norm": 0.251953125, "learning_rate": 4.994222371828124e-05, "loss": 2.8171, "num_input_tokens_seen": 762839040, "step": 1455 }, { "epoch": 0.07080461442401538, "grad_norm": 0.24609375, "learning_rate": 4.994085357637157e-05, "loss": 2.8152, "num_input_tokens_seen": 765460480, "step": 1460 }, { "epoch": 0.071047095980262, "grad_norm": 0.244140625, "learning_rate": 4.9939467397337025e-05, "loss": 2.8093, "num_input_tokens_seen": 768081920, "step": 1465 }, { "epoch": 0.07128957753650864, "grad_norm": 0.2451171875, "learning_rate": 4.9938065182068925e-05, "loss": 2.8236, "num_input_tokens_seen": 770703360, "step": 1470 }, { "epoch": 0.07153205909275526, "grad_norm": 0.2451171875, "learning_rate": 4.9936646931468896e-05, "loss": 2.8191, "num_input_tokens_seen": 773324800, "step": 1475 }, { "epoch": 0.07177454064900189, "grad_norm": 0.255859375, "learning_rate": 4.9935212646448886e-05, "loss": 2.8199, "num_input_tokens_seen": 775946240, "step": 1480 }, { "epoch": 0.07201702220524851, "grad_norm": 0.2470703125, "learning_rate": 4.993376232793115e-05, "loss": 2.8172, "num_input_tokens_seen": 778567680, "step": 1485 }, { "epoch": 0.07225950376149515, "grad_norm": 0.25390625, "learning_rate": 4.993229597684825e-05, "loss": 2.8199, "num_input_tokens_seen": 781189120, "step": 1490 }, { "epoch": 0.07250198531774177, "grad_norm": 0.2578125, "learning_rate": 4.9930813594143064e-05, "loss": 2.8152, "num_input_tokens_seen": 783810560, "step": 1495 }, { "epoch": 0.0727444668739884, "grad_norm": 0.251953125, "learning_rate": 4.9929315180768775e-05, "loss": 2.8001, "num_input_tokens_seen": 786432000, "step": 1500 }, { "epoch": 0.0727444668739884, "eval_accuracy": 0.45046246539651524, "eval_loss": 2.7831716537475586, "eval_runtime": 5.8093, "eval_samples_per_second": 51.641, "eval_steps_per_second": 6.541, "num_input_tokens_seen": 786432000, "step": 1500 }, { "epoch": 0.07298694843023502, "grad_norm": 0.26171875, "learning_rate": 4.992780073768886e-05, "loss": 2.814, "num_input_tokens_seen": 789053440, "step": 1505 }, { "epoch": 0.07322942998648166, "grad_norm": 0.255859375, "learning_rate": 4.992627026587713e-05, "loss": 2.806, "num_input_tokens_seen": 791674880, "step": 1510 }, { "epoch": 0.07347191154272828, "grad_norm": 0.267578125, "learning_rate": 4.992472376631767e-05, "loss": 2.8164, "num_input_tokens_seen": 794296320, "step": 1515 }, { "epoch": 0.07371439309897491, "grad_norm": 0.259765625, "learning_rate": 4.992316124000491e-05, "loss": 2.8305, "num_input_tokens_seen": 796917760, "step": 1520 }, { "epoch": 0.07395687465522154, "grad_norm": 0.25, "learning_rate": 4.992158268794355e-05, "loss": 2.8207, "num_input_tokens_seen": 799539200, "step": 1525 }, { "epoch": 0.07419935621146817, "grad_norm": 0.25, "learning_rate": 4.991998811114861e-05, "loss": 2.8167, "num_input_tokens_seen": 802160640, "step": 1530 }, { "epoch": 0.07444183776771479, "grad_norm": 0.2490234375, "learning_rate": 4.991837751064542e-05, "loss": 2.7985, "num_input_tokens_seen": 804782080, "step": 1535 }, { "epoch": 0.07468431932396143, "grad_norm": 0.25390625, "learning_rate": 4.99167508874696e-05, "loss": 2.8054, "num_input_tokens_seen": 807403520, "step": 1540 }, { "epoch": 0.07492680088020805, "grad_norm": 0.2578125, "learning_rate": 4.991510824266707e-05, "loss": 2.8267, "num_input_tokens_seen": 810024960, "step": 1545 }, { "epoch": 0.07516928243645468, "grad_norm": 0.251953125, "learning_rate": 4.991344957729409e-05, "loss": 2.824, "num_input_tokens_seen": 812646400, "step": 1550 }, { "epoch": 0.0754117639927013, "grad_norm": 0.26171875, "learning_rate": 4.991177489241716e-05, "loss": 2.8155, "num_input_tokens_seen": 815267840, "step": 1555 }, { "epoch": 0.07565424554894794, "grad_norm": 0.2470703125, "learning_rate": 4.991008418911313e-05, "loss": 2.8131, "num_input_tokens_seen": 817889280, "step": 1560 }, { "epoch": 0.07589672710519456, "grad_norm": 0.2470703125, "learning_rate": 4.9908377468469124e-05, "loss": 2.8172, "num_input_tokens_seen": 820510720, "step": 1565 }, { "epoch": 0.0761392086614412, "grad_norm": 0.2578125, "learning_rate": 4.990665473158259e-05, "loss": 2.829, "num_input_tokens_seen": 823132160, "step": 1570 }, { "epoch": 0.07638169021768781, "grad_norm": 0.2578125, "learning_rate": 4.990491597956124e-05, "loss": 2.7951, "num_input_tokens_seen": 825753600, "step": 1575 }, { "epoch": 0.07662417177393445, "grad_norm": 0.26171875, "learning_rate": 4.990316121352312e-05, "loss": 2.8291, "num_input_tokens_seen": 828375040, "step": 1580 }, { "epoch": 0.07686665333018107, "grad_norm": 0.2470703125, "learning_rate": 4.990139043459654e-05, "loss": 2.8227, "num_input_tokens_seen": 830996480, "step": 1585 }, { "epoch": 0.0771091348864277, "grad_norm": 0.2470703125, "learning_rate": 4.9899603643920126e-05, "loss": 2.8165, "num_input_tokens_seen": 833617920, "step": 1590 }, { "epoch": 0.07735161644267433, "grad_norm": 0.259765625, "learning_rate": 4.98978008426428e-05, "loss": 2.8119, "num_input_tokens_seen": 836239360, "step": 1595 }, { "epoch": 0.07759409799892096, "grad_norm": 0.255859375, "learning_rate": 4.9895982031923766e-05, "loss": 2.811, "num_input_tokens_seen": 838860800, "step": 1600 }, { "epoch": 0.07783657955516758, "grad_norm": 0.255859375, "learning_rate": 4.989414721293254e-05, "loss": 2.8195, "num_input_tokens_seen": 841482240, "step": 1605 }, { "epoch": 0.07807906111141422, "grad_norm": 0.2451171875, "learning_rate": 4.989229638684892e-05, "loss": 2.8394, "num_input_tokens_seen": 844103680, "step": 1610 }, { "epoch": 0.07832154266766084, "grad_norm": 0.25, "learning_rate": 4.989042955486299e-05, "loss": 2.8155, "num_input_tokens_seen": 846725120, "step": 1615 }, { "epoch": 0.07856402422390747, "grad_norm": 0.251953125, "learning_rate": 4.9888546718175154e-05, "loss": 2.8113, "num_input_tokens_seen": 849346560, "step": 1620 }, { "epoch": 0.07880650578015409, "grad_norm": 0.26171875, "learning_rate": 4.9886647877996074e-05, "loss": 2.8253, "num_input_tokens_seen": 851968000, "step": 1625 }, { "epoch": 0.07904898733640073, "grad_norm": 0.263671875, "learning_rate": 4.988473303554672e-05, "loss": 2.8264, "num_input_tokens_seen": 854589440, "step": 1630 }, { "epoch": 0.07929146889264735, "grad_norm": 0.25390625, "learning_rate": 4.988280219205833e-05, "loss": 2.8174, "num_input_tokens_seen": 857210880, "step": 1635 }, { "epoch": 0.07953395044889398, "grad_norm": 0.2578125, "learning_rate": 4.988085534877248e-05, "loss": 2.8017, "num_input_tokens_seen": 859832320, "step": 1640 }, { "epoch": 0.0797764320051406, "grad_norm": 0.2470703125, "learning_rate": 4.987889250694098e-05, "loss": 2.8149, "num_input_tokens_seen": 862453760, "step": 1645 }, { "epoch": 0.08001891356138724, "grad_norm": 0.2490234375, "learning_rate": 4.9876913667825955e-05, "loss": 2.8148, "num_input_tokens_seen": 865075200, "step": 1650 }, { "epoch": 0.08026139511763386, "grad_norm": 0.255859375, "learning_rate": 4.987491883269981e-05, "loss": 2.8222, "num_input_tokens_seen": 867696640, "step": 1655 }, { "epoch": 0.0805038766738805, "grad_norm": 0.251953125, "learning_rate": 4.987290800284524e-05, "loss": 2.8074, "num_input_tokens_seen": 870318080, "step": 1660 }, { "epoch": 0.08074635823012712, "grad_norm": 0.25390625, "learning_rate": 4.987088117955523e-05, "loss": 2.8234, "num_input_tokens_seen": 872939520, "step": 1665 }, { "epoch": 0.08098883978637375, "grad_norm": 0.25, "learning_rate": 4.9868838364133016e-05, "loss": 2.8321, "num_input_tokens_seen": 875560960, "step": 1670 }, { "epoch": 0.08123132134262037, "grad_norm": 0.2451171875, "learning_rate": 4.986677955789216e-05, "loss": 2.8036, "num_input_tokens_seen": 878182400, "step": 1675 }, { "epoch": 0.081473802898867, "grad_norm": 0.2451171875, "learning_rate": 4.9864704762156487e-05, "loss": 2.8171, "num_input_tokens_seen": 880803840, "step": 1680 }, { "epoch": 0.08171628445511363, "grad_norm": 0.244140625, "learning_rate": 4.986261397826009e-05, "loss": 2.8167, "num_input_tokens_seen": 883425280, "step": 1685 }, { "epoch": 0.08195876601136026, "grad_norm": 0.2490234375, "learning_rate": 4.9860507207547366e-05, "loss": 2.8094, "num_input_tokens_seen": 886046720, "step": 1690 }, { "epoch": 0.08220124756760688, "grad_norm": 0.25, "learning_rate": 4.985838445137299e-05, "loss": 2.7931, "num_input_tokens_seen": 888668160, "step": 1695 }, { "epoch": 0.08244372912385352, "grad_norm": 0.251953125, "learning_rate": 4.985624571110189e-05, "loss": 2.8207, "num_input_tokens_seen": 891289600, "step": 1700 }, { "epoch": 0.08268621068010014, "grad_norm": 0.25390625, "learning_rate": 4.9854090988109294e-05, "loss": 2.7987, "num_input_tokens_seen": 893911040, "step": 1705 }, { "epoch": 0.08292869223634677, "grad_norm": 0.25390625, "learning_rate": 4.9851920283780714e-05, "loss": 2.8237, "num_input_tokens_seen": 896532480, "step": 1710 }, { "epoch": 0.0831711737925934, "grad_norm": 0.2578125, "learning_rate": 4.984973359951192e-05, "loss": 2.8, "num_input_tokens_seen": 899153920, "step": 1715 }, { "epoch": 0.08341365534884003, "grad_norm": 0.2451171875, "learning_rate": 4.984753093670895e-05, "loss": 2.812, "num_input_tokens_seen": 901775360, "step": 1720 }, { "epoch": 0.08365613690508665, "grad_norm": 0.25390625, "learning_rate": 4.984531229678815e-05, "loss": 2.8196, "num_input_tokens_seen": 904396800, "step": 1725 }, { "epoch": 0.08389861846133329, "grad_norm": 0.25390625, "learning_rate": 4.984307768117611e-05, "loss": 2.827, "num_input_tokens_seen": 907018240, "step": 1730 }, { "epoch": 0.0841411000175799, "grad_norm": 0.2421875, "learning_rate": 4.98408270913097e-05, "loss": 2.8098, "num_input_tokens_seen": 909639680, "step": 1735 }, { "epoch": 0.08438358157382654, "grad_norm": 0.2373046875, "learning_rate": 4.9838560528636066e-05, "loss": 2.8144, "num_input_tokens_seen": 912261120, "step": 1740 }, { "epoch": 0.08462606313007316, "grad_norm": 0.251953125, "learning_rate": 4.983627799461263e-05, "loss": 2.8083, "num_input_tokens_seen": 914882560, "step": 1745 }, { "epoch": 0.0848685446863198, "grad_norm": 0.2578125, "learning_rate": 4.9833979490707064e-05, "loss": 2.8096, "num_input_tokens_seen": 917504000, "step": 1750 }, { "epoch": 0.08511102624256643, "grad_norm": 0.259765625, "learning_rate": 4.983166501839732e-05, "loss": 2.8097, "num_input_tokens_seen": 920125440, "step": 1755 }, { "epoch": 0.08535350779881305, "grad_norm": 0.25390625, "learning_rate": 4.9829334579171626e-05, "loss": 2.8097, "num_input_tokens_seen": 922746880, "step": 1760 }, { "epoch": 0.08559598935505969, "grad_norm": 0.24609375, "learning_rate": 4.9826988174528465e-05, "loss": 2.8175, "num_input_tokens_seen": 925368320, "step": 1765 }, { "epoch": 0.08583847091130631, "grad_norm": 0.2451171875, "learning_rate": 4.98246258059766e-05, "loss": 2.8214, "num_input_tokens_seen": 927989760, "step": 1770 }, { "epoch": 0.08608095246755294, "grad_norm": 0.2353515625, "learning_rate": 4.982224747503503e-05, "loss": 2.8127, "num_input_tokens_seen": 930611200, "step": 1775 }, { "epoch": 0.08632343402379956, "grad_norm": 0.2578125, "learning_rate": 4.9819853183233046e-05, "loss": 2.8065, "num_input_tokens_seen": 933232640, "step": 1780 }, { "epoch": 0.0865659155800462, "grad_norm": 0.248046875, "learning_rate": 4.9817442932110193e-05, "loss": 2.8097, "num_input_tokens_seen": 935854080, "step": 1785 }, { "epoch": 0.08680839713629282, "grad_norm": 0.25390625, "learning_rate": 4.9815016723216273e-05, "loss": 2.8254, "num_input_tokens_seen": 938475520, "step": 1790 }, { "epoch": 0.08705087869253945, "grad_norm": 0.251953125, "learning_rate": 4.9812574558111365e-05, "loss": 2.8086, "num_input_tokens_seen": 941096960, "step": 1795 }, { "epoch": 0.08729336024878608, "grad_norm": 0.2490234375, "learning_rate": 4.9810116438365784e-05, "loss": 2.8045, "num_input_tokens_seen": 943718400, "step": 1800 }, { "epoch": 0.08729336024878608, "eval_accuracy": 0.4512392118547468, "eval_loss": 2.7771894931793213, "eval_runtime": 5.8369, "eval_samples_per_second": 51.397, "eval_steps_per_second": 6.51, "num_input_tokens_seen": 943718400, "step": 1800 }, { "epoch": 0.08753584180503271, "grad_norm": 0.2412109375, "learning_rate": 4.9807642365560123e-05, "loss": 2.8076, "num_input_tokens_seen": 946339840, "step": 1805 }, { "epoch": 0.08777832336127933, "grad_norm": 0.244140625, "learning_rate": 4.980515234128522e-05, "loss": 2.8131, "num_input_tokens_seen": 948961280, "step": 1810 }, { "epoch": 0.08802080491752597, "grad_norm": 0.2578125, "learning_rate": 4.980264636714219e-05, "loss": 2.8187, "num_input_tokens_seen": 951582720, "step": 1815 }, { "epoch": 0.08826328647377259, "grad_norm": 0.2490234375, "learning_rate": 4.980012444474238e-05, "loss": 2.8076, "num_input_tokens_seen": 954204160, "step": 1820 }, { "epoch": 0.08850576803001922, "grad_norm": 0.251953125, "learning_rate": 4.97975865757074e-05, "loss": 2.8258, "num_input_tokens_seen": 956825600, "step": 1825 }, { "epoch": 0.08874824958626584, "grad_norm": 0.255859375, "learning_rate": 4.979503276166912e-05, "loss": 2.81, "num_input_tokens_seen": 959447040, "step": 1830 }, { "epoch": 0.08899073114251248, "grad_norm": 0.263671875, "learning_rate": 4.979246300426965e-05, "loss": 2.8154, "num_input_tokens_seen": 962068480, "step": 1835 }, { "epoch": 0.0892332126987591, "grad_norm": 0.267578125, "learning_rate": 4.978987730516137e-05, "loss": 2.8244, "num_input_tokens_seen": 964689920, "step": 1840 }, { "epoch": 0.08947569425500573, "grad_norm": 0.2470703125, "learning_rate": 4.9787275666006904e-05, "loss": 2.8089, "num_input_tokens_seen": 967311360, "step": 1845 }, { "epoch": 0.08971817581125235, "grad_norm": 0.24609375, "learning_rate": 4.9784658088479106e-05, "loss": 2.8155, "num_input_tokens_seen": 969932800, "step": 1850 }, { "epoch": 0.08996065736749899, "grad_norm": 0.251953125, "learning_rate": 4.978202457426111e-05, "loss": 2.8164, "num_input_tokens_seen": 972554240, "step": 1855 }, { "epoch": 0.09020313892374561, "grad_norm": 0.2490234375, "learning_rate": 4.977937512504628e-05, "loss": 2.7966, "num_input_tokens_seen": 975175680, "step": 1860 }, { "epoch": 0.09044562047999224, "grad_norm": 0.24609375, "learning_rate": 4.977670974253822e-05, "loss": 2.8174, "num_input_tokens_seen": 977797120, "step": 1865 }, { "epoch": 0.09068810203623887, "grad_norm": 0.263671875, "learning_rate": 4.97740284284508e-05, "loss": 2.8133, "num_input_tokens_seen": 980418560, "step": 1870 }, { "epoch": 0.0909305835924855, "grad_norm": 0.2470703125, "learning_rate": 4.977133118450811e-05, "loss": 2.8081, "num_input_tokens_seen": 983040000, "step": 1875 }, { "epoch": 0.09117306514873212, "grad_norm": 0.25390625, "learning_rate": 4.976861801244449e-05, "loss": 2.8187, "num_input_tokens_seen": 985661440, "step": 1880 }, { "epoch": 0.09141554670497876, "grad_norm": 0.255859375, "learning_rate": 4.976588891400455e-05, "loss": 2.8081, "num_input_tokens_seen": 988282880, "step": 1885 }, { "epoch": 0.09165802826122538, "grad_norm": 0.2490234375, "learning_rate": 4.97631438909431e-05, "loss": 2.7958, "num_input_tokens_seen": 990904320, "step": 1890 }, { "epoch": 0.09190050981747201, "grad_norm": 0.24609375, "learning_rate": 4.97603829450252e-05, "loss": 2.8202, "num_input_tokens_seen": 993525760, "step": 1895 }, { "epoch": 0.09214299137371863, "grad_norm": 0.279296875, "learning_rate": 4.975760607802618e-05, "loss": 2.7975, "num_input_tokens_seen": 996147200, "step": 1900 }, { "epoch": 0.09238547292996527, "grad_norm": 0.2431640625, "learning_rate": 4.975481329173156e-05, "loss": 2.8094, "num_input_tokens_seen": 998768640, "step": 1905 }, { "epoch": 0.09262795448621189, "grad_norm": 0.248046875, "learning_rate": 4.975200458793713e-05, "loss": 2.8066, "num_input_tokens_seen": 1001390080, "step": 1910 }, { "epoch": 0.09287043604245852, "grad_norm": 0.2451171875, "learning_rate": 4.97491799684489e-05, "loss": 2.8147, "num_input_tokens_seen": 1004011520, "step": 1915 }, { "epoch": 0.09311291759870514, "grad_norm": 0.248046875, "learning_rate": 4.9746339435083124e-05, "loss": 2.7911, "num_input_tokens_seen": 1006632960, "step": 1920 }, { "epoch": 0.09335539915495178, "grad_norm": 0.2421875, "learning_rate": 4.9743482989666275e-05, "loss": 2.822, "num_input_tokens_seen": 1009254400, "step": 1925 }, { "epoch": 0.0935978807111984, "grad_norm": 0.25390625, "learning_rate": 4.9740610634035064e-05, "loss": 2.8057, "num_input_tokens_seen": 1011875840, "step": 1930 }, { "epoch": 0.09384036226744503, "grad_norm": 0.248046875, "learning_rate": 4.973772237003644e-05, "loss": 2.8066, "num_input_tokens_seen": 1014497280, "step": 1935 }, { "epoch": 0.09408284382369166, "grad_norm": 0.25390625, "learning_rate": 4.973481819952758e-05, "loss": 2.8002, "num_input_tokens_seen": 1017118720, "step": 1940 }, { "epoch": 0.09432532537993829, "grad_norm": 0.25, "learning_rate": 4.973189812437588e-05, "loss": 2.8146, "num_input_tokens_seen": 1019740160, "step": 1945 }, { "epoch": 0.09456780693618491, "grad_norm": 0.2578125, "learning_rate": 4.9728962146458956e-05, "loss": 2.8133, "num_input_tokens_seen": 1022361600, "step": 1950 }, { "epoch": 0.09481028849243155, "grad_norm": 0.255859375, "learning_rate": 4.9726010267664666e-05, "loss": 2.7953, "num_input_tokens_seen": 1024983040, "step": 1955 }, { "epoch": 0.09505277004867817, "grad_norm": 0.2451171875, "learning_rate": 4.972304248989109e-05, "loss": 2.7998, "num_input_tokens_seen": 1027604480, "step": 1960 }, { "epoch": 0.0952952516049248, "grad_norm": 0.25, "learning_rate": 4.9720058815046534e-05, "loss": 2.817, "num_input_tokens_seen": 1030225920, "step": 1965 }, { "epoch": 0.09553773316117142, "grad_norm": 0.25390625, "learning_rate": 4.9717059245049505e-05, "loss": 2.8026, "num_input_tokens_seen": 1032847360, "step": 1970 }, { "epoch": 0.09578021471741806, "grad_norm": 0.25, "learning_rate": 4.9714043781828754e-05, "loss": 2.8179, "num_input_tokens_seen": 1035468800, "step": 1975 }, { "epoch": 0.09602269627366468, "grad_norm": 0.255859375, "learning_rate": 4.9711012427323235e-05, "loss": 2.8098, "num_input_tokens_seen": 1038090240, "step": 1980 }, { "epoch": 0.09626517782991131, "grad_norm": 0.25390625, "learning_rate": 4.970796518348214e-05, "loss": 2.8045, "num_input_tokens_seen": 1040711680, "step": 1985 }, { "epoch": 0.09650765938615793, "grad_norm": 0.263671875, "learning_rate": 4.970490205226486e-05, "loss": 2.8143, "num_input_tokens_seen": 1043333120, "step": 1990 }, { "epoch": 0.09675014094240457, "grad_norm": 0.2470703125, "learning_rate": 4.9701823035640994e-05, "loss": 2.8037, "num_input_tokens_seen": 1045954560, "step": 1995 }, { "epoch": 0.09699262249865119, "grad_norm": 0.23828125, "learning_rate": 4.9698728135590394e-05, "loss": 2.8136, "num_input_tokens_seen": 1048576000, "step": 2000 }, { "epoch": 0.09723510405489783, "grad_norm": 0.255859375, "learning_rate": 4.9695617354103085e-05, "loss": 2.7976, "num_input_tokens_seen": 1051197440, "step": 2005 }, { "epoch": 0.09747758561114445, "grad_norm": 0.24609375, "learning_rate": 4.9692490693179324e-05, "loss": 2.8091, "num_input_tokens_seen": 1053818880, "step": 2010 }, { "epoch": 0.09772006716739108, "grad_norm": 0.25390625, "learning_rate": 4.968934815482956e-05, "loss": 2.8106, "num_input_tokens_seen": 1056440320, "step": 2015 }, { "epoch": 0.0979625487236377, "grad_norm": 0.251953125, "learning_rate": 4.9686189741074494e-05, "loss": 2.8134, "num_input_tokens_seen": 1059061760, "step": 2020 }, { "epoch": 0.09820503027988434, "grad_norm": 0.248046875, "learning_rate": 4.968301545394498e-05, "loss": 2.8143, "num_input_tokens_seen": 1061683200, "step": 2025 }, { "epoch": 0.09844751183613096, "grad_norm": 0.251953125, "learning_rate": 4.967982529548211e-05, "loss": 2.806, "num_input_tokens_seen": 1064304640, "step": 2030 }, { "epoch": 0.09868999339237759, "grad_norm": 0.240234375, "learning_rate": 4.967661926773718e-05, "loss": 2.8234, "num_input_tokens_seen": 1066926080, "step": 2035 }, { "epoch": 0.09893247494862423, "grad_norm": 0.26171875, "learning_rate": 4.967339737277169e-05, "loss": 2.7927, "num_input_tokens_seen": 1069547520, "step": 2040 }, { "epoch": 0.09917495650487085, "grad_norm": 0.244140625, "learning_rate": 4.967015961265732e-05, "loss": 2.805, "num_input_tokens_seen": 1072168960, "step": 2045 }, { "epoch": 0.09941743806111748, "grad_norm": 0.248046875, "learning_rate": 4.9666905989475995e-05, "loss": 2.802, "num_input_tokens_seen": 1074790400, "step": 2050 }, { "epoch": 0.0996599196173641, "grad_norm": 0.2470703125, "learning_rate": 4.96636365053198e-05, "loss": 2.8156, "num_input_tokens_seen": 1077411840, "step": 2055 }, { "epoch": 0.09990240117361074, "grad_norm": 0.251953125, "learning_rate": 4.966035116229103e-05, "loss": 2.8105, "num_input_tokens_seen": 1080033280, "step": 2060 }, { "epoch": 0.10014488272985736, "grad_norm": 0.2490234375, "learning_rate": 4.9657049962502196e-05, "loss": 2.8111, "num_input_tokens_seen": 1082654720, "step": 2065 }, { "epoch": 0.100387364286104, "grad_norm": 0.255859375, "learning_rate": 4.965373290807598e-05, "loss": 2.8031, "num_input_tokens_seen": 1085276160, "step": 2070 }, { "epoch": 0.10062984584235062, "grad_norm": 0.251953125, "learning_rate": 4.9650400001145265e-05, "loss": 2.8049, "num_input_tokens_seen": 1087897600, "step": 2075 }, { "epoch": 0.10087232739859725, "grad_norm": 0.25, "learning_rate": 4.9647051243853135e-05, "loss": 2.8112, "num_input_tokens_seen": 1090519040, "step": 2080 }, { "epoch": 0.10111480895484387, "grad_norm": 0.2431640625, "learning_rate": 4.964368663835288e-05, "loss": 2.8095, "num_input_tokens_seen": 1093140480, "step": 2085 }, { "epoch": 0.1013572905110905, "grad_norm": 0.244140625, "learning_rate": 4.964030618680793e-05, "loss": 2.8092, "num_input_tokens_seen": 1095761920, "step": 2090 }, { "epoch": 0.10159977206733713, "grad_norm": 0.2392578125, "learning_rate": 4.963690989139196e-05, "loss": 2.8094, "num_input_tokens_seen": 1098383360, "step": 2095 }, { "epoch": 0.10184225362358376, "grad_norm": 0.255859375, "learning_rate": 4.96334977542888e-05, "loss": 2.8019, "num_input_tokens_seen": 1101004800, "step": 2100 }, { "epoch": 0.10184225362358376, "eval_accuracy": 0.4515551213157466, "eval_loss": 2.772890329360962, "eval_runtime": 5.8572, "eval_samples_per_second": 51.219, "eval_steps_per_second": 6.488, "num_input_tokens_seen": 1101004800, "step": 2100 }, { "epoch": 0.10208473517983038, "grad_norm": 0.2451171875, "learning_rate": 4.963006977769248e-05, "loss": 2.7996, "num_input_tokens_seen": 1103626240, "step": 2105 }, { "epoch": 0.10232721673607702, "grad_norm": 0.2451171875, "learning_rate": 4.9626625963807205e-05, "loss": 2.8103, "num_input_tokens_seen": 1106247680, "step": 2110 }, { "epoch": 0.10256969829232364, "grad_norm": 0.25390625, "learning_rate": 4.962316631484737e-05, "loss": 2.7967, "num_input_tokens_seen": 1108869120, "step": 2115 }, { "epoch": 0.10281217984857027, "grad_norm": 0.255859375, "learning_rate": 4.9619690833037545e-05, "loss": 2.8031, "num_input_tokens_seen": 1111490560, "step": 2120 }, { "epoch": 0.1030546614048169, "grad_norm": 0.251953125, "learning_rate": 4.96161995206125e-05, "loss": 2.8048, "num_input_tokens_seen": 1114112000, "step": 2125 }, { "epoch": 0.10329714296106353, "grad_norm": 0.25390625, "learning_rate": 4.9612692379817175e-05, "loss": 2.8085, "num_input_tokens_seen": 1116733440, "step": 2130 }, { "epoch": 0.10353962451731015, "grad_norm": 0.263671875, "learning_rate": 4.960916941290666e-05, "loss": 2.8057, "num_input_tokens_seen": 1119354880, "step": 2135 }, { "epoch": 0.10378210607355678, "grad_norm": 0.2578125, "learning_rate": 4.960563062214627e-05, "loss": 2.7938, "num_input_tokens_seen": 1121976320, "step": 2140 }, { "epoch": 0.1040245876298034, "grad_norm": 0.267578125, "learning_rate": 4.960207600981145e-05, "loss": 2.8095, "num_input_tokens_seen": 1124597760, "step": 2145 }, { "epoch": 0.10426706918605004, "grad_norm": 0.2490234375, "learning_rate": 4.9598505578187844e-05, "loss": 2.8159, "num_input_tokens_seen": 1127219200, "step": 2150 }, { "epoch": 0.10450955074229666, "grad_norm": 0.259765625, "learning_rate": 4.9594919329571264e-05, "loss": 2.7916, "num_input_tokens_seen": 1129840640, "step": 2155 }, { "epoch": 0.1047520322985433, "grad_norm": 0.25390625, "learning_rate": 4.959131726626769e-05, "loss": 2.804, "num_input_tokens_seen": 1132462080, "step": 2160 }, { "epoch": 0.10499451385478992, "grad_norm": 0.248046875, "learning_rate": 4.9587699390593276e-05, "loss": 2.805, "num_input_tokens_seen": 1135083520, "step": 2165 }, { "epoch": 0.10523699541103655, "grad_norm": 0.259765625, "learning_rate": 4.9584065704874326e-05, "loss": 2.8, "num_input_tokens_seen": 1137704960, "step": 2170 }, { "epoch": 0.10547947696728317, "grad_norm": 0.26953125, "learning_rate": 4.9580416211447336e-05, "loss": 2.7925, "num_input_tokens_seen": 1140326400, "step": 2175 }, { "epoch": 0.10572195852352981, "grad_norm": 0.2578125, "learning_rate": 4.9576750912658945e-05, "loss": 2.7941, "num_input_tokens_seen": 1142947840, "step": 2180 }, { "epoch": 0.10596444007977643, "grad_norm": 0.251953125, "learning_rate": 4.957306981086596e-05, "loss": 2.8094, "num_input_tokens_seen": 1145569280, "step": 2185 }, { "epoch": 0.10620692163602306, "grad_norm": 0.2451171875, "learning_rate": 4.9569372908435365e-05, "loss": 2.7984, "num_input_tokens_seen": 1148190720, "step": 2190 }, { "epoch": 0.10644940319226968, "grad_norm": 0.244140625, "learning_rate": 4.956566020774428e-05, "loss": 2.8152, "num_input_tokens_seen": 1150812160, "step": 2195 }, { "epoch": 0.10669188474851632, "grad_norm": 0.26171875, "learning_rate": 4.956193171118e-05, "loss": 2.8166, "num_input_tokens_seen": 1153433600, "step": 2200 }, { "epoch": 0.10693436630476294, "grad_norm": 0.2451171875, "learning_rate": 4.955818742113997e-05, "loss": 2.7986, "num_input_tokens_seen": 1156055040, "step": 2205 }, { "epoch": 0.10717684786100957, "grad_norm": 0.251953125, "learning_rate": 4.95544273400318e-05, "loss": 2.8067, "num_input_tokens_seen": 1158676480, "step": 2210 }, { "epoch": 0.1074193294172562, "grad_norm": 0.240234375, "learning_rate": 4.955065147027323e-05, "loss": 2.8053, "num_input_tokens_seen": 1161297920, "step": 2215 }, { "epoch": 0.10766181097350283, "grad_norm": 0.259765625, "learning_rate": 4.954685981429218e-05, "loss": 2.7979, "num_input_tokens_seen": 1163919360, "step": 2220 }, { "epoch": 0.10790429252974945, "grad_norm": 0.248046875, "learning_rate": 4.95430523745267e-05, "loss": 2.7966, "num_input_tokens_seen": 1166540800, "step": 2225 }, { "epoch": 0.10814677408599609, "grad_norm": 0.25390625, "learning_rate": 4.9539229153425e-05, "loss": 2.8017, "num_input_tokens_seen": 1169162240, "step": 2230 }, { "epoch": 0.10838925564224271, "grad_norm": 0.255859375, "learning_rate": 4.953539015344545e-05, "loss": 2.801, "num_input_tokens_seen": 1171783680, "step": 2235 }, { "epoch": 0.10863173719848934, "grad_norm": 0.2578125, "learning_rate": 4.953153537705653e-05, "loss": 2.802, "num_input_tokens_seen": 1174405120, "step": 2240 }, { "epoch": 0.10887421875473596, "grad_norm": 0.244140625, "learning_rate": 4.952766482673689e-05, "loss": 2.8066, "num_input_tokens_seen": 1177026560, "step": 2245 }, { "epoch": 0.1091167003109826, "grad_norm": 0.25, "learning_rate": 4.952377850497533e-05, "loss": 2.8077, "num_input_tokens_seen": 1179648000, "step": 2250 }, { "epoch": 0.10935918186722922, "grad_norm": 0.25390625, "learning_rate": 4.951987641427076e-05, "loss": 2.7906, "num_input_tokens_seen": 1182269440, "step": 2255 }, { "epoch": 0.10960166342347585, "grad_norm": 0.259765625, "learning_rate": 4.951595855713227e-05, "loss": 2.8113, "num_input_tokens_seen": 1184890880, "step": 2260 }, { "epoch": 0.10984414497972247, "grad_norm": 0.267578125, "learning_rate": 4.951202493607905e-05, "loss": 2.8124, "num_input_tokens_seen": 1187512320, "step": 2265 }, { "epoch": 0.11008662653596911, "grad_norm": 0.255859375, "learning_rate": 4.950807555364045e-05, "loss": 2.7866, "num_input_tokens_seen": 1190133760, "step": 2270 }, { "epoch": 0.11032910809221573, "grad_norm": 0.25, "learning_rate": 4.9504110412355954e-05, "loss": 2.8151, "num_input_tokens_seen": 1192755200, "step": 2275 }, { "epoch": 0.11057158964846237, "grad_norm": 0.263671875, "learning_rate": 4.950012951477516e-05, "loss": 2.7983, "num_input_tokens_seen": 1195376640, "step": 2280 }, { "epoch": 0.11081407120470899, "grad_norm": 0.263671875, "learning_rate": 4.9496132863457813e-05, "loss": 2.8271, "num_input_tokens_seen": 1197998080, "step": 2285 }, { "epoch": 0.11105655276095562, "grad_norm": 0.248046875, "learning_rate": 4.949212046097379e-05, "loss": 2.799, "num_input_tokens_seen": 1200619520, "step": 2290 }, { "epoch": 0.11129903431720224, "grad_norm": 0.25390625, "learning_rate": 4.948809230990309e-05, "loss": 2.8016, "num_input_tokens_seen": 1203240960, "step": 2295 }, { "epoch": 0.11154151587344888, "grad_norm": 0.2412109375, "learning_rate": 4.9484048412835836e-05, "loss": 2.8, "num_input_tokens_seen": 1205862400, "step": 2300 }, { "epoch": 0.1117839974296955, "grad_norm": 0.255859375, "learning_rate": 4.947998877237228e-05, "loss": 2.8049, "num_input_tokens_seen": 1208483840, "step": 2305 }, { "epoch": 0.11202647898594213, "grad_norm": 0.25, "learning_rate": 4.94759133911228e-05, "loss": 2.8028, "num_input_tokens_seen": 1211105280, "step": 2310 }, { "epoch": 0.11226896054218875, "grad_norm": 0.251953125, "learning_rate": 4.947182227170788e-05, "loss": 2.8036, "num_input_tokens_seen": 1213726720, "step": 2315 }, { "epoch": 0.11251144209843539, "grad_norm": 0.24609375, "learning_rate": 4.9467715416758155e-05, "loss": 2.8069, "num_input_tokens_seen": 1216348160, "step": 2320 }, { "epoch": 0.11275392365468201, "grad_norm": 0.259765625, "learning_rate": 4.946359282891434e-05, "loss": 2.7988, "num_input_tokens_seen": 1218969600, "step": 2325 }, { "epoch": 0.11299640521092864, "grad_norm": 0.25, "learning_rate": 4.945945451082729e-05, "loss": 2.8013, "num_input_tokens_seen": 1221591040, "step": 2330 }, { "epoch": 0.11323888676717528, "grad_norm": 0.248046875, "learning_rate": 4.9455300465157976e-05, "loss": 2.8114, "num_input_tokens_seen": 1224212480, "step": 2335 }, { "epoch": 0.1134813683234219, "grad_norm": 0.2451171875, "learning_rate": 4.945113069457747e-05, "loss": 2.8018, "num_input_tokens_seen": 1226833920, "step": 2340 }, { "epoch": 0.11372384987966853, "grad_norm": 0.248046875, "learning_rate": 4.944694520176697e-05, "loss": 2.8051, "num_input_tokens_seen": 1229455360, "step": 2345 }, { "epoch": 0.11396633143591516, "grad_norm": 0.26171875, "learning_rate": 4.944274398941775e-05, "loss": 2.7898, "num_input_tokens_seen": 1232076800, "step": 2350 }, { "epoch": 0.11420881299216179, "grad_norm": 0.251953125, "learning_rate": 4.9438527060231244e-05, "loss": 2.7976, "num_input_tokens_seen": 1234698240, "step": 2355 }, { "epoch": 0.11445129454840841, "grad_norm": 0.26171875, "learning_rate": 4.943429441691894e-05, "loss": 2.8107, "num_input_tokens_seen": 1237319680, "step": 2360 }, { "epoch": 0.11469377610465505, "grad_norm": 0.255859375, "learning_rate": 4.943004606220247e-05, "loss": 2.7924, "num_input_tokens_seen": 1239941120, "step": 2365 }, { "epoch": 0.11493625766090167, "grad_norm": 0.26171875, "learning_rate": 4.942578199881355e-05, "loss": 2.7947, "num_input_tokens_seen": 1242562560, "step": 2370 }, { "epoch": 0.1151787392171483, "grad_norm": 0.2490234375, "learning_rate": 4.9421502229494e-05, "loss": 2.7961, "num_input_tokens_seen": 1245184000, "step": 2375 }, { "epoch": 0.11542122077339492, "grad_norm": 0.2421875, "learning_rate": 4.941720675699573e-05, "loss": 2.7909, "num_input_tokens_seen": 1247805440, "step": 2380 }, { "epoch": 0.11566370232964156, "grad_norm": 0.251953125, "learning_rate": 4.9412895584080766e-05, "loss": 2.8032, "num_input_tokens_seen": 1250426880, "step": 2385 }, { "epoch": 0.11590618388588818, "grad_norm": 0.2490234375, "learning_rate": 4.940856871352121e-05, "loss": 2.8052, "num_input_tokens_seen": 1253048320, "step": 2390 }, { "epoch": 0.11614866544213481, "grad_norm": 0.2451171875, "learning_rate": 4.9404226148099274e-05, "loss": 2.7942, "num_input_tokens_seen": 1255669760, "step": 2395 }, { "epoch": 0.11639114699838143, "grad_norm": 0.25, "learning_rate": 4.9399867890607254e-05, "loss": 2.7995, "num_input_tokens_seen": 1258291200, "step": 2400 }, { "epoch": 0.11639114699838143, "eval_accuracy": 0.4522162514248494, "eval_loss": 2.7690587043762207, "eval_runtime": 6.2539, "eval_samples_per_second": 47.97, "eval_steps_per_second": 6.076, "num_input_tokens_seen": 1258291200, "step": 2400 }, { "epoch": 0.11663362855462807, "grad_norm": 0.25390625, "learning_rate": 4.939549394384754e-05, "loss": 2.7948, "num_input_tokens_seen": 1260912640, "step": 2405 }, { "epoch": 0.11687611011087469, "grad_norm": 0.255859375, "learning_rate": 4.939110431063258e-05, "loss": 2.8082, "num_input_tokens_seen": 1263534080, "step": 2410 }, { "epoch": 0.11711859166712132, "grad_norm": 0.25, "learning_rate": 4.9386698993784984e-05, "loss": 2.7991, "num_input_tokens_seen": 1266155520, "step": 2415 }, { "epoch": 0.11736107322336795, "grad_norm": 0.2470703125, "learning_rate": 4.938227799613736e-05, "loss": 2.8032, "num_input_tokens_seen": 1268776960, "step": 2420 }, { "epoch": 0.11760355477961458, "grad_norm": 0.259765625, "learning_rate": 4.937784132053245e-05, "loss": 2.8013, "num_input_tokens_seen": 1271398400, "step": 2425 }, { "epoch": 0.1178460363358612, "grad_norm": 0.2578125, "learning_rate": 4.937338896982306e-05, "loss": 2.8085, "num_input_tokens_seen": 1274019840, "step": 2430 }, { "epoch": 0.11808851789210784, "grad_norm": 0.248046875, "learning_rate": 4.936892094687209e-05, "loss": 2.8037, "num_input_tokens_seen": 1276641280, "step": 2435 }, { "epoch": 0.11833099944835446, "grad_norm": 0.251953125, "learning_rate": 4.9364437254552495e-05, "loss": 2.8014, "num_input_tokens_seen": 1279262720, "step": 2440 }, { "epoch": 0.11857348100460109, "grad_norm": 0.2451171875, "learning_rate": 4.935993789574733e-05, "loss": 2.7895, "num_input_tokens_seen": 1281884160, "step": 2445 }, { "epoch": 0.11881596256084771, "grad_norm": 0.251953125, "learning_rate": 4.93554228733497e-05, "loss": 2.7933, "num_input_tokens_seen": 1284505600, "step": 2450 }, { "epoch": 0.11905844411709435, "grad_norm": 0.2578125, "learning_rate": 4.935089219026279e-05, "loss": 2.8195, "num_input_tokens_seen": 1287127040, "step": 2455 }, { "epoch": 0.11930092567334097, "grad_norm": 0.2578125, "learning_rate": 4.9346345849399864e-05, "loss": 2.8013, "num_input_tokens_seen": 1289748480, "step": 2460 }, { "epoch": 0.1195434072295876, "grad_norm": 0.248046875, "learning_rate": 4.9341783853684246e-05, "loss": 2.8148, "num_input_tokens_seen": 1292369920, "step": 2465 }, { "epoch": 0.11978588878583422, "grad_norm": 0.255859375, "learning_rate": 4.9337206206049325e-05, "loss": 2.8012, "num_input_tokens_seen": 1294991360, "step": 2470 }, { "epoch": 0.12002837034208086, "grad_norm": 0.25390625, "learning_rate": 4.933261290943856e-05, "loss": 2.7881, "num_input_tokens_seen": 1297612800, "step": 2475 }, { "epoch": 0.12027085189832748, "grad_norm": 0.251953125, "learning_rate": 4.932800396680548e-05, "loss": 2.7861, "num_input_tokens_seen": 1300234240, "step": 2480 }, { "epoch": 0.12051333345457411, "grad_norm": 0.2451171875, "learning_rate": 4.9323379381113644e-05, "loss": 2.7839, "num_input_tokens_seen": 1302855680, "step": 2485 }, { "epoch": 0.12075581501082074, "grad_norm": 0.25390625, "learning_rate": 4.93187391553367e-05, "loss": 2.793, "num_input_tokens_seen": 1305477120, "step": 2490 }, { "epoch": 0.12099829656706737, "grad_norm": 0.255859375, "learning_rate": 4.931408329245835e-05, "loss": 2.8094, "num_input_tokens_seen": 1308098560, "step": 2495 }, { "epoch": 0.12124077812331399, "grad_norm": 0.251953125, "learning_rate": 4.9309411795472327e-05, "loss": 2.7796, "num_input_tokens_seen": 1310720000, "step": 2500 }, { "epoch": 0.12148325967956063, "grad_norm": 0.255859375, "learning_rate": 4.930472466738244e-05, "loss": 2.7933, "num_input_tokens_seen": 1313341440, "step": 2505 }, { "epoch": 0.12172574123580725, "grad_norm": 0.251953125, "learning_rate": 4.930002191120254e-05, "loss": 2.7996, "num_input_tokens_seen": 1315962880, "step": 2510 }, { "epoch": 0.12196822279205388, "grad_norm": 0.255859375, "learning_rate": 4.9295303529956535e-05, "loss": 2.7956, "num_input_tokens_seen": 1318584320, "step": 2515 }, { "epoch": 0.1222107043483005, "grad_norm": 0.255859375, "learning_rate": 4.929056952667838e-05, "loss": 2.801, "num_input_tokens_seen": 1321205760, "step": 2520 }, { "epoch": 0.12245318590454714, "grad_norm": 0.248046875, "learning_rate": 4.928581990441204e-05, "loss": 2.7956, "num_input_tokens_seen": 1323827200, "step": 2525 }, { "epoch": 0.12269566746079376, "grad_norm": 0.244140625, "learning_rate": 4.928105466621157e-05, "loss": 2.803, "num_input_tokens_seen": 1326448640, "step": 2530 }, { "epoch": 0.1229381490170404, "grad_norm": 0.25, "learning_rate": 4.927627381514106e-05, "loss": 2.8075, "num_input_tokens_seen": 1329070080, "step": 2535 }, { "epoch": 0.12318063057328701, "grad_norm": 0.2578125, "learning_rate": 4.927147735427461e-05, "loss": 2.8163, "num_input_tokens_seen": 1331691520, "step": 2540 }, { "epoch": 0.12342311212953365, "grad_norm": 0.25390625, "learning_rate": 4.926666528669637e-05, "loss": 2.8067, "num_input_tokens_seen": 1334312960, "step": 2545 }, { "epoch": 0.12366559368578027, "grad_norm": 0.255859375, "learning_rate": 4.926183761550055e-05, "loss": 2.7921, "num_input_tokens_seen": 1336934400, "step": 2550 }, { "epoch": 0.1239080752420269, "grad_norm": 0.259765625, "learning_rate": 4.925699434379136e-05, "loss": 2.8004, "num_input_tokens_seen": 1339555840, "step": 2555 }, { "epoch": 0.12415055679827353, "grad_norm": 0.263671875, "learning_rate": 4.925213547468305e-05, "loss": 2.8096, "num_input_tokens_seen": 1342177280, "step": 2560 }, { "epoch": 0.12439303835452016, "grad_norm": 0.2490234375, "learning_rate": 4.924726101129991e-05, "loss": 2.8016, "num_input_tokens_seen": 1344798720, "step": 2565 }, { "epoch": 0.12463551991076678, "grad_norm": 0.255859375, "learning_rate": 4.924237095677625e-05, "loss": 2.7967, "num_input_tokens_seen": 1347420160, "step": 2570 }, { "epoch": 0.12487800146701342, "grad_norm": 0.255859375, "learning_rate": 4.923746531425641e-05, "loss": 2.811, "num_input_tokens_seen": 1350041600, "step": 2575 }, { "epoch": 0.12512048302326004, "grad_norm": 0.2578125, "learning_rate": 4.923254408689474e-05, "loss": 2.8002, "num_input_tokens_seen": 1352663040, "step": 2580 }, { "epoch": 0.12536296457950666, "grad_norm": 0.25, "learning_rate": 4.922760727785563e-05, "loss": 2.8053, "num_input_tokens_seen": 1355284480, "step": 2585 }, { "epoch": 0.1256054461357533, "grad_norm": 0.2490234375, "learning_rate": 4.922265489031346e-05, "loss": 2.8038, "num_input_tokens_seen": 1357905920, "step": 2590 }, { "epoch": 0.12584792769199993, "grad_norm": 0.2470703125, "learning_rate": 4.9217686927452664e-05, "loss": 2.7968, "num_input_tokens_seen": 1360527360, "step": 2595 }, { "epoch": 0.12609040924824655, "grad_norm": 0.24609375, "learning_rate": 4.9212703392467667e-05, "loss": 2.7843, "num_input_tokens_seen": 1363148800, "step": 2600 }, { "epoch": 0.12633289080449317, "grad_norm": 0.25390625, "learning_rate": 4.920770428856292e-05, "loss": 2.8016, "num_input_tokens_seen": 1365770240, "step": 2605 }, { "epoch": 0.12657537236073982, "grad_norm": 0.26171875, "learning_rate": 4.9202689618952866e-05, "loss": 2.7902, "num_input_tokens_seen": 1368391680, "step": 2610 }, { "epoch": 0.12681785391698644, "grad_norm": 0.2578125, "learning_rate": 4.9197659386861976e-05, "loss": 2.8136, "num_input_tokens_seen": 1371013120, "step": 2615 }, { "epoch": 0.12706033547323306, "grad_norm": 0.2470703125, "learning_rate": 4.9192613595524724e-05, "loss": 2.8071, "num_input_tokens_seen": 1373634560, "step": 2620 }, { "epoch": 0.1273028170294797, "grad_norm": 0.2578125, "learning_rate": 4.918755224818558e-05, "loss": 2.8067, "num_input_tokens_seen": 1376256000, "step": 2625 }, { "epoch": 0.12754529858572633, "grad_norm": 0.244140625, "learning_rate": 4.918247534809902e-05, "loss": 2.7912, "num_input_tokens_seen": 1378877440, "step": 2630 }, { "epoch": 0.12778778014197295, "grad_norm": 0.2421875, "learning_rate": 4.9177382898529534e-05, "loss": 2.7971, "num_input_tokens_seen": 1381498880, "step": 2635 }, { "epoch": 0.12803026169821957, "grad_norm": 0.25390625, "learning_rate": 4.917227490275158e-05, "loss": 2.7934, "num_input_tokens_seen": 1384120320, "step": 2640 }, { "epoch": 0.12827274325446622, "grad_norm": 0.263671875, "learning_rate": 4.916715136404964e-05, "loss": 2.7972, "num_input_tokens_seen": 1386741760, "step": 2645 }, { "epoch": 0.12851522481071284, "grad_norm": 0.2578125, "learning_rate": 4.91620122857182e-05, "loss": 2.8053, "num_input_tokens_seen": 1389363200, "step": 2650 }, { "epoch": 0.12875770636695946, "grad_norm": 0.2578125, "learning_rate": 4.9156857671061696e-05, "loss": 2.7921, "num_input_tokens_seen": 1391984640, "step": 2655 }, { "epoch": 0.12900018792320608, "grad_norm": 0.255859375, "learning_rate": 4.9151687523394584e-05, "loss": 2.8009, "num_input_tokens_seen": 1394606080, "step": 2660 }, { "epoch": 0.12924266947945273, "grad_norm": 0.255859375, "learning_rate": 4.91465018460413e-05, "loss": 2.8027, "num_input_tokens_seen": 1397227520, "step": 2665 }, { "epoch": 0.12948515103569935, "grad_norm": 0.259765625, "learning_rate": 4.914130064233627e-05, "loss": 2.8085, "num_input_tokens_seen": 1399848960, "step": 2670 }, { "epoch": 0.12972763259194597, "grad_norm": 0.251953125, "learning_rate": 4.91360839156239e-05, "loss": 2.7943, "num_input_tokens_seen": 1402470400, "step": 2675 }, { "epoch": 0.1299701141481926, "grad_norm": 0.248046875, "learning_rate": 4.9130851669258574e-05, "loss": 2.8179, "num_input_tokens_seen": 1405091840, "step": 2680 }, { "epoch": 0.13021259570443924, "grad_norm": 0.25390625, "learning_rate": 4.9125603906604664e-05, "loss": 2.8101, "num_input_tokens_seen": 1407713280, "step": 2685 }, { "epoch": 0.13045507726068586, "grad_norm": 0.251953125, "learning_rate": 4.912034063103651e-05, "loss": 2.7838, "num_input_tokens_seen": 1410334720, "step": 2690 }, { "epoch": 0.13069755881693249, "grad_norm": 0.251953125, "learning_rate": 4.911506184593844e-05, "loss": 2.8124, "num_input_tokens_seen": 1412956160, "step": 2695 }, { "epoch": 0.1309400403731791, "grad_norm": 0.271484375, "learning_rate": 4.910976755470473e-05, "loss": 2.8006, "num_input_tokens_seen": 1415577600, "step": 2700 }, { "epoch": 0.1309400403731791, "eval_accuracy": 0.4526038104543234, "eval_loss": 2.7656688690185547, "eval_runtime": 5.786, "eval_samples_per_second": 51.849, "eval_steps_per_second": 6.568, "num_input_tokens_seen": 1415577600, "step": 2700 }, { "epoch": 0.13118252192942575, "grad_norm": 0.2578125, "learning_rate": 4.910445776073966e-05, "loss": 2.796, "num_input_tokens_seen": 1418199040, "step": 2705 }, { "epoch": 0.13142500348567238, "grad_norm": 0.26171875, "learning_rate": 4.909913246745745e-05, "loss": 2.805, "num_input_tokens_seen": 1420820480, "step": 2710 }, { "epoch": 0.131667485041919, "grad_norm": 0.251953125, "learning_rate": 4.909379167828231e-05, "loss": 2.7952, "num_input_tokens_seen": 1423441920, "step": 2715 }, { "epoch": 0.13190996659816562, "grad_norm": 0.25, "learning_rate": 4.9088435396648383e-05, "loss": 2.8165, "num_input_tokens_seen": 1426063360, "step": 2720 }, { "epoch": 0.13215244815441227, "grad_norm": 0.271484375, "learning_rate": 4.908306362599979e-05, "loss": 2.7988, "num_input_tokens_seen": 1428684800, "step": 2725 }, { "epoch": 0.1323949297106589, "grad_norm": 0.25, "learning_rate": 4.907767636979063e-05, "loss": 2.7978, "num_input_tokens_seen": 1431306240, "step": 2730 }, { "epoch": 0.1326374112669055, "grad_norm": 0.255859375, "learning_rate": 4.907227363148493e-05, "loss": 2.8041, "num_input_tokens_seen": 1433927680, "step": 2735 }, { "epoch": 0.13287989282315213, "grad_norm": 0.2578125, "learning_rate": 4.90668554145567e-05, "loss": 2.8011, "num_input_tokens_seen": 1436549120, "step": 2740 }, { "epoch": 0.13312237437939878, "grad_norm": 0.25390625, "learning_rate": 4.9061421722489866e-05, "loss": 2.8083, "num_input_tokens_seen": 1439170560, "step": 2745 }, { "epoch": 0.1333648559356454, "grad_norm": 0.2578125, "learning_rate": 4.905597255877834e-05, "loss": 2.7869, "num_input_tokens_seen": 1441792000, "step": 2750 }, { "epoch": 0.13360733749189202, "grad_norm": 0.2490234375, "learning_rate": 4.905050792692596e-05, "loss": 2.8159, "num_input_tokens_seen": 1444413440, "step": 2755 }, { "epoch": 0.13384981904813864, "grad_norm": 0.255859375, "learning_rate": 4.9045027830446534e-05, "loss": 2.7949, "num_input_tokens_seen": 1447034880, "step": 2760 }, { "epoch": 0.1340923006043853, "grad_norm": 0.26171875, "learning_rate": 4.903953227286378e-05, "loss": 2.8046, "num_input_tokens_seen": 1449656320, "step": 2765 }, { "epoch": 0.1343347821606319, "grad_norm": 0.25390625, "learning_rate": 4.903402125771139e-05, "loss": 2.8053, "num_input_tokens_seen": 1452277760, "step": 2770 }, { "epoch": 0.13457726371687853, "grad_norm": 0.255859375, "learning_rate": 4.9028494788532966e-05, "loss": 2.8006, "num_input_tokens_seen": 1454899200, "step": 2775 }, { "epoch": 0.13481974527312515, "grad_norm": 0.26171875, "learning_rate": 4.902295286888208e-05, "loss": 2.7883, "num_input_tokens_seen": 1457520640, "step": 2780 }, { "epoch": 0.1350622268293718, "grad_norm": 0.263671875, "learning_rate": 4.9017395502322206e-05, "loss": 2.7899, "num_input_tokens_seen": 1460142080, "step": 2785 }, { "epoch": 0.13530470838561842, "grad_norm": 0.2578125, "learning_rate": 4.9011822692426765e-05, "loss": 2.8109, "num_input_tokens_seen": 1462763520, "step": 2790 }, { "epoch": 0.13554718994186504, "grad_norm": 0.255859375, "learning_rate": 4.900623444277913e-05, "loss": 2.7942, "num_input_tokens_seen": 1465384960, "step": 2795 }, { "epoch": 0.13578967149811166, "grad_norm": 0.267578125, "learning_rate": 4.900063075697256e-05, "loss": 2.8041, "num_input_tokens_seen": 1468006400, "step": 2800 }, { "epoch": 0.1360321530543583, "grad_norm": 0.263671875, "learning_rate": 4.899501163861026e-05, "loss": 2.801, "num_input_tokens_seen": 1470627840, "step": 2805 }, { "epoch": 0.13627463461060493, "grad_norm": 0.255859375, "learning_rate": 4.898937709130537e-05, "loss": 2.7967, "num_input_tokens_seen": 1473249280, "step": 2810 }, { "epoch": 0.13651711616685155, "grad_norm": 0.25, "learning_rate": 4.8983727118680934e-05, "loss": 2.7884, "num_input_tokens_seen": 1475870720, "step": 2815 }, { "epoch": 0.13675959772309818, "grad_norm": 0.25, "learning_rate": 4.897806172436991e-05, "loss": 2.7912, "num_input_tokens_seen": 1478492160, "step": 2820 }, { "epoch": 0.13700207927934482, "grad_norm": 0.26171875, "learning_rate": 4.89723809120152e-05, "loss": 2.7917, "num_input_tokens_seen": 1481113600, "step": 2825 }, { "epoch": 0.13724456083559144, "grad_norm": 0.255859375, "learning_rate": 4.8966684685269586e-05, "loss": 2.7988, "num_input_tokens_seen": 1483735040, "step": 2830 }, { "epoch": 0.13748704239183807, "grad_norm": 0.271484375, "learning_rate": 4.8960973047795786e-05, "loss": 2.794, "num_input_tokens_seen": 1486356480, "step": 2835 }, { "epoch": 0.1377295239480847, "grad_norm": 0.26171875, "learning_rate": 4.895524600326642e-05, "loss": 2.7972, "num_input_tokens_seen": 1488977920, "step": 2840 }, { "epoch": 0.13797200550433134, "grad_norm": 0.255859375, "learning_rate": 4.894950355536401e-05, "loss": 2.7936, "num_input_tokens_seen": 1491599360, "step": 2845 }, { "epoch": 0.13821448706057796, "grad_norm": 0.25, "learning_rate": 4.894374570778099e-05, "loss": 2.7919, "num_input_tokens_seen": 1494220800, "step": 2850 }, { "epoch": 0.13845696861682458, "grad_norm": 0.255859375, "learning_rate": 4.893797246421968e-05, "loss": 2.7955, "num_input_tokens_seen": 1496842240, "step": 2855 }, { "epoch": 0.1386994501730712, "grad_norm": 0.251953125, "learning_rate": 4.893218382839232e-05, "loss": 2.8069, "num_input_tokens_seen": 1499463680, "step": 2860 }, { "epoch": 0.13894193172931785, "grad_norm": 0.2451171875, "learning_rate": 4.8926379804021037e-05, "loss": 2.7934, "num_input_tokens_seen": 1502085120, "step": 2865 }, { "epoch": 0.13918441328556447, "grad_norm": 0.255859375, "learning_rate": 4.892056039483787e-05, "loss": 2.7899, "num_input_tokens_seen": 1504706560, "step": 2870 }, { "epoch": 0.1394268948418111, "grad_norm": 0.267578125, "learning_rate": 4.891472560458471e-05, "loss": 2.7911, "num_input_tokens_seen": 1507328000, "step": 2875 }, { "epoch": 0.1396693763980577, "grad_norm": 0.2470703125, "learning_rate": 4.890887543701338e-05, "loss": 2.7826, "num_input_tokens_seen": 1509949440, "step": 2880 }, { "epoch": 0.13991185795430436, "grad_norm": 0.255859375, "learning_rate": 4.890300989588557e-05, "loss": 2.7796, "num_input_tokens_seen": 1512570880, "step": 2885 }, { "epoch": 0.14015433951055098, "grad_norm": 0.25, "learning_rate": 4.889712898497286e-05, "loss": 2.7972, "num_input_tokens_seen": 1515192320, "step": 2890 }, { "epoch": 0.1403968210667976, "grad_norm": 0.25, "learning_rate": 4.889123270805671e-05, "loss": 2.7947, "num_input_tokens_seen": 1517813760, "step": 2895 }, { "epoch": 0.14063930262304422, "grad_norm": 0.267578125, "learning_rate": 4.888532106892847e-05, "loss": 2.8126, "num_input_tokens_seen": 1520435200, "step": 2900 }, { "epoch": 0.14088178417929087, "grad_norm": 0.2578125, "learning_rate": 4.8879394071389355e-05, "loss": 2.8139, "num_input_tokens_seen": 1523056640, "step": 2905 }, { "epoch": 0.1411242657355375, "grad_norm": 0.26171875, "learning_rate": 4.887345171925046e-05, "loss": 2.8028, "num_input_tokens_seen": 1525678080, "step": 2910 }, { "epoch": 0.1413667472917841, "grad_norm": 0.2578125, "learning_rate": 4.886749401633276e-05, "loss": 2.7921, "num_input_tokens_seen": 1528299520, "step": 2915 }, { "epoch": 0.14160922884803076, "grad_norm": 0.259765625, "learning_rate": 4.88615209664671e-05, "loss": 2.7959, "num_input_tokens_seen": 1530920960, "step": 2920 }, { "epoch": 0.14185171040427738, "grad_norm": 0.26953125, "learning_rate": 4.8855532573494175e-05, "loss": 2.7985, "num_input_tokens_seen": 1533542400, "step": 2925 }, { "epoch": 0.142094191960524, "grad_norm": 0.2578125, "learning_rate": 4.8849528841264555e-05, "loss": 2.7886, "num_input_tokens_seen": 1536163840, "step": 2930 }, { "epoch": 0.14233667351677062, "grad_norm": 0.259765625, "learning_rate": 4.884350977363871e-05, "loss": 2.8094, "num_input_tokens_seen": 1538785280, "step": 2935 }, { "epoch": 0.14257915507301727, "grad_norm": 0.259765625, "learning_rate": 4.88374753744869e-05, "loss": 2.7781, "num_input_tokens_seen": 1541406720, "step": 2940 }, { "epoch": 0.1428216366292639, "grad_norm": 0.2470703125, "learning_rate": 4.88314256476893e-05, "loss": 2.7846, "num_input_tokens_seen": 1544028160, "step": 2945 }, { "epoch": 0.1430641181855105, "grad_norm": 0.251953125, "learning_rate": 4.882536059713592e-05, "loss": 2.7976, "num_input_tokens_seen": 1546649600, "step": 2950 }, { "epoch": 0.14330659974175713, "grad_norm": 0.248046875, "learning_rate": 4.8819280226726624e-05, "loss": 2.7954, "num_input_tokens_seen": 1549271040, "step": 2955 }, { "epoch": 0.14354908129800378, "grad_norm": 0.25, "learning_rate": 4.8813184540371125e-05, "loss": 2.7906, "num_input_tokens_seen": 1551892480, "step": 2960 }, { "epoch": 0.1437915628542504, "grad_norm": 0.25390625, "learning_rate": 4.8807073541989e-05, "loss": 2.7971, "num_input_tokens_seen": 1554513920, "step": 2965 }, { "epoch": 0.14403404441049703, "grad_norm": 0.26953125, "learning_rate": 4.880094723550965e-05, "loss": 2.8003, "num_input_tokens_seen": 1557135360, "step": 2970 }, { "epoch": 0.14427652596674365, "grad_norm": 0.28515625, "learning_rate": 4.879480562487232e-05, "loss": 2.7965, "num_input_tokens_seen": 1559756800, "step": 2975 }, { "epoch": 0.1445190075229903, "grad_norm": 0.26171875, "learning_rate": 4.878864871402612e-05, "loss": 2.8034, "num_input_tokens_seen": 1562378240, "step": 2980 }, { "epoch": 0.14476148907923692, "grad_norm": 0.25, "learning_rate": 4.878247650692998e-05, "loss": 2.7928, "num_input_tokens_seen": 1564999680, "step": 2985 }, { "epoch": 0.14500397063548354, "grad_norm": 0.255859375, "learning_rate": 4.877628900755265e-05, "loss": 2.7956, "num_input_tokens_seen": 1567621120, "step": 2990 }, { "epoch": 0.14524645219173016, "grad_norm": 0.2470703125, "learning_rate": 4.8770086219872756e-05, "loss": 2.7934, "num_input_tokens_seen": 1570242560, "step": 2995 }, { "epoch": 0.1454889337479768, "grad_norm": 0.2490234375, "learning_rate": 4.876386814787871e-05, "loss": 2.7886, "num_input_tokens_seen": 1572864000, "step": 3000 }, { "epoch": 0.1454889337479768, "eval_accuracy": 0.45276827878195736, "eval_loss": 2.763141393661499, "eval_runtime": 5.8531, "eval_samples_per_second": 51.255, "eval_steps_per_second": 6.492, "num_input_tokens_seen": 1572864000, "step": 3000 }, { "epoch": 0.14573141530422343, "grad_norm": 0.255859375, "learning_rate": 4.875763479556879e-05, "loss": 2.7761, "num_input_tokens_seen": 1575485440, "step": 3005 }, { "epoch": 0.14597389686047005, "grad_norm": 0.263671875, "learning_rate": 4.8751386166951065e-05, "loss": 2.7871, "num_input_tokens_seen": 1578106880, "step": 3010 }, { "epoch": 0.14621637841671667, "grad_norm": 0.251953125, "learning_rate": 4.874512226604344e-05, "loss": 2.7915, "num_input_tokens_seen": 1580728320, "step": 3015 }, { "epoch": 0.14645885997296332, "grad_norm": 0.259765625, "learning_rate": 4.8738843096873646e-05, "loss": 2.7887, "num_input_tokens_seen": 1583349760, "step": 3020 }, { "epoch": 0.14670134152920994, "grad_norm": 0.265625, "learning_rate": 4.873254866347924e-05, "loss": 2.7887, "num_input_tokens_seen": 1585971200, "step": 3025 }, { "epoch": 0.14694382308545656, "grad_norm": 0.25390625, "learning_rate": 4.872623896990757e-05, "loss": 2.8081, "num_input_tokens_seen": 1588592640, "step": 3030 }, { "epoch": 0.14718630464170318, "grad_norm": 0.2490234375, "learning_rate": 4.871991402021581e-05, "loss": 2.7981, "num_input_tokens_seen": 1591214080, "step": 3035 }, { "epoch": 0.14742878619794983, "grad_norm": 0.251953125, "learning_rate": 4.871357381847094e-05, "loss": 2.7952, "num_input_tokens_seen": 1593835520, "step": 3040 }, { "epoch": 0.14767126775419645, "grad_norm": 0.255859375, "learning_rate": 4.870721836874976e-05, "loss": 2.7912, "num_input_tokens_seen": 1596456960, "step": 3045 }, { "epoch": 0.14791374931044307, "grad_norm": 0.25, "learning_rate": 4.870084767513885e-05, "loss": 2.791, "num_input_tokens_seen": 1599078400, "step": 3050 }, { "epoch": 0.1481562308666897, "grad_norm": 0.2470703125, "learning_rate": 4.869446174173462e-05, "loss": 2.7779, "num_input_tokens_seen": 1601699840, "step": 3055 }, { "epoch": 0.14839871242293634, "grad_norm": 0.251953125, "learning_rate": 4.8688060572643254e-05, "loss": 2.7998, "num_input_tokens_seen": 1604321280, "step": 3060 }, { "epoch": 0.14864119397918296, "grad_norm": 0.2470703125, "learning_rate": 4.868164417198074e-05, "loss": 2.7951, "num_input_tokens_seen": 1606942720, "step": 3065 }, { "epoch": 0.14888367553542958, "grad_norm": 0.26171875, "learning_rate": 4.867521254387289e-05, "loss": 2.7777, "num_input_tokens_seen": 1609564160, "step": 3070 }, { "epoch": 0.1491261570916762, "grad_norm": 0.25, "learning_rate": 4.866876569245524e-05, "loss": 2.8008, "num_input_tokens_seen": 1612185600, "step": 3075 }, { "epoch": 0.14936863864792285, "grad_norm": 0.251953125, "learning_rate": 4.86623036218732e-05, "loss": 2.7931, "num_input_tokens_seen": 1614807040, "step": 3080 }, { "epoch": 0.14961112020416947, "grad_norm": 0.244140625, "learning_rate": 4.8655826336281886e-05, "loss": 2.7892, "num_input_tokens_seen": 1617428480, "step": 3085 }, { "epoch": 0.1498536017604161, "grad_norm": 0.2578125, "learning_rate": 4.864933383984625e-05, "loss": 2.8002, "num_input_tokens_seen": 1620049920, "step": 3090 }, { "epoch": 0.15009608331666272, "grad_norm": 0.25390625, "learning_rate": 4.864282613674101e-05, "loss": 2.7988, "num_input_tokens_seen": 1622671360, "step": 3095 }, { "epoch": 0.15033856487290936, "grad_norm": 0.2470703125, "learning_rate": 4.863630323115065e-05, "loss": 2.782, "num_input_tokens_seen": 1625292800, "step": 3100 }, { "epoch": 0.15058104642915598, "grad_norm": 0.255859375, "learning_rate": 4.862976512726944e-05, "loss": 2.7963, "num_input_tokens_seen": 1627914240, "step": 3105 }, { "epoch": 0.1508235279854026, "grad_norm": 0.271484375, "learning_rate": 4.862321182930143e-05, "loss": 2.7979, "num_input_tokens_seen": 1630535680, "step": 3110 }, { "epoch": 0.15106600954164923, "grad_norm": 0.25, "learning_rate": 4.861664334146043e-05, "loss": 2.8031, "num_input_tokens_seen": 1633157120, "step": 3115 }, { "epoch": 0.15130849109789588, "grad_norm": 0.259765625, "learning_rate": 4.861005966797002e-05, "loss": 2.7898, "num_input_tokens_seen": 1635778560, "step": 3120 }, { "epoch": 0.1515509726541425, "grad_norm": 0.25390625, "learning_rate": 4.860346081306353e-05, "loss": 2.786, "num_input_tokens_seen": 1638400000, "step": 3125 }, { "epoch": 0.15179345421038912, "grad_norm": 0.2470703125, "learning_rate": 4.859684678098407e-05, "loss": 2.7989, "num_input_tokens_seen": 1641021440, "step": 3130 }, { "epoch": 0.15203593576663574, "grad_norm": 0.259765625, "learning_rate": 4.859021757598452e-05, "loss": 2.8032, "num_input_tokens_seen": 1643642880, "step": 3135 }, { "epoch": 0.1522784173228824, "grad_norm": 0.25390625, "learning_rate": 4.858357320232749e-05, "loss": 2.7905, "num_input_tokens_seen": 1646264320, "step": 3140 }, { "epoch": 0.152520898879129, "grad_norm": 0.259765625, "learning_rate": 4.8576913664285346e-05, "loss": 2.788, "num_input_tokens_seen": 1648885760, "step": 3145 }, { "epoch": 0.15276338043537563, "grad_norm": 0.27734375, "learning_rate": 4.8570238966140215e-05, "loss": 2.8114, "num_input_tokens_seen": 1651507200, "step": 3150 }, { "epoch": 0.15300586199162225, "grad_norm": 0.26171875, "learning_rate": 4.856354911218398e-05, "loss": 2.8087, "num_input_tokens_seen": 1654128640, "step": 3155 }, { "epoch": 0.1532483435478689, "grad_norm": 0.267578125, "learning_rate": 4.855684410671825e-05, "loss": 2.7873, "num_input_tokens_seen": 1656750080, "step": 3160 }, { "epoch": 0.15349082510411552, "grad_norm": 0.24609375, "learning_rate": 4.855012395405439e-05, "loss": 2.7929, "num_input_tokens_seen": 1659371520, "step": 3165 }, { "epoch": 0.15373330666036214, "grad_norm": 0.26171875, "learning_rate": 4.85433886585135e-05, "loss": 2.8022, "num_input_tokens_seen": 1661992960, "step": 3170 }, { "epoch": 0.15397578821660876, "grad_norm": 0.2470703125, "learning_rate": 4.853663822442641e-05, "loss": 2.7892, "num_input_tokens_seen": 1664614400, "step": 3175 }, { "epoch": 0.1542182697728554, "grad_norm": 0.2490234375, "learning_rate": 4.8529872656133704e-05, "loss": 2.8025, "num_input_tokens_seen": 1667235840, "step": 3180 }, { "epoch": 0.15446075132910203, "grad_norm": 0.25390625, "learning_rate": 4.852309195798567e-05, "loss": 2.7959, "num_input_tokens_seen": 1669857280, "step": 3185 }, { "epoch": 0.15470323288534865, "grad_norm": 0.267578125, "learning_rate": 4.851629613434236e-05, "loss": 2.8077, "num_input_tokens_seen": 1672478720, "step": 3190 }, { "epoch": 0.15494571444159527, "grad_norm": 0.2734375, "learning_rate": 4.85094851895735e-05, "loss": 2.7869, "num_input_tokens_seen": 1675100160, "step": 3195 }, { "epoch": 0.15518819599784192, "grad_norm": 0.2490234375, "learning_rate": 4.8502659128058595e-05, "loss": 2.8001, "num_input_tokens_seen": 1677721600, "step": 3200 }, { "epoch": 0.15543067755408854, "grad_norm": 0.263671875, "learning_rate": 4.849581795418684e-05, "loss": 2.7906, "num_input_tokens_seen": 1680343040, "step": 3205 }, { "epoch": 0.15567315911033516, "grad_norm": 0.265625, "learning_rate": 4.8488961672357145e-05, "loss": 2.7865, "num_input_tokens_seen": 1682964480, "step": 3210 }, { "epoch": 0.1559156406665818, "grad_norm": 0.2578125, "learning_rate": 4.848209028697816e-05, "loss": 2.8061, "num_input_tokens_seen": 1685585920, "step": 3215 }, { "epoch": 0.15615812222282843, "grad_norm": 0.2470703125, "learning_rate": 4.847520380246821e-05, "loss": 2.7891, "num_input_tokens_seen": 1688207360, "step": 3220 }, { "epoch": 0.15640060377907505, "grad_norm": 0.248046875, "learning_rate": 4.846830222325536e-05, "loss": 2.8028, "num_input_tokens_seen": 1690828800, "step": 3225 }, { "epoch": 0.15664308533532167, "grad_norm": 0.259765625, "learning_rate": 4.846138555377735e-05, "loss": 2.7781, "num_input_tokens_seen": 1693450240, "step": 3230 }, { "epoch": 0.15688556689156832, "grad_norm": 0.2451171875, "learning_rate": 4.845445379848166e-05, "loss": 2.8013, "num_input_tokens_seen": 1696071680, "step": 3235 }, { "epoch": 0.15712804844781494, "grad_norm": 0.255859375, "learning_rate": 4.844750696182545e-05, "loss": 2.8074, "num_input_tokens_seen": 1698693120, "step": 3240 }, { "epoch": 0.15737053000406157, "grad_norm": 0.255859375, "learning_rate": 4.8440545048275573e-05, "loss": 2.8084, "num_input_tokens_seen": 1701314560, "step": 3245 }, { "epoch": 0.15761301156030819, "grad_norm": 0.2470703125, "learning_rate": 4.84335680623086e-05, "loss": 2.7899, "num_input_tokens_seen": 1703936000, "step": 3250 }, { "epoch": 0.15785549311655483, "grad_norm": 0.255859375, "learning_rate": 4.842657600841075e-05, "loss": 2.798, "num_input_tokens_seen": 1706557440, "step": 3255 }, { "epoch": 0.15809797467280146, "grad_norm": 0.251953125, "learning_rate": 4.841956889107797e-05, "loss": 2.7949, "num_input_tokens_seen": 1709178880, "step": 3260 }, { "epoch": 0.15834045622904808, "grad_norm": 0.2490234375, "learning_rate": 4.8412546714815885e-05, "loss": 2.7997, "num_input_tokens_seen": 1711800320, "step": 3265 }, { "epoch": 0.1585829377852947, "grad_norm": 0.2578125, "learning_rate": 4.840550948413979e-05, "loss": 2.7884, "num_input_tokens_seen": 1714421760, "step": 3270 }, { "epoch": 0.15882541934154135, "grad_norm": 0.259765625, "learning_rate": 4.839845720357467e-05, "loss": 2.8002, "num_input_tokens_seen": 1717043200, "step": 3275 }, { "epoch": 0.15906790089778797, "grad_norm": 0.2470703125, "learning_rate": 4.839138987765519e-05, "loss": 2.8018, "num_input_tokens_seen": 1719664640, "step": 3280 }, { "epoch": 0.1593103824540346, "grad_norm": 0.26171875, "learning_rate": 4.838430751092569e-05, "loss": 2.7909, "num_input_tokens_seen": 1722286080, "step": 3285 }, { "epoch": 0.1595528640102812, "grad_norm": 0.2490234375, "learning_rate": 4.837721010794016e-05, "loss": 2.7843, "num_input_tokens_seen": 1724907520, "step": 3290 }, { "epoch": 0.15979534556652786, "grad_norm": 0.265625, "learning_rate": 4.837009767326228e-05, "loss": 2.8041, "num_input_tokens_seen": 1727528960, "step": 3295 }, { "epoch": 0.16003782712277448, "grad_norm": 0.251953125, "learning_rate": 4.83629702114654e-05, "loss": 2.7907, "num_input_tokens_seen": 1730150400, "step": 3300 }, { "epoch": 0.16003782712277448, "eval_accuracy": 0.45322423058133854, "eval_loss": 2.760647773742676, "eval_runtime": 5.7789, "eval_samples_per_second": 51.913, "eval_steps_per_second": 6.576, "num_input_tokens_seen": 1730150400, "step": 3300 }, { "epoch": 0.1602803086790211, "grad_norm": 0.2578125, "learning_rate": 4.8355827727132516e-05, "loss": 2.7834, "num_input_tokens_seen": 1732771840, "step": 3305 }, { "epoch": 0.16052279023526772, "grad_norm": 0.25390625, "learning_rate": 4.8348670224856285e-05, "loss": 2.7893, "num_input_tokens_seen": 1735393280, "step": 3310 }, { "epoch": 0.16076527179151437, "grad_norm": 0.259765625, "learning_rate": 4.834149770923903e-05, "loss": 2.7888, "num_input_tokens_seen": 1738014720, "step": 3315 }, { "epoch": 0.161007753347761, "grad_norm": 0.248046875, "learning_rate": 4.833431018489273e-05, "loss": 2.7999, "num_input_tokens_seen": 1740636160, "step": 3320 }, { "epoch": 0.1612502349040076, "grad_norm": 0.251953125, "learning_rate": 4.8327107656439e-05, "loss": 2.7849, "num_input_tokens_seen": 1743257600, "step": 3325 }, { "epoch": 0.16149271646025423, "grad_norm": 0.244140625, "learning_rate": 4.831989012850912e-05, "loss": 2.7892, "num_input_tokens_seen": 1745879040, "step": 3330 }, { "epoch": 0.16173519801650088, "grad_norm": 0.26171875, "learning_rate": 4.831265760574398e-05, "loss": 2.7839, "num_input_tokens_seen": 1748500480, "step": 3335 }, { "epoch": 0.1619776795727475, "grad_norm": 0.255859375, "learning_rate": 4.830541009279417e-05, "loss": 2.8026, "num_input_tokens_seen": 1751121920, "step": 3340 }, { "epoch": 0.16222016112899412, "grad_norm": 0.259765625, "learning_rate": 4.829814759431987e-05, "loss": 2.7879, "num_input_tokens_seen": 1753743360, "step": 3345 }, { "epoch": 0.16246264268524074, "grad_norm": 0.26171875, "learning_rate": 4.829087011499091e-05, "loss": 2.7794, "num_input_tokens_seen": 1756364800, "step": 3350 }, { "epoch": 0.1627051242414874, "grad_norm": 0.25, "learning_rate": 4.8283577659486764e-05, "loss": 2.7939, "num_input_tokens_seen": 1758986240, "step": 3355 }, { "epoch": 0.162947605797734, "grad_norm": 0.255859375, "learning_rate": 4.827627023249652e-05, "loss": 2.7916, "num_input_tokens_seen": 1761607680, "step": 3360 }, { "epoch": 0.16319008735398063, "grad_norm": 0.251953125, "learning_rate": 4.8268947838718895e-05, "loss": 2.7954, "num_input_tokens_seen": 1764229120, "step": 3365 }, { "epoch": 0.16343256891022725, "grad_norm": 0.2734375, "learning_rate": 4.826161048286224e-05, "loss": 2.7845, "num_input_tokens_seen": 1766850560, "step": 3370 }, { "epoch": 0.1636750504664739, "grad_norm": 0.251953125, "learning_rate": 4.8254258169644526e-05, "loss": 2.8021, "num_input_tokens_seen": 1769472000, "step": 3375 }, { "epoch": 0.16391753202272052, "grad_norm": 0.267578125, "learning_rate": 4.824689090379333e-05, "loss": 2.7891, "num_input_tokens_seen": 1772093440, "step": 3380 }, { "epoch": 0.16416001357896715, "grad_norm": 0.255859375, "learning_rate": 4.8239508690045846e-05, "loss": 2.8082, "num_input_tokens_seen": 1774714880, "step": 3385 }, { "epoch": 0.16440249513521377, "grad_norm": 0.2578125, "learning_rate": 4.8232111533148895e-05, "loss": 2.7915, "num_input_tokens_seen": 1777336320, "step": 3390 }, { "epoch": 0.16464497669146042, "grad_norm": 0.255859375, "learning_rate": 4.822469943785888e-05, "loss": 2.804, "num_input_tokens_seen": 1779957760, "step": 3395 }, { "epoch": 0.16488745824770704, "grad_norm": 0.25, "learning_rate": 4.8217272408941835e-05, "loss": 2.7979, "num_input_tokens_seen": 1782579200, "step": 3400 }, { "epoch": 0.16512993980395366, "grad_norm": 0.25390625, "learning_rate": 4.820983045117339e-05, "loss": 2.7809, "num_input_tokens_seen": 1785200640, "step": 3405 }, { "epoch": 0.16537242136020028, "grad_norm": 0.2578125, "learning_rate": 4.820237356933876e-05, "loss": 2.7977, "num_input_tokens_seen": 1787822080, "step": 3410 }, { "epoch": 0.16561490291644693, "grad_norm": 0.26171875, "learning_rate": 4.819490176823277e-05, "loss": 2.7775, "num_input_tokens_seen": 1790443520, "step": 3415 }, { "epoch": 0.16585738447269355, "grad_norm": 0.259765625, "learning_rate": 4.8187415052659835e-05, "loss": 2.7965, "num_input_tokens_seen": 1793064960, "step": 3420 }, { "epoch": 0.16609986602894017, "grad_norm": 0.259765625, "learning_rate": 4.8179913427433965e-05, "loss": 2.7936, "num_input_tokens_seen": 1795686400, "step": 3425 }, { "epoch": 0.1663423475851868, "grad_norm": 0.26171875, "learning_rate": 4.8172396897378745e-05, "loss": 2.789, "num_input_tokens_seen": 1798307840, "step": 3430 }, { "epoch": 0.16658482914143344, "grad_norm": 0.25, "learning_rate": 4.816486546732736e-05, "loss": 2.7786, "num_input_tokens_seen": 1800929280, "step": 3435 }, { "epoch": 0.16682731069768006, "grad_norm": 0.251953125, "learning_rate": 4.815731914212256e-05, "loss": 2.8067, "num_input_tokens_seen": 1803550720, "step": 3440 }, { "epoch": 0.16706979225392668, "grad_norm": 0.25390625, "learning_rate": 4.8149757926616666e-05, "loss": 2.7945, "num_input_tokens_seen": 1806172160, "step": 3445 }, { "epoch": 0.1673122738101733, "grad_norm": 0.2421875, "learning_rate": 4.814218182567162e-05, "loss": 2.8073, "num_input_tokens_seen": 1808793600, "step": 3450 }, { "epoch": 0.16755475536641995, "grad_norm": 0.25, "learning_rate": 4.813459084415887e-05, "loss": 2.7857, "num_input_tokens_seen": 1811415040, "step": 3455 }, { "epoch": 0.16779723692266657, "grad_norm": 0.25390625, "learning_rate": 4.812698498695948e-05, "loss": 2.79, "num_input_tokens_seen": 1814036480, "step": 3460 }, { "epoch": 0.1680397184789132, "grad_norm": 0.25, "learning_rate": 4.811936425896406e-05, "loss": 2.8142, "num_input_tokens_seen": 1816657920, "step": 3465 }, { "epoch": 0.1682822000351598, "grad_norm": 0.25390625, "learning_rate": 4.811172866507279e-05, "loss": 2.7948, "num_input_tokens_seen": 1819279360, "step": 3470 }, { "epoch": 0.16852468159140646, "grad_norm": 0.2578125, "learning_rate": 4.8104078210195406e-05, "loss": 2.8081, "num_input_tokens_seen": 1821900800, "step": 3475 }, { "epoch": 0.16876716314765308, "grad_norm": 0.26953125, "learning_rate": 4.809641289925119e-05, "loss": 2.8015, "num_input_tokens_seen": 1824522240, "step": 3480 }, { "epoch": 0.1690096447038997, "grad_norm": 0.263671875, "learning_rate": 4.8088732737168986e-05, "loss": 2.783, "num_input_tokens_seen": 1827143680, "step": 3485 }, { "epoch": 0.16925212626014632, "grad_norm": 0.265625, "learning_rate": 4.808103772888719e-05, "loss": 2.7795, "num_input_tokens_seen": 1829765120, "step": 3490 }, { "epoch": 0.16949460781639297, "grad_norm": 0.25390625, "learning_rate": 4.807332787935374e-05, "loss": 2.7744, "num_input_tokens_seen": 1832386560, "step": 3495 }, { "epoch": 0.1697370893726396, "grad_norm": 0.267578125, "learning_rate": 4.8065603193526114e-05, "loss": 2.7926, "num_input_tokens_seen": 1835008000, "step": 3500 }, { "epoch": 0.16997957092888621, "grad_norm": 0.26171875, "learning_rate": 4.805786367637134e-05, "loss": 2.7882, "num_input_tokens_seen": 1837629440, "step": 3505 }, { "epoch": 0.17022205248513286, "grad_norm": 0.25390625, "learning_rate": 4.805010933286598e-05, "loss": 2.795, "num_input_tokens_seen": 1840250880, "step": 3510 }, { "epoch": 0.17046453404137948, "grad_norm": 0.2490234375, "learning_rate": 4.8042340167996105e-05, "loss": 2.7866, "num_input_tokens_seen": 1842872320, "step": 3515 }, { "epoch": 0.1707070155976261, "grad_norm": 0.251953125, "learning_rate": 4.803455618675736e-05, "loss": 2.7983, "num_input_tokens_seen": 1845493760, "step": 3520 }, { "epoch": 0.17094949715387273, "grad_norm": 0.24609375, "learning_rate": 4.802675739415488e-05, "loss": 2.7836, "num_input_tokens_seen": 1848115200, "step": 3525 }, { "epoch": 0.17119197871011937, "grad_norm": 0.24609375, "learning_rate": 4.801894379520333e-05, "loss": 2.7891, "num_input_tokens_seen": 1850736640, "step": 3530 }, { "epoch": 0.171434460266366, "grad_norm": 0.2490234375, "learning_rate": 4.801111539492692e-05, "loss": 2.7963, "num_input_tokens_seen": 1853358080, "step": 3535 }, { "epoch": 0.17167694182261262, "grad_norm": 0.259765625, "learning_rate": 4.800327219835936e-05, "loss": 2.7872, "num_input_tokens_seen": 1855979520, "step": 3540 }, { "epoch": 0.17191942337885924, "grad_norm": 0.275390625, "learning_rate": 4.7995414210543866e-05, "loss": 2.7856, "num_input_tokens_seen": 1858600960, "step": 3545 }, { "epoch": 0.17216190493510589, "grad_norm": 0.251953125, "learning_rate": 4.798754143653317e-05, "loss": 2.7839, "num_input_tokens_seen": 1861222400, "step": 3550 }, { "epoch": 0.1724043864913525, "grad_norm": 0.2490234375, "learning_rate": 4.797965388138953e-05, "loss": 2.7956, "num_input_tokens_seen": 1863843840, "step": 3555 }, { "epoch": 0.17264686804759913, "grad_norm": 0.25, "learning_rate": 4.7971751550184674e-05, "loss": 2.786, "num_input_tokens_seen": 1866465280, "step": 3560 }, { "epoch": 0.17288934960384575, "grad_norm": 0.251953125, "learning_rate": 4.796383444799987e-05, "loss": 2.7978, "num_input_tokens_seen": 1869086720, "step": 3565 }, { "epoch": 0.1731318311600924, "grad_norm": 0.259765625, "learning_rate": 4.795590257992584e-05, "loss": 2.8049, "num_input_tokens_seen": 1871708160, "step": 3570 }, { "epoch": 0.17337431271633902, "grad_norm": 0.2490234375, "learning_rate": 4.794795595106285e-05, "loss": 2.7942, "num_input_tokens_seen": 1874329600, "step": 3575 }, { "epoch": 0.17361679427258564, "grad_norm": 0.251953125, "learning_rate": 4.793999456652062e-05, "loss": 2.7885, "num_input_tokens_seen": 1876951040, "step": 3580 }, { "epoch": 0.17385927582883226, "grad_norm": 0.255859375, "learning_rate": 4.7932018431418366e-05, "loss": 2.8022, "num_input_tokens_seen": 1879572480, "step": 3585 }, { "epoch": 0.1741017573850789, "grad_norm": 0.26171875, "learning_rate": 4.792402755088481e-05, "loss": 2.7914, "num_input_tokens_seen": 1882193920, "step": 3590 }, { "epoch": 0.17434423894132553, "grad_norm": 0.275390625, "learning_rate": 4.791602193005812e-05, "loss": 2.7815, "num_input_tokens_seen": 1884815360, "step": 3595 }, { "epoch": 0.17458672049757215, "grad_norm": 0.26171875, "learning_rate": 4.7908001574085964e-05, "loss": 2.7907, "num_input_tokens_seen": 1887436800, "step": 3600 }, { "epoch": 0.17458672049757215, "eval_accuracy": 0.4535759648265755, "eval_loss": 2.7588062286376953, "eval_runtime": 5.9537, "eval_samples_per_second": 50.389, "eval_steps_per_second": 6.383, "num_input_tokens_seen": 1887436800, "step": 3600 }, { "epoch": 0.17482920205381877, "grad_norm": 0.255859375, "learning_rate": 4.789996648812548e-05, "loss": 2.792, "num_input_tokens_seen": 1890058240, "step": 3605 }, { "epoch": 0.17507168361006542, "grad_norm": 0.251953125, "learning_rate": 4.78919166773433e-05, "loss": 2.775, "num_input_tokens_seen": 1892679680, "step": 3610 }, { "epoch": 0.17531416516631204, "grad_norm": 0.25390625, "learning_rate": 4.7883852146915465e-05, "loss": 2.8045, "num_input_tokens_seen": 1895301120, "step": 3615 }, { "epoch": 0.17555664672255866, "grad_norm": 0.2578125, "learning_rate": 4.787577290202755e-05, "loss": 2.7993, "num_input_tokens_seen": 1897922560, "step": 3620 }, { "epoch": 0.17579912827880528, "grad_norm": 0.25390625, "learning_rate": 4.7867678947874546e-05, "loss": 2.7879, "num_input_tokens_seen": 1900544000, "step": 3625 }, { "epoch": 0.17604160983505193, "grad_norm": 0.251953125, "learning_rate": 4.785957028966092e-05, "loss": 2.7804, "num_input_tokens_seen": 1903165440, "step": 3630 }, { "epoch": 0.17628409139129855, "grad_norm": 0.263671875, "learning_rate": 4.785144693260059e-05, "loss": 2.8043, "num_input_tokens_seen": 1905786880, "step": 3635 }, { "epoch": 0.17652657294754517, "grad_norm": 0.2490234375, "learning_rate": 4.784330888191691e-05, "loss": 2.791, "num_input_tokens_seen": 1908408320, "step": 3640 }, { "epoch": 0.1767690545037918, "grad_norm": 0.25390625, "learning_rate": 4.783515614284273e-05, "loss": 2.7967, "num_input_tokens_seen": 1911029760, "step": 3645 }, { "epoch": 0.17701153606003844, "grad_norm": 0.25390625, "learning_rate": 4.782698872062028e-05, "loss": 2.7859, "num_input_tokens_seen": 1913651200, "step": 3650 }, { "epoch": 0.17725401761628506, "grad_norm": 0.259765625, "learning_rate": 4.7818806620501284e-05, "loss": 2.7977, "num_input_tokens_seen": 1916272640, "step": 3655 }, { "epoch": 0.17749649917253169, "grad_norm": 0.27734375, "learning_rate": 4.781060984774687e-05, "loss": 2.802, "num_input_tokens_seen": 1918894080, "step": 3660 }, { "epoch": 0.1777389807287783, "grad_norm": 0.259765625, "learning_rate": 4.780239840762763e-05, "loss": 2.7888, "num_input_tokens_seen": 1921515520, "step": 3665 }, { "epoch": 0.17798146228502496, "grad_norm": 0.2490234375, "learning_rate": 4.7794172305423554e-05, "loss": 2.7871, "num_input_tokens_seen": 1924136960, "step": 3670 }, { "epoch": 0.17822394384127158, "grad_norm": 0.2470703125, "learning_rate": 4.77859315464241e-05, "loss": 2.7989, "num_input_tokens_seen": 1926758400, "step": 3675 }, { "epoch": 0.1784664253975182, "grad_norm": 0.265625, "learning_rate": 4.7777676135928096e-05, "loss": 2.7838, "num_input_tokens_seen": 1929379840, "step": 3680 }, { "epoch": 0.17870890695376482, "grad_norm": 0.25390625, "learning_rate": 4.776940607924385e-05, "loss": 2.7912, "num_input_tokens_seen": 1932001280, "step": 3685 }, { "epoch": 0.17895138851001147, "grad_norm": 0.248046875, "learning_rate": 4.776112138168904e-05, "loss": 2.8105, "num_input_tokens_seen": 1934622720, "step": 3690 }, { "epoch": 0.1791938700662581, "grad_norm": 0.271484375, "learning_rate": 4.7752822048590805e-05, "loss": 2.7914, "num_input_tokens_seen": 1937244160, "step": 3695 }, { "epoch": 0.1794363516225047, "grad_norm": 0.255859375, "learning_rate": 4.7744508085285645e-05, "loss": 2.7828, "num_input_tokens_seen": 1939865600, "step": 3700 }, { "epoch": 0.17967883317875133, "grad_norm": 0.26171875, "learning_rate": 4.773617949711949e-05, "loss": 2.801, "num_input_tokens_seen": 1942487040, "step": 3705 }, { "epoch": 0.17992131473499798, "grad_norm": 0.25, "learning_rate": 4.7727836289447685e-05, "loss": 2.8001, "num_input_tokens_seen": 1945108480, "step": 3710 }, { "epoch": 0.1801637962912446, "grad_norm": 0.2490234375, "learning_rate": 4.771947846763496e-05, "loss": 2.7896, "num_input_tokens_seen": 1947729920, "step": 3715 }, { "epoch": 0.18040627784749122, "grad_norm": 0.255859375, "learning_rate": 4.7711106037055456e-05, "loss": 2.7967, "num_input_tokens_seen": 1950351360, "step": 3720 }, { "epoch": 0.18064875940373784, "grad_norm": 0.255859375, "learning_rate": 4.770271900309268e-05, "loss": 2.7843, "num_input_tokens_seen": 1952972800, "step": 3725 }, { "epoch": 0.1808912409599845, "grad_norm": 0.271484375, "learning_rate": 4.7694317371139556e-05, "loss": 2.7914, "num_input_tokens_seen": 1955594240, "step": 3730 }, { "epoch": 0.1811337225162311, "grad_norm": 0.25390625, "learning_rate": 4.768590114659839e-05, "loss": 2.7873, "num_input_tokens_seen": 1958215680, "step": 3735 }, { "epoch": 0.18137620407247773, "grad_norm": 0.251953125, "learning_rate": 4.767747033488087e-05, "loss": 2.8016, "num_input_tokens_seen": 1960837120, "step": 3740 }, { "epoch": 0.18161868562872435, "grad_norm": 0.255859375, "learning_rate": 4.766902494140805e-05, "loss": 2.7838, "num_input_tokens_seen": 1963458560, "step": 3745 }, { "epoch": 0.181861167184971, "grad_norm": 0.26171875, "learning_rate": 4.766056497161037e-05, "loss": 2.8001, "num_input_tokens_seen": 1966080000, "step": 3750 }, { "epoch": 0.18210364874121762, "grad_norm": 0.255859375, "learning_rate": 4.7652090430927656e-05, "loss": 2.7851, "num_input_tokens_seen": 1968701440, "step": 3755 }, { "epoch": 0.18234613029746424, "grad_norm": 0.259765625, "learning_rate": 4.7643601324809077e-05, "loss": 2.7882, "num_input_tokens_seen": 1971322880, "step": 3760 }, { "epoch": 0.18258861185371086, "grad_norm": 0.255859375, "learning_rate": 4.7635097658713195e-05, "loss": 2.8002, "num_input_tokens_seen": 1973944320, "step": 3765 }, { "epoch": 0.1828310934099575, "grad_norm": 0.248046875, "learning_rate": 4.762657943810791e-05, "loss": 2.7898, "num_input_tokens_seen": 1976565760, "step": 3770 }, { "epoch": 0.18307357496620413, "grad_norm": 0.2578125, "learning_rate": 4.76180466684705e-05, "loss": 2.8035, "num_input_tokens_seen": 1979187200, "step": 3775 }, { "epoch": 0.18331605652245075, "grad_norm": 0.25390625, "learning_rate": 4.760949935528758e-05, "loss": 2.785, "num_input_tokens_seen": 1981808640, "step": 3780 }, { "epoch": 0.18355853807869738, "grad_norm": 0.2470703125, "learning_rate": 4.7600937504055126e-05, "loss": 2.7879, "num_input_tokens_seen": 1984430080, "step": 3785 }, { "epoch": 0.18380101963494402, "grad_norm": 0.2451171875, "learning_rate": 4.759236112027847e-05, "loss": 2.7969, "num_input_tokens_seen": 1987051520, "step": 3790 }, { "epoch": 0.18404350119119064, "grad_norm": 0.2470703125, "learning_rate": 4.758377020947228e-05, "loss": 2.801, "num_input_tokens_seen": 1989672960, "step": 3795 }, { "epoch": 0.18428598274743727, "grad_norm": 0.265625, "learning_rate": 4.7575164777160555e-05, "loss": 2.7953, "num_input_tokens_seen": 1992294400, "step": 3800 }, { "epoch": 0.18452846430368391, "grad_norm": 0.251953125, "learning_rate": 4.756654482887665e-05, "loss": 2.7936, "num_input_tokens_seen": 1994915840, "step": 3805 }, { "epoch": 0.18477094585993054, "grad_norm": 0.267578125, "learning_rate": 4.7557910370163245e-05, "loss": 2.7776, "num_input_tokens_seen": 1997537280, "step": 3810 }, { "epoch": 0.18501342741617716, "grad_norm": 0.25390625, "learning_rate": 4.754926140657235e-05, "loss": 2.7977, "num_input_tokens_seen": 2000158720, "step": 3815 }, { "epoch": 0.18525590897242378, "grad_norm": 0.2890625, "learning_rate": 4.75405979436653e-05, "loss": 2.7957, "num_input_tokens_seen": 2002780160, "step": 3820 }, { "epoch": 0.18549839052867043, "grad_norm": 0.265625, "learning_rate": 4.753191998701276e-05, "loss": 2.7826, "num_input_tokens_seen": 2005401600, "step": 3825 }, { "epoch": 0.18574087208491705, "grad_norm": 0.265625, "learning_rate": 4.7523227542194714e-05, "loss": 2.8021, "num_input_tokens_seen": 2008023040, "step": 3830 }, { "epoch": 0.18598335364116367, "grad_norm": 0.259765625, "learning_rate": 4.751452061480045e-05, "loss": 2.7876, "num_input_tokens_seen": 2010644480, "step": 3835 }, { "epoch": 0.1862258351974103, "grad_norm": 0.251953125, "learning_rate": 4.750579921042858e-05, "loss": 2.7959, "num_input_tokens_seen": 2013265920, "step": 3840 }, { "epoch": 0.18646831675365694, "grad_norm": 0.263671875, "learning_rate": 4.749706333468702e-05, "loss": 2.7865, "num_input_tokens_seen": 2015887360, "step": 3845 }, { "epoch": 0.18671079830990356, "grad_norm": 0.255859375, "learning_rate": 4.7488312993193e-05, "loss": 2.7897, "num_input_tokens_seen": 2018508800, "step": 3850 }, { "epoch": 0.18695327986615018, "grad_norm": 0.259765625, "learning_rate": 4.747954819157303e-05, "loss": 2.7692, "num_input_tokens_seen": 2021130240, "step": 3855 }, { "epoch": 0.1871957614223968, "grad_norm": 0.267578125, "learning_rate": 4.747076893546294e-05, "loss": 2.7851, "num_input_tokens_seen": 2023751680, "step": 3860 }, { "epoch": 0.18743824297864345, "grad_norm": 0.251953125, "learning_rate": 4.746197523050785e-05, "loss": 2.7784, "num_input_tokens_seen": 2026373120, "step": 3865 }, { "epoch": 0.18768072453489007, "grad_norm": 0.2578125, "learning_rate": 4.745316708236217e-05, "loss": 2.78, "num_input_tokens_seen": 2028994560, "step": 3870 }, { "epoch": 0.1879232060911367, "grad_norm": 0.267578125, "learning_rate": 4.744434449668959e-05, "loss": 2.7942, "num_input_tokens_seen": 2031616000, "step": 3875 }, { "epoch": 0.1881656876473833, "grad_norm": 0.2490234375, "learning_rate": 4.7435507479163085e-05, "loss": 2.7872, "num_input_tokens_seen": 2034237440, "step": 3880 }, { "epoch": 0.18840816920362996, "grad_norm": 0.25, "learning_rate": 4.7426656035464915e-05, "loss": 2.7738, "num_input_tokens_seen": 2036858880, "step": 3885 }, { "epoch": 0.18865065075987658, "grad_norm": 0.24609375, "learning_rate": 4.7417790171286614e-05, "loss": 2.8005, "num_input_tokens_seen": 2039480320, "step": 3890 }, { "epoch": 0.1888931323161232, "grad_norm": 0.255859375, "learning_rate": 4.740890989232899e-05, "loss": 2.793, "num_input_tokens_seen": 2042101760, "step": 3895 }, { "epoch": 0.18913561387236982, "grad_norm": 0.2490234375, "learning_rate": 4.7400015204302105e-05, "loss": 2.7788, "num_input_tokens_seen": 2044723200, "step": 3900 }, { "epoch": 0.18913561387236982, "eval_accuracy": 0.4537322911577919, "eval_loss": 2.7569406032562256, "eval_runtime": 5.7814, "eval_samples_per_second": 51.891, "eval_steps_per_second": 6.573, "num_input_tokens_seen": 2044723200, "step": 3900 }, { "epoch": 0.18937809542861647, "grad_norm": 0.25390625, "learning_rate": 4.739110611292532e-05, "loss": 2.7841, "num_input_tokens_seen": 2047344640, "step": 3905 }, { "epoch": 0.1896205769848631, "grad_norm": 0.25390625, "learning_rate": 4.738218262392722e-05, "loss": 2.8006, "num_input_tokens_seen": 2049966080, "step": 3910 }, { "epoch": 0.1898630585411097, "grad_norm": 0.259765625, "learning_rate": 4.7373244743045676e-05, "loss": 2.7932, "num_input_tokens_seen": 2052587520, "step": 3915 }, { "epoch": 0.19010554009735633, "grad_norm": 0.26953125, "learning_rate": 4.736429247602778e-05, "loss": 2.7886, "num_input_tokens_seen": 2055208960, "step": 3920 }, { "epoch": 0.19034802165360298, "grad_norm": 0.25, "learning_rate": 4.735532582862993e-05, "loss": 2.7903, "num_input_tokens_seen": 2057830400, "step": 3925 }, { "epoch": 0.1905905032098496, "grad_norm": 0.2578125, "learning_rate": 4.734634480661771e-05, "loss": 2.7968, "num_input_tokens_seen": 2060451840, "step": 3930 }, { "epoch": 0.19083298476609623, "grad_norm": 0.25390625, "learning_rate": 4.733734941576598e-05, "loss": 2.7749, "num_input_tokens_seen": 2063073280, "step": 3935 }, { "epoch": 0.19107546632234285, "grad_norm": 0.251953125, "learning_rate": 4.732833966185883e-05, "loss": 2.7859, "num_input_tokens_seen": 2065694720, "step": 3940 }, { "epoch": 0.1913179478785895, "grad_norm": 0.251953125, "learning_rate": 4.73193155506896e-05, "loss": 2.7785, "num_input_tokens_seen": 2068316160, "step": 3945 }, { "epoch": 0.19156042943483612, "grad_norm": 0.265625, "learning_rate": 4.731027708806084e-05, "loss": 2.7971, "num_input_tokens_seen": 2070937600, "step": 3950 }, { "epoch": 0.19180291099108274, "grad_norm": 0.25, "learning_rate": 4.730122427978434e-05, "loss": 2.7928, "num_input_tokens_seen": 2073559040, "step": 3955 }, { "epoch": 0.19204539254732936, "grad_norm": 0.248046875, "learning_rate": 4.72921571316811e-05, "loss": 2.7878, "num_input_tokens_seen": 2076180480, "step": 3960 }, { "epoch": 0.192287874103576, "grad_norm": 0.25390625, "learning_rate": 4.7283075649581374e-05, "loss": 2.789, "num_input_tokens_seen": 2078801920, "step": 3965 }, { "epoch": 0.19253035565982263, "grad_norm": 0.25390625, "learning_rate": 4.727397983932461e-05, "loss": 2.7807, "num_input_tokens_seen": 2081423360, "step": 3970 }, { "epoch": 0.19277283721606925, "grad_norm": 0.25390625, "learning_rate": 4.726486970675945e-05, "loss": 2.7927, "num_input_tokens_seen": 2084044800, "step": 3975 }, { "epoch": 0.19301531877231587, "grad_norm": 0.25390625, "learning_rate": 4.725574525774379e-05, "loss": 2.7903, "num_input_tokens_seen": 2086666240, "step": 3980 }, { "epoch": 0.19325780032856252, "grad_norm": 0.2490234375, "learning_rate": 4.72466064981447e-05, "loss": 2.8101, "num_input_tokens_seen": 2089287680, "step": 3985 }, { "epoch": 0.19350028188480914, "grad_norm": 0.263671875, "learning_rate": 4.7237453433838445e-05, "loss": 2.7965, "num_input_tokens_seen": 2091909120, "step": 3990 }, { "epoch": 0.19374276344105576, "grad_norm": 0.26953125, "learning_rate": 4.7228286070710525e-05, "loss": 2.7921, "num_input_tokens_seen": 2094530560, "step": 3995 }, { "epoch": 0.19398524499730238, "grad_norm": 0.26171875, "learning_rate": 4.7219104414655595e-05, "loss": 2.7743, "num_input_tokens_seen": 2097152000, "step": 4000 }, { "epoch": 0.19422772655354903, "grad_norm": 0.2490234375, "learning_rate": 4.720990847157752e-05, "loss": 2.7748, "num_input_tokens_seen": 2099773440, "step": 4005 }, { "epoch": 0.19447020810979565, "grad_norm": 0.255859375, "learning_rate": 4.720069824738936e-05, "loss": 2.7936, "num_input_tokens_seen": 2102394880, "step": 4010 }, { "epoch": 0.19471268966604227, "grad_norm": 0.265625, "learning_rate": 4.719147374801335e-05, "loss": 2.7772, "num_input_tokens_seen": 2105016320, "step": 4015 }, { "epoch": 0.1949551712222889, "grad_norm": 0.255859375, "learning_rate": 4.718223497938088e-05, "loss": 2.7926, "num_input_tokens_seen": 2107637760, "step": 4020 }, { "epoch": 0.19519765277853554, "grad_norm": 0.267578125, "learning_rate": 4.717298194743254e-05, "loss": 2.7731, "num_input_tokens_seen": 2110259200, "step": 4025 }, { "epoch": 0.19544013433478216, "grad_norm": 0.255859375, "learning_rate": 4.71637146581181e-05, "loss": 2.7816, "num_input_tokens_seen": 2112880640, "step": 4030 }, { "epoch": 0.19568261589102878, "grad_norm": 0.2578125, "learning_rate": 4.715443311739648e-05, "loss": 2.7954, "num_input_tokens_seen": 2115502080, "step": 4035 }, { "epoch": 0.1959250974472754, "grad_norm": 0.255859375, "learning_rate": 4.714513733123577e-05, "loss": 2.7833, "num_input_tokens_seen": 2118123520, "step": 4040 }, { "epoch": 0.19616757900352205, "grad_norm": 0.25390625, "learning_rate": 4.713582730561321e-05, "loss": 2.7868, "num_input_tokens_seen": 2120744960, "step": 4045 }, { "epoch": 0.19641006055976867, "grad_norm": 0.263671875, "learning_rate": 4.712650304651521e-05, "loss": 2.7841, "num_input_tokens_seen": 2123366400, "step": 4050 }, { "epoch": 0.1966525421160153, "grad_norm": 0.275390625, "learning_rate": 4.7117164559937335e-05, "loss": 2.7979, "num_input_tokens_seen": 2125987840, "step": 4055 }, { "epoch": 0.19689502367226192, "grad_norm": 0.263671875, "learning_rate": 4.7107811851884284e-05, "loss": 2.777, "num_input_tokens_seen": 2128609280, "step": 4060 }, { "epoch": 0.19713750522850856, "grad_norm": 0.24609375, "learning_rate": 4.70984449283699e-05, "loss": 2.8029, "num_input_tokens_seen": 2131230720, "step": 4065 }, { "epoch": 0.19737998678475518, "grad_norm": 0.255859375, "learning_rate": 4.708906379541719e-05, "loss": 2.7869, "num_input_tokens_seen": 2133852160, "step": 4070 }, { "epoch": 0.1976224683410018, "grad_norm": 0.26171875, "learning_rate": 4.7079668459058256e-05, "loss": 2.7896, "num_input_tokens_seen": 2136473600, "step": 4075 }, { "epoch": 0.19786494989724845, "grad_norm": 0.259765625, "learning_rate": 4.7070258925334374e-05, "loss": 2.786, "num_input_tokens_seen": 2139095040, "step": 4080 }, { "epoch": 0.19810743145349508, "grad_norm": 0.267578125, "learning_rate": 4.706083520029594e-05, "loss": 2.7735, "num_input_tokens_seen": 2141716480, "step": 4085 }, { "epoch": 0.1983499130097417, "grad_norm": 0.263671875, "learning_rate": 4.705139729000246e-05, "loss": 2.7909, "num_input_tokens_seen": 2144337920, "step": 4090 }, { "epoch": 0.19859239456598832, "grad_norm": 0.2470703125, "learning_rate": 4.7041945200522566e-05, "loss": 2.7978, "num_input_tokens_seen": 2146959360, "step": 4095 }, { "epoch": 0.19883487612223497, "grad_norm": 0.251953125, "learning_rate": 4.703247893793401e-05, "loss": 2.7886, "num_input_tokens_seen": 2149580800, "step": 4100 }, { "epoch": 0.1990773576784816, "grad_norm": 0.255859375, "learning_rate": 4.702299850832367e-05, "loss": 2.7938, "num_input_tokens_seen": 2152202240, "step": 4105 }, { "epoch": 0.1993198392347282, "grad_norm": 0.25390625, "learning_rate": 4.701350391778751e-05, "loss": 2.799, "num_input_tokens_seen": 2154823680, "step": 4110 }, { "epoch": 0.19956232079097483, "grad_norm": 0.251953125, "learning_rate": 4.700399517243062e-05, "loss": 2.7861, "num_input_tokens_seen": 2157445120, "step": 4115 }, { "epoch": 0.19980480234722148, "grad_norm": 0.26953125, "learning_rate": 4.699447227836716e-05, "loss": 2.8033, "num_input_tokens_seen": 2160066560, "step": 4120 }, { "epoch": 0.2000472839034681, "grad_norm": 0.25390625, "learning_rate": 4.698493524172045e-05, "loss": 2.7837, "num_input_tokens_seen": 2162688000, "step": 4125 }, { "epoch": 0.20028976545971472, "grad_norm": 0.26171875, "learning_rate": 4.697538406862283e-05, "loss": 2.7977, "num_input_tokens_seen": 2165309440, "step": 4130 }, { "epoch": 0.20053224701596134, "grad_norm": 0.287109375, "learning_rate": 4.696581876521578e-05, "loss": 2.7901, "num_input_tokens_seen": 2167930880, "step": 4135 }, { "epoch": 0.200774728572208, "grad_norm": 0.271484375, "learning_rate": 4.6956239337649846e-05, "loss": 2.7915, "num_input_tokens_seen": 2170552320, "step": 4140 }, { "epoch": 0.2010172101284546, "grad_norm": 0.259765625, "learning_rate": 4.694664579208465e-05, "loss": 2.7782, "num_input_tokens_seen": 2173173760, "step": 4145 }, { "epoch": 0.20125969168470123, "grad_norm": 0.26171875, "learning_rate": 4.6937038134688923e-05, "loss": 2.7809, "num_input_tokens_seen": 2175795200, "step": 4150 }, { "epoch": 0.20150217324094785, "grad_norm": 0.251953125, "learning_rate": 4.692741637164043e-05, "loss": 2.8012, "num_input_tokens_seen": 2178416640, "step": 4155 }, { "epoch": 0.2017446547971945, "grad_norm": 0.251953125, "learning_rate": 4.6917780509126045e-05, "loss": 2.7837, "num_input_tokens_seen": 2181038080, "step": 4160 }, { "epoch": 0.20198713635344112, "grad_norm": 0.255859375, "learning_rate": 4.690813055334167e-05, "loss": 2.7862, "num_input_tokens_seen": 2183659520, "step": 4165 }, { "epoch": 0.20222961790968774, "grad_norm": 0.265625, "learning_rate": 4.689846651049228e-05, "loss": 2.7854, "num_input_tokens_seen": 2186280960, "step": 4170 }, { "epoch": 0.20247209946593436, "grad_norm": 0.2734375, "learning_rate": 4.6888788386791935e-05, "loss": 2.8001, "num_input_tokens_seen": 2188902400, "step": 4175 }, { "epoch": 0.202714581022181, "grad_norm": 0.255859375, "learning_rate": 4.6879096188463725e-05, "loss": 2.791, "num_input_tokens_seen": 2191523840, "step": 4180 }, { "epoch": 0.20295706257842763, "grad_norm": 0.265625, "learning_rate": 4.6869389921739795e-05, "loss": 2.7943, "num_input_tokens_seen": 2194145280, "step": 4185 }, { "epoch": 0.20319954413467425, "grad_norm": 0.2578125, "learning_rate": 4.685966959286132e-05, "loss": 2.7996, "num_input_tokens_seen": 2196766720, "step": 4190 }, { "epoch": 0.20344202569092087, "grad_norm": 0.255859375, "learning_rate": 4.684993520807855e-05, "loss": 2.7908, "num_input_tokens_seen": 2199388160, "step": 4195 }, { "epoch": 0.20368450724716752, "grad_norm": 0.259765625, "learning_rate": 4.6840186773650743e-05, "loss": 2.7942, "num_input_tokens_seen": 2202009600, "step": 4200 }, { "epoch": 0.20368450724716752, "eval_accuracy": 0.4539895782445856, "eval_loss": 2.7551848888397217, "eval_runtime": 6.2629, "eval_samples_per_second": 47.901, "eval_steps_per_second": 6.067, "num_input_tokens_seen": 2202009600, "step": 4200 }, { "epoch": 0.20392698880341414, "grad_norm": 0.25390625, "learning_rate": 4.683042429584621e-05, "loss": 2.8141, "num_input_tokens_seen": 2204631040, "step": 4205 }, { "epoch": 0.20416947035966077, "grad_norm": 0.2578125, "learning_rate": 4.6820647780942286e-05, "loss": 2.7836, "num_input_tokens_seen": 2207252480, "step": 4210 }, { "epoch": 0.20441195191590739, "grad_norm": 0.265625, "learning_rate": 4.681085723522533e-05, "loss": 2.813, "num_input_tokens_seen": 2209873920, "step": 4215 }, { "epoch": 0.20465443347215403, "grad_norm": 0.265625, "learning_rate": 4.680105266499072e-05, "loss": 2.7833, "num_input_tokens_seen": 2212495360, "step": 4220 }, { "epoch": 0.20489691502840066, "grad_norm": 0.265625, "learning_rate": 4.6791234076542864e-05, "loss": 2.7967, "num_input_tokens_seen": 2215116800, "step": 4225 }, { "epoch": 0.20513939658464728, "grad_norm": 0.251953125, "learning_rate": 4.678140147619516e-05, "loss": 2.7865, "num_input_tokens_seen": 2217738240, "step": 4230 }, { "epoch": 0.2053818781408939, "grad_norm": 0.24609375, "learning_rate": 4.6771554870270055e-05, "loss": 2.7994, "num_input_tokens_seen": 2220359680, "step": 4235 }, { "epoch": 0.20562435969714055, "grad_norm": 0.251953125, "learning_rate": 4.6761694265098965e-05, "loss": 2.7748, "num_input_tokens_seen": 2222981120, "step": 4240 }, { "epoch": 0.20586684125338717, "grad_norm": 0.24609375, "learning_rate": 4.675181966702232e-05, "loss": 2.785, "num_input_tokens_seen": 2225602560, "step": 4245 }, { "epoch": 0.2061093228096338, "grad_norm": 0.248046875, "learning_rate": 4.6741931082389545e-05, "loss": 2.7839, "num_input_tokens_seen": 2228224000, "step": 4250 }, { "epoch": 0.2063518043658804, "grad_norm": 0.255859375, "learning_rate": 4.673202851755907e-05, "loss": 2.7781, "num_input_tokens_seen": 2230845440, "step": 4255 }, { "epoch": 0.20659428592212706, "grad_norm": 0.259765625, "learning_rate": 4.6722111978898306e-05, "loss": 2.7961, "num_input_tokens_seen": 2233466880, "step": 4260 }, { "epoch": 0.20683676747837368, "grad_norm": 0.267578125, "learning_rate": 4.671218147278364e-05, "loss": 2.8002, "num_input_tokens_seen": 2236088320, "step": 4265 }, { "epoch": 0.2070792490346203, "grad_norm": 0.2578125, "learning_rate": 4.6702237005600456e-05, "loss": 2.7901, "num_input_tokens_seen": 2238709760, "step": 4270 }, { "epoch": 0.20732173059086692, "grad_norm": 0.259765625, "learning_rate": 4.6692278583743116e-05, "loss": 2.8018, "num_input_tokens_seen": 2241331200, "step": 4275 }, { "epoch": 0.20756421214711357, "grad_norm": 0.251953125, "learning_rate": 4.6682306213614935e-05, "loss": 2.8013, "num_input_tokens_seen": 2243952640, "step": 4280 }, { "epoch": 0.2078066937033602, "grad_norm": 0.265625, "learning_rate": 4.6672319901628214e-05, "loss": 2.778, "num_input_tokens_seen": 2246574080, "step": 4285 }, { "epoch": 0.2080491752596068, "grad_norm": 0.263671875, "learning_rate": 4.666231965420421e-05, "loss": 2.7975, "num_input_tokens_seen": 2249195520, "step": 4290 }, { "epoch": 0.20829165681585343, "grad_norm": 0.26171875, "learning_rate": 4.665230547777316e-05, "loss": 2.7774, "num_input_tokens_seen": 2251816960, "step": 4295 }, { "epoch": 0.20853413837210008, "grad_norm": 0.25, "learning_rate": 4.6642277378774224e-05, "loss": 2.7957, "num_input_tokens_seen": 2254438400, "step": 4300 }, { "epoch": 0.2087766199283467, "grad_norm": 0.25390625, "learning_rate": 4.6632235363655544e-05, "loss": 2.7776, "num_input_tokens_seen": 2257059840, "step": 4305 }, { "epoch": 0.20901910148459332, "grad_norm": 0.259765625, "learning_rate": 4.662217943887419e-05, "loss": 2.7951, "num_input_tokens_seen": 2259681280, "step": 4310 }, { "epoch": 0.20926158304083994, "grad_norm": 0.2470703125, "learning_rate": 4.661210961089619e-05, "loss": 2.7819, "num_input_tokens_seen": 2262302720, "step": 4315 }, { "epoch": 0.2095040645970866, "grad_norm": 0.251953125, "learning_rate": 4.660202588619651e-05, "loss": 2.792, "num_input_tokens_seen": 2264924160, "step": 4320 }, { "epoch": 0.2097465461533332, "grad_norm": 0.255859375, "learning_rate": 4.659192827125904e-05, "loss": 2.7868, "num_input_tokens_seen": 2267545600, "step": 4325 }, { "epoch": 0.20998902770957983, "grad_norm": 0.251953125, "learning_rate": 4.6581816772576616e-05, "loss": 2.7768, "num_input_tokens_seen": 2270167040, "step": 4330 }, { "epoch": 0.21023150926582646, "grad_norm": 0.255859375, "learning_rate": 4.657169139665098e-05, "loss": 2.7881, "num_input_tokens_seen": 2272788480, "step": 4335 }, { "epoch": 0.2104739908220731, "grad_norm": 0.25390625, "learning_rate": 4.656155214999283e-05, "loss": 2.7805, "num_input_tokens_seen": 2275409920, "step": 4340 }, { "epoch": 0.21071647237831972, "grad_norm": 0.259765625, "learning_rate": 4.655139903912176e-05, "loss": 2.7865, "num_input_tokens_seen": 2278031360, "step": 4345 }, { "epoch": 0.21095895393456635, "grad_norm": 0.251953125, "learning_rate": 4.654123207056629e-05, "loss": 2.7831, "num_input_tokens_seen": 2280652800, "step": 4350 }, { "epoch": 0.21120143549081297, "grad_norm": 0.25, "learning_rate": 4.653105125086382e-05, "loss": 2.7884, "num_input_tokens_seen": 2283274240, "step": 4355 }, { "epoch": 0.21144391704705962, "grad_norm": 0.25390625, "learning_rate": 4.652085658656071e-05, "loss": 2.7898, "num_input_tokens_seen": 2285895680, "step": 4360 }, { "epoch": 0.21168639860330624, "grad_norm": 0.2490234375, "learning_rate": 4.6510648084212185e-05, "loss": 2.7872, "num_input_tokens_seen": 2288517120, "step": 4365 }, { "epoch": 0.21192888015955286, "grad_norm": 0.25, "learning_rate": 4.650042575038236e-05, "loss": 2.7695, "num_input_tokens_seen": 2291138560, "step": 4370 }, { "epoch": 0.2121713617157995, "grad_norm": 0.26171875, "learning_rate": 4.6490189591644274e-05, "loss": 2.7835, "num_input_tokens_seen": 2293760000, "step": 4375 }, { "epoch": 0.21241384327204613, "grad_norm": 0.251953125, "learning_rate": 4.647993961457984e-05, "loss": 2.7704, "num_input_tokens_seen": 2296381440, "step": 4380 }, { "epoch": 0.21265632482829275, "grad_norm": 0.26171875, "learning_rate": 4.646967582577986e-05, "loss": 2.7883, "num_input_tokens_seen": 2299002880, "step": 4385 }, { "epoch": 0.21289880638453937, "grad_norm": 0.251953125, "learning_rate": 4.6459398231843996e-05, "loss": 2.782, "num_input_tokens_seen": 2301624320, "step": 4390 }, { "epoch": 0.21314128794078602, "grad_norm": 0.251953125, "learning_rate": 4.644910683938084e-05, "loss": 2.7908, "num_input_tokens_seen": 2304245760, "step": 4395 }, { "epoch": 0.21338376949703264, "grad_norm": 0.271484375, "learning_rate": 4.643880165500778e-05, "loss": 2.7901, "num_input_tokens_seen": 2306867200, "step": 4400 }, { "epoch": 0.21362625105327926, "grad_norm": 0.25390625, "learning_rate": 4.642848268535115e-05, "loss": 2.7943, "num_input_tokens_seen": 2309488640, "step": 4405 }, { "epoch": 0.21386873260952588, "grad_norm": 0.2490234375, "learning_rate": 4.641814993704609e-05, "loss": 2.7824, "num_input_tokens_seen": 2312110080, "step": 4410 }, { "epoch": 0.21411121416577253, "grad_norm": 0.2490234375, "learning_rate": 4.640780341673663e-05, "loss": 2.7923, "num_input_tokens_seen": 2314731520, "step": 4415 }, { "epoch": 0.21435369572201915, "grad_norm": 0.255859375, "learning_rate": 4.6397443131075647e-05, "loss": 2.79, "num_input_tokens_seen": 2317352960, "step": 4420 }, { "epoch": 0.21459617727826577, "grad_norm": 0.25390625, "learning_rate": 4.638706908672487e-05, "loss": 2.7971, "num_input_tokens_seen": 2319974400, "step": 4425 }, { "epoch": 0.2148386588345124, "grad_norm": 0.251953125, "learning_rate": 4.637668129035487e-05, "loss": 2.7933, "num_input_tokens_seen": 2322595840, "step": 4430 }, { "epoch": 0.21508114039075904, "grad_norm": 0.255859375, "learning_rate": 4.636627974864507e-05, "loss": 2.7892, "num_input_tokens_seen": 2325217280, "step": 4435 }, { "epoch": 0.21532362194700566, "grad_norm": 0.26953125, "learning_rate": 4.6355864468283726e-05, "loss": 2.792, "num_input_tokens_seen": 2327838720, "step": 4440 }, { "epoch": 0.21556610350325228, "grad_norm": 0.265625, "learning_rate": 4.634543545596792e-05, "loss": 2.7811, "num_input_tokens_seen": 2330460160, "step": 4445 }, { "epoch": 0.2158085850594989, "grad_norm": 0.248046875, "learning_rate": 4.633499271840359e-05, "loss": 2.7816, "num_input_tokens_seen": 2333081600, "step": 4450 }, { "epoch": 0.21605106661574555, "grad_norm": 0.251953125, "learning_rate": 4.632453626230546e-05, "loss": 2.8012, "num_input_tokens_seen": 2335703040, "step": 4455 }, { "epoch": 0.21629354817199217, "grad_norm": 0.259765625, "learning_rate": 4.631406609439711e-05, "loss": 2.7855, "num_input_tokens_seen": 2338324480, "step": 4460 }, { "epoch": 0.2165360297282388, "grad_norm": 0.25, "learning_rate": 4.630358222141092e-05, "loss": 2.766, "num_input_tokens_seen": 2340945920, "step": 4465 }, { "epoch": 0.21677851128448541, "grad_norm": 0.25390625, "learning_rate": 4.629308465008809e-05, "loss": 2.7734, "num_input_tokens_seen": 2343567360, "step": 4470 }, { "epoch": 0.21702099284073206, "grad_norm": 0.2578125, "learning_rate": 4.628257338717862e-05, "loss": 2.7903, "num_input_tokens_seen": 2346188800, "step": 4475 }, { "epoch": 0.21726347439697868, "grad_norm": 0.255859375, "learning_rate": 4.6272048439441315e-05, "loss": 2.7796, "num_input_tokens_seen": 2348810240, "step": 4480 }, { "epoch": 0.2175059559532253, "grad_norm": 0.251953125, "learning_rate": 4.62615098136438e-05, "loss": 2.7905, "num_input_tokens_seen": 2351431680, "step": 4485 }, { "epoch": 0.21774843750947193, "grad_norm": 0.251953125, "learning_rate": 4.625095751656245e-05, "loss": 2.7908, "num_input_tokens_seen": 2354053120, "step": 4490 }, { "epoch": 0.21799091906571857, "grad_norm": 0.255859375, "learning_rate": 4.624039155498247e-05, "loss": 2.7879, "num_input_tokens_seen": 2356674560, "step": 4495 }, { "epoch": 0.2182334006219652, "grad_norm": 0.251953125, "learning_rate": 4.622981193569784e-05, "loss": 2.793, "num_input_tokens_seen": 2359296000, "step": 4500 }, { "epoch": 0.2182334006219652, "eval_accuracy": 0.4542712913206318, "eval_loss": 2.7538249492645264, "eval_runtime": 5.8817, "eval_samples_per_second": 51.006, "eval_steps_per_second": 6.461, "num_input_tokens_seen": 2359296000, "step": 4500 }, { "epoch": 0.21847588217821182, "grad_norm": 0.255859375, "learning_rate": 4.621921866551133e-05, "loss": 2.7789, "num_input_tokens_seen": 2361917440, "step": 4505 }, { "epoch": 0.21871836373445844, "grad_norm": 0.255859375, "learning_rate": 4.620861175123446e-05, "loss": 2.7708, "num_input_tokens_seen": 2364538880, "step": 4510 }, { "epoch": 0.2189608452907051, "grad_norm": 0.2578125, "learning_rate": 4.6197991199687566e-05, "loss": 2.8001, "num_input_tokens_seen": 2367160320, "step": 4515 }, { "epoch": 0.2192033268469517, "grad_norm": 0.2451171875, "learning_rate": 4.6187357017699716e-05, "loss": 2.7862, "num_input_tokens_seen": 2369781760, "step": 4520 }, { "epoch": 0.21944580840319833, "grad_norm": 0.25390625, "learning_rate": 4.617670921210875e-05, "loss": 2.7765, "num_input_tokens_seen": 2372403200, "step": 4525 }, { "epoch": 0.21968828995944495, "grad_norm": 0.259765625, "learning_rate": 4.616604778976128e-05, "loss": 2.7928, "num_input_tokens_seen": 2375024640, "step": 4530 }, { "epoch": 0.2199307715156916, "grad_norm": 0.2734375, "learning_rate": 4.615537275751266e-05, "loss": 2.7938, "num_input_tokens_seen": 2377646080, "step": 4535 }, { "epoch": 0.22017325307193822, "grad_norm": 0.271484375, "learning_rate": 4.614468412222702e-05, "loss": 2.7835, "num_input_tokens_seen": 2380267520, "step": 4540 }, { "epoch": 0.22041573462818484, "grad_norm": 0.265625, "learning_rate": 4.61339818907772e-05, "loss": 2.7925, "num_input_tokens_seen": 2382888960, "step": 4545 }, { "epoch": 0.22065821618443146, "grad_norm": 0.27734375, "learning_rate": 4.612326607004481e-05, "loss": 2.797, "num_input_tokens_seen": 2385510400, "step": 4550 }, { "epoch": 0.2209006977406781, "grad_norm": 0.26953125, "learning_rate": 4.61125366669202e-05, "loss": 2.791, "num_input_tokens_seen": 2388131840, "step": 4555 }, { "epoch": 0.22114317929692473, "grad_norm": 0.283203125, "learning_rate": 4.610179368830243e-05, "loss": 2.7996, "num_input_tokens_seen": 2390753280, "step": 4560 }, { "epoch": 0.22138566085317135, "grad_norm": 0.259765625, "learning_rate": 4.60910371410993e-05, "loss": 2.7943, "num_input_tokens_seen": 2393374720, "step": 4565 }, { "epoch": 0.22162814240941797, "grad_norm": 0.259765625, "learning_rate": 4.608026703222735e-05, "loss": 2.79, "num_input_tokens_seen": 2395996160, "step": 4570 }, { "epoch": 0.22187062396566462, "grad_norm": 0.25, "learning_rate": 4.6069483368611815e-05, "loss": 2.7917, "num_input_tokens_seen": 2398617600, "step": 4575 }, { "epoch": 0.22211310552191124, "grad_norm": 0.2470703125, "learning_rate": 4.605868615718667e-05, "loss": 2.7813, "num_input_tokens_seen": 2401239040, "step": 4580 }, { "epoch": 0.22235558707815786, "grad_norm": 0.25390625, "learning_rate": 4.604787540489458e-05, "loss": 2.7822, "num_input_tokens_seen": 2403860480, "step": 4585 }, { "epoch": 0.22259806863440448, "grad_norm": 0.271484375, "learning_rate": 4.603705111868693e-05, "loss": 2.7883, "num_input_tokens_seen": 2406481920, "step": 4590 }, { "epoch": 0.22284055019065113, "grad_norm": 0.255859375, "learning_rate": 4.6026213305523794e-05, "loss": 2.7924, "num_input_tokens_seen": 2409103360, "step": 4595 }, { "epoch": 0.22308303174689775, "grad_norm": 0.2578125, "learning_rate": 4.601536197237397e-05, "loss": 2.7906, "num_input_tokens_seen": 2411724800, "step": 4600 }, { "epoch": 0.22332551330314437, "grad_norm": 0.265625, "learning_rate": 4.600449712621493e-05, "loss": 2.7683, "num_input_tokens_seen": 2414346240, "step": 4605 }, { "epoch": 0.223567994859391, "grad_norm": 0.263671875, "learning_rate": 4.5993618774032824e-05, "loss": 2.7971, "num_input_tokens_seen": 2416967680, "step": 4610 }, { "epoch": 0.22381047641563764, "grad_norm": 0.25, "learning_rate": 4.5982726922822515e-05, "loss": 2.8059, "num_input_tokens_seen": 2419589120, "step": 4615 }, { "epoch": 0.22405295797188426, "grad_norm": 0.251953125, "learning_rate": 4.5971821579587536e-05, "loss": 2.7974, "num_input_tokens_seen": 2422210560, "step": 4620 }, { "epoch": 0.22429543952813089, "grad_norm": 0.2578125, "learning_rate": 4.596090275134007e-05, "loss": 2.7842, "num_input_tokens_seen": 2424832000, "step": 4625 }, { "epoch": 0.2245379210843775, "grad_norm": 0.251953125, "learning_rate": 4.594997044510101e-05, "loss": 2.7906, "num_input_tokens_seen": 2427453440, "step": 4630 }, { "epoch": 0.22478040264062416, "grad_norm": 0.251953125, "learning_rate": 4.5939024667899886e-05, "loss": 2.7935, "num_input_tokens_seen": 2430074880, "step": 4635 }, { "epoch": 0.22502288419687078, "grad_norm": 0.244140625, "learning_rate": 4.592806542677491e-05, "loss": 2.7775, "num_input_tokens_seen": 2432696320, "step": 4640 }, { "epoch": 0.2252653657531174, "grad_norm": 0.26171875, "learning_rate": 4.5917092728772944e-05, "loss": 2.7844, "num_input_tokens_seen": 2435317760, "step": 4645 }, { "epoch": 0.22550784730936402, "grad_norm": 0.26953125, "learning_rate": 4.590610658094949e-05, "loss": 2.7916, "num_input_tokens_seen": 2437939200, "step": 4650 }, { "epoch": 0.22575032886561067, "grad_norm": 0.251953125, "learning_rate": 4.589510699036872e-05, "loss": 2.7813, "num_input_tokens_seen": 2440560640, "step": 4655 }, { "epoch": 0.2259928104218573, "grad_norm": 0.244140625, "learning_rate": 4.588409396410343e-05, "loss": 2.7931, "num_input_tokens_seen": 2443182080, "step": 4660 }, { "epoch": 0.2262352919781039, "grad_norm": 0.25390625, "learning_rate": 4.5873067509235065e-05, "loss": 2.7913, "num_input_tokens_seen": 2445803520, "step": 4665 }, { "epoch": 0.22647777353435056, "grad_norm": 0.251953125, "learning_rate": 4.5862027632853724e-05, "loss": 2.7831, "num_input_tokens_seen": 2448424960, "step": 4670 }, { "epoch": 0.22672025509059718, "grad_norm": 0.255859375, "learning_rate": 4.5850974342058095e-05, "loss": 2.7832, "num_input_tokens_seen": 2451046400, "step": 4675 }, { "epoch": 0.2269627366468438, "grad_norm": 0.251953125, "learning_rate": 4.5839907643955525e-05, "loss": 2.7763, "num_input_tokens_seen": 2453667840, "step": 4680 }, { "epoch": 0.22720521820309042, "grad_norm": 0.25390625, "learning_rate": 4.582882754566196e-05, "loss": 2.7882, "num_input_tokens_seen": 2456289280, "step": 4685 }, { "epoch": 0.22744769975933707, "grad_norm": 0.255859375, "learning_rate": 4.581773405430199e-05, "loss": 2.7681, "num_input_tokens_seen": 2458910720, "step": 4690 }, { "epoch": 0.2276901813155837, "grad_norm": 0.2578125, "learning_rate": 4.5806627177008775e-05, "loss": 2.7858, "num_input_tokens_seen": 2461532160, "step": 4695 }, { "epoch": 0.2279326628718303, "grad_norm": 0.2578125, "learning_rate": 4.579550692092412e-05, "loss": 2.7882, "num_input_tokens_seen": 2464153600, "step": 4700 }, { "epoch": 0.22817514442807693, "grad_norm": 0.2490234375, "learning_rate": 4.578437329319842e-05, "loss": 2.7782, "num_input_tokens_seen": 2466775040, "step": 4705 }, { "epoch": 0.22841762598432358, "grad_norm": 0.25390625, "learning_rate": 4.5773226300990666e-05, "loss": 2.7819, "num_input_tokens_seen": 2469396480, "step": 4710 }, { "epoch": 0.2286601075405702, "grad_norm": 0.2578125, "learning_rate": 4.576206595146845e-05, "loss": 2.7845, "num_input_tokens_seen": 2472017920, "step": 4715 }, { "epoch": 0.22890258909681682, "grad_norm": 0.251953125, "learning_rate": 4.5750892251807934e-05, "loss": 2.7834, "num_input_tokens_seen": 2474639360, "step": 4720 }, { "epoch": 0.22914507065306344, "grad_norm": 0.259765625, "learning_rate": 4.573970520919388e-05, "loss": 2.7876, "num_input_tokens_seen": 2477260800, "step": 4725 }, { "epoch": 0.2293875522093101, "grad_norm": 0.255859375, "learning_rate": 4.572850483081964e-05, "loss": 2.7961, "num_input_tokens_seen": 2479882240, "step": 4730 }, { "epoch": 0.2296300337655567, "grad_norm": 0.24609375, "learning_rate": 4.5717291123887106e-05, "loss": 2.7874, "num_input_tokens_seen": 2482503680, "step": 4735 }, { "epoch": 0.22987251532180333, "grad_norm": 0.255859375, "learning_rate": 4.570606409560677e-05, "loss": 2.7892, "num_input_tokens_seen": 2485125120, "step": 4740 }, { "epoch": 0.23011499687804995, "grad_norm": 0.259765625, "learning_rate": 4.569482375319769e-05, "loss": 2.7874, "num_input_tokens_seen": 2487746560, "step": 4745 }, { "epoch": 0.2303574784342966, "grad_norm": 0.2578125, "learning_rate": 4.5683570103887475e-05, "loss": 2.7831, "num_input_tokens_seen": 2490368000, "step": 4750 }, { "epoch": 0.23059995999054322, "grad_norm": 0.25390625, "learning_rate": 4.567230315491228e-05, "loss": 2.786, "num_input_tokens_seen": 2492989440, "step": 4755 }, { "epoch": 0.23084244154678984, "grad_norm": 0.25390625, "learning_rate": 4.566102291351683e-05, "loss": 2.7816, "num_input_tokens_seen": 2495610880, "step": 4760 }, { "epoch": 0.23108492310303647, "grad_norm": 0.2734375, "learning_rate": 4.5649729386954395e-05, "loss": 2.8002, "num_input_tokens_seen": 2498232320, "step": 4765 }, { "epoch": 0.23132740465928311, "grad_norm": 0.2470703125, "learning_rate": 4.563842258248677e-05, "loss": 2.7963, "num_input_tokens_seen": 2500853760, "step": 4770 }, { "epoch": 0.23156988621552974, "grad_norm": 0.25, "learning_rate": 4.562710250738433e-05, "loss": 2.786, "num_input_tokens_seen": 2503475200, "step": 4775 }, { "epoch": 0.23181236777177636, "grad_norm": 0.271484375, "learning_rate": 4.561576916892592e-05, "loss": 2.7741, "num_input_tokens_seen": 2506096640, "step": 4780 }, { "epoch": 0.23205484932802298, "grad_norm": 0.259765625, "learning_rate": 4.560442257439896e-05, "loss": 2.7907, "num_input_tokens_seen": 2508718080, "step": 4785 }, { "epoch": 0.23229733088426963, "grad_norm": 0.25390625, "learning_rate": 4.55930627310994e-05, "loss": 2.7808, "num_input_tokens_seen": 2511339520, "step": 4790 }, { "epoch": 0.23253981244051625, "grad_norm": 0.259765625, "learning_rate": 4.558168964633166e-05, "loss": 2.7782, "num_input_tokens_seen": 2513960960, "step": 4795 }, { "epoch": 0.23278229399676287, "grad_norm": 0.259765625, "learning_rate": 4.557030332740873e-05, "loss": 2.7958, "num_input_tokens_seen": 2516582400, "step": 4800 }, { "epoch": 0.23278229399676287, "eval_accuracy": 0.4543738804754926, "eval_loss": 2.7526228427886963, "eval_runtime": 5.8809, "eval_samples_per_second": 51.013, "eval_steps_per_second": 6.462, "num_input_tokens_seen": 2516582400, "step": 4800 }, { "epoch": 0.2330247755530095, "grad_norm": 0.2490234375, "learning_rate": 4.555890378165206e-05, "loss": 2.7984, "num_input_tokens_seen": 2519203840, "step": 4805 }, { "epoch": 0.23326725710925614, "grad_norm": 0.25390625, "learning_rate": 4.5547491016391645e-05, "loss": 2.7877, "num_input_tokens_seen": 2521825280, "step": 4810 }, { "epoch": 0.23350973866550276, "grad_norm": 0.2451171875, "learning_rate": 4.553606503896597e-05, "loss": 2.7959, "num_input_tokens_seen": 2524446720, "step": 4815 }, { "epoch": 0.23375222022174938, "grad_norm": 0.255859375, "learning_rate": 4.552462585672199e-05, "loss": 2.7823, "num_input_tokens_seen": 2527068160, "step": 4820 }, { "epoch": 0.233994701777996, "grad_norm": 0.259765625, "learning_rate": 4.551317347701519e-05, "loss": 2.7733, "num_input_tokens_seen": 2529689600, "step": 4825 }, { "epoch": 0.23423718333424265, "grad_norm": 0.255859375, "learning_rate": 4.550170790720951e-05, "loss": 2.7665, "num_input_tokens_seen": 2532311040, "step": 4830 }, { "epoch": 0.23447966489048927, "grad_norm": 0.25390625, "learning_rate": 4.549022915467739e-05, "loss": 2.7968, "num_input_tokens_seen": 2534932480, "step": 4835 }, { "epoch": 0.2347221464467359, "grad_norm": 0.25390625, "learning_rate": 4.5478737226799736e-05, "loss": 2.7886, "num_input_tokens_seen": 2537553920, "step": 4840 }, { "epoch": 0.2349646280029825, "grad_norm": 0.2578125, "learning_rate": 4.5467232130965935e-05, "loss": 2.7939, "num_input_tokens_seen": 2540175360, "step": 4845 }, { "epoch": 0.23520710955922916, "grad_norm": 0.259765625, "learning_rate": 4.5455713874573825e-05, "loss": 2.7814, "num_input_tokens_seen": 2542796800, "step": 4850 }, { "epoch": 0.23544959111547578, "grad_norm": 0.251953125, "learning_rate": 4.5444182465029726e-05, "loss": 2.7878, "num_input_tokens_seen": 2545418240, "step": 4855 }, { "epoch": 0.2356920726717224, "grad_norm": 0.26171875, "learning_rate": 4.54326379097484e-05, "loss": 2.7799, "num_input_tokens_seen": 2548039680, "step": 4860 }, { "epoch": 0.23593455422796902, "grad_norm": 0.26953125, "learning_rate": 4.542108021615308e-05, "loss": 2.8001, "num_input_tokens_seen": 2550661120, "step": 4865 }, { "epoch": 0.23617703578421567, "grad_norm": 0.26953125, "learning_rate": 4.540950939167542e-05, "loss": 2.7806, "num_input_tokens_seen": 2553282560, "step": 4870 }, { "epoch": 0.2364195173404623, "grad_norm": 0.2578125, "learning_rate": 4.539792544375554e-05, "loss": 2.7804, "num_input_tokens_seen": 2555904000, "step": 4875 }, { "epoch": 0.2366619988967089, "grad_norm": 0.263671875, "learning_rate": 4.538632837984199e-05, "loss": 2.7779, "num_input_tokens_seen": 2558525440, "step": 4880 }, { "epoch": 0.23690448045295553, "grad_norm": 0.251953125, "learning_rate": 4.537471820739176e-05, "loss": 2.7868, "num_input_tokens_seen": 2561146880, "step": 4885 }, { "epoch": 0.23714696200920218, "grad_norm": 0.24609375, "learning_rate": 4.536309493387025e-05, "loss": 2.7886, "num_input_tokens_seen": 2563768320, "step": 4890 }, { "epoch": 0.2373894435654488, "grad_norm": 0.2734375, "learning_rate": 4.5351458566751317e-05, "loss": 2.7962, "num_input_tokens_seen": 2566389760, "step": 4895 }, { "epoch": 0.23763192512169543, "grad_norm": 0.2578125, "learning_rate": 4.53398091135172e-05, "loss": 2.7919, "num_input_tokens_seen": 2569011200, "step": 4900 }, { "epoch": 0.23787440667794205, "grad_norm": 0.251953125, "learning_rate": 4.532814658165858e-05, "loss": 2.7918, "num_input_tokens_seen": 2571632640, "step": 4905 }, { "epoch": 0.2381168882341887, "grad_norm": 0.2578125, "learning_rate": 4.5316470978674536e-05, "loss": 2.7812, "num_input_tokens_seen": 2574254080, "step": 4910 }, { "epoch": 0.23835936979043532, "grad_norm": 0.2578125, "learning_rate": 4.530478231207255e-05, "loss": 2.791, "num_input_tokens_seen": 2576875520, "step": 4915 }, { "epoch": 0.23860185134668194, "grad_norm": 0.2451171875, "learning_rate": 4.5293080589368513e-05, "loss": 2.7838, "num_input_tokens_seen": 2579496960, "step": 4920 }, { "epoch": 0.23884433290292856, "grad_norm": 0.248046875, "learning_rate": 4.52813658180867e-05, "loss": 2.7861, "num_input_tokens_seen": 2582118400, "step": 4925 }, { "epoch": 0.2390868144591752, "grad_norm": 0.25, "learning_rate": 4.52696380057598e-05, "loss": 2.7665, "num_input_tokens_seen": 2584739840, "step": 4930 }, { "epoch": 0.23932929601542183, "grad_norm": 0.2470703125, "learning_rate": 4.5257897159928844e-05, "loss": 2.7923, "num_input_tokens_seen": 2587361280, "step": 4935 }, { "epoch": 0.23957177757166845, "grad_norm": 0.25390625, "learning_rate": 4.524614328814327e-05, "loss": 2.7835, "num_input_tokens_seen": 2589982720, "step": 4940 }, { "epoch": 0.23981425912791507, "grad_norm": 0.28515625, "learning_rate": 4.523437639796092e-05, "loss": 2.787, "num_input_tokens_seen": 2592604160, "step": 4945 }, { "epoch": 0.24005674068416172, "grad_norm": 0.265625, "learning_rate": 4.5222596496947954e-05, "loss": 2.7922, "num_input_tokens_seen": 2595225600, "step": 4950 }, { "epoch": 0.24029922224040834, "grad_norm": 0.255859375, "learning_rate": 4.521080359267893e-05, "loss": 2.7975, "num_input_tokens_seen": 2597847040, "step": 4955 }, { "epoch": 0.24054170379665496, "grad_norm": 0.259765625, "learning_rate": 4.519899769273676e-05, "loss": 2.7844, "num_input_tokens_seen": 2600468480, "step": 4960 }, { "epoch": 0.2407841853529016, "grad_norm": 0.248046875, "learning_rate": 4.518717880471271e-05, "loss": 2.7835, "num_input_tokens_seen": 2603089920, "step": 4965 }, { "epoch": 0.24102666690914823, "grad_norm": 0.25390625, "learning_rate": 4.517534693620639e-05, "loss": 2.79, "num_input_tokens_seen": 2605711360, "step": 4970 }, { "epoch": 0.24126914846539485, "grad_norm": 0.2578125, "learning_rate": 4.516350209482577e-05, "loss": 2.7865, "num_input_tokens_seen": 2608332800, "step": 4975 }, { "epoch": 0.24151163002164147, "grad_norm": 0.259765625, "learning_rate": 4.515164428818717e-05, "loss": 2.7822, "num_input_tokens_seen": 2610954240, "step": 4980 }, { "epoch": 0.24175411157788812, "grad_norm": 0.25390625, "learning_rate": 4.513977352391522e-05, "loss": 2.7986, "num_input_tokens_seen": 2613575680, "step": 4985 }, { "epoch": 0.24199659313413474, "grad_norm": 0.26171875, "learning_rate": 4.5127889809642896e-05, "loss": 2.7906, "num_input_tokens_seen": 2616197120, "step": 4990 }, { "epoch": 0.24223907469038136, "grad_norm": 0.26171875, "learning_rate": 4.511599315301151e-05, "loss": 2.7838, "num_input_tokens_seen": 2618818560, "step": 4995 }, { "epoch": 0.24248155624662798, "grad_norm": 0.2578125, "learning_rate": 4.5104083561670686e-05, "loss": 2.7927, "num_input_tokens_seen": 2621440000, "step": 5000 }, { "epoch": 0.24272403780287463, "grad_norm": 0.263671875, "learning_rate": 4.5092161043278344e-05, "loss": 2.7774, "num_input_tokens_seen": 2624061440, "step": 5005 }, { "epoch": 0.24296651935912125, "grad_norm": 0.26171875, "learning_rate": 4.508022560550077e-05, "loss": 2.7792, "num_input_tokens_seen": 2626682880, "step": 5010 }, { "epoch": 0.24320900091536787, "grad_norm": 0.26171875, "learning_rate": 4.506827725601251e-05, "loss": 2.7848, "num_input_tokens_seen": 2629304320, "step": 5015 }, { "epoch": 0.2434514824716145, "grad_norm": 0.25, "learning_rate": 4.5056316002496424e-05, "loss": 2.7974, "num_input_tokens_seen": 2631925760, "step": 5020 }, { "epoch": 0.24369396402786114, "grad_norm": 0.271484375, "learning_rate": 4.504434185264368e-05, "loss": 2.8029, "num_input_tokens_seen": 2634547200, "step": 5025 }, { "epoch": 0.24393644558410776, "grad_norm": 0.25390625, "learning_rate": 4.5032354814153724e-05, "loss": 2.7913, "num_input_tokens_seen": 2637168640, "step": 5030 }, { "epoch": 0.24417892714035438, "grad_norm": 0.25390625, "learning_rate": 4.502035489473429e-05, "loss": 2.794, "num_input_tokens_seen": 2639790080, "step": 5035 }, { "epoch": 0.244421408696601, "grad_norm": 0.265625, "learning_rate": 4.500834210210143e-05, "loss": 2.7875, "num_input_tokens_seen": 2642411520, "step": 5040 }, { "epoch": 0.24466389025284765, "grad_norm": 0.2470703125, "learning_rate": 4.499631644397941e-05, "loss": 2.788, "num_input_tokens_seen": 2645032960, "step": 5045 }, { "epoch": 0.24490637180909428, "grad_norm": 0.259765625, "learning_rate": 4.498427792810084e-05, "loss": 2.7795, "num_input_tokens_seen": 2647654400, "step": 5050 }, { "epoch": 0.2451488533653409, "grad_norm": 0.2421875, "learning_rate": 4.497222656220652e-05, "loss": 2.7842, "num_input_tokens_seen": 2650275840, "step": 5055 }, { "epoch": 0.24539133492158752, "grad_norm": 0.251953125, "learning_rate": 4.496016235404559e-05, "loss": 2.8141, "num_input_tokens_seen": 2652897280, "step": 5060 }, { "epoch": 0.24563381647783417, "grad_norm": 0.263671875, "learning_rate": 4.4948085311375386e-05, "loss": 2.7749, "num_input_tokens_seen": 2655518720, "step": 5065 }, { "epoch": 0.2458762980340808, "grad_norm": 0.251953125, "learning_rate": 4.4935995441961535e-05, "loss": 2.7758, "num_input_tokens_seen": 2658140160, "step": 5070 }, { "epoch": 0.2461187795903274, "grad_norm": 0.2578125, "learning_rate": 4.4923892753577895e-05, "loss": 2.7871, "num_input_tokens_seen": 2660761600, "step": 5075 }, { "epoch": 0.24636126114657403, "grad_norm": 0.25390625, "learning_rate": 4.491177725400657e-05, "loss": 2.7752, "num_input_tokens_seen": 2663383040, "step": 5080 }, { "epoch": 0.24660374270282068, "grad_norm": 0.255859375, "learning_rate": 4.489964895103791e-05, "loss": 2.7848, "num_input_tokens_seen": 2666004480, "step": 5085 }, { "epoch": 0.2468462242590673, "grad_norm": 0.255859375, "learning_rate": 4.488750785247048e-05, "loss": 2.7906, "num_input_tokens_seen": 2668625920, "step": 5090 }, { "epoch": 0.24708870581531392, "grad_norm": 0.259765625, "learning_rate": 4.487535396611108e-05, "loss": 2.7912, "num_input_tokens_seen": 2671247360, "step": 5095 }, { "epoch": 0.24733118737156054, "grad_norm": 0.2578125, "learning_rate": 4.486318729977474e-05, "loss": 2.78, "num_input_tokens_seen": 2673868800, "step": 5100 }, { "epoch": 0.24733118737156054, "eval_accuracy": 0.45466047874938936, "eval_loss": 2.751500129699707, "eval_runtime": 5.8324, "eval_samples_per_second": 51.437, "eval_steps_per_second": 6.515, "num_input_tokens_seen": 2673868800, "step": 5100 }, { "epoch": 0.2475736689278072, "grad_norm": 0.25, "learning_rate": 4.48510078612847e-05, "loss": 2.7894, "num_input_tokens_seen": 2676490240, "step": 5105 }, { "epoch": 0.2478161504840538, "grad_norm": 0.24609375, "learning_rate": 4.4838815658472425e-05, "loss": 2.7768, "num_input_tokens_seen": 2679111680, "step": 5110 }, { "epoch": 0.24805863204030043, "grad_norm": 0.24609375, "learning_rate": 4.482661069917756e-05, "loss": 2.7889, "num_input_tokens_seen": 2681733120, "step": 5115 }, { "epoch": 0.24830111359654705, "grad_norm": 0.259765625, "learning_rate": 4.481439299124799e-05, "loss": 2.7859, "num_input_tokens_seen": 2684354560, "step": 5120 }, { "epoch": 0.2485435951527937, "grad_norm": 0.25390625, "learning_rate": 4.4802162542539774e-05, "loss": 2.7794, "num_input_tokens_seen": 2686976000, "step": 5125 }, { "epoch": 0.24878607670904032, "grad_norm": 0.2734375, "learning_rate": 4.478991936091714e-05, "loss": 2.7955, "num_input_tokens_seen": 2689597440, "step": 5130 }, { "epoch": 0.24902855826528694, "grad_norm": 0.26171875, "learning_rate": 4.477766345425257e-05, "loss": 2.7731, "num_input_tokens_seen": 2692218880, "step": 5135 }, { "epoch": 0.24927103982153356, "grad_norm": 0.251953125, "learning_rate": 4.476539483042666e-05, "loss": 2.7784, "num_input_tokens_seen": 2694840320, "step": 5140 }, { "epoch": 0.2495135213777802, "grad_norm": 0.2578125, "learning_rate": 4.475311349732823e-05, "loss": 2.7798, "num_input_tokens_seen": 2697461760, "step": 5145 }, { "epoch": 0.24975600293402683, "grad_norm": 0.275390625, "learning_rate": 4.4740819462854245e-05, "loss": 2.7754, "num_input_tokens_seen": 2700083200, "step": 5150 }, { "epoch": 0.24999848449027345, "grad_norm": 0.25390625, "learning_rate": 4.4728512734909844e-05, "loss": 2.7828, "num_input_tokens_seen": 2702704640, "step": 5155 }, { "epoch": 0.2502409660465201, "grad_norm": 0.255859375, "learning_rate": 4.471619332140833e-05, "loss": 2.7775, "num_input_tokens_seen": 2705326080, "step": 5160 }, { "epoch": 0.2504834476027667, "grad_norm": 0.265625, "learning_rate": 4.470386123027117e-05, "loss": 2.7935, "num_input_tokens_seen": 2707947520, "step": 5165 }, { "epoch": 0.2507259291590133, "grad_norm": 0.251953125, "learning_rate": 4.469151646942797e-05, "loss": 2.7943, "num_input_tokens_seen": 2710568960, "step": 5170 }, { "epoch": 0.25096841071526, "grad_norm": 0.255859375, "learning_rate": 4.467915904681649e-05, "loss": 2.7892, "num_input_tokens_seen": 2713190400, "step": 5175 }, { "epoch": 0.2512108922715066, "grad_norm": 0.25390625, "learning_rate": 4.466678897038263e-05, "loss": 2.7888, "num_input_tokens_seen": 2715811840, "step": 5180 }, { "epoch": 0.25145337382775323, "grad_norm": 0.26953125, "learning_rate": 4.465440624808043e-05, "loss": 2.7868, "num_input_tokens_seen": 2718433280, "step": 5185 }, { "epoch": 0.25169585538399986, "grad_norm": 0.2470703125, "learning_rate": 4.4642010887872056e-05, "loss": 2.7908, "num_input_tokens_seen": 2721054720, "step": 5190 }, { "epoch": 0.2519383369402465, "grad_norm": 0.255859375, "learning_rate": 4.4629602897727804e-05, "loss": 2.7858, "num_input_tokens_seen": 2723676160, "step": 5195 }, { "epoch": 0.2521808184964931, "grad_norm": 0.255859375, "learning_rate": 4.461718228562608e-05, "loss": 2.7764, "num_input_tokens_seen": 2726297600, "step": 5200 }, { "epoch": 0.2524233000527397, "grad_norm": 0.2578125, "learning_rate": 4.460474905955342e-05, "loss": 2.7833, "num_input_tokens_seen": 2728919040, "step": 5205 }, { "epoch": 0.25266578160898634, "grad_norm": 0.255859375, "learning_rate": 4.4592303227504476e-05, "loss": 2.7957, "num_input_tokens_seen": 2731540480, "step": 5210 }, { "epoch": 0.252908263165233, "grad_norm": 0.25390625, "learning_rate": 4.457984479748197e-05, "loss": 2.778, "num_input_tokens_seen": 2734161920, "step": 5215 }, { "epoch": 0.25315074472147964, "grad_norm": 0.263671875, "learning_rate": 4.456737377749678e-05, "loss": 2.8094, "num_input_tokens_seen": 2736783360, "step": 5220 }, { "epoch": 0.25339322627772626, "grad_norm": 0.244140625, "learning_rate": 4.455489017556784e-05, "loss": 2.7743, "num_input_tokens_seen": 2739404800, "step": 5225 }, { "epoch": 0.2536357078339729, "grad_norm": 0.251953125, "learning_rate": 4.454239399972218e-05, "loss": 2.7785, "num_input_tokens_seen": 2742026240, "step": 5230 }, { "epoch": 0.2538781893902195, "grad_norm": 0.255859375, "learning_rate": 4.452988525799492e-05, "loss": 2.779, "num_input_tokens_seen": 2744647680, "step": 5235 }, { "epoch": 0.2541206709464661, "grad_norm": 0.255859375, "learning_rate": 4.451736395842926e-05, "loss": 2.7809, "num_input_tokens_seen": 2747269120, "step": 5240 }, { "epoch": 0.25436315250271274, "grad_norm": 0.255859375, "learning_rate": 4.450483010907648e-05, "loss": 2.7811, "num_input_tokens_seen": 2749890560, "step": 5245 }, { "epoch": 0.2546056340589594, "grad_norm": 0.25, "learning_rate": 4.449228371799591e-05, "loss": 2.7724, "num_input_tokens_seen": 2752512000, "step": 5250 }, { "epoch": 0.25484811561520604, "grad_norm": 0.251953125, "learning_rate": 4.447972479325497e-05, "loss": 2.7836, "num_input_tokens_seen": 2755133440, "step": 5255 }, { "epoch": 0.25509059717145266, "grad_norm": 0.2578125, "learning_rate": 4.446715334292913e-05, "loss": 2.7972, "num_input_tokens_seen": 2757754880, "step": 5260 }, { "epoch": 0.2553330787276993, "grad_norm": 0.2578125, "learning_rate": 4.445456937510188e-05, "loss": 2.7827, "num_input_tokens_seen": 2760376320, "step": 5265 }, { "epoch": 0.2555755602839459, "grad_norm": 0.25, "learning_rate": 4.4441972897864833e-05, "loss": 2.7784, "num_input_tokens_seen": 2762997760, "step": 5270 }, { "epoch": 0.2558180418401925, "grad_norm": 0.2451171875, "learning_rate": 4.442936391931759e-05, "loss": 2.7896, "num_input_tokens_seen": 2765619200, "step": 5275 }, { "epoch": 0.25606052339643914, "grad_norm": 0.26171875, "learning_rate": 4.4416742447567784e-05, "loss": 2.7843, "num_input_tokens_seen": 2768240640, "step": 5280 }, { "epoch": 0.25630300495268576, "grad_norm": 0.255859375, "learning_rate": 4.440410849073112e-05, "loss": 2.785, "num_input_tokens_seen": 2770862080, "step": 5285 }, { "epoch": 0.25654548650893244, "grad_norm": 0.2578125, "learning_rate": 4.43914620569313e-05, "loss": 2.7713, "num_input_tokens_seen": 2773483520, "step": 5290 }, { "epoch": 0.25678796806517906, "grad_norm": 0.26953125, "learning_rate": 4.4378803154300066e-05, "loss": 2.7791, "num_input_tokens_seen": 2776104960, "step": 5295 }, { "epoch": 0.2570304496214257, "grad_norm": 0.26171875, "learning_rate": 4.4366131790977174e-05, "loss": 2.8016, "num_input_tokens_seen": 2778726400, "step": 5300 }, { "epoch": 0.2572729311776723, "grad_norm": 0.26171875, "learning_rate": 4.435344797511038e-05, "loss": 2.7764, "num_input_tokens_seen": 2781347840, "step": 5305 }, { "epoch": 0.2575154127339189, "grad_norm": 0.2470703125, "learning_rate": 4.4340751714855475e-05, "loss": 2.7858, "num_input_tokens_seen": 2783969280, "step": 5310 }, { "epoch": 0.25775789429016555, "grad_norm": 0.259765625, "learning_rate": 4.432804301837621e-05, "loss": 2.7711, "num_input_tokens_seen": 2786590720, "step": 5315 }, { "epoch": 0.25800037584641217, "grad_norm": 0.263671875, "learning_rate": 4.4315321893844375e-05, "loss": 2.7839, "num_input_tokens_seen": 2789212160, "step": 5320 }, { "epoch": 0.2582428574026588, "grad_norm": 0.25, "learning_rate": 4.430258834943972e-05, "loss": 2.7914, "num_input_tokens_seen": 2791833600, "step": 5325 }, { "epoch": 0.25848533895890546, "grad_norm": 0.25390625, "learning_rate": 4.4289842393350004e-05, "loss": 2.7876, "num_input_tokens_seen": 2794455040, "step": 5330 }, { "epoch": 0.2587278205151521, "grad_norm": 0.2490234375, "learning_rate": 4.4277084033770946e-05, "loss": 2.7891, "num_input_tokens_seen": 2797076480, "step": 5335 }, { "epoch": 0.2589703020713987, "grad_norm": 0.2490234375, "learning_rate": 4.426431327890626e-05, "loss": 2.7691, "num_input_tokens_seen": 2799697920, "step": 5340 }, { "epoch": 0.2592127836276453, "grad_norm": 0.263671875, "learning_rate": 4.425153013696762e-05, "loss": 2.7807, "num_input_tokens_seen": 2802319360, "step": 5345 }, { "epoch": 0.25945526518389195, "grad_norm": 0.248046875, "learning_rate": 4.423873461617467e-05, "loss": 2.7843, "num_input_tokens_seen": 2804940800, "step": 5350 }, { "epoch": 0.25969774674013857, "grad_norm": 0.255859375, "learning_rate": 4.4225926724755e-05, "loss": 2.7738, "num_input_tokens_seen": 2807562240, "step": 5355 }, { "epoch": 0.2599402282963852, "grad_norm": 0.255859375, "learning_rate": 4.421310647094417e-05, "loss": 2.7872, "num_input_tokens_seen": 2810183680, "step": 5360 }, { "epoch": 0.2601827098526318, "grad_norm": 0.255859375, "learning_rate": 4.420027386298568e-05, "loss": 2.7801, "num_input_tokens_seen": 2812805120, "step": 5365 }, { "epoch": 0.2604251914088785, "grad_norm": 0.25390625, "learning_rate": 4.418742890913097e-05, "loss": 2.7878, "num_input_tokens_seen": 2815426560, "step": 5370 }, { "epoch": 0.2606676729651251, "grad_norm": 0.255859375, "learning_rate": 4.417457161763945e-05, "loss": 2.7867, "num_input_tokens_seen": 2818048000, "step": 5375 }, { "epoch": 0.26091015452137173, "grad_norm": 0.255859375, "learning_rate": 4.4161701996778415e-05, "loss": 2.7785, "num_input_tokens_seen": 2820669440, "step": 5380 }, { "epoch": 0.26115263607761835, "grad_norm": 0.25390625, "learning_rate": 4.4148820054823125e-05, "loss": 2.7838, "num_input_tokens_seen": 2823290880, "step": 5385 }, { "epoch": 0.26139511763386497, "grad_norm": 0.2578125, "learning_rate": 4.4135925800056744e-05, "loss": 2.79, "num_input_tokens_seen": 2825912320, "step": 5390 }, { "epoch": 0.2616375991901116, "grad_norm": 0.267578125, "learning_rate": 4.412301924077036e-05, "loss": 2.7882, "num_input_tokens_seen": 2828533760, "step": 5395 }, { "epoch": 0.2618800807463582, "grad_norm": 0.265625, "learning_rate": 4.411010038526297e-05, "loss": 2.7937, "num_input_tokens_seen": 2831155200, "step": 5400 }, { "epoch": 0.2618800807463582, "eval_accuracy": 0.45476958150138413, "eval_loss": 2.750626564025879, "eval_runtime": 5.809, "eval_samples_per_second": 51.644, "eval_steps_per_second": 6.542, "num_input_tokens_seen": 2831155200, "step": 5400 }, { "epoch": 0.26212256230260483, "grad_norm": 0.259765625, "learning_rate": 4.409716924184148e-05, "loss": 2.7784, "num_input_tokens_seen": 2833776640, "step": 5405 }, { "epoch": 0.2623650438588515, "grad_norm": 0.25390625, "learning_rate": 4.4084225818820694e-05, "loss": 2.7894, "num_input_tokens_seen": 2836398080, "step": 5410 }, { "epoch": 0.26260752541509813, "grad_norm": 0.251953125, "learning_rate": 4.407127012452332e-05, "loss": 2.778, "num_input_tokens_seen": 2839019520, "step": 5415 }, { "epoch": 0.26285000697134475, "grad_norm": 0.2578125, "learning_rate": 4.4058302167279944e-05, "loss": 2.7857, "num_input_tokens_seen": 2841640960, "step": 5420 }, { "epoch": 0.2630924885275914, "grad_norm": 0.26171875, "learning_rate": 4.404532195542905e-05, "loss": 2.7766, "num_input_tokens_seen": 2844262400, "step": 5425 }, { "epoch": 0.263334970083838, "grad_norm": 0.2470703125, "learning_rate": 4.4032329497316985e-05, "loss": 2.7816, "num_input_tokens_seen": 2846883840, "step": 5430 }, { "epoch": 0.2635774516400846, "grad_norm": 0.248046875, "learning_rate": 4.401932480129799e-05, "loss": 2.8017, "num_input_tokens_seen": 2849505280, "step": 5435 }, { "epoch": 0.26381993319633124, "grad_norm": 0.248046875, "learning_rate": 4.400630787573416e-05, "loss": 2.7787, "num_input_tokens_seen": 2852126720, "step": 5440 }, { "epoch": 0.26406241475257786, "grad_norm": 0.251953125, "learning_rate": 4.399327872899547e-05, "loss": 2.7934, "num_input_tokens_seen": 2854748160, "step": 5445 }, { "epoch": 0.26430489630882453, "grad_norm": 0.26171875, "learning_rate": 4.398023736945973e-05, "loss": 2.7813, "num_input_tokens_seen": 2857369600, "step": 5450 }, { "epoch": 0.26454737786507115, "grad_norm": 0.255859375, "learning_rate": 4.396718380551263e-05, "loss": 2.7795, "num_input_tokens_seen": 2859991040, "step": 5455 }, { "epoch": 0.2647898594213178, "grad_norm": 0.259765625, "learning_rate": 4.3954118045547675e-05, "loss": 2.7947, "num_input_tokens_seen": 2862612480, "step": 5460 }, { "epoch": 0.2650323409775644, "grad_norm": 0.259765625, "learning_rate": 4.394104009796623e-05, "loss": 2.7809, "num_input_tokens_seen": 2865233920, "step": 5465 }, { "epoch": 0.265274822533811, "grad_norm": 0.251953125, "learning_rate": 4.392794997117753e-05, "loss": 2.7705, "num_input_tokens_seen": 2867855360, "step": 5470 }, { "epoch": 0.26551730409005764, "grad_norm": 0.259765625, "learning_rate": 4.391484767359858e-05, "loss": 2.788, "num_input_tokens_seen": 2870476800, "step": 5475 }, { "epoch": 0.26575978564630426, "grad_norm": 0.25, "learning_rate": 4.390173321365423e-05, "loss": 2.7913, "num_input_tokens_seen": 2873098240, "step": 5480 }, { "epoch": 0.2660022672025509, "grad_norm": 0.26171875, "learning_rate": 4.388860659977719e-05, "loss": 2.7851, "num_input_tokens_seen": 2875719680, "step": 5485 }, { "epoch": 0.26624474875879756, "grad_norm": 0.259765625, "learning_rate": 4.387546784040794e-05, "loss": 2.7839, "num_input_tokens_seen": 2878341120, "step": 5490 }, { "epoch": 0.2664872303150442, "grad_norm": 0.2470703125, "learning_rate": 4.3862316943994766e-05, "loss": 2.7604, "num_input_tokens_seen": 2880962560, "step": 5495 }, { "epoch": 0.2667297118712908, "grad_norm": 0.2578125, "learning_rate": 4.3849153918993815e-05, "loss": 2.7719, "num_input_tokens_seen": 2883584000, "step": 5500 }, { "epoch": 0.2669721934275374, "grad_norm": 0.25390625, "learning_rate": 4.383597877386896e-05, "loss": 2.7873, "num_input_tokens_seen": 2886205440, "step": 5505 }, { "epoch": 0.26721467498378404, "grad_norm": 0.25, "learning_rate": 4.382279151709192e-05, "loss": 2.7806, "num_input_tokens_seen": 2888826880, "step": 5510 }, { "epoch": 0.26745715654003066, "grad_norm": 0.263671875, "learning_rate": 4.380959215714218e-05, "loss": 2.7814, "num_input_tokens_seen": 2891448320, "step": 5515 }, { "epoch": 0.2676996380962773, "grad_norm": 0.26953125, "learning_rate": 4.3796380702507014e-05, "loss": 2.7763, "num_input_tokens_seen": 2894069760, "step": 5520 }, { "epoch": 0.2679421196525239, "grad_norm": 0.267578125, "learning_rate": 4.3783157161681466e-05, "loss": 2.7891, "num_input_tokens_seen": 2896691200, "step": 5525 }, { "epoch": 0.2681846012087706, "grad_norm": 0.26171875, "learning_rate": 4.376992154316835e-05, "loss": 2.7868, "num_input_tokens_seen": 2899312640, "step": 5530 }, { "epoch": 0.2684270827650172, "grad_norm": 0.26171875, "learning_rate": 4.375667385547826e-05, "loss": 2.791, "num_input_tokens_seen": 2901934080, "step": 5535 }, { "epoch": 0.2686695643212638, "grad_norm": 0.24609375, "learning_rate": 4.3743414107129546e-05, "loss": 2.7801, "num_input_tokens_seen": 2904555520, "step": 5540 }, { "epoch": 0.26891204587751044, "grad_norm": 0.251953125, "learning_rate": 4.3730142306648294e-05, "loss": 2.7961, "num_input_tokens_seen": 2907176960, "step": 5545 }, { "epoch": 0.26915452743375706, "grad_norm": 0.2578125, "learning_rate": 4.3716858462568365e-05, "loss": 2.7786, "num_input_tokens_seen": 2909798400, "step": 5550 }, { "epoch": 0.2693970089900037, "grad_norm": 0.255859375, "learning_rate": 4.370356258343135e-05, "loss": 2.7981, "num_input_tokens_seen": 2912419840, "step": 5555 }, { "epoch": 0.2696394905462503, "grad_norm": 0.255859375, "learning_rate": 4.369025467778659e-05, "loss": 2.7813, "num_input_tokens_seen": 2915041280, "step": 5560 }, { "epoch": 0.269881972102497, "grad_norm": 0.251953125, "learning_rate": 4.3676934754191145e-05, "loss": 2.7738, "num_input_tokens_seen": 2917662720, "step": 5565 }, { "epoch": 0.2701244536587436, "grad_norm": 0.251953125, "learning_rate": 4.3663602821209805e-05, "loss": 2.7794, "num_input_tokens_seen": 2920284160, "step": 5570 }, { "epoch": 0.2703669352149902, "grad_norm": 0.259765625, "learning_rate": 4.36502588874151e-05, "loss": 2.7904, "num_input_tokens_seen": 2922905600, "step": 5575 }, { "epoch": 0.27060941677123684, "grad_norm": 0.2578125, "learning_rate": 4.363690296138725e-05, "loss": 2.7908, "num_input_tokens_seen": 2925527040, "step": 5580 }, { "epoch": 0.27085189832748346, "grad_norm": 0.26953125, "learning_rate": 4.36235350517142e-05, "loss": 2.7871, "num_input_tokens_seen": 2928148480, "step": 5585 }, { "epoch": 0.2710943798837301, "grad_norm": 0.26171875, "learning_rate": 4.3610155166991605e-05, "loss": 2.778, "num_input_tokens_seen": 2930769920, "step": 5590 }, { "epoch": 0.2713368614399767, "grad_norm": 0.2578125, "learning_rate": 4.359676331582282e-05, "loss": 2.7791, "num_input_tokens_seen": 2933391360, "step": 5595 }, { "epoch": 0.2715793429962233, "grad_norm": 0.2470703125, "learning_rate": 4.358335950681888e-05, "loss": 2.7795, "num_input_tokens_seen": 2936012800, "step": 5600 }, { "epoch": 0.27182182455247, "grad_norm": 0.259765625, "learning_rate": 4.356994374859852e-05, "loss": 2.7851, "num_input_tokens_seen": 2938634240, "step": 5605 }, { "epoch": 0.2720643061087166, "grad_norm": 0.255859375, "learning_rate": 4.355651604978815e-05, "loss": 2.7762, "num_input_tokens_seen": 2941255680, "step": 5610 }, { "epoch": 0.27230678766496325, "grad_norm": 0.25390625, "learning_rate": 4.3543076419021874e-05, "loss": 2.776, "num_input_tokens_seen": 2943877120, "step": 5615 }, { "epoch": 0.27254926922120987, "grad_norm": 0.25390625, "learning_rate": 4.3529624864941456e-05, "loss": 2.7986, "num_input_tokens_seen": 2946498560, "step": 5620 }, { "epoch": 0.2727917507774565, "grad_norm": 0.2490234375, "learning_rate": 4.351616139619632e-05, "loss": 2.7924, "num_input_tokens_seen": 2949120000, "step": 5625 }, { "epoch": 0.2730342323337031, "grad_norm": 0.25390625, "learning_rate": 4.350268602144358e-05, "loss": 2.7873, "num_input_tokens_seen": 2951741440, "step": 5630 }, { "epoch": 0.27327671388994973, "grad_norm": 0.26171875, "learning_rate": 4.3489198749347976e-05, "loss": 2.7869, "num_input_tokens_seen": 2954362880, "step": 5635 }, { "epoch": 0.27351919544619635, "grad_norm": 0.255859375, "learning_rate": 4.347569958858191e-05, "loss": 2.786, "num_input_tokens_seen": 2956984320, "step": 5640 }, { "epoch": 0.273761677002443, "grad_norm": 0.26171875, "learning_rate": 4.3462188547825415e-05, "loss": 2.779, "num_input_tokens_seen": 2959605760, "step": 5645 }, { "epoch": 0.27400415855868965, "grad_norm": 0.25, "learning_rate": 4.3448665635766187e-05, "loss": 2.7765, "num_input_tokens_seen": 2962227200, "step": 5650 }, { "epoch": 0.27424664011493627, "grad_norm": 0.251953125, "learning_rate": 4.343513086109955e-05, "loss": 2.7761, "num_input_tokens_seen": 2964848640, "step": 5655 }, { "epoch": 0.2744891216711829, "grad_norm": 0.267578125, "learning_rate": 4.342158423252843e-05, "loss": 2.79, "num_input_tokens_seen": 2967470080, "step": 5660 }, { "epoch": 0.2747316032274295, "grad_norm": 0.2578125, "learning_rate": 4.3408025758763403e-05, "loss": 2.7829, "num_input_tokens_seen": 2970091520, "step": 5665 }, { "epoch": 0.27497408478367613, "grad_norm": 0.259765625, "learning_rate": 4.339445544852265e-05, "loss": 2.7945, "num_input_tokens_seen": 2972712960, "step": 5670 }, { "epoch": 0.27521656633992275, "grad_norm": 0.25, "learning_rate": 4.338087331053198e-05, "loss": 2.7835, "num_input_tokens_seen": 2975334400, "step": 5675 }, { "epoch": 0.2754590478961694, "grad_norm": 0.240234375, "learning_rate": 4.336727935352477e-05, "loss": 2.7977, "num_input_tokens_seen": 2977955840, "step": 5680 }, { "epoch": 0.27570152945241605, "grad_norm": 0.251953125, "learning_rate": 4.335367358624204e-05, "loss": 2.7855, "num_input_tokens_seen": 2980577280, "step": 5685 }, { "epoch": 0.27594401100866267, "grad_norm": 0.259765625, "learning_rate": 4.334005601743236e-05, "loss": 2.7781, "num_input_tokens_seen": 2983198720, "step": 5690 }, { "epoch": 0.2761864925649093, "grad_norm": 0.259765625, "learning_rate": 4.3326426655851936e-05, "loss": 2.7889, "num_input_tokens_seen": 2985820160, "step": 5695 }, { "epoch": 0.2764289741211559, "grad_norm": 0.24609375, "learning_rate": 4.331278551026453e-05, "loss": 2.7717, "num_input_tokens_seen": 2988441600, "step": 5700 }, { "epoch": 0.2764289741211559, "eval_accuracy": 0.4547614395049666, "eval_loss": 2.749791145324707, "eval_runtime": 5.8606, "eval_samples_per_second": 51.189, "eval_steps_per_second": 6.484, "num_input_tokens_seen": 2988441600, "step": 5700 }, { "epoch": 0.27667145567740253, "grad_norm": 0.267578125, "learning_rate": 4.329913258944146e-05, "loss": 2.7882, "num_input_tokens_seen": 2991063040, "step": 5705 }, { "epoch": 0.27691393723364915, "grad_norm": 0.251953125, "learning_rate": 4.328546790216167e-05, "loss": 2.7819, "num_input_tokens_seen": 2993684480, "step": 5710 }, { "epoch": 0.2771564187898958, "grad_norm": 0.2578125, "learning_rate": 4.327179145721161e-05, "loss": 2.7812, "num_input_tokens_seen": 2996305920, "step": 5715 }, { "epoch": 0.2773989003461424, "grad_norm": 0.255859375, "learning_rate": 4.325810326338535e-05, "loss": 2.8015, "num_input_tokens_seen": 2998927360, "step": 5720 }, { "epoch": 0.2776413819023891, "grad_norm": 0.255859375, "learning_rate": 4.3244403329484456e-05, "loss": 2.79, "num_input_tokens_seen": 3001548800, "step": 5725 }, { "epoch": 0.2778838634586357, "grad_norm": 0.263671875, "learning_rate": 4.323069166431809e-05, "loss": 2.7898, "num_input_tokens_seen": 3004170240, "step": 5730 }, { "epoch": 0.2781263450148823, "grad_norm": 0.255859375, "learning_rate": 4.321696827670293e-05, "loss": 2.7953, "num_input_tokens_seen": 3006791680, "step": 5735 }, { "epoch": 0.27836882657112894, "grad_norm": 0.248046875, "learning_rate": 4.320323317546321e-05, "loss": 2.7848, "num_input_tokens_seen": 3009413120, "step": 5740 }, { "epoch": 0.27861130812737556, "grad_norm": 0.259765625, "learning_rate": 4.3189486369430674e-05, "loss": 2.7783, "num_input_tokens_seen": 3012034560, "step": 5745 }, { "epoch": 0.2788537896836222, "grad_norm": 0.255859375, "learning_rate": 4.317572786744461e-05, "loss": 2.7932, "num_input_tokens_seen": 3014656000, "step": 5750 }, { "epoch": 0.2790962712398688, "grad_norm": 0.259765625, "learning_rate": 4.3161957678351825e-05, "loss": 2.7695, "num_input_tokens_seen": 3017277440, "step": 5755 }, { "epoch": 0.2793387527961154, "grad_norm": 0.26171875, "learning_rate": 4.314817581100662e-05, "loss": 2.7823, "num_input_tokens_seen": 3019898880, "step": 5760 }, { "epoch": 0.2795812343523621, "grad_norm": 0.251953125, "learning_rate": 4.313438227427084e-05, "loss": 2.7727, "num_input_tokens_seen": 3022520320, "step": 5765 }, { "epoch": 0.2798237159086087, "grad_norm": 0.248046875, "learning_rate": 4.312057707701382e-05, "loss": 2.7871, "num_input_tokens_seen": 3025141760, "step": 5770 }, { "epoch": 0.28006619746485534, "grad_norm": 0.25390625, "learning_rate": 4.3106760228112365e-05, "loss": 2.7925, "num_input_tokens_seen": 3027763200, "step": 5775 }, { "epoch": 0.28030867902110196, "grad_norm": 0.2578125, "learning_rate": 4.309293173645082e-05, "loss": 2.787, "num_input_tokens_seen": 3030384640, "step": 5780 }, { "epoch": 0.2805511605773486, "grad_norm": 0.259765625, "learning_rate": 4.307909161092096e-05, "loss": 2.7925, "num_input_tokens_seen": 3033006080, "step": 5785 }, { "epoch": 0.2807936421335952, "grad_norm": 0.263671875, "learning_rate": 4.30652398604221e-05, "loss": 2.7864, "num_input_tokens_seen": 3035627520, "step": 5790 }, { "epoch": 0.2810361236898418, "grad_norm": 0.2578125, "learning_rate": 4.3051376493860995e-05, "loss": 2.7921, "num_input_tokens_seen": 3038248960, "step": 5795 }, { "epoch": 0.28127860524608844, "grad_norm": 0.255859375, "learning_rate": 4.303750152015188e-05, "loss": 2.7858, "num_input_tokens_seen": 3040870400, "step": 5800 }, { "epoch": 0.2815210868023351, "grad_norm": 0.265625, "learning_rate": 4.302361494821644e-05, "loss": 2.7797, "num_input_tokens_seen": 3043491840, "step": 5805 }, { "epoch": 0.28176356835858174, "grad_norm": 0.25, "learning_rate": 4.3009716786983834e-05, "loss": 2.775, "num_input_tokens_seen": 3046113280, "step": 5810 }, { "epoch": 0.28200604991482836, "grad_norm": 0.2490234375, "learning_rate": 4.299580704539067e-05, "loss": 2.7769, "num_input_tokens_seen": 3048734720, "step": 5815 }, { "epoch": 0.282248531471075, "grad_norm": 0.26171875, "learning_rate": 4.298188573238098e-05, "loss": 2.781, "num_input_tokens_seen": 3051356160, "step": 5820 }, { "epoch": 0.2824910130273216, "grad_norm": 0.265625, "learning_rate": 4.2967952856906276e-05, "loss": 2.7854, "num_input_tokens_seen": 3053977600, "step": 5825 }, { "epoch": 0.2827334945835682, "grad_norm": 0.2470703125, "learning_rate": 4.295400842792549e-05, "loss": 2.7853, "num_input_tokens_seen": 3056599040, "step": 5830 }, { "epoch": 0.28297597613981484, "grad_norm": 0.255859375, "learning_rate": 4.2940052454404954e-05, "loss": 2.7991, "num_input_tokens_seen": 3059220480, "step": 5835 }, { "epoch": 0.2832184576960615, "grad_norm": 0.255859375, "learning_rate": 4.2926084945318454e-05, "loss": 2.7957, "num_input_tokens_seen": 3061841920, "step": 5840 }, { "epoch": 0.28346093925230814, "grad_norm": 0.255859375, "learning_rate": 4.2912105909647194e-05, "loss": 2.7762, "num_input_tokens_seen": 3064463360, "step": 5845 }, { "epoch": 0.28370342080855476, "grad_norm": 0.26171875, "learning_rate": 4.289811535637978e-05, "loss": 2.7674, "num_input_tokens_seen": 3067084800, "step": 5850 }, { "epoch": 0.2839459023648014, "grad_norm": 0.244140625, "learning_rate": 4.288411329451222e-05, "loss": 2.7743, "num_input_tokens_seen": 3069706240, "step": 5855 }, { "epoch": 0.284188383921048, "grad_norm": 0.251953125, "learning_rate": 4.287009973304792e-05, "loss": 2.7811, "num_input_tokens_seen": 3072327680, "step": 5860 }, { "epoch": 0.2844308654772946, "grad_norm": 0.255859375, "learning_rate": 4.2856074680997705e-05, "loss": 2.7976, "num_input_tokens_seen": 3074949120, "step": 5865 }, { "epoch": 0.28467334703354125, "grad_norm": 0.25390625, "learning_rate": 4.284203814737976e-05, "loss": 2.7823, "num_input_tokens_seen": 3077570560, "step": 5870 }, { "epoch": 0.28491582858978787, "grad_norm": 0.251953125, "learning_rate": 4.282799014121967e-05, "loss": 2.7877, "num_input_tokens_seen": 3080192000, "step": 5875 }, { "epoch": 0.28515831014603454, "grad_norm": 0.25390625, "learning_rate": 4.281393067155038e-05, "loss": 2.7856, "num_input_tokens_seen": 3082813440, "step": 5880 }, { "epoch": 0.28540079170228116, "grad_norm": 0.251953125, "learning_rate": 4.279985974741223e-05, "loss": 2.7772, "num_input_tokens_seen": 3085434880, "step": 5885 }, { "epoch": 0.2856432732585278, "grad_norm": 0.25390625, "learning_rate": 4.2785777377852904e-05, "loss": 2.7833, "num_input_tokens_seen": 3088056320, "step": 5890 }, { "epoch": 0.2858857548147744, "grad_norm": 0.265625, "learning_rate": 4.277168357192746e-05, "loss": 2.7821, "num_input_tokens_seen": 3090677760, "step": 5895 }, { "epoch": 0.286128236371021, "grad_norm": 0.26171875, "learning_rate": 4.27575783386983e-05, "loss": 2.7749, "num_input_tokens_seen": 3093299200, "step": 5900 }, { "epoch": 0.28637071792726765, "grad_norm": 0.2578125, "learning_rate": 4.2743461687235176e-05, "loss": 2.7761, "num_input_tokens_seen": 3095920640, "step": 5905 }, { "epoch": 0.28661319948351427, "grad_norm": 0.275390625, "learning_rate": 4.272933362661518e-05, "loss": 2.7836, "num_input_tokens_seen": 3098542080, "step": 5910 }, { "epoch": 0.2868556810397609, "grad_norm": 0.265625, "learning_rate": 4.2715194165922754e-05, "loss": 2.795, "num_input_tokens_seen": 3101163520, "step": 5915 }, { "epoch": 0.28709816259600757, "grad_norm": 0.25390625, "learning_rate": 4.2701043314249644e-05, "loss": 2.7857, "num_input_tokens_seen": 3103784960, "step": 5920 }, { "epoch": 0.2873406441522542, "grad_norm": 0.25390625, "learning_rate": 4.268688108069496e-05, "loss": 2.7809, "num_input_tokens_seen": 3106406400, "step": 5925 }, { "epoch": 0.2875831257085008, "grad_norm": 0.25390625, "learning_rate": 4.267270747436508e-05, "loss": 2.7832, "num_input_tokens_seen": 3109027840, "step": 5930 }, { "epoch": 0.28782560726474743, "grad_norm": 0.26171875, "learning_rate": 4.2658522504373736e-05, "loss": 2.7991, "num_input_tokens_seen": 3111649280, "step": 5935 }, { "epoch": 0.28806808882099405, "grad_norm": 0.251953125, "learning_rate": 4.264432617984195e-05, "loss": 2.7908, "num_input_tokens_seen": 3114270720, "step": 5940 }, { "epoch": 0.28831057037724067, "grad_norm": 0.25, "learning_rate": 4.263011850989805e-05, "loss": 2.7745, "num_input_tokens_seen": 3116892160, "step": 5945 }, { "epoch": 0.2885530519334873, "grad_norm": 0.25, "learning_rate": 4.2615899503677656e-05, "loss": 2.7903, "num_input_tokens_seen": 3119513600, "step": 5950 }, { "epoch": 0.2887955334897339, "grad_norm": 0.25, "learning_rate": 4.260166917032368e-05, "loss": 2.7734, "num_input_tokens_seen": 3122135040, "step": 5955 }, { "epoch": 0.2890380150459806, "grad_norm": 0.265625, "learning_rate": 4.25874275189863e-05, "loss": 2.7876, "num_input_tokens_seen": 3124756480, "step": 5960 }, { "epoch": 0.2892804966022272, "grad_norm": 0.267578125, "learning_rate": 4.2573174558823004e-05, "loss": 2.7744, "num_input_tokens_seen": 3127377920, "step": 5965 }, { "epoch": 0.28952297815847383, "grad_norm": 0.265625, "learning_rate": 4.2558910298998535e-05, "loss": 2.7804, "num_input_tokens_seen": 3129999360, "step": 5970 }, { "epoch": 0.28976545971472045, "grad_norm": 0.25, "learning_rate": 4.2544634748684886e-05, "loss": 2.7791, "num_input_tokens_seen": 3132620800, "step": 5975 }, { "epoch": 0.2900079412709671, "grad_norm": 0.26171875, "learning_rate": 4.253034791706134e-05, "loss": 2.7848, "num_input_tokens_seen": 3135242240, "step": 5980 }, { "epoch": 0.2902504228272137, "grad_norm": 0.2578125, "learning_rate": 4.251604981331441e-05, "loss": 2.7896, "num_input_tokens_seen": 3137863680, "step": 5985 }, { "epoch": 0.2904929043834603, "grad_norm": 0.2470703125, "learning_rate": 4.2501740446637874e-05, "loss": 2.7807, "num_input_tokens_seen": 3140485120, "step": 5990 }, { "epoch": 0.29073538593970694, "grad_norm": 0.25390625, "learning_rate": 4.248741982623274e-05, "loss": 2.7796, "num_input_tokens_seen": 3143106560, "step": 5995 }, { "epoch": 0.2909778674959536, "grad_norm": 0.25390625, "learning_rate": 4.2473087961307256e-05, "loss": 2.7832, "num_input_tokens_seen": 3145728000, "step": 6000 }, { "epoch": 0.2909778674959536, "eval_accuracy": 0.4548298322748738, "eval_loss": 2.7490339279174805, "eval_runtime": 5.8334, "eval_samples_per_second": 51.428, "eval_steps_per_second": 6.514, "num_input_tokens_seen": 3145728000, "step": 6000 }, { "epoch": 0.29122034905220023, "grad_norm": 0.2470703125, "learning_rate": 4.24587448610769e-05, "loss": 2.7829, "num_input_tokens_seen": 3148349440, "step": 6005 }, { "epoch": 0.29146283060844685, "grad_norm": 0.259765625, "learning_rate": 4.244439053476438e-05, "loss": 2.7767, "num_input_tokens_seen": 3150970880, "step": 6010 }, { "epoch": 0.2917053121646935, "grad_norm": 0.259765625, "learning_rate": 4.24300249915996e-05, "loss": 2.8001, "num_input_tokens_seen": 3153592320, "step": 6015 }, { "epoch": 0.2919477937209401, "grad_norm": 0.26171875, "learning_rate": 4.2415648240819726e-05, "loss": 2.7843, "num_input_tokens_seen": 3156213760, "step": 6020 }, { "epoch": 0.2921902752771867, "grad_norm": 0.251953125, "learning_rate": 4.2401260291669074e-05, "loss": 2.7845, "num_input_tokens_seen": 3158835200, "step": 6025 }, { "epoch": 0.29243275683343334, "grad_norm": 0.255859375, "learning_rate": 4.238686115339919e-05, "loss": 2.7737, "num_input_tokens_seen": 3161456640, "step": 6030 }, { "epoch": 0.29267523838967996, "grad_norm": 0.2578125, "learning_rate": 4.2372450835268816e-05, "loss": 2.7867, "num_input_tokens_seen": 3164078080, "step": 6035 }, { "epoch": 0.29291771994592664, "grad_norm": 0.265625, "learning_rate": 4.235802934654388e-05, "loss": 2.796, "num_input_tokens_seen": 3166699520, "step": 6040 }, { "epoch": 0.29316020150217326, "grad_norm": 0.25390625, "learning_rate": 4.234359669649747e-05, "loss": 2.781, "num_input_tokens_seen": 3169320960, "step": 6045 }, { "epoch": 0.2934026830584199, "grad_norm": 0.25, "learning_rate": 4.2329152894409895e-05, "loss": 2.7849, "num_input_tokens_seen": 3171942400, "step": 6050 }, { "epoch": 0.2936451646146665, "grad_norm": 0.251953125, "learning_rate": 4.231469794956859e-05, "loss": 2.7789, "num_input_tokens_seen": 3174563840, "step": 6055 }, { "epoch": 0.2938876461709131, "grad_norm": 0.263671875, "learning_rate": 4.230023187126818e-05, "loss": 2.7813, "num_input_tokens_seen": 3177185280, "step": 6060 }, { "epoch": 0.29413012772715974, "grad_norm": 0.2578125, "learning_rate": 4.2285754668810454e-05, "loss": 2.7695, "num_input_tokens_seen": 3179806720, "step": 6065 }, { "epoch": 0.29437260928340636, "grad_norm": 0.25390625, "learning_rate": 4.227126635150434e-05, "loss": 2.7834, "num_input_tokens_seen": 3182428160, "step": 6070 }, { "epoch": 0.294615090839653, "grad_norm": 0.2578125, "learning_rate": 4.2256766928665895e-05, "loss": 2.7758, "num_input_tokens_seen": 3185049600, "step": 6075 }, { "epoch": 0.29485757239589966, "grad_norm": 0.265625, "learning_rate": 4.224225640961837e-05, "loss": 2.7878, "num_input_tokens_seen": 3187671040, "step": 6080 }, { "epoch": 0.2951000539521463, "grad_norm": 0.251953125, "learning_rate": 4.222773480369211e-05, "loss": 2.7845, "num_input_tokens_seen": 3190292480, "step": 6085 }, { "epoch": 0.2953425355083929, "grad_norm": 0.24609375, "learning_rate": 4.221320212022458e-05, "loss": 2.7655, "num_input_tokens_seen": 3192913920, "step": 6090 }, { "epoch": 0.2955850170646395, "grad_norm": 0.255859375, "learning_rate": 4.219865836856042e-05, "loss": 2.7847, "num_input_tokens_seen": 3195535360, "step": 6095 }, { "epoch": 0.29582749862088614, "grad_norm": 0.259765625, "learning_rate": 4.218410355805132e-05, "loss": 2.7824, "num_input_tokens_seen": 3198156800, "step": 6100 }, { "epoch": 0.29606998017713276, "grad_norm": 0.263671875, "learning_rate": 4.216953769805613e-05, "loss": 2.757, "num_input_tokens_seen": 3200778240, "step": 6105 }, { "epoch": 0.2963124617333794, "grad_norm": 0.255859375, "learning_rate": 4.21549607979408e-05, "loss": 2.7787, "num_input_tokens_seen": 3203399680, "step": 6110 }, { "epoch": 0.296554943289626, "grad_norm": 0.25390625, "learning_rate": 4.2140372867078345e-05, "loss": 2.798, "num_input_tokens_seen": 3206021120, "step": 6115 }, { "epoch": 0.2967974248458727, "grad_norm": 0.24609375, "learning_rate": 4.212577391484891e-05, "loss": 2.7794, "num_input_tokens_seen": 3208642560, "step": 6120 }, { "epoch": 0.2970399064021193, "grad_norm": 0.25390625, "learning_rate": 4.21111639506397e-05, "loss": 2.7806, "num_input_tokens_seen": 3211264000, "step": 6125 }, { "epoch": 0.2972823879583659, "grad_norm": 0.259765625, "learning_rate": 4.209654298384503e-05, "loss": 2.777, "num_input_tokens_seen": 3213885440, "step": 6130 }, { "epoch": 0.29752486951461254, "grad_norm": 0.26171875, "learning_rate": 4.208191102386627e-05, "loss": 2.7611, "num_input_tokens_seen": 3216506880, "step": 6135 }, { "epoch": 0.29776735107085917, "grad_norm": 0.26953125, "learning_rate": 4.2067268080111856e-05, "loss": 2.7974, "num_input_tokens_seen": 3219128320, "step": 6140 }, { "epoch": 0.2980098326271058, "grad_norm": 0.255859375, "learning_rate": 4.205261416199729e-05, "loss": 2.7778, "num_input_tokens_seen": 3221749760, "step": 6145 }, { "epoch": 0.2982523141833524, "grad_norm": 0.25, "learning_rate": 4.203794927894514e-05, "loss": 2.7706, "num_input_tokens_seen": 3224371200, "step": 6150 }, { "epoch": 0.2984947957395991, "grad_norm": 0.2578125, "learning_rate": 4.2023273440385014e-05, "loss": 2.7762, "num_input_tokens_seen": 3226992640, "step": 6155 }, { "epoch": 0.2987372772958457, "grad_norm": 0.255859375, "learning_rate": 4.2008586655753566e-05, "loss": 2.777, "num_input_tokens_seen": 3229614080, "step": 6160 }, { "epoch": 0.2989797588520923, "grad_norm": 0.2578125, "learning_rate": 4.199388893449449e-05, "loss": 2.786, "num_input_tokens_seen": 3232235520, "step": 6165 }, { "epoch": 0.29922224040833895, "grad_norm": 0.26953125, "learning_rate": 4.1979180286058515e-05, "loss": 2.7808, "num_input_tokens_seen": 3234856960, "step": 6170 }, { "epoch": 0.29946472196458557, "grad_norm": 0.25390625, "learning_rate": 4.196446071990341e-05, "loss": 2.7693, "num_input_tokens_seen": 3237478400, "step": 6175 }, { "epoch": 0.2997072035208322, "grad_norm": 0.26171875, "learning_rate": 4.194973024549392e-05, "loss": 2.7775, "num_input_tokens_seen": 3240099840, "step": 6180 }, { "epoch": 0.2999496850770788, "grad_norm": 0.255859375, "learning_rate": 4.193498887230184e-05, "loss": 2.7776, "num_input_tokens_seen": 3242721280, "step": 6185 }, { "epoch": 0.30019216663332543, "grad_norm": 0.259765625, "learning_rate": 4.1920236609805986e-05, "loss": 2.7848, "num_input_tokens_seen": 3245342720, "step": 6190 }, { "epoch": 0.3004346481895721, "grad_norm": 0.263671875, "learning_rate": 4.190547346749213e-05, "loss": 2.7975, "num_input_tokens_seen": 3247964160, "step": 6195 }, { "epoch": 0.3006771297458187, "grad_norm": 0.265625, "learning_rate": 4.1890699454853067e-05, "loss": 2.7854, "num_input_tokens_seen": 3250585600, "step": 6200 }, { "epoch": 0.30091961130206535, "grad_norm": 0.251953125, "learning_rate": 4.18759145813886e-05, "loss": 2.7638, "num_input_tokens_seen": 3253207040, "step": 6205 }, { "epoch": 0.30116209285831197, "grad_norm": 0.2734375, "learning_rate": 4.186111885660547e-05, "loss": 2.7825, "num_input_tokens_seen": 3255828480, "step": 6210 }, { "epoch": 0.3014045744145586, "grad_norm": 0.26171875, "learning_rate": 4.184631229001744e-05, "loss": 2.7761, "num_input_tokens_seen": 3258449920, "step": 6215 }, { "epoch": 0.3016470559708052, "grad_norm": 0.251953125, "learning_rate": 4.1831494891145215e-05, "loss": 2.7822, "num_input_tokens_seen": 3261071360, "step": 6220 }, { "epoch": 0.30188953752705183, "grad_norm": 0.251953125, "learning_rate": 4.1816666669516474e-05, "loss": 2.7881, "num_input_tokens_seen": 3263692800, "step": 6225 }, { "epoch": 0.30213201908329845, "grad_norm": 0.25, "learning_rate": 4.180182763466586e-05, "loss": 2.7894, "num_input_tokens_seen": 3266314240, "step": 6230 }, { "epoch": 0.30237450063954513, "grad_norm": 0.24609375, "learning_rate": 4.178697779613497e-05, "loss": 2.776, "num_input_tokens_seen": 3268935680, "step": 6235 }, { "epoch": 0.30261698219579175, "grad_norm": 0.255859375, "learning_rate": 4.177211716347234e-05, "loss": 2.7779, "num_input_tokens_seen": 3271557120, "step": 6240 }, { "epoch": 0.30285946375203837, "grad_norm": 0.25, "learning_rate": 4.1757245746233435e-05, "loss": 2.7768, "num_input_tokens_seen": 3274178560, "step": 6245 }, { "epoch": 0.303101945308285, "grad_norm": 0.2490234375, "learning_rate": 4.174236355398069e-05, "loss": 2.7701, "num_input_tokens_seen": 3276800000, "step": 6250 }, { "epoch": 0.3033444268645316, "grad_norm": 0.251953125, "learning_rate": 4.172747059628345e-05, "loss": 2.7897, "num_input_tokens_seen": 3279421440, "step": 6255 }, { "epoch": 0.30358690842077823, "grad_norm": 0.251953125, "learning_rate": 4.171256688271795e-05, "loss": 2.7801, "num_input_tokens_seen": 3282042880, "step": 6260 }, { "epoch": 0.30382938997702486, "grad_norm": 0.2578125, "learning_rate": 4.1697652422867403e-05, "loss": 2.7879, "num_input_tokens_seen": 3284664320, "step": 6265 }, { "epoch": 0.3040718715332715, "grad_norm": 0.259765625, "learning_rate": 4.1682727226321885e-05, "loss": 2.7765, "num_input_tokens_seen": 3287285760, "step": 6270 }, { "epoch": 0.30431435308951815, "grad_norm": 0.2490234375, "learning_rate": 4.166779130267839e-05, "loss": 2.7829, "num_input_tokens_seen": 3289907200, "step": 6275 }, { "epoch": 0.3045568346457648, "grad_norm": 0.2490234375, "learning_rate": 4.1652844661540825e-05, "loss": 2.7875, "num_input_tokens_seen": 3292528640, "step": 6280 }, { "epoch": 0.3047993162020114, "grad_norm": 0.263671875, "learning_rate": 4.163788731251995e-05, "loss": 2.7917, "num_input_tokens_seen": 3295150080, "step": 6285 }, { "epoch": 0.305041797758258, "grad_norm": 0.251953125, "learning_rate": 4.1622919265233456e-05, "loss": 2.7794, "num_input_tokens_seen": 3297771520, "step": 6290 }, { "epoch": 0.30528427931450464, "grad_norm": 0.255859375, "learning_rate": 4.1607940529305876e-05, "loss": 2.7816, "num_input_tokens_seen": 3300392960, "step": 6295 }, { "epoch": 0.30552676087075126, "grad_norm": 0.259765625, "learning_rate": 4.159295111436864e-05, "loss": 2.768, "num_input_tokens_seen": 3303014400, "step": 6300 }, { "epoch": 0.30552676087075126, "eval_accuracy": 0.45503989578244586, "eval_loss": 2.7481751441955566, "eval_runtime": 5.8537, "eval_samples_per_second": 51.25, "eval_steps_per_second": 6.492, "num_input_tokens_seen": 3303014400, "step": 6300 }, { "epoch": 0.3057692424269979, "grad_norm": 0.248046875, "learning_rate": 4.1577951030060034e-05, "loss": 2.7833, "num_input_tokens_seen": 3305635840, "step": 6305 }, { "epoch": 0.3060117239832445, "grad_norm": 0.251953125, "learning_rate": 4.1562940286025195e-05, "loss": 2.7898, "num_input_tokens_seen": 3308257280, "step": 6310 }, { "epoch": 0.3062542055394912, "grad_norm": 0.263671875, "learning_rate": 4.1547918891916144e-05, "loss": 2.7797, "num_input_tokens_seen": 3310878720, "step": 6315 }, { "epoch": 0.3064966870957378, "grad_norm": 0.255859375, "learning_rate": 4.153288685739172e-05, "loss": 2.78, "num_input_tokens_seen": 3313500160, "step": 6320 }, { "epoch": 0.3067391686519844, "grad_norm": 0.251953125, "learning_rate": 4.1517844192117614e-05, "loss": 2.7905, "num_input_tokens_seen": 3316121600, "step": 6325 }, { "epoch": 0.30698165020823104, "grad_norm": 0.255859375, "learning_rate": 4.150279090576636e-05, "loss": 2.7758, "num_input_tokens_seen": 3318743040, "step": 6330 }, { "epoch": 0.30722413176447766, "grad_norm": 0.255859375, "learning_rate": 4.148772700801731e-05, "loss": 2.7731, "num_input_tokens_seen": 3321364480, "step": 6335 }, { "epoch": 0.3074666133207243, "grad_norm": 0.265625, "learning_rate": 4.1472652508556646e-05, "loss": 2.7888, "num_input_tokens_seen": 3323985920, "step": 6340 }, { "epoch": 0.3077090948769709, "grad_norm": 0.251953125, "learning_rate": 4.145756741707737e-05, "loss": 2.7849, "num_input_tokens_seen": 3326607360, "step": 6345 }, { "epoch": 0.3079515764332175, "grad_norm": 0.259765625, "learning_rate": 4.144247174327929e-05, "loss": 2.7932, "num_input_tokens_seen": 3329228800, "step": 6350 }, { "epoch": 0.3081940579894642, "grad_norm": 0.2421875, "learning_rate": 4.1427365496869005e-05, "loss": 2.7765, "num_input_tokens_seen": 3331850240, "step": 6355 }, { "epoch": 0.3084365395457108, "grad_norm": 0.25390625, "learning_rate": 4.141224868755994e-05, "loss": 2.7782, "num_input_tokens_seen": 3334471680, "step": 6360 }, { "epoch": 0.30867902110195744, "grad_norm": 0.255859375, "learning_rate": 4.13971213250723e-05, "loss": 2.7946, "num_input_tokens_seen": 3337093120, "step": 6365 }, { "epoch": 0.30892150265820406, "grad_norm": 0.259765625, "learning_rate": 4.1381983419133056e-05, "loss": 2.7732, "num_input_tokens_seen": 3339714560, "step": 6370 }, { "epoch": 0.3091639842144507, "grad_norm": 0.259765625, "learning_rate": 4.1366834979476e-05, "loss": 2.7807, "num_input_tokens_seen": 3342336000, "step": 6375 }, { "epoch": 0.3094064657706973, "grad_norm": 0.25390625, "learning_rate": 4.135167601584166e-05, "loss": 2.7756, "num_input_tokens_seen": 3344957440, "step": 6380 }, { "epoch": 0.3096489473269439, "grad_norm": 0.259765625, "learning_rate": 4.133650653797734e-05, "loss": 2.7711, "num_input_tokens_seen": 3347578880, "step": 6385 }, { "epoch": 0.30989142888319055, "grad_norm": 0.248046875, "learning_rate": 4.132132655563711e-05, "loss": 2.7809, "num_input_tokens_seen": 3350200320, "step": 6390 }, { "epoch": 0.3101339104394372, "grad_norm": 0.255859375, "learning_rate": 4.1306136078581814e-05, "loss": 2.7921, "num_input_tokens_seen": 3352821760, "step": 6395 }, { "epoch": 0.31037639199568384, "grad_norm": 0.251953125, "learning_rate": 4.129093511657899e-05, "loss": 2.7853, "num_input_tokens_seen": 3355443200, "step": 6400 }, { "epoch": 0.31061887355193046, "grad_norm": 0.251953125, "learning_rate": 4.1275723679402984e-05, "loss": 2.7895, "num_input_tokens_seen": 3358064640, "step": 6405 }, { "epoch": 0.3108613551081771, "grad_norm": 0.267578125, "learning_rate": 4.126050177683483e-05, "loss": 2.7789, "num_input_tokens_seen": 3360686080, "step": 6410 }, { "epoch": 0.3111038366644237, "grad_norm": 0.25390625, "learning_rate": 4.12452694186623e-05, "loss": 2.774, "num_input_tokens_seen": 3363307520, "step": 6415 }, { "epoch": 0.3113463182206703, "grad_norm": 0.25390625, "learning_rate": 4.123002661467992e-05, "loss": 2.7821, "num_input_tokens_seen": 3365928960, "step": 6420 }, { "epoch": 0.31158879977691695, "grad_norm": 0.25390625, "learning_rate": 4.1214773374688877e-05, "loss": 2.777, "num_input_tokens_seen": 3368550400, "step": 6425 }, { "epoch": 0.3118312813331636, "grad_norm": 0.25, "learning_rate": 4.119950970849712e-05, "loss": 2.7865, "num_input_tokens_seen": 3371171840, "step": 6430 }, { "epoch": 0.31207376288941024, "grad_norm": 0.255859375, "learning_rate": 4.118423562591928e-05, "loss": 2.7788, "num_input_tokens_seen": 3373793280, "step": 6435 }, { "epoch": 0.31231624444565687, "grad_norm": 0.259765625, "learning_rate": 4.1168951136776676e-05, "loss": 2.7915, "num_input_tokens_seen": 3376414720, "step": 6440 }, { "epoch": 0.3125587260019035, "grad_norm": 0.263671875, "learning_rate": 4.1153656250897344e-05, "loss": 2.7871, "num_input_tokens_seen": 3379036160, "step": 6445 }, { "epoch": 0.3128012075581501, "grad_norm": 0.251953125, "learning_rate": 4.113835097811598e-05, "loss": 2.7868, "num_input_tokens_seen": 3381657600, "step": 6450 }, { "epoch": 0.31304368911439673, "grad_norm": 0.2470703125, "learning_rate": 4.112303532827398e-05, "loss": 2.7706, "num_input_tokens_seen": 3384279040, "step": 6455 }, { "epoch": 0.31328617067064335, "grad_norm": 0.251953125, "learning_rate": 4.11077093112194e-05, "loss": 2.7861, "num_input_tokens_seen": 3386900480, "step": 6460 }, { "epoch": 0.31352865222688997, "grad_norm": 0.25, "learning_rate": 4.1092372936806964e-05, "loss": 2.7775, "num_input_tokens_seen": 3389521920, "step": 6465 }, { "epoch": 0.31377113378313665, "grad_norm": 0.2578125, "learning_rate": 4.107702621489805e-05, "loss": 2.7656, "num_input_tokens_seen": 3392143360, "step": 6470 }, { "epoch": 0.31401361533938327, "grad_norm": 0.25, "learning_rate": 4.106166915536071e-05, "loss": 2.7714, "num_input_tokens_seen": 3394764800, "step": 6475 }, { "epoch": 0.3142560968956299, "grad_norm": 0.244140625, "learning_rate": 4.104630176806962e-05, "loss": 2.7862, "num_input_tokens_seen": 3397386240, "step": 6480 }, { "epoch": 0.3144985784518765, "grad_norm": 0.2578125, "learning_rate": 4.103092406290611e-05, "loss": 2.7868, "num_input_tokens_seen": 3400007680, "step": 6485 }, { "epoch": 0.31474106000812313, "grad_norm": 0.267578125, "learning_rate": 4.101553604975813e-05, "loss": 2.7785, "num_input_tokens_seen": 3402629120, "step": 6490 }, { "epoch": 0.31498354156436975, "grad_norm": 0.248046875, "learning_rate": 4.100013773852027e-05, "loss": 2.7844, "num_input_tokens_seen": 3405250560, "step": 6495 }, { "epoch": 0.31522602312061637, "grad_norm": 0.25, "learning_rate": 4.098472913909376e-05, "loss": 2.7916, "num_input_tokens_seen": 3407872000, "step": 6500 }, { "epoch": 0.315468504676863, "grad_norm": 0.267578125, "learning_rate": 4.096931026138642e-05, "loss": 2.7721, "num_input_tokens_seen": 3410493440, "step": 6505 }, { "epoch": 0.31571098623310967, "grad_norm": 0.25, "learning_rate": 4.095388111531266e-05, "loss": 2.7925, "num_input_tokens_seen": 3413114880, "step": 6510 }, { "epoch": 0.3159534677893563, "grad_norm": 0.2578125, "learning_rate": 4.093844171079355e-05, "loss": 2.7947, "num_input_tokens_seen": 3415736320, "step": 6515 }, { "epoch": 0.3161959493456029, "grad_norm": 0.2578125, "learning_rate": 4.09229920577567e-05, "loss": 2.7875, "num_input_tokens_seen": 3418357760, "step": 6520 }, { "epoch": 0.31643843090184953, "grad_norm": 0.255859375, "learning_rate": 4.090753216613635e-05, "loss": 2.793, "num_input_tokens_seen": 3420979200, "step": 6525 }, { "epoch": 0.31668091245809615, "grad_norm": 0.2578125, "learning_rate": 4.0892062045873296e-05, "loss": 2.788, "num_input_tokens_seen": 3423600640, "step": 6530 }, { "epoch": 0.3169233940143428, "grad_norm": 0.251953125, "learning_rate": 4.087658170691493e-05, "loss": 2.7857, "num_input_tokens_seen": 3426222080, "step": 6535 }, { "epoch": 0.3171658755705894, "grad_norm": 0.2421875, "learning_rate": 4.0861091159215194e-05, "loss": 2.7709, "num_input_tokens_seen": 3428843520, "step": 6540 }, { "epoch": 0.317408357126836, "grad_norm": 0.2490234375, "learning_rate": 4.0845590412734625e-05, "loss": 2.7727, "num_input_tokens_seen": 3431464960, "step": 6545 }, { "epoch": 0.3176508386830827, "grad_norm": 0.248046875, "learning_rate": 4.083007947744029e-05, "loss": 2.7918, "num_input_tokens_seen": 3434086400, "step": 6550 }, { "epoch": 0.3178933202393293, "grad_norm": 0.263671875, "learning_rate": 4.081455836330581e-05, "loss": 2.7887, "num_input_tokens_seen": 3436707840, "step": 6555 }, { "epoch": 0.31813580179557593, "grad_norm": 0.251953125, "learning_rate": 4.079902708031137e-05, "loss": 2.7735, "num_input_tokens_seen": 3439329280, "step": 6560 }, { "epoch": 0.31837828335182256, "grad_norm": 0.25390625, "learning_rate": 4.078348563844368e-05, "loss": 2.7832, "num_input_tokens_seen": 3441950720, "step": 6565 }, { "epoch": 0.3186207649080692, "grad_norm": 0.24609375, "learning_rate": 4.076793404769599e-05, "loss": 2.7718, "num_input_tokens_seen": 3444572160, "step": 6570 }, { "epoch": 0.3188632464643158, "grad_norm": 0.251953125, "learning_rate": 4.075237231806806e-05, "loss": 2.7927, "num_input_tokens_seen": 3447193600, "step": 6575 }, { "epoch": 0.3191057280205624, "grad_norm": 0.259765625, "learning_rate": 4.0736800459566175e-05, "loss": 2.7647, "num_input_tokens_seen": 3449815040, "step": 6580 }, { "epoch": 0.31934820957680904, "grad_norm": 0.26953125, "learning_rate": 4.0721218482203146e-05, "loss": 2.7822, "num_input_tokens_seen": 3452436480, "step": 6585 }, { "epoch": 0.3195906911330557, "grad_norm": 0.255859375, "learning_rate": 4.0705626395998294e-05, "loss": 2.7805, "num_input_tokens_seen": 3455057920, "step": 6590 }, { "epoch": 0.31983317268930234, "grad_norm": 0.255859375, "learning_rate": 4.0690024210977405e-05, "loss": 2.7821, "num_input_tokens_seen": 3457679360, "step": 6595 }, { "epoch": 0.32007565424554896, "grad_norm": 0.25, "learning_rate": 4.06744119371728e-05, "loss": 2.7653, "num_input_tokens_seen": 3460300800, "step": 6600 }, { "epoch": 0.32007565424554896, "eval_accuracy": 0.4551441133365901, "eval_loss": 2.7476091384887695, "eval_runtime": 5.9773, "eval_samples_per_second": 50.19, "eval_steps_per_second": 6.357, "num_input_tokens_seen": 3460300800, "step": 6600 }, { "epoch": 0.3203181358017956, "grad_norm": 0.271484375, "learning_rate": 4.0658789584623246e-05, "loss": 2.7825, "num_input_tokens_seen": 3462922240, "step": 6605 }, { "epoch": 0.3205606173580422, "grad_norm": 0.251953125, "learning_rate": 4.064315716337404e-05, "loss": 2.7973, "num_input_tokens_seen": 3465543680, "step": 6610 }, { "epoch": 0.3208030989142888, "grad_norm": 0.255859375, "learning_rate": 4.062751468347691e-05, "loss": 2.7879, "num_input_tokens_seen": 3468165120, "step": 6615 }, { "epoch": 0.32104558047053544, "grad_norm": 0.265625, "learning_rate": 4.0611862154990074e-05, "loss": 2.7872, "num_input_tokens_seen": 3470786560, "step": 6620 }, { "epoch": 0.32128806202678206, "grad_norm": 0.25, "learning_rate": 4.059619958797821e-05, "loss": 2.7656, "num_input_tokens_seen": 3473408000, "step": 6625 }, { "epoch": 0.32153054358302874, "grad_norm": 0.26171875, "learning_rate": 4.0580526992512435e-05, "loss": 2.7809, "num_input_tokens_seen": 3476029440, "step": 6630 }, { "epoch": 0.32177302513927536, "grad_norm": 0.2578125, "learning_rate": 4.056484437867033e-05, "loss": 2.7756, "num_input_tokens_seen": 3478650880, "step": 6635 }, { "epoch": 0.322015506695522, "grad_norm": 0.2578125, "learning_rate": 4.054915175653592e-05, "loss": 2.786, "num_input_tokens_seen": 3481272320, "step": 6640 }, { "epoch": 0.3222579882517686, "grad_norm": 0.265625, "learning_rate": 4.053344913619965e-05, "loss": 2.7802, "num_input_tokens_seen": 3483893760, "step": 6645 }, { "epoch": 0.3225004698080152, "grad_norm": 0.263671875, "learning_rate": 4.051773652775842e-05, "loss": 2.794, "num_input_tokens_seen": 3486515200, "step": 6650 }, { "epoch": 0.32274295136426184, "grad_norm": 0.25390625, "learning_rate": 4.0502013941315516e-05, "loss": 2.7916, "num_input_tokens_seen": 3489136640, "step": 6655 }, { "epoch": 0.32298543292050846, "grad_norm": 0.2451171875, "learning_rate": 4.048628138698067e-05, "loss": 2.78, "num_input_tokens_seen": 3491758080, "step": 6660 }, { "epoch": 0.3232279144767551, "grad_norm": 0.2578125, "learning_rate": 4.047053887487e-05, "loss": 2.7756, "num_input_tokens_seen": 3494379520, "step": 6665 }, { "epoch": 0.32347039603300176, "grad_norm": 0.248046875, "learning_rate": 4.045478641510606e-05, "loss": 2.7777, "num_input_tokens_seen": 3497000960, "step": 6670 }, { "epoch": 0.3237128775892484, "grad_norm": 0.2578125, "learning_rate": 4.0439024017817774e-05, "loss": 2.7721, "num_input_tokens_seen": 3499622400, "step": 6675 }, { "epoch": 0.323955359145495, "grad_norm": 0.251953125, "learning_rate": 4.042325169314045e-05, "loss": 2.7897, "num_input_tokens_seen": 3502243840, "step": 6680 }, { "epoch": 0.3241978407017416, "grad_norm": 0.2578125, "learning_rate": 4.0407469451215804e-05, "loss": 2.7719, "num_input_tokens_seen": 3504865280, "step": 6685 }, { "epoch": 0.32444032225798825, "grad_norm": 0.263671875, "learning_rate": 4.039167730219191e-05, "loss": 2.7843, "num_input_tokens_seen": 3507486720, "step": 6690 }, { "epoch": 0.32468280381423487, "grad_norm": 0.265625, "learning_rate": 4.037587525622322e-05, "loss": 2.7853, "num_input_tokens_seen": 3510108160, "step": 6695 }, { "epoch": 0.3249252853704815, "grad_norm": 0.263671875, "learning_rate": 4.036006332347055e-05, "loss": 2.7768, "num_input_tokens_seen": 3512729600, "step": 6700 }, { "epoch": 0.32516776692672816, "grad_norm": 0.2578125, "learning_rate": 4.0344241514101075e-05, "loss": 2.7949, "num_input_tokens_seen": 3515351040, "step": 6705 }, { "epoch": 0.3254102484829748, "grad_norm": 0.26171875, "learning_rate": 4.0328409838288304e-05, "loss": 2.7683, "num_input_tokens_seen": 3517972480, "step": 6710 }, { "epoch": 0.3256527300392214, "grad_norm": 0.2578125, "learning_rate": 4.031256830621212e-05, "loss": 2.7793, "num_input_tokens_seen": 3520593920, "step": 6715 }, { "epoch": 0.325895211595468, "grad_norm": 0.251953125, "learning_rate": 4.029671692805872e-05, "loss": 2.7752, "num_input_tokens_seen": 3523215360, "step": 6720 }, { "epoch": 0.32613769315171465, "grad_norm": 0.259765625, "learning_rate": 4.0280855714020625e-05, "loss": 2.7741, "num_input_tokens_seen": 3525836800, "step": 6725 }, { "epoch": 0.32638017470796127, "grad_norm": 0.251953125, "learning_rate": 4.026498467429672e-05, "loss": 2.7804, "num_input_tokens_seen": 3528458240, "step": 6730 }, { "epoch": 0.3266226562642079, "grad_norm": 0.25, "learning_rate": 4.024910381909218e-05, "loss": 2.7863, "num_input_tokens_seen": 3531079680, "step": 6735 }, { "epoch": 0.3268651378204545, "grad_norm": 0.2490234375, "learning_rate": 4.0233213158618475e-05, "loss": 2.7718, "num_input_tokens_seen": 3533701120, "step": 6740 }, { "epoch": 0.3271076193767012, "grad_norm": 0.2578125, "learning_rate": 4.021731270309341e-05, "loss": 2.7902, "num_input_tokens_seen": 3536322560, "step": 6745 }, { "epoch": 0.3273501009329478, "grad_norm": 0.26171875, "learning_rate": 4.020140246274109e-05, "loss": 2.7941, "num_input_tokens_seen": 3538944000, "step": 6750 }, { "epoch": 0.32759258248919443, "grad_norm": 0.265625, "learning_rate": 4.018548244779187e-05, "loss": 2.7729, "num_input_tokens_seen": 3541565440, "step": 6755 }, { "epoch": 0.32783506404544105, "grad_norm": 0.25390625, "learning_rate": 4.0169552668482445e-05, "loss": 2.7828, "num_input_tokens_seen": 3544186880, "step": 6760 }, { "epoch": 0.32807754560168767, "grad_norm": 0.2578125, "learning_rate": 4.0153613135055755e-05, "loss": 2.7757, "num_input_tokens_seen": 3546808320, "step": 6765 }, { "epoch": 0.3283200271579343, "grad_norm": 0.255859375, "learning_rate": 4.013766385776102e-05, "loss": 2.7726, "num_input_tokens_seen": 3549429760, "step": 6770 }, { "epoch": 0.3285625087141809, "grad_norm": 0.2578125, "learning_rate": 4.012170484685371e-05, "loss": 2.7886, "num_input_tokens_seen": 3552051200, "step": 6775 }, { "epoch": 0.32880499027042753, "grad_norm": 0.2578125, "learning_rate": 4.01057361125956e-05, "loss": 2.7867, "num_input_tokens_seen": 3554672640, "step": 6780 }, { "epoch": 0.3290474718266742, "grad_norm": 0.25390625, "learning_rate": 4.0089757665254655e-05, "loss": 2.7773, "num_input_tokens_seen": 3557294080, "step": 6785 }, { "epoch": 0.32928995338292083, "grad_norm": 0.25390625, "learning_rate": 4.0073769515105134e-05, "loss": 2.7719, "num_input_tokens_seen": 3559915520, "step": 6790 }, { "epoch": 0.32953243493916745, "grad_norm": 0.25, "learning_rate": 4.0057771672427515e-05, "loss": 2.799, "num_input_tokens_seen": 3562536960, "step": 6795 }, { "epoch": 0.32977491649541407, "grad_norm": 0.244140625, "learning_rate": 4.004176414750851e-05, "loss": 2.7746, "num_input_tokens_seen": 3565158400, "step": 6800 }, { "epoch": 0.3300173980516607, "grad_norm": 0.25, "learning_rate": 4.002574695064106e-05, "loss": 2.7851, "num_input_tokens_seen": 3567779840, "step": 6805 }, { "epoch": 0.3302598796079073, "grad_norm": 0.25390625, "learning_rate": 4.000972009212431e-05, "loss": 2.7826, "num_input_tokens_seen": 3570401280, "step": 6810 }, { "epoch": 0.33050236116415393, "grad_norm": 0.263671875, "learning_rate": 3.999368358226365e-05, "loss": 2.7749, "num_input_tokens_seen": 3573022720, "step": 6815 }, { "epoch": 0.33074484272040056, "grad_norm": 0.255859375, "learning_rate": 3.997763743137064e-05, "loss": 2.7771, "num_input_tokens_seen": 3575644160, "step": 6820 }, { "epoch": 0.33098732427664723, "grad_norm": 0.26171875, "learning_rate": 3.996158164976307e-05, "loss": 2.7887, "num_input_tokens_seen": 3578265600, "step": 6825 }, { "epoch": 0.33122980583289385, "grad_norm": 0.2578125, "learning_rate": 3.994551624776489e-05, "loss": 2.7924, "num_input_tokens_seen": 3580887040, "step": 6830 }, { "epoch": 0.3314722873891405, "grad_norm": 0.2490234375, "learning_rate": 3.992944123570627e-05, "loss": 2.7865, "num_input_tokens_seen": 3583508480, "step": 6835 }, { "epoch": 0.3317147689453871, "grad_norm": 0.255859375, "learning_rate": 3.991335662392353e-05, "loss": 2.7756, "num_input_tokens_seen": 3586129920, "step": 6840 }, { "epoch": 0.3319572505016337, "grad_norm": 0.2490234375, "learning_rate": 3.989726242275918e-05, "loss": 2.7716, "num_input_tokens_seen": 3588751360, "step": 6845 }, { "epoch": 0.33219973205788034, "grad_norm": 0.255859375, "learning_rate": 3.988115864256191e-05, "loss": 2.7869, "num_input_tokens_seen": 3591372800, "step": 6850 }, { "epoch": 0.33244221361412696, "grad_norm": 0.259765625, "learning_rate": 3.986504529368653e-05, "loss": 2.8014, "num_input_tokens_seen": 3593994240, "step": 6855 }, { "epoch": 0.3326846951703736, "grad_norm": 0.251953125, "learning_rate": 3.984892238649403e-05, "loss": 2.782, "num_input_tokens_seen": 3596615680, "step": 6860 }, { "epoch": 0.33292717672662026, "grad_norm": 0.2451171875, "learning_rate": 3.983278993135154e-05, "loss": 2.7856, "num_input_tokens_seen": 3599237120, "step": 6865 }, { "epoch": 0.3331696582828669, "grad_norm": 0.251953125, "learning_rate": 3.9816647938632326e-05, "loss": 2.7754, "num_input_tokens_seen": 3601858560, "step": 6870 }, { "epoch": 0.3334121398391135, "grad_norm": 0.2470703125, "learning_rate": 3.9800496418715805e-05, "loss": 2.79, "num_input_tokens_seen": 3604480000, "step": 6875 }, { "epoch": 0.3336546213953601, "grad_norm": 0.25, "learning_rate": 3.9784335381987485e-05, "loss": 2.7723, "num_input_tokens_seen": 3607101440, "step": 6880 }, { "epoch": 0.33389710295160674, "grad_norm": 0.26171875, "learning_rate": 3.9768164838839026e-05, "loss": 2.7887, "num_input_tokens_seen": 3609722880, "step": 6885 }, { "epoch": 0.33413958450785336, "grad_norm": 0.248046875, "learning_rate": 3.97519847996682e-05, "loss": 2.7714, "num_input_tokens_seen": 3612344320, "step": 6890 }, { "epoch": 0.3343820660641, "grad_norm": 0.2490234375, "learning_rate": 3.973579527487884e-05, "loss": 2.7813, "num_input_tokens_seen": 3614965760, "step": 6895 }, { "epoch": 0.3346245476203466, "grad_norm": 0.251953125, "learning_rate": 3.971959627488094e-05, "loss": 2.7843, "num_input_tokens_seen": 3617587200, "step": 6900 }, { "epoch": 0.3346245476203466, "eval_accuracy": 0.45507409216739947, "eval_loss": 2.7469868659973145, "eval_runtime": 5.8611, "eval_samples_per_second": 51.185, "eval_steps_per_second": 6.483, "num_input_tokens_seen": 3617587200, "step": 6900 }, { "epoch": 0.3348670291765933, "grad_norm": 0.25390625, "learning_rate": 3.9703387810090555e-05, "loss": 2.7783, "num_input_tokens_seen": 3620208640, "step": 6905 }, { "epoch": 0.3351095107328399, "grad_norm": 0.251953125, "learning_rate": 3.968716989092982e-05, "loss": 2.7871, "num_input_tokens_seen": 3622830080, "step": 6910 }, { "epoch": 0.3353519922890865, "grad_norm": 0.25390625, "learning_rate": 3.9670942527826956e-05, "loss": 2.7889, "num_input_tokens_seen": 3625451520, "step": 6915 }, { "epoch": 0.33559447384533314, "grad_norm": 0.2392578125, "learning_rate": 3.965470573121627e-05, "loss": 2.7702, "num_input_tokens_seen": 3628072960, "step": 6920 }, { "epoch": 0.33583695540157976, "grad_norm": 0.244140625, "learning_rate": 3.9638459511538116e-05, "loss": 2.7807, "num_input_tokens_seen": 3630694400, "step": 6925 }, { "epoch": 0.3360794369578264, "grad_norm": 0.26171875, "learning_rate": 3.9622203879238925e-05, "loss": 2.7756, "num_input_tokens_seen": 3633315840, "step": 6930 }, { "epoch": 0.336321918514073, "grad_norm": 0.25390625, "learning_rate": 3.960593884477116e-05, "loss": 2.7712, "num_input_tokens_seen": 3635937280, "step": 6935 }, { "epoch": 0.3365644000703196, "grad_norm": 0.248046875, "learning_rate": 3.958966441859334e-05, "loss": 2.7854, "num_input_tokens_seen": 3638558720, "step": 6940 }, { "epoch": 0.3368068816265663, "grad_norm": 0.25, "learning_rate": 3.957338061117003e-05, "loss": 2.7908, "num_input_tokens_seen": 3641180160, "step": 6945 }, { "epoch": 0.3370493631828129, "grad_norm": 0.2470703125, "learning_rate": 3.955708743297182e-05, "loss": 2.764, "num_input_tokens_seen": 3643801600, "step": 6950 }, { "epoch": 0.33729184473905954, "grad_norm": 0.25, "learning_rate": 3.954078489447531e-05, "loss": 2.7863, "num_input_tokens_seen": 3646423040, "step": 6955 }, { "epoch": 0.33753432629530616, "grad_norm": 0.26171875, "learning_rate": 3.952447300616315e-05, "loss": 2.7685, "num_input_tokens_seen": 3649044480, "step": 6960 }, { "epoch": 0.3377768078515528, "grad_norm": 0.26171875, "learning_rate": 3.9508151778523996e-05, "loss": 2.7669, "num_input_tokens_seen": 3651665920, "step": 6965 }, { "epoch": 0.3380192894077994, "grad_norm": 0.259765625, "learning_rate": 3.949182122205247e-05, "loss": 2.7774, "num_input_tokens_seen": 3654287360, "step": 6970 }, { "epoch": 0.338261770964046, "grad_norm": 0.259765625, "learning_rate": 3.947548134724924e-05, "loss": 2.7872, "num_input_tokens_seen": 3656908800, "step": 6975 }, { "epoch": 0.33850425252029265, "grad_norm": 0.2578125, "learning_rate": 3.945913216462095e-05, "loss": 2.7829, "num_input_tokens_seen": 3659530240, "step": 6980 }, { "epoch": 0.3387467340765393, "grad_norm": 0.251953125, "learning_rate": 3.944277368468022e-05, "loss": 2.791, "num_input_tokens_seen": 3662151680, "step": 6985 }, { "epoch": 0.33898921563278595, "grad_norm": 0.2490234375, "learning_rate": 3.942640591794565e-05, "loss": 2.7832, "num_input_tokens_seen": 3664773120, "step": 6990 }, { "epoch": 0.33923169718903257, "grad_norm": 0.25390625, "learning_rate": 3.9410028874941836e-05, "loss": 2.7644, "num_input_tokens_seen": 3667394560, "step": 6995 }, { "epoch": 0.3394741787452792, "grad_norm": 0.25, "learning_rate": 3.93936425661993e-05, "loss": 2.7861, "num_input_tokens_seen": 3670016000, "step": 7000 }, { "epoch": 0.3397166603015258, "grad_norm": 0.251953125, "learning_rate": 3.9377247002254546e-05, "loss": 2.7767, "num_input_tokens_seen": 3672637440, "step": 7005 }, { "epoch": 0.33995914185777243, "grad_norm": 0.2490234375, "learning_rate": 3.936084219365003e-05, "loss": 2.7971, "num_input_tokens_seen": 3675258880, "step": 7010 }, { "epoch": 0.34020162341401905, "grad_norm": 0.2470703125, "learning_rate": 3.9344428150934135e-05, "loss": 2.7858, "num_input_tokens_seen": 3677880320, "step": 7015 }, { "epoch": 0.3404441049702657, "grad_norm": 0.251953125, "learning_rate": 3.9328004884661205e-05, "loss": 2.7868, "num_input_tokens_seen": 3680501760, "step": 7020 }, { "epoch": 0.34068658652651235, "grad_norm": 0.25, "learning_rate": 3.9311572405391495e-05, "loss": 2.7893, "num_input_tokens_seen": 3683123200, "step": 7025 }, { "epoch": 0.34092906808275897, "grad_norm": 0.2451171875, "learning_rate": 3.9295130723691206e-05, "loss": 2.7755, "num_input_tokens_seen": 3685744640, "step": 7030 }, { "epoch": 0.3411715496390056, "grad_norm": 0.26171875, "learning_rate": 3.927867985013241e-05, "loss": 2.7824, "num_input_tokens_seen": 3688366080, "step": 7035 }, { "epoch": 0.3414140311952522, "grad_norm": 0.25390625, "learning_rate": 3.926221979529316e-05, "loss": 2.7815, "num_input_tokens_seen": 3690987520, "step": 7040 }, { "epoch": 0.34165651275149883, "grad_norm": 0.2490234375, "learning_rate": 3.924575056975737e-05, "loss": 2.7821, "num_input_tokens_seen": 3693608960, "step": 7045 }, { "epoch": 0.34189899430774545, "grad_norm": 0.25, "learning_rate": 3.922927218411482e-05, "loss": 2.7823, "num_input_tokens_seen": 3696230400, "step": 7050 }, { "epoch": 0.3421414758639921, "grad_norm": 0.25, "learning_rate": 3.921278464896124e-05, "loss": 2.7939, "num_input_tokens_seen": 3698851840, "step": 7055 }, { "epoch": 0.34238395742023875, "grad_norm": 0.2431640625, "learning_rate": 3.919628797489823e-05, "loss": 2.787, "num_input_tokens_seen": 3701473280, "step": 7060 }, { "epoch": 0.34262643897648537, "grad_norm": 0.255859375, "learning_rate": 3.9179782172533216e-05, "loss": 2.7935, "num_input_tokens_seen": 3704094720, "step": 7065 }, { "epoch": 0.342868920532732, "grad_norm": 0.255859375, "learning_rate": 3.916326725247957e-05, "loss": 2.7681, "num_input_tokens_seen": 3706716160, "step": 7070 }, { "epoch": 0.3431114020889786, "grad_norm": 0.251953125, "learning_rate": 3.9146743225356483e-05, "loss": 2.7899, "num_input_tokens_seen": 3709337600, "step": 7075 }, { "epoch": 0.34335388364522523, "grad_norm": 0.2470703125, "learning_rate": 3.9130210101788994e-05, "loss": 2.797, "num_input_tokens_seen": 3711959040, "step": 7080 }, { "epoch": 0.34359636520147185, "grad_norm": 0.265625, "learning_rate": 3.9113667892408015e-05, "loss": 2.7703, "num_input_tokens_seen": 3714580480, "step": 7085 }, { "epoch": 0.3438388467577185, "grad_norm": 0.255859375, "learning_rate": 3.909711660785028e-05, "loss": 2.7729, "num_input_tokens_seen": 3717201920, "step": 7090 }, { "epoch": 0.3440813283139651, "grad_norm": 0.25, "learning_rate": 3.908055625875838e-05, "loss": 2.7915, "num_input_tokens_seen": 3719823360, "step": 7095 }, { "epoch": 0.34432380987021177, "grad_norm": 0.25, "learning_rate": 3.906398685578071e-05, "loss": 2.7717, "num_input_tokens_seen": 3722444800, "step": 7100 }, { "epoch": 0.3445662914264584, "grad_norm": 0.2431640625, "learning_rate": 3.9047408409571516e-05, "loss": 2.7824, "num_input_tokens_seen": 3725066240, "step": 7105 }, { "epoch": 0.344808772982705, "grad_norm": 0.259765625, "learning_rate": 3.903082093079083e-05, "loss": 2.774, "num_input_tokens_seen": 3727687680, "step": 7110 }, { "epoch": 0.34505125453895164, "grad_norm": 0.25390625, "learning_rate": 3.901422443010451e-05, "loss": 2.7723, "num_input_tokens_seen": 3730309120, "step": 7115 }, { "epoch": 0.34529373609519826, "grad_norm": 0.2470703125, "learning_rate": 3.8997618918184206e-05, "loss": 2.7748, "num_input_tokens_seen": 3732930560, "step": 7120 }, { "epoch": 0.3455362176514449, "grad_norm": 0.267578125, "learning_rate": 3.898100440570737e-05, "loss": 2.7756, "num_input_tokens_seen": 3735552000, "step": 7125 }, { "epoch": 0.3457786992076915, "grad_norm": 0.259765625, "learning_rate": 3.896438090335724e-05, "loss": 2.7742, "num_input_tokens_seen": 3738173440, "step": 7130 }, { "epoch": 0.3460211807639381, "grad_norm": 0.255859375, "learning_rate": 3.8947748421822826e-05, "loss": 2.7715, "num_input_tokens_seen": 3740794880, "step": 7135 }, { "epoch": 0.3462636623201848, "grad_norm": 0.25, "learning_rate": 3.893110697179892e-05, "loss": 2.7701, "num_input_tokens_seen": 3743416320, "step": 7140 }, { "epoch": 0.3465061438764314, "grad_norm": 0.255859375, "learning_rate": 3.891445656398608e-05, "loss": 2.7833, "num_input_tokens_seen": 3746037760, "step": 7145 }, { "epoch": 0.34674862543267804, "grad_norm": 0.255859375, "learning_rate": 3.8897797209090616e-05, "loss": 2.777, "num_input_tokens_seen": 3748659200, "step": 7150 }, { "epoch": 0.34699110698892466, "grad_norm": 0.25390625, "learning_rate": 3.8881128917824606e-05, "loss": 2.7722, "num_input_tokens_seen": 3751280640, "step": 7155 }, { "epoch": 0.3472335885451713, "grad_norm": 0.2451171875, "learning_rate": 3.886445170090586e-05, "loss": 2.7933, "num_input_tokens_seen": 3753902080, "step": 7160 }, { "epoch": 0.3474760701014179, "grad_norm": 0.255859375, "learning_rate": 3.884776556905793e-05, "loss": 2.7814, "num_input_tokens_seen": 3756523520, "step": 7165 }, { "epoch": 0.3477185516576645, "grad_norm": 0.248046875, "learning_rate": 3.883107053301012e-05, "loss": 2.7849, "num_input_tokens_seen": 3759144960, "step": 7170 }, { "epoch": 0.34796103321391114, "grad_norm": 0.255859375, "learning_rate": 3.8814366603497415e-05, "loss": 2.7713, "num_input_tokens_seen": 3761766400, "step": 7175 }, { "epoch": 0.3482035147701578, "grad_norm": 0.26171875, "learning_rate": 3.8797653791260565e-05, "loss": 2.7698, "num_input_tokens_seen": 3764387840, "step": 7180 }, { "epoch": 0.34844599632640444, "grad_norm": 0.251953125, "learning_rate": 3.878093210704602e-05, "loss": 2.7729, "num_input_tokens_seen": 3767009280, "step": 7185 }, { "epoch": 0.34868847788265106, "grad_norm": 0.255859375, "learning_rate": 3.8764201561605904e-05, "loss": 2.7877, "num_input_tokens_seen": 3769630720, "step": 7190 }, { "epoch": 0.3489309594388977, "grad_norm": 0.265625, "learning_rate": 3.874746216569808e-05, "loss": 2.7723, "num_input_tokens_seen": 3772252160, "step": 7195 }, { "epoch": 0.3491734409951443, "grad_norm": 0.25390625, "learning_rate": 3.873071393008608e-05, "loss": 2.7765, "num_input_tokens_seen": 3774873600, "step": 7200 }, { "epoch": 0.3491734409951443, "eval_accuracy": 0.4550219833903273, "eval_loss": 2.7463502883911133, "eval_runtime": 5.8692, "eval_samples_per_second": 51.114, "eval_steps_per_second": 6.474, "num_input_tokens_seen": 3774873600, "step": 7200 }, { "epoch": 0.3494159225513909, "grad_norm": 0.2578125, "learning_rate": 3.871395686553912e-05, "loss": 2.7959, "num_input_tokens_seen": 3777495040, "step": 7205 }, { "epoch": 0.34965840410763754, "grad_norm": 0.255859375, "learning_rate": 3.869719098283211e-05, "loss": 2.7815, "num_input_tokens_seen": 3780116480, "step": 7210 }, { "epoch": 0.34990088566388416, "grad_norm": 0.25, "learning_rate": 3.868041629274561e-05, "loss": 2.774, "num_input_tokens_seen": 3782737920, "step": 7215 }, { "epoch": 0.35014336722013084, "grad_norm": 0.25, "learning_rate": 3.866363280606584e-05, "loss": 2.7742, "num_input_tokens_seen": 3785359360, "step": 7220 }, { "epoch": 0.35038584877637746, "grad_norm": 0.251953125, "learning_rate": 3.86468405335847e-05, "loss": 2.7824, "num_input_tokens_seen": 3787980800, "step": 7225 }, { "epoch": 0.3506283303326241, "grad_norm": 0.26171875, "learning_rate": 3.863003948609972e-05, "loss": 2.7731, "num_input_tokens_seen": 3790602240, "step": 7230 }, { "epoch": 0.3508708118888707, "grad_norm": 0.26171875, "learning_rate": 3.861322967441409e-05, "loss": 2.7944, "num_input_tokens_seen": 3793223680, "step": 7235 }, { "epoch": 0.3511132934451173, "grad_norm": 0.2490234375, "learning_rate": 3.8596411109336604e-05, "loss": 2.7845, "num_input_tokens_seen": 3795845120, "step": 7240 }, { "epoch": 0.35135577500136395, "grad_norm": 0.2490234375, "learning_rate": 3.8579583801681725e-05, "loss": 2.7809, "num_input_tokens_seen": 3798466560, "step": 7245 }, { "epoch": 0.35159825655761057, "grad_norm": 0.2578125, "learning_rate": 3.8562747762269504e-05, "loss": 2.7752, "num_input_tokens_seen": 3801088000, "step": 7250 }, { "epoch": 0.3518407381138572, "grad_norm": 0.26171875, "learning_rate": 3.854590300192562e-05, "loss": 2.7826, "num_input_tokens_seen": 3803709440, "step": 7255 }, { "epoch": 0.35208321967010386, "grad_norm": 0.26171875, "learning_rate": 3.8529049531481364e-05, "loss": 2.7801, "num_input_tokens_seen": 3806330880, "step": 7260 }, { "epoch": 0.3523257012263505, "grad_norm": 0.2490234375, "learning_rate": 3.8512187361773625e-05, "loss": 2.7696, "num_input_tokens_seen": 3808952320, "step": 7265 }, { "epoch": 0.3525681827825971, "grad_norm": 0.24609375, "learning_rate": 3.849531650364488e-05, "loss": 2.7825, "num_input_tokens_seen": 3811573760, "step": 7270 }, { "epoch": 0.3528106643388437, "grad_norm": 0.255859375, "learning_rate": 3.847843696794319e-05, "loss": 2.7795, "num_input_tokens_seen": 3814195200, "step": 7275 }, { "epoch": 0.35305314589509035, "grad_norm": 0.2470703125, "learning_rate": 3.846154876552222e-05, "loss": 2.7835, "num_input_tokens_seen": 3816816640, "step": 7280 }, { "epoch": 0.35329562745133697, "grad_norm": 0.265625, "learning_rate": 3.844465190724116e-05, "loss": 2.7722, "num_input_tokens_seen": 3819438080, "step": 7285 }, { "epoch": 0.3535381090075836, "grad_norm": 0.2470703125, "learning_rate": 3.8427746403964836e-05, "loss": 2.7649, "num_input_tokens_seen": 3822059520, "step": 7290 }, { "epoch": 0.35378059056383027, "grad_norm": 0.255859375, "learning_rate": 3.8410832266563555e-05, "loss": 2.7647, "num_input_tokens_seen": 3824680960, "step": 7295 }, { "epoch": 0.3540230721200769, "grad_norm": 0.265625, "learning_rate": 3.839390950591324e-05, "loss": 2.7675, "num_input_tokens_seen": 3827302400, "step": 7300 }, { "epoch": 0.3542655536763235, "grad_norm": 0.26171875, "learning_rate": 3.837697813289531e-05, "loss": 2.7912, "num_input_tokens_seen": 3829923840, "step": 7305 }, { "epoch": 0.35450803523257013, "grad_norm": 0.251953125, "learning_rate": 3.836003815839676e-05, "loss": 2.7815, "num_input_tokens_seen": 3832545280, "step": 7310 }, { "epoch": 0.35475051678881675, "grad_norm": 0.267578125, "learning_rate": 3.834308959331009e-05, "loss": 2.7858, "num_input_tokens_seen": 3835166720, "step": 7315 }, { "epoch": 0.35499299834506337, "grad_norm": 0.26953125, "learning_rate": 3.832613244853335e-05, "loss": 2.7794, "num_input_tokens_seen": 3837788160, "step": 7320 }, { "epoch": 0.35523547990131, "grad_norm": 0.279296875, "learning_rate": 3.8309166734970064e-05, "loss": 2.7687, "num_input_tokens_seen": 3840409600, "step": 7325 }, { "epoch": 0.3554779614575566, "grad_norm": 0.26171875, "learning_rate": 3.829219246352931e-05, "loss": 2.7792, "num_input_tokens_seen": 3843031040, "step": 7330 }, { "epoch": 0.3557204430138033, "grad_norm": 0.28125, "learning_rate": 3.827520964512564e-05, "loss": 2.7775, "num_input_tokens_seen": 3845652480, "step": 7335 }, { "epoch": 0.3559629245700499, "grad_norm": 0.2578125, "learning_rate": 3.8258218290679124e-05, "loss": 2.7859, "num_input_tokens_seen": 3848273920, "step": 7340 }, { "epoch": 0.35620540612629653, "grad_norm": 0.263671875, "learning_rate": 3.8241218411115306e-05, "loss": 2.7754, "num_input_tokens_seen": 3850895360, "step": 7345 }, { "epoch": 0.35644788768254315, "grad_norm": 0.25390625, "learning_rate": 3.8224210017365205e-05, "loss": 2.7796, "num_input_tokens_seen": 3853516800, "step": 7350 }, { "epoch": 0.3566903692387898, "grad_norm": 0.2578125, "learning_rate": 3.820719312036535e-05, "loss": 2.7865, "num_input_tokens_seen": 3856138240, "step": 7355 }, { "epoch": 0.3569328507950364, "grad_norm": 0.251953125, "learning_rate": 3.819016773105768e-05, "loss": 2.7742, "num_input_tokens_seen": 3858759680, "step": 7360 }, { "epoch": 0.357175332351283, "grad_norm": 0.248046875, "learning_rate": 3.817313386038964e-05, "loss": 2.7804, "num_input_tokens_seen": 3861381120, "step": 7365 }, { "epoch": 0.35741781390752964, "grad_norm": 0.2451171875, "learning_rate": 3.815609151931412e-05, "loss": 2.7833, "num_input_tokens_seen": 3864002560, "step": 7370 }, { "epoch": 0.3576602954637763, "grad_norm": 0.251953125, "learning_rate": 3.813904071878945e-05, "loss": 2.7833, "num_input_tokens_seen": 3866624000, "step": 7375 }, { "epoch": 0.35790277702002293, "grad_norm": 0.248046875, "learning_rate": 3.81219814697794e-05, "loss": 2.7804, "num_input_tokens_seen": 3869245440, "step": 7380 }, { "epoch": 0.35814525857626955, "grad_norm": 0.255859375, "learning_rate": 3.810491378325318e-05, "loss": 2.7805, "num_input_tokens_seen": 3871866880, "step": 7385 }, { "epoch": 0.3583877401325162, "grad_norm": 0.265625, "learning_rate": 3.80878376701854e-05, "loss": 2.787, "num_input_tokens_seen": 3874488320, "step": 7390 }, { "epoch": 0.3586302216887628, "grad_norm": 0.267578125, "learning_rate": 3.807075314155613e-05, "loss": 2.7921, "num_input_tokens_seen": 3877109760, "step": 7395 }, { "epoch": 0.3588727032450094, "grad_norm": 0.259765625, "learning_rate": 3.8053660208350815e-05, "loss": 2.7695, "num_input_tokens_seen": 3879731200, "step": 7400 }, { "epoch": 0.35911518480125604, "grad_norm": 0.259765625, "learning_rate": 3.803655888156033e-05, "loss": 2.7807, "num_input_tokens_seen": 3882352640, "step": 7405 }, { "epoch": 0.35935766635750266, "grad_norm": 0.255859375, "learning_rate": 3.801944917218092e-05, "loss": 2.7863, "num_input_tokens_seen": 3884974080, "step": 7410 }, { "epoch": 0.35960014791374934, "grad_norm": 0.255859375, "learning_rate": 3.800233109121425e-05, "loss": 2.7784, "num_input_tokens_seen": 3887595520, "step": 7415 }, { "epoch": 0.35984262946999596, "grad_norm": 0.2490234375, "learning_rate": 3.798520464966734e-05, "loss": 2.7841, "num_input_tokens_seen": 3890216960, "step": 7420 }, { "epoch": 0.3600851110262426, "grad_norm": 0.2451171875, "learning_rate": 3.7968069858552604e-05, "loss": 2.7715, "num_input_tokens_seen": 3892838400, "step": 7425 }, { "epoch": 0.3603275925824892, "grad_norm": 0.255859375, "learning_rate": 3.795092672888782e-05, "loss": 2.7832, "num_input_tokens_seen": 3895459840, "step": 7430 }, { "epoch": 0.3605700741387358, "grad_norm": 0.26171875, "learning_rate": 3.7933775271696136e-05, "loss": 2.7782, "num_input_tokens_seen": 3898081280, "step": 7435 }, { "epoch": 0.36081255569498244, "grad_norm": 0.263671875, "learning_rate": 3.791661549800604e-05, "loss": 2.7799, "num_input_tokens_seen": 3900702720, "step": 7440 }, { "epoch": 0.36105503725122906, "grad_norm": 0.2470703125, "learning_rate": 3.789944741885136e-05, "loss": 2.7891, "num_input_tokens_seen": 3903324160, "step": 7445 }, { "epoch": 0.3612975188074757, "grad_norm": 0.26171875, "learning_rate": 3.78822710452713e-05, "loss": 2.784, "num_input_tokens_seen": 3905945600, "step": 7450 }, { "epoch": 0.36154000036372236, "grad_norm": 0.25390625, "learning_rate": 3.786508638831036e-05, "loss": 2.776, "num_input_tokens_seen": 3908567040, "step": 7455 }, { "epoch": 0.361782481919969, "grad_norm": 0.248046875, "learning_rate": 3.78478934590184e-05, "loss": 2.7755, "num_input_tokens_seen": 3911188480, "step": 7460 }, { "epoch": 0.3620249634762156, "grad_norm": 0.244140625, "learning_rate": 3.783069226845056e-05, "loss": 2.7844, "num_input_tokens_seen": 3913809920, "step": 7465 }, { "epoch": 0.3622674450324622, "grad_norm": 0.25390625, "learning_rate": 3.7813482827667325e-05, "loss": 2.7889, "num_input_tokens_seen": 3916431360, "step": 7470 }, { "epoch": 0.36250992658870884, "grad_norm": 0.251953125, "learning_rate": 3.779626514773448e-05, "loss": 2.7728, "num_input_tokens_seen": 3919052800, "step": 7475 }, { "epoch": 0.36275240814495546, "grad_norm": 0.26953125, "learning_rate": 3.777903923972307e-05, "loss": 2.7841, "num_input_tokens_seen": 3921674240, "step": 7480 }, { "epoch": 0.3629948897012021, "grad_norm": 0.259765625, "learning_rate": 3.77618051147095e-05, "loss": 2.7764, "num_input_tokens_seen": 3924295680, "step": 7485 }, { "epoch": 0.3632373712574487, "grad_norm": 0.26171875, "learning_rate": 3.77445627837754e-05, "loss": 2.7899, "num_input_tokens_seen": 3926917120, "step": 7490 }, { "epoch": 0.3634798528136954, "grad_norm": 0.2578125, "learning_rate": 3.77273122580077e-05, "loss": 2.7736, "num_input_tokens_seen": 3929538560, "step": 7495 }, { "epoch": 0.363722334369942, "grad_norm": 0.25, "learning_rate": 3.771005354849859e-05, "loss": 2.7778, "num_input_tokens_seen": 3932160000, "step": 7500 }, { "epoch": 0.363722334369942, "eval_accuracy": 0.45522390490148185, "eval_loss": 2.7460319995880127, "eval_runtime": 5.7761, "eval_samples_per_second": 51.938, "eval_steps_per_second": 6.579, "num_input_tokens_seen": 3932160000, "step": 7500 }, { "epoch": 0.3639648159261886, "grad_norm": 0.2451171875, "learning_rate": 3.769278666634555e-05, "loss": 2.786, "num_input_tokens_seen": 3934781440, "step": 7505 }, { "epoch": 0.36420729748243524, "grad_norm": 0.2578125, "learning_rate": 3.767551162265126e-05, "loss": 2.7914, "num_input_tokens_seen": 3937402880, "step": 7510 }, { "epoch": 0.36444977903868186, "grad_norm": 0.255859375, "learning_rate": 3.7658228428523714e-05, "loss": 2.767, "num_input_tokens_seen": 3940024320, "step": 7515 }, { "epoch": 0.3646922605949285, "grad_norm": 0.263671875, "learning_rate": 3.764093709507609e-05, "loss": 2.7903, "num_input_tokens_seen": 3942645760, "step": 7520 }, { "epoch": 0.3649347421511751, "grad_norm": 0.263671875, "learning_rate": 3.7623637633426835e-05, "loss": 2.7748, "num_input_tokens_seen": 3945267200, "step": 7525 }, { "epoch": 0.3651772237074217, "grad_norm": 0.26171875, "learning_rate": 3.760633005469961e-05, "loss": 2.7837, "num_input_tokens_seen": 3947888640, "step": 7530 }, { "epoch": 0.3654197052636684, "grad_norm": 0.259765625, "learning_rate": 3.758901437002329e-05, "loss": 2.7716, "num_input_tokens_seen": 3950510080, "step": 7535 }, { "epoch": 0.365662186819915, "grad_norm": 0.263671875, "learning_rate": 3.7571690590531975e-05, "loss": 2.7612, "num_input_tokens_seen": 3953131520, "step": 7540 }, { "epoch": 0.36590466837616165, "grad_norm": 0.25, "learning_rate": 3.755435872736496e-05, "loss": 2.7766, "num_input_tokens_seen": 3955752960, "step": 7545 }, { "epoch": 0.36614714993240827, "grad_norm": 0.259765625, "learning_rate": 3.7537018791666746e-05, "loss": 2.7842, "num_input_tokens_seen": 3958374400, "step": 7550 }, { "epoch": 0.3663896314886549, "grad_norm": 0.25390625, "learning_rate": 3.751967079458702e-05, "loss": 2.7718, "num_input_tokens_seen": 3960995840, "step": 7555 }, { "epoch": 0.3666321130449015, "grad_norm": 0.25390625, "learning_rate": 3.750231474728065e-05, "loss": 2.7661, "num_input_tokens_seen": 3963617280, "step": 7560 }, { "epoch": 0.36687459460114813, "grad_norm": 0.25, "learning_rate": 3.7484950660907683e-05, "loss": 2.7754, "num_input_tokens_seen": 3966238720, "step": 7565 }, { "epoch": 0.36711707615739475, "grad_norm": 0.2490234375, "learning_rate": 3.746757854663333e-05, "loss": 2.7748, "num_input_tokens_seen": 3968860160, "step": 7570 }, { "epoch": 0.3673595577136414, "grad_norm": 0.25390625, "learning_rate": 3.745019841562798e-05, "loss": 2.7734, "num_input_tokens_seen": 3971481600, "step": 7575 }, { "epoch": 0.36760203926988805, "grad_norm": 0.25390625, "learning_rate": 3.7432810279067153e-05, "loss": 2.7905, "num_input_tokens_seen": 3974103040, "step": 7580 }, { "epoch": 0.36784452082613467, "grad_norm": 0.263671875, "learning_rate": 3.741541414813155e-05, "loss": 2.7909, "num_input_tokens_seen": 3976724480, "step": 7585 }, { "epoch": 0.3680870023823813, "grad_norm": 0.25390625, "learning_rate": 3.739801003400697e-05, "loss": 2.7889, "num_input_tokens_seen": 3979345920, "step": 7590 }, { "epoch": 0.3683294839386279, "grad_norm": 0.2578125, "learning_rate": 3.7380597947884374e-05, "loss": 2.8, "num_input_tokens_seen": 3981967360, "step": 7595 }, { "epoch": 0.36857196549487453, "grad_norm": 0.25, "learning_rate": 3.736317790095985e-05, "loss": 2.7804, "num_input_tokens_seen": 3984588800, "step": 7600 }, { "epoch": 0.36881444705112115, "grad_norm": 0.25390625, "learning_rate": 3.734574990443459e-05, "loss": 2.7764, "num_input_tokens_seen": 3987210240, "step": 7605 }, { "epoch": 0.36905692860736783, "grad_norm": 0.265625, "learning_rate": 3.73283139695149e-05, "loss": 2.7686, "num_input_tokens_seen": 3989831680, "step": 7610 }, { "epoch": 0.36929941016361445, "grad_norm": 0.2578125, "learning_rate": 3.731087010741222e-05, "loss": 2.7881, "num_input_tokens_seen": 3992453120, "step": 7615 }, { "epoch": 0.36954189171986107, "grad_norm": 0.255859375, "learning_rate": 3.7293418329343026e-05, "loss": 2.7763, "num_input_tokens_seen": 3995074560, "step": 7620 }, { "epoch": 0.3697843732761077, "grad_norm": 0.26953125, "learning_rate": 3.7275958646528944e-05, "loss": 2.7661, "num_input_tokens_seen": 3997696000, "step": 7625 }, { "epoch": 0.3700268548323543, "grad_norm": 0.25390625, "learning_rate": 3.725849107019666e-05, "loss": 2.7846, "num_input_tokens_seen": 4000317440, "step": 7630 }, { "epoch": 0.37026933638860093, "grad_norm": 0.25390625, "learning_rate": 3.7241015611577926e-05, "loss": 2.7895, "num_input_tokens_seen": 4002938880, "step": 7635 }, { "epoch": 0.37051181794484755, "grad_norm": 0.255859375, "learning_rate": 3.7223532281909574e-05, "loss": 2.783, "num_input_tokens_seen": 4005560320, "step": 7640 }, { "epoch": 0.3707542995010942, "grad_norm": 0.26171875, "learning_rate": 3.7206041092433495e-05, "loss": 2.7809, "num_input_tokens_seen": 4008181760, "step": 7645 }, { "epoch": 0.37099678105734085, "grad_norm": 0.2578125, "learning_rate": 3.7188542054396625e-05, "loss": 2.765, "num_input_tokens_seen": 4010803200, "step": 7650 }, { "epoch": 0.3712392626135875, "grad_norm": 0.25, "learning_rate": 3.7171035179050964e-05, "loss": 2.7904, "num_input_tokens_seen": 4013424640, "step": 7655 }, { "epoch": 0.3714817441698341, "grad_norm": 0.25390625, "learning_rate": 3.7153520477653545e-05, "loss": 2.7808, "num_input_tokens_seen": 4016046080, "step": 7660 }, { "epoch": 0.3717242257260807, "grad_norm": 0.25390625, "learning_rate": 3.713599796146644e-05, "loss": 2.785, "num_input_tokens_seen": 4018667520, "step": 7665 }, { "epoch": 0.37196670728232734, "grad_norm": 0.259765625, "learning_rate": 3.7118467641756705e-05, "loss": 2.7786, "num_input_tokens_seen": 4021288960, "step": 7670 }, { "epoch": 0.37220918883857396, "grad_norm": 0.25390625, "learning_rate": 3.710092952979647e-05, "loss": 2.7713, "num_input_tokens_seen": 4023910400, "step": 7675 }, { "epoch": 0.3724516703948206, "grad_norm": 0.2392578125, "learning_rate": 3.708338363686285e-05, "loss": 2.778, "num_input_tokens_seen": 4026531840, "step": 7680 }, { "epoch": 0.3726941519510672, "grad_norm": 0.25390625, "learning_rate": 3.706582997423794e-05, "loss": 2.7676, "num_input_tokens_seen": 4029153280, "step": 7685 }, { "epoch": 0.3729366335073139, "grad_norm": 0.2451171875, "learning_rate": 3.704826855320889e-05, "loss": 2.7797, "num_input_tokens_seen": 4031774720, "step": 7690 }, { "epoch": 0.3731791150635605, "grad_norm": 0.259765625, "learning_rate": 3.703069938506778e-05, "loss": 2.7757, "num_input_tokens_seen": 4034396160, "step": 7695 }, { "epoch": 0.3734215966198071, "grad_norm": 0.2578125, "learning_rate": 3.70131224811117e-05, "loss": 2.7846, "num_input_tokens_seen": 4037017600, "step": 7700 }, { "epoch": 0.37366407817605374, "grad_norm": 0.25390625, "learning_rate": 3.6995537852642714e-05, "loss": 2.7727, "num_input_tokens_seen": 4039639040, "step": 7705 }, { "epoch": 0.37390655973230036, "grad_norm": 0.251953125, "learning_rate": 3.697794551096784e-05, "loss": 2.777, "num_input_tokens_seen": 4042260480, "step": 7710 }, { "epoch": 0.374149041288547, "grad_norm": 0.25390625, "learning_rate": 3.696034546739907e-05, "loss": 2.774, "num_input_tokens_seen": 4044881920, "step": 7715 }, { "epoch": 0.3743915228447936, "grad_norm": 0.26171875, "learning_rate": 3.6942737733253345e-05, "loss": 2.7813, "num_input_tokens_seen": 4047503360, "step": 7720 }, { "epoch": 0.3746340044010402, "grad_norm": 0.25390625, "learning_rate": 3.6925122319852546e-05, "loss": 2.7747, "num_input_tokens_seen": 4050124800, "step": 7725 }, { "epoch": 0.3748764859572869, "grad_norm": 0.259765625, "learning_rate": 3.690749923852349e-05, "loss": 2.7741, "num_input_tokens_seen": 4052746240, "step": 7730 }, { "epoch": 0.3751189675135335, "grad_norm": 0.25, "learning_rate": 3.688986850059792e-05, "loss": 2.7742, "num_input_tokens_seen": 4055367680, "step": 7735 }, { "epoch": 0.37536144906978014, "grad_norm": 0.2470703125, "learning_rate": 3.6872230117412534e-05, "loss": 2.7745, "num_input_tokens_seen": 4057989120, "step": 7740 }, { "epoch": 0.37560393062602676, "grad_norm": 0.26953125, "learning_rate": 3.68545841003089e-05, "loss": 2.7821, "num_input_tokens_seen": 4060610560, "step": 7745 }, { "epoch": 0.3758464121822734, "grad_norm": 0.25, "learning_rate": 3.6836930460633534e-05, "loss": 2.7731, "num_input_tokens_seen": 4063232000, "step": 7750 }, { "epoch": 0.37608889373852, "grad_norm": 0.25, "learning_rate": 3.6819269209737836e-05, "loss": 2.7888, "num_input_tokens_seen": 4065853440, "step": 7755 }, { "epoch": 0.3763313752947666, "grad_norm": 0.267578125, "learning_rate": 3.680160035897809e-05, "loss": 2.7733, "num_input_tokens_seen": 4068474880, "step": 7760 }, { "epoch": 0.37657385685101324, "grad_norm": 0.2578125, "learning_rate": 3.678392391971548e-05, "loss": 2.7787, "num_input_tokens_seen": 4071096320, "step": 7765 }, { "epoch": 0.3768163384072599, "grad_norm": 0.2578125, "learning_rate": 3.676623990331607e-05, "loss": 2.7751, "num_input_tokens_seen": 4073717760, "step": 7770 }, { "epoch": 0.37705881996350654, "grad_norm": 0.251953125, "learning_rate": 3.674854832115079e-05, "loss": 2.7868, "num_input_tokens_seen": 4076339200, "step": 7775 }, { "epoch": 0.37730130151975316, "grad_norm": 0.248046875, "learning_rate": 3.673084918459544e-05, "loss": 2.7887, "num_input_tokens_seen": 4078960640, "step": 7780 }, { "epoch": 0.3775437830759998, "grad_norm": 0.265625, "learning_rate": 3.671314250503068e-05, "loss": 2.7777, "num_input_tokens_seen": 4081582080, "step": 7785 }, { "epoch": 0.3777862646322464, "grad_norm": 0.251953125, "learning_rate": 3.669542829384201e-05, "loss": 2.7748, "num_input_tokens_seen": 4084203520, "step": 7790 }, { "epoch": 0.378028746188493, "grad_norm": 0.2578125, "learning_rate": 3.6677706562419786e-05, "loss": 2.7666, "num_input_tokens_seen": 4086824960, "step": 7795 }, { "epoch": 0.37827122774473965, "grad_norm": 0.2578125, "learning_rate": 3.6659977322159185e-05, "loss": 2.7655, "num_input_tokens_seen": 4089446400, "step": 7800 }, { "epoch": 0.37827122774473965, "eval_accuracy": 0.4553134668620746, "eval_loss": 2.7454605102539062, "eval_runtime": 6.4652, "eval_samples_per_second": 46.402, "eval_steps_per_second": 5.878, "num_input_tokens_seen": 4089446400, "step": 7800 }, { "epoch": 0.37851370930098627, "grad_norm": 0.25, "learning_rate": 3.664224058446022e-05, "loss": 2.7846, "num_input_tokens_seen": 4092067840, "step": 7805 }, { "epoch": 0.37875619085723294, "grad_norm": 0.26953125, "learning_rate": 3.662449636072772e-05, "loss": 2.7803, "num_input_tokens_seen": 4094689280, "step": 7810 }, { "epoch": 0.37899867241347956, "grad_norm": 0.25390625, "learning_rate": 3.660674466237134e-05, "loss": 2.7708, "num_input_tokens_seen": 4097310720, "step": 7815 }, { "epoch": 0.3792411539697262, "grad_norm": 0.251953125, "learning_rate": 3.658898550080554e-05, "loss": 2.7793, "num_input_tokens_seen": 4099932160, "step": 7820 }, { "epoch": 0.3794836355259728, "grad_norm": 0.2578125, "learning_rate": 3.657121888744955e-05, "loss": 2.7767, "num_input_tokens_seen": 4102553600, "step": 7825 }, { "epoch": 0.3797261170822194, "grad_norm": 0.265625, "learning_rate": 3.655344483372743e-05, "loss": 2.7808, "num_input_tokens_seen": 4105175040, "step": 7830 }, { "epoch": 0.37996859863846605, "grad_norm": 0.25, "learning_rate": 3.6535663351068006e-05, "loss": 2.7628, "num_input_tokens_seen": 4107796480, "step": 7835 }, { "epoch": 0.38021108019471267, "grad_norm": 0.251953125, "learning_rate": 3.6517874450904885e-05, "loss": 2.7667, "num_input_tokens_seen": 4110417920, "step": 7840 }, { "epoch": 0.3804535617509593, "grad_norm": 0.248046875, "learning_rate": 3.6500078144676425e-05, "loss": 2.7799, "num_input_tokens_seen": 4113039360, "step": 7845 }, { "epoch": 0.38069604330720597, "grad_norm": 0.251953125, "learning_rate": 3.648227444382578e-05, "loss": 2.7923, "num_input_tokens_seen": 4115660800, "step": 7850 }, { "epoch": 0.3809385248634526, "grad_norm": 0.25, "learning_rate": 3.6464463359800834e-05, "loss": 2.7731, "num_input_tokens_seen": 4118282240, "step": 7855 }, { "epoch": 0.3811810064196992, "grad_norm": 0.248046875, "learning_rate": 3.644664490405422e-05, "loss": 2.787, "num_input_tokens_seen": 4120903680, "step": 7860 }, { "epoch": 0.38142348797594583, "grad_norm": 0.251953125, "learning_rate": 3.642881908804334e-05, "loss": 2.7851, "num_input_tokens_seen": 4123525120, "step": 7865 }, { "epoch": 0.38166596953219245, "grad_norm": 0.251953125, "learning_rate": 3.641098592323027e-05, "loss": 2.7845, "num_input_tokens_seen": 4126146560, "step": 7870 }, { "epoch": 0.38190845108843907, "grad_norm": 0.251953125, "learning_rate": 3.639314542108187e-05, "loss": 2.7905, "num_input_tokens_seen": 4128768000, "step": 7875 }, { "epoch": 0.3821509326446857, "grad_norm": 0.248046875, "learning_rate": 3.637529759306969e-05, "loss": 2.7899, "num_input_tokens_seen": 4131389440, "step": 7880 }, { "epoch": 0.38239341420093237, "grad_norm": 0.25390625, "learning_rate": 3.635744245066999e-05, "loss": 2.7767, "num_input_tokens_seen": 4134010880, "step": 7885 }, { "epoch": 0.382635895757179, "grad_norm": 0.259765625, "learning_rate": 3.633958000536375e-05, "loss": 2.7913, "num_input_tokens_seen": 4136632320, "step": 7890 }, { "epoch": 0.3828783773134256, "grad_norm": 0.25, "learning_rate": 3.6321710268636623e-05, "loss": 2.7836, "num_input_tokens_seen": 4139253760, "step": 7895 }, { "epoch": 0.38312085886967223, "grad_norm": 0.2470703125, "learning_rate": 3.6303833251978966e-05, "loss": 2.7747, "num_input_tokens_seen": 4141875200, "step": 7900 }, { "epoch": 0.38336334042591885, "grad_norm": 0.255859375, "learning_rate": 3.62859489668858e-05, "loss": 2.7795, "num_input_tokens_seen": 4144496640, "step": 7905 }, { "epoch": 0.3836058219821655, "grad_norm": 0.26953125, "learning_rate": 3.626805742485686e-05, "loss": 2.7651, "num_input_tokens_seen": 4147118080, "step": 7910 }, { "epoch": 0.3838483035384121, "grad_norm": 0.259765625, "learning_rate": 3.625015863739649e-05, "loss": 2.7785, "num_input_tokens_seen": 4149739520, "step": 7915 }, { "epoch": 0.3840907850946587, "grad_norm": 0.255859375, "learning_rate": 3.623225261601375e-05, "loss": 2.7778, "num_input_tokens_seen": 4152360960, "step": 7920 }, { "epoch": 0.3843332666509054, "grad_norm": 0.251953125, "learning_rate": 3.62143393722223e-05, "loss": 2.8004, "num_input_tokens_seen": 4154982400, "step": 7925 }, { "epoch": 0.384575748207152, "grad_norm": 0.265625, "learning_rate": 3.619641891754048e-05, "loss": 2.7604, "num_input_tokens_seen": 4157603840, "step": 7930 }, { "epoch": 0.38481822976339863, "grad_norm": 0.263671875, "learning_rate": 3.6178491263491236e-05, "loss": 2.7872, "num_input_tokens_seen": 4160225280, "step": 7935 }, { "epoch": 0.38506071131964525, "grad_norm": 0.2412109375, "learning_rate": 3.61605564216022e-05, "loss": 2.7679, "num_input_tokens_seen": 4162846720, "step": 7940 }, { "epoch": 0.3853031928758919, "grad_norm": 0.259765625, "learning_rate": 3.6142614403405553e-05, "loss": 2.7745, "num_input_tokens_seen": 4165468160, "step": 7945 }, { "epoch": 0.3855456744321385, "grad_norm": 0.251953125, "learning_rate": 3.612466522043813e-05, "loss": 2.7776, "num_input_tokens_seen": 4168089600, "step": 7950 }, { "epoch": 0.3857881559883851, "grad_norm": 0.25, "learning_rate": 3.610670888424139e-05, "loss": 2.7669, "num_input_tokens_seen": 4170711040, "step": 7955 }, { "epoch": 0.38603063754463174, "grad_norm": 0.25, "learning_rate": 3.608874540636134e-05, "loss": 2.7912, "num_input_tokens_seen": 4173332480, "step": 7960 }, { "epoch": 0.3862731191008784, "grad_norm": 0.2490234375, "learning_rate": 3.607077479834863e-05, "loss": 2.7734, "num_input_tokens_seen": 4175953920, "step": 7965 }, { "epoch": 0.38651560065712504, "grad_norm": 0.26171875, "learning_rate": 3.605279707175846e-05, "loss": 2.7793, "num_input_tokens_seen": 4178575360, "step": 7970 }, { "epoch": 0.38675808221337166, "grad_norm": 0.24609375, "learning_rate": 3.603481223815064e-05, "loss": 2.7734, "num_input_tokens_seen": 4181196800, "step": 7975 }, { "epoch": 0.3870005637696183, "grad_norm": 0.26171875, "learning_rate": 3.6016820309089504e-05, "loss": 2.7832, "num_input_tokens_seen": 4183818240, "step": 7980 }, { "epoch": 0.3872430453258649, "grad_norm": 0.248046875, "learning_rate": 3.599882129614399e-05, "loss": 2.7774, "num_input_tokens_seen": 4186439680, "step": 7985 }, { "epoch": 0.3874855268821115, "grad_norm": 0.25, "learning_rate": 3.598081521088758e-05, "loss": 2.7679, "num_input_tokens_seen": 4189061120, "step": 7990 }, { "epoch": 0.38772800843835814, "grad_norm": 0.2431640625, "learning_rate": 3.5962802064898295e-05, "loss": 2.7803, "num_input_tokens_seen": 4191682560, "step": 7995 }, { "epoch": 0.38797048999460476, "grad_norm": 0.25, "learning_rate": 3.5944781869758695e-05, "loss": 2.7687, "num_input_tokens_seen": 4194304000, "step": 8000 }, { "epoch": 0.38821297155085144, "grad_norm": 0.251953125, "learning_rate": 3.592675463705588e-05, "loss": 2.7724, "num_input_tokens_seen": 4196925440, "step": 8005 }, { "epoch": 0.38845545310709806, "grad_norm": 0.265625, "learning_rate": 3.5908720378381475e-05, "loss": 2.7742, "num_input_tokens_seen": 4199546880, "step": 8010 }, { "epoch": 0.3886979346633447, "grad_norm": 0.25390625, "learning_rate": 3.5890679105331624e-05, "loss": 2.7902, "num_input_tokens_seen": 4202168320, "step": 8015 }, { "epoch": 0.3889404162195913, "grad_norm": 0.2578125, "learning_rate": 3.587263082950698e-05, "loss": 2.7769, "num_input_tokens_seen": 4204789760, "step": 8020 }, { "epoch": 0.3891828977758379, "grad_norm": 0.255859375, "learning_rate": 3.5854575562512695e-05, "loss": 2.7873, "num_input_tokens_seen": 4207411200, "step": 8025 }, { "epoch": 0.38942537933208454, "grad_norm": 0.25390625, "learning_rate": 3.583651331595841e-05, "loss": 2.776, "num_input_tokens_seen": 4210032640, "step": 8030 }, { "epoch": 0.38966786088833116, "grad_norm": 0.255859375, "learning_rate": 3.581844410145827e-05, "loss": 2.7833, "num_input_tokens_seen": 4212654080, "step": 8035 }, { "epoch": 0.3899103424445778, "grad_norm": 0.2431640625, "learning_rate": 3.5800367930630905e-05, "loss": 2.7792, "num_input_tokens_seen": 4215275520, "step": 8040 }, { "epoch": 0.39015282400082446, "grad_norm": 0.251953125, "learning_rate": 3.578228481509938e-05, "loss": 2.7588, "num_input_tokens_seen": 4217896960, "step": 8045 }, { "epoch": 0.3903953055570711, "grad_norm": 0.2431640625, "learning_rate": 3.5764194766491263e-05, "loss": 2.7772, "num_input_tokens_seen": 4220518400, "step": 8050 }, { "epoch": 0.3906377871133177, "grad_norm": 0.263671875, "learning_rate": 3.574609779643858e-05, "loss": 2.7874, "num_input_tokens_seen": 4223139840, "step": 8055 }, { "epoch": 0.3908802686695643, "grad_norm": 0.2490234375, "learning_rate": 3.572799391657778e-05, "loss": 2.7831, "num_input_tokens_seen": 4225761280, "step": 8060 }, { "epoch": 0.39112275022581094, "grad_norm": 0.2578125, "learning_rate": 3.570988313854979e-05, "loss": 2.7774, "num_input_tokens_seen": 4228382720, "step": 8065 }, { "epoch": 0.39136523178205757, "grad_norm": 0.2578125, "learning_rate": 3.569176547399993e-05, "loss": 2.7759, "num_input_tokens_seen": 4231004160, "step": 8070 }, { "epoch": 0.3916077133383042, "grad_norm": 0.25, "learning_rate": 3.5673640934577976e-05, "loss": 2.7794, "num_input_tokens_seen": 4233625600, "step": 8075 }, { "epoch": 0.3918501948945508, "grad_norm": 0.25390625, "learning_rate": 3.565550953193814e-05, "loss": 2.7808, "num_input_tokens_seen": 4236247040, "step": 8080 }, { "epoch": 0.3920926764507975, "grad_norm": 0.2490234375, "learning_rate": 3.5637371277739006e-05, "loss": 2.7834, "num_input_tokens_seen": 4238868480, "step": 8085 }, { "epoch": 0.3923351580070441, "grad_norm": 0.251953125, "learning_rate": 3.56192261836436e-05, "loss": 2.7626, "num_input_tokens_seen": 4241489920, "step": 8090 }, { "epoch": 0.3925776395632907, "grad_norm": 0.2451171875, "learning_rate": 3.560107426131932e-05, "loss": 2.776, "num_input_tokens_seen": 4244111360, "step": 8095 }, { "epoch": 0.39282012111953735, "grad_norm": 0.251953125, "learning_rate": 3.558291552243798e-05, "loss": 2.7943, "num_input_tokens_seen": 4246732800, "step": 8100 }, { "epoch": 0.39282012111953735, "eval_accuracy": 0.45537208923628075, "eval_loss": 2.7449140548706055, "eval_runtime": 5.8144, "eval_samples_per_second": 51.596, "eval_steps_per_second": 6.536, "num_input_tokens_seen": 4246732800, "step": 8100 }, { "epoch": 0.39306260267578397, "grad_norm": 0.2421875, "learning_rate": 3.5564749978675734e-05, "loss": 2.762, "num_input_tokens_seen": 4249354240, "step": 8105 }, { "epoch": 0.3933050842320306, "grad_norm": 0.2578125, "learning_rate": 3.554657764171317e-05, "loss": 2.7846, "num_input_tokens_seen": 4251975680, "step": 8110 }, { "epoch": 0.3935475657882772, "grad_norm": 0.251953125, "learning_rate": 3.5528398523235194e-05, "loss": 2.7749, "num_input_tokens_seen": 4254597120, "step": 8115 }, { "epoch": 0.39379004734452383, "grad_norm": 0.255859375, "learning_rate": 3.551021263493111e-05, "loss": 2.7709, "num_input_tokens_seen": 4257218560, "step": 8120 }, { "epoch": 0.3940325289007705, "grad_norm": 0.25390625, "learning_rate": 3.5492019988494555e-05, "loss": 2.7745, "num_input_tokens_seen": 4259840000, "step": 8125 }, { "epoch": 0.3942750104570171, "grad_norm": 0.251953125, "learning_rate": 3.54738205956235e-05, "loss": 2.7728, "num_input_tokens_seen": 4262461440, "step": 8130 }, { "epoch": 0.39451749201326375, "grad_norm": 0.259765625, "learning_rate": 3.545561446802028e-05, "loss": 2.7631, "num_input_tokens_seen": 4265082880, "step": 8135 }, { "epoch": 0.39475997356951037, "grad_norm": 0.25, "learning_rate": 3.5437401617391544e-05, "loss": 2.7748, "num_input_tokens_seen": 4267704320, "step": 8140 }, { "epoch": 0.395002455125757, "grad_norm": 0.26171875, "learning_rate": 3.5419182055448287e-05, "loss": 2.7731, "num_input_tokens_seen": 4270325760, "step": 8145 }, { "epoch": 0.3952449366820036, "grad_norm": 0.25, "learning_rate": 3.540095579390577e-05, "loss": 2.7674, "num_input_tokens_seen": 4272947200, "step": 8150 }, { "epoch": 0.39548741823825023, "grad_norm": 0.2451171875, "learning_rate": 3.538272284448362e-05, "loss": 2.7688, "num_input_tokens_seen": 4275568640, "step": 8155 }, { "epoch": 0.3957298997944969, "grad_norm": 0.2470703125, "learning_rate": 3.5364483218905714e-05, "loss": 2.7853, "num_input_tokens_seen": 4278190080, "step": 8160 }, { "epoch": 0.39597238135074353, "grad_norm": 0.2578125, "learning_rate": 3.534623692890027e-05, "loss": 2.7762, "num_input_tokens_seen": 4280811520, "step": 8165 }, { "epoch": 0.39621486290699015, "grad_norm": 0.251953125, "learning_rate": 3.532798398619975e-05, "loss": 2.7617, "num_input_tokens_seen": 4283432960, "step": 8170 }, { "epoch": 0.39645734446323677, "grad_norm": 0.25, "learning_rate": 3.530972440254092e-05, "loss": 2.7788, "num_input_tokens_seen": 4286054400, "step": 8175 }, { "epoch": 0.3966998260194834, "grad_norm": 0.26171875, "learning_rate": 3.5291458189664796e-05, "loss": 2.7804, "num_input_tokens_seen": 4288675840, "step": 8180 }, { "epoch": 0.39694230757573, "grad_norm": 0.26171875, "learning_rate": 3.527318535931667e-05, "loss": 2.8013, "num_input_tokens_seen": 4291297280, "step": 8185 }, { "epoch": 0.39718478913197663, "grad_norm": 0.248046875, "learning_rate": 3.525490592324609e-05, "loss": 2.7848, "num_input_tokens_seen": 4293918720, "step": 8190 }, { "epoch": 0.39742727068822326, "grad_norm": 0.2470703125, "learning_rate": 3.5236619893206854e-05, "loss": 2.8064, "num_input_tokens_seen": 4296540160, "step": 8195 }, { "epoch": 0.39766975224446993, "grad_norm": 0.2412109375, "learning_rate": 3.5218327280956975e-05, "loss": 2.7625, "num_input_tokens_seen": 4299161600, "step": 8200 }, { "epoch": 0.39791223380071655, "grad_norm": 0.255859375, "learning_rate": 3.520002809825874e-05, "loss": 2.7813, "num_input_tokens_seen": 4301783040, "step": 8205 }, { "epoch": 0.3981547153569632, "grad_norm": 0.2578125, "learning_rate": 3.518172235687862e-05, "loss": 2.7884, "num_input_tokens_seen": 4304404480, "step": 8210 }, { "epoch": 0.3983971969132098, "grad_norm": 0.259765625, "learning_rate": 3.516341006858733e-05, "loss": 2.7766, "num_input_tokens_seen": 4307025920, "step": 8215 }, { "epoch": 0.3986396784694564, "grad_norm": 0.248046875, "learning_rate": 3.514509124515979e-05, "loss": 2.7809, "num_input_tokens_seen": 4309647360, "step": 8220 }, { "epoch": 0.39888216002570304, "grad_norm": 0.248046875, "learning_rate": 3.5126765898375105e-05, "loss": 2.7709, "num_input_tokens_seen": 4312268800, "step": 8225 }, { "epoch": 0.39912464158194966, "grad_norm": 0.251953125, "learning_rate": 3.510843404001659e-05, "loss": 2.7879, "num_input_tokens_seen": 4314890240, "step": 8230 }, { "epoch": 0.3993671231381963, "grad_norm": 0.259765625, "learning_rate": 3.509009568187176e-05, "loss": 2.7797, "num_input_tokens_seen": 4317511680, "step": 8235 }, { "epoch": 0.39960960469444295, "grad_norm": 0.251953125, "learning_rate": 3.5071750835732276e-05, "loss": 2.7759, "num_input_tokens_seen": 4320133120, "step": 8240 }, { "epoch": 0.3998520862506896, "grad_norm": 0.2578125, "learning_rate": 3.505339951339399e-05, "loss": 2.7877, "num_input_tokens_seen": 4322754560, "step": 8245 }, { "epoch": 0.4000945678069362, "grad_norm": 0.26171875, "learning_rate": 3.503504172665694e-05, "loss": 2.7775, "num_input_tokens_seen": 4325376000, "step": 8250 }, { "epoch": 0.4003370493631828, "grad_norm": 0.24609375, "learning_rate": 3.5016677487325265e-05, "loss": 2.7786, "num_input_tokens_seen": 4327997440, "step": 8255 }, { "epoch": 0.40057953091942944, "grad_norm": 0.255859375, "learning_rate": 3.499830680720731e-05, "loss": 2.7727, "num_input_tokens_seen": 4330618880, "step": 8260 }, { "epoch": 0.40082201247567606, "grad_norm": 0.251953125, "learning_rate": 3.497992969811553e-05, "loss": 2.7847, "num_input_tokens_seen": 4333240320, "step": 8265 }, { "epoch": 0.4010644940319227, "grad_norm": 0.25390625, "learning_rate": 3.496154617186651e-05, "loss": 2.7776, "num_input_tokens_seen": 4335861760, "step": 8270 }, { "epoch": 0.4013069755881693, "grad_norm": 0.26171875, "learning_rate": 3.494315624028098e-05, "loss": 2.761, "num_input_tokens_seen": 4338483200, "step": 8275 }, { "epoch": 0.401549457144416, "grad_norm": 0.25390625, "learning_rate": 3.49247599151838e-05, "loss": 2.7779, "num_input_tokens_seen": 4341104640, "step": 8280 }, { "epoch": 0.4017919387006626, "grad_norm": 0.259765625, "learning_rate": 3.4906357208403896e-05, "loss": 2.7821, "num_input_tokens_seen": 4343726080, "step": 8285 }, { "epoch": 0.4020344202569092, "grad_norm": 0.248046875, "learning_rate": 3.488794813177433e-05, "loss": 2.7739, "num_input_tokens_seen": 4346347520, "step": 8290 }, { "epoch": 0.40227690181315584, "grad_norm": 0.2490234375, "learning_rate": 3.486953269713226e-05, "loss": 2.78, "num_input_tokens_seen": 4348968960, "step": 8295 }, { "epoch": 0.40251938336940246, "grad_norm": 0.2490234375, "learning_rate": 3.4851110916318924e-05, "loss": 2.7706, "num_input_tokens_seen": 4351590400, "step": 8300 }, { "epoch": 0.4027618649256491, "grad_norm": 0.248046875, "learning_rate": 3.483268280117964e-05, "loss": 2.7744, "num_input_tokens_seen": 4354211840, "step": 8305 }, { "epoch": 0.4030043464818957, "grad_norm": 0.255859375, "learning_rate": 3.4814248363563794e-05, "loss": 2.7724, "num_input_tokens_seen": 4356833280, "step": 8310 }, { "epoch": 0.4032468280381423, "grad_norm": 0.263671875, "learning_rate": 3.4795807615324864e-05, "loss": 2.7788, "num_input_tokens_seen": 4359454720, "step": 8315 }, { "epoch": 0.403489309594389, "grad_norm": 0.265625, "learning_rate": 3.477736056832035e-05, "loss": 2.7863, "num_input_tokens_seen": 4362076160, "step": 8320 }, { "epoch": 0.4037317911506356, "grad_norm": 0.263671875, "learning_rate": 3.4758907234411824e-05, "loss": 2.7689, "num_input_tokens_seen": 4364697600, "step": 8325 }, { "epoch": 0.40397427270688224, "grad_norm": 0.255859375, "learning_rate": 3.474044762546489e-05, "loss": 2.7864, "num_input_tokens_seen": 4367319040, "step": 8330 }, { "epoch": 0.40421675426312886, "grad_norm": 0.244140625, "learning_rate": 3.4721981753349205e-05, "loss": 2.7751, "num_input_tokens_seen": 4369940480, "step": 8335 }, { "epoch": 0.4044592358193755, "grad_norm": 0.248046875, "learning_rate": 3.4703509629938425e-05, "loss": 2.7779, "num_input_tokens_seen": 4372561920, "step": 8340 }, { "epoch": 0.4047017173756221, "grad_norm": 0.2451171875, "learning_rate": 3.468503126711025e-05, "loss": 2.7835, "num_input_tokens_seen": 4375183360, "step": 8345 }, { "epoch": 0.4049441989318687, "grad_norm": 0.25, "learning_rate": 3.466654667674638e-05, "loss": 2.7711, "num_input_tokens_seen": 4377804800, "step": 8350 }, { "epoch": 0.40518668048811535, "grad_norm": 0.263671875, "learning_rate": 3.4648055870732524e-05, "loss": 2.7625, "num_input_tokens_seen": 4380426240, "step": 8355 }, { "epoch": 0.405429162044362, "grad_norm": 0.25, "learning_rate": 3.462955886095839e-05, "loss": 2.7825, "num_input_tokens_seen": 4383047680, "step": 8360 }, { "epoch": 0.40567164360060864, "grad_norm": 0.26171875, "learning_rate": 3.461105565931766e-05, "loss": 2.7853, "num_input_tokens_seen": 4385669120, "step": 8365 }, { "epoch": 0.40591412515685527, "grad_norm": 0.25390625, "learning_rate": 3.4592546277708016e-05, "loss": 2.7774, "num_input_tokens_seen": 4388290560, "step": 8370 }, { "epoch": 0.4061566067131019, "grad_norm": 0.244140625, "learning_rate": 3.4574030728031107e-05, "loss": 2.7627, "num_input_tokens_seen": 4390912000, "step": 8375 }, { "epoch": 0.4063990882693485, "grad_norm": 0.255859375, "learning_rate": 3.455550902219254e-05, "loss": 2.7861, "num_input_tokens_seen": 4393533440, "step": 8380 }, { "epoch": 0.40664156982559513, "grad_norm": 0.26171875, "learning_rate": 3.4536981172101895e-05, "loss": 2.7708, "num_input_tokens_seen": 4396154880, "step": 8385 }, { "epoch": 0.40688405138184175, "grad_norm": 0.255859375, "learning_rate": 3.451844718967269e-05, "loss": 2.7833, "num_input_tokens_seen": 4398776320, "step": 8390 }, { "epoch": 0.40712653293808837, "grad_norm": 0.251953125, "learning_rate": 3.44999070868224e-05, "loss": 2.764, "num_input_tokens_seen": 4401397760, "step": 8395 }, { "epoch": 0.40736901449433505, "grad_norm": 0.2490234375, "learning_rate": 3.448136087547242e-05, "loss": 2.7715, "num_input_tokens_seen": 4404019200, "step": 8400 }, { "epoch": 0.40736901449433505, "eval_accuracy": 0.4552369320957499, "eval_loss": 2.744718313217163, "eval_runtime": 5.8034, "eval_samples_per_second": 51.694, "eval_steps_per_second": 6.548, "num_input_tokens_seen": 4404019200, "step": 8400 }, { "epoch": 0.40761149605058167, "grad_norm": 0.263671875, "learning_rate": 3.4462808567548084e-05, "loss": 2.7877, "num_input_tokens_seen": 4406640640, "step": 8405 }, { "epoch": 0.4078539776068283, "grad_norm": 0.2578125, "learning_rate": 3.444425017497864e-05, "loss": 2.7712, "num_input_tokens_seen": 4409262080, "step": 8410 }, { "epoch": 0.4080964591630749, "grad_norm": 0.2451171875, "learning_rate": 3.442568570969724e-05, "loss": 2.771, "num_input_tokens_seen": 4411883520, "step": 8415 }, { "epoch": 0.40833894071932153, "grad_norm": 0.25390625, "learning_rate": 3.440711518364097e-05, "loss": 2.767, "num_input_tokens_seen": 4414504960, "step": 8420 }, { "epoch": 0.40858142227556815, "grad_norm": 0.255859375, "learning_rate": 3.4388538608750784e-05, "loss": 2.7818, "num_input_tokens_seen": 4417126400, "step": 8425 }, { "epoch": 0.40882390383181477, "grad_norm": 0.25, "learning_rate": 3.4369955996971536e-05, "loss": 2.7995, "num_input_tokens_seen": 4419747840, "step": 8430 }, { "epoch": 0.4090663853880614, "grad_norm": 0.2578125, "learning_rate": 3.435136736025198e-05, "loss": 2.78, "num_input_tokens_seen": 4422369280, "step": 8435 }, { "epoch": 0.40930886694430807, "grad_norm": 0.2451171875, "learning_rate": 3.433277271054469e-05, "loss": 2.773, "num_input_tokens_seen": 4424990720, "step": 8440 }, { "epoch": 0.4095513485005547, "grad_norm": 0.2490234375, "learning_rate": 3.431417205980616e-05, "loss": 2.7688, "num_input_tokens_seen": 4427612160, "step": 8445 }, { "epoch": 0.4097938300568013, "grad_norm": 0.26171875, "learning_rate": 3.4295565419996735e-05, "loss": 2.7796, "num_input_tokens_seen": 4430233600, "step": 8450 }, { "epoch": 0.41003631161304793, "grad_norm": 0.2578125, "learning_rate": 3.42769528030806e-05, "loss": 2.7825, "num_input_tokens_seen": 4432855040, "step": 8455 }, { "epoch": 0.41027879316929455, "grad_norm": 0.251953125, "learning_rate": 3.425833422102576e-05, "loss": 2.7858, "num_input_tokens_seen": 4435476480, "step": 8460 }, { "epoch": 0.4105212747255412, "grad_norm": 0.255859375, "learning_rate": 3.42397096858041e-05, "loss": 2.7787, "num_input_tokens_seen": 4438097920, "step": 8465 }, { "epoch": 0.4107637562817878, "grad_norm": 0.25390625, "learning_rate": 3.4221079209391314e-05, "loss": 2.7618, "num_input_tokens_seen": 4440719360, "step": 8470 }, { "epoch": 0.41100623783803447, "grad_norm": 0.2578125, "learning_rate": 3.420244280376691e-05, "loss": 2.7701, "num_input_tokens_seen": 4443340800, "step": 8475 }, { "epoch": 0.4112487193942811, "grad_norm": 0.26171875, "learning_rate": 3.418380048091421e-05, "loss": 2.7797, "num_input_tokens_seen": 4445962240, "step": 8480 }, { "epoch": 0.4114912009505277, "grad_norm": 0.255859375, "learning_rate": 3.4165152252820346e-05, "loss": 2.7982, "num_input_tokens_seen": 4448583680, "step": 8485 }, { "epoch": 0.41173368250677433, "grad_norm": 0.255859375, "learning_rate": 3.414649813147625e-05, "loss": 2.786, "num_input_tokens_seen": 4451205120, "step": 8490 }, { "epoch": 0.41197616406302096, "grad_norm": 0.259765625, "learning_rate": 3.412783812887663e-05, "loss": 2.7949, "num_input_tokens_seen": 4453826560, "step": 8495 }, { "epoch": 0.4122186456192676, "grad_norm": 0.255859375, "learning_rate": 3.410917225701999e-05, "loss": 2.775, "num_input_tokens_seen": 4456448000, "step": 8500 }, { "epoch": 0.4124611271755142, "grad_norm": 0.248046875, "learning_rate": 3.4090500527908604e-05, "loss": 2.7739, "num_input_tokens_seen": 4459069440, "step": 8505 }, { "epoch": 0.4127036087317608, "grad_norm": 0.255859375, "learning_rate": 3.407182295354851e-05, "loss": 2.7748, "num_input_tokens_seen": 4461690880, "step": 8510 }, { "epoch": 0.4129460902880075, "grad_norm": 0.2470703125, "learning_rate": 3.4053139545949503e-05, "loss": 2.7677, "num_input_tokens_seen": 4464312320, "step": 8515 }, { "epoch": 0.4131885718442541, "grad_norm": 0.240234375, "learning_rate": 3.4034450317125135e-05, "loss": 2.7693, "num_input_tokens_seen": 4466933760, "step": 8520 }, { "epoch": 0.41343105340050074, "grad_norm": 0.26171875, "learning_rate": 3.4015755279092685e-05, "loss": 2.7851, "num_input_tokens_seen": 4469555200, "step": 8525 }, { "epoch": 0.41367353495674736, "grad_norm": 0.26171875, "learning_rate": 3.399705444387319e-05, "loss": 2.7801, "num_input_tokens_seen": 4472176640, "step": 8530 }, { "epoch": 0.413916016512994, "grad_norm": 0.2470703125, "learning_rate": 3.39783478234914e-05, "loss": 2.7755, "num_input_tokens_seen": 4474798080, "step": 8535 }, { "epoch": 0.4141584980692406, "grad_norm": 0.255859375, "learning_rate": 3.39596354299758e-05, "loss": 2.7839, "num_input_tokens_seen": 4477419520, "step": 8540 }, { "epoch": 0.4144009796254872, "grad_norm": 0.259765625, "learning_rate": 3.3940917275358565e-05, "loss": 2.7853, "num_input_tokens_seen": 4480040960, "step": 8545 }, { "epoch": 0.41464346118173384, "grad_norm": 0.251953125, "learning_rate": 3.392219337167559e-05, "loss": 2.7964, "num_input_tokens_seen": 4482662400, "step": 8550 }, { "epoch": 0.4148859427379805, "grad_norm": 0.259765625, "learning_rate": 3.390346373096645e-05, "loss": 2.7604, "num_input_tokens_seen": 4485283840, "step": 8555 }, { "epoch": 0.41512842429422714, "grad_norm": 0.26953125, "learning_rate": 3.3884728365274435e-05, "loss": 2.779, "num_input_tokens_seen": 4487905280, "step": 8560 }, { "epoch": 0.41537090585047376, "grad_norm": 0.26171875, "learning_rate": 3.38659872866465e-05, "loss": 2.7849, "num_input_tokens_seen": 4490526720, "step": 8565 }, { "epoch": 0.4156133874067204, "grad_norm": 0.255859375, "learning_rate": 3.384724050713327e-05, "loss": 2.7886, "num_input_tokens_seen": 4493148160, "step": 8570 }, { "epoch": 0.415855868962967, "grad_norm": 0.251953125, "learning_rate": 3.382848803878905e-05, "loss": 2.777, "num_input_tokens_seen": 4495769600, "step": 8575 }, { "epoch": 0.4160983505192136, "grad_norm": 0.251953125, "learning_rate": 3.3809729893671796e-05, "loss": 2.7784, "num_input_tokens_seen": 4498391040, "step": 8580 }, { "epoch": 0.41634083207546024, "grad_norm": 0.2578125, "learning_rate": 3.379096608384309e-05, "loss": 2.7808, "num_input_tokens_seen": 4501012480, "step": 8585 }, { "epoch": 0.41658331363170686, "grad_norm": 0.25390625, "learning_rate": 3.3772196621368216e-05, "loss": 2.7823, "num_input_tokens_seen": 4503633920, "step": 8590 }, { "epoch": 0.41682579518795354, "grad_norm": 0.25, "learning_rate": 3.375342151831603e-05, "loss": 2.7768, "num_input_tokens_seen": 4506255360, "step": 8595 }, { "epoch": 0.41706827674420016, "grad_norm": 0.25390625, "learning_rate": 3.3734640786759035e-05, "loss": 2.7818, "num_input_tokens_seen": 4508876800, "step": 8600 }, { "epoch": 0.4173107583004468, "grad_norm": 0.25390625, "learning_rate": 3.3715854438773374e-05, "loss": 2.782, "num_input_tokens_seen": 4511498240, "step": 8605 }, { "epoch": 0.4175532398566934, "grad_norm": 0.25390625, "learning_rate": 3.369706248643879e-05, "loss": 2.7812, "num_input_tokens_seen": 4514119680, "step": 8610 }, { "epoch": 0.41779572141294, "grad_norm": 0.2470703125, "learning_rate": 3.367826494183861e-05, "loss": 2.7964, "num_input_tokens_seen": 4516741120, "step": 8615 }, { "epoch": 0.41803820296918665, "grad_norm": 0.25, "learning_rate": 3.365946181705979e-05, "loss": 2.8021, "num_input_tokens_seen": 4519362560, "step": 8620 }, { "epoch": 0.41828068452543327, "grad_norm": 0.259765625, "learning_rate": 3.364065312419285e-05, "loss": 2.7659, "num_input_tokens_seen": 4521984000, "step": 8625 }, { "epoch": 0.4185231660816799, "grad_norm": 0.251953125, "learning_rate": 3.3621838875331886e-05, "loss": 2.769, "num_input_tokens_seen": 4524605440, "step": 8630 }, { "epoch": 0.41876564763792656, "grad_norm": 0.251953125, "learning_rate": 3.360301908257459e-05, "loss": 2.7614, "num_input_tokens_seen": 4527226880, "step": 8635 }, { "epoch": 0.4190081291941732, "grad_norm": 0.25, "learning_rate": 3.35841937580222e-05, "loss": 2.7745, "num_input_tokens_seen": 4529848320, "step": 8640 }, { "epoch": 0.4192506107504198, "grad_norm": 0.2470703125, "learning_rate": 3.356536291377953e-05, "loss": 2.7709, "num_input_tokens_seen": 4532469760, "step": 8645 }, { "epoch": 0.4194930923066664, "grad_norm": 0.251953125, "learning_rate": 3.3546526561954914e-05, "loss": 2.7621, "num_input_tokens_seen": 4535091200, "step": 8650 }, { "epoch": 0.41973557386291305, "grad_norm": 0.25390625, "learning_rate": 3.3527684714660255e-05, "loss": 2.7709, "num_input_tokens_seen": 4537712640, "step": 8655 }, { "epoch": 0.41997805541915967, "grad_norm": 0.251953125, "learning_rate": 3.350883738401098e-05, "loss": 2.7755, "num_input_tokens_seen": 4540334080, "step": 8660 }, { "epoch": 0.4202205369754063, "grad_norm": 0.244140625, "learning_rate": 3.348998458212603e-05, "loss": 2.7841, "num_input_tokens_seen": 4542955520, "step": 8665 }, { "epoch": 0.4204630185316529, "grad_norm": 0.25390625, "learning_rate": 3.347112632112788e-05, "loss": 2.7876, "num_input_tokens_seen": 4545576960, "step": 8670 }, { "epoch": 0.4207055000878996, "grad_norm": 0.263671875, "learning_rate": 3.345226261314251e-05, "loss": 2.7698, "num_input_tokens_seen": 4548198400, "step": 8675 }, { "epoch": 0.4209479816441462, "grad_norm": 0.2578125, "learning_rate": 3.34333934702994e-05, "loss": 2.7672, "num_input_tokens_seen": 4550819840, "step": 8680 }, { "epoch": 0.42119046320039283, "grad_norm": 0.271484375, "learning_rate": 3.3414518904731537e-05, "loss": 2.7668, "num_input_tokens_seen": 4553441280, "step": 8685 }, { "epoch": 0.42143294475663945, "grad_norm": 0.259765625, "learning_rate": 3.339563892857538e-05, "loss": 2.767, "num_input_tokens_seen": 4556062720, "step": 8690 }, { "epoch": 0.42167542631288607, "grad_norm": 0.2578125, "learning_rate": 3.3376753553970864e-05, "loss": 2.7825, "num_input_tokens_seen": 4558684160, "step": 8695 }, { "epoch": 0.4219179078691327, "grad_norm": 0.263671875, "learning_rate": 3.33578627930614e-05, "loss": 2.7828, "num_input_tokens_seen": 4561305600, "step": 8700 }, { "epoch": 0.4219179078691327, "eval_accuracy": 0.4554388536069044, "eval_loss": 2.744344472885132, "eval_runtime": 6.4252, "eval_samples_per_second": 46.691, "eval_steps_per_second": 5.914, "num_input_tokens_seen": 4561305600, "step": 8700 }, { "epoch": 0.4221603894253793, "grad_norm": 0.244140625, "learning_rate": 3.333896665799388e-05, "loss": 2.7885, "num_input_tokens_seen": 4563927040, "step": 8705 }, { "epoch": 0.42240287098162593, "grad_norm": 0.25390625, "learning_rate": 3.332006516091863e-05, "loss": 2.7741, "num_input_tokens_seen": 4566548480, "step": 8710 }, { "epoch": 0.4226453525378726, "grad_norm": 0.24609375, "learning_rate": 3.330115831398944e-05, "loss": 2.7702, "num_input_tokens_seen": 4569169920, "step": 8715 }, { "epoch": 0.42288783409411923, "grad_norm": 0.2490234375, "learning_rate": 3.328224612936351e-05, "loss": 2.764, "num_input_tokens_seen": 4571791360, "step": 8720 }, { "epoch": 0.42313031565036585, "grad_norm": 0.251953125, "learning_rate": 3.326332861920151e-05, "loss": 2.7803, "num_input_tokens_seen": 4574412800, "step": 8725 }, { "epoch": 0.4233727972066125, "grad_norm": 0.2470703125, "learning_rate": 3.324440579566751e-05, "loss": 2.7652, "num_input_tokens_seen": 4577034240, "step": 8730 }, { "epoch": 0.4236152787628591, "grad_norm": 0.259765625, "learning_rate": 3.3225477670929e-05, "loss": 2.7707, "num_input_tokens_seen": 4579655680, "step": 8735 }, { "epoch": 0.4238577603191057, "grad_norm": 0.251953125, "learning_rate": 3.32065442571569e-05, "loss": 2.7752, "num_input_tokens_seen": 4582277120, "step": 8740 }, { "epoch": 0.42410024187535234, "grad_norm": 0.251953125, "learning_rate": 3.318760556652551e-05, "loss": 2.7779, "num_input_tokens_seen": 4584898560, "step": 8745 }, { "epoch": 0.424342723431599, "grad_norm": 0.255859375, "learning_rate": 3.31686616112125e-05, "loss": 2.7869, "num_input_tokens_seen": 4587520000, "step": 8750 }, { "epoch": 0.42458520498784563, "grad_norm": 0.2470703125, "learning_rate": 3.314971240339898e-05, "loss": 2.7758, "num_input_tokens_seen": 4590141440, "step": 8755 }, { "epoch": 0.42482768654409225, "grad_norm": 0.248046875, "learning_rate": 3.31307579552694e-05, "loss": 2.7691, "num_input_tokens_seen": 4592762880, "step": 8760 }, { "epoch": 0.4250701681003389, "grad_norm": 0.25390625, "learning_rate": 3.3111798279011594e-05, "loss": 2.7729, "num_input_tokens_seen": 4595384320, "step": 8765 }, { "epoch": 0.4253126496565855, "grad_norm": 0.24609375, "learning_rate": 3.309283338681674e-05, "loss": 2.7736, "num_input_tokens_seen": 4598005760, "step": 8770 }, { "epoch": 0.4255551312128321, "grad_norm": 0.2578125, "learning_rate": 3.3073863290879395e-05, "loss": 2.7777, "num_input_tokens_seen": 4600627200, "step": 8775 }, { "epoch": 0.42579761276907874, "grad_norm": 0.255859375, "learning_rate": 3.305488800339744e-05, "loss": 2.7623, "num_input_tokens_seen": 4603248640, "step": 8780 }, { "epoch": 0.42604009432532536, "grad_norm": 0.255859375, "learning_rate": 3.303590753657211e-05, "loss": 2.7881, "num_input_tokens_seen": 4605870080, "step": 8785 }, { "epoch": 0.42628257588157203, "grad_norm": 0.263671875, "learning_rate": 3.3016921902607954e-05, "loss": 2.7774, "num_input_tokens_seen": 4608491520, "step": 8790 }, { "epoch": 0.42652505743781866, "grad_norm": 0.25390625, "learning_rate": 3.299793111371287e-05, "loss": 2.7848, "num_input_tokens_seen": 4611112960, "step": 8795 }, { "epoch": 0.4267675389940653, "grad_norm": 0.25390625, "learning_rate": 3.297893518209804e-05, "loss": 2.7778, "num_input_tokens_seen": 4613734400, "step": 8800 }, { "epoch": 0.4270100205503119, "grad_norm": 0.25, "learning_rate": 3.295993411997798e-05, "loss": 2.7885, "num_input_tokens_seen": 4616355840, "step": 8805 }, { "epoch": 0.4272525021065585, "grad_norm": 0.265625, "learning_rate": 3.294092793957047e-05, "loss": 2.7789, "num_input_tokens_seen": 4618977280, "step": 8810 }, { "epoch": 0.42749498366280514, "grad_norm": 0.2490234375, "learning_rate": 3.292191665309663e-05, "loss": 2.7705, "num_input_tokens_seen": 4621598720, "step": 8815 }, { "epoch": 0.42773746521905176, "grad_norm": 0.25390625, "learning_rate": 3.2902900272780814e-05, "loss": 2.7909, "num_input_tokens_seen": 4624220160, "step": 8820 }, { "epoch": 0.4279799467752984, "grad_norm": 0.251953125, "learning_rate": 3.2883878810850687e-05, "loss": 2.7787, "num_input_tokens_seen": 4626841600, "step": 8825 }, { "epoch": 0.42822242833154506, "grad_norm": 0.25390625, "learning_rate": 3.286485227953716e-05, "loss": 2.7682, "num_input_tokens_seen": 4629463040, "step": 8830 }, { "epoch": 0.4284649098877917, "grad_norm": 0.259765625, "learning_rate": 3.284582069107441e-05, "loss": 2.7724, "num_input_tokens_seen": 4632084480, "step": 8835 }, { "epoch": 0.4287073914440383, "grad_norm": 0.255859375, "learning_rate": 3.2826784057699876e-05, "loss": 2.7818, "num_input_tokens_seen": 4634705920, "step": 8840 }, { "epoch": 0.4289498730002849, "grad_norm": 0.251953125, "learning_rate": 3.2807742391654234e-05, "loss": 2.7841, "num_input_tokens_seen": 4637327360, "step": 8845 }, { "epoch": 0.42919235455653154, "grad_norm": 0.25390625, "learning_rate": 3.278869570518138e-05, "loss": 2.7849, "num_input_tokens_seen": 4639948800, "step": 8850 }, { "epoch": 0.42943483611277816, "grad_norm": 0.244140625, "learning_rate": 3.2769644010528476e-05, "loss": 2.7787, "num_input_tokens_seen": 4642570240, "step": 8855 }, { "epoch": 0.4296773176690248, "grad_norm": 0.24609375, "learning_rate": 3.275058731994586e-05, "loss": 2.7786, "num_input_tokens_seen": 4645191680, "step": 8860 }, { "epoch": 0.4299197992252714, "grad_norm": 0.25390625, "learning_rate": 3.273152564568711e-05, "loss": 2.7695, "num_input_tokens_seen": 4647813120, "step": 8865 }, { "epoch": 0.4301622807815181, "grad_norm": 0.259765625, "learning_rate": 3.2712459000008996e-05, "loss": 2.7708, "num_input_tokens_seen": 4650434560, "step": 8870 }, { "epoch": 0.4304047623377647, "grad_norm": 0.2578125, "learning_rate": 3.269338739517149e-05, "loss": 2.7691, "num_input_tokens_seen": 4653056000, "step": 8875 }, { "epoch": 0.4306472438940113, "grad_norm": 0.2451171875, "learning_rate": 3.2674310843437774e-05, "loss": 2.7763, "num_input_tokens_seen": 4655677440, "step": 8880 }, { "epoch": 0.43088972545025794, "grad_norm": 0.251953125, "learning_rate": 3.265522935707417e-05, "loss": 2.7744, "num_input_tokens_seen": 4658298880, "step": 8885 }, { "epoch": 0.43113220700650456, "grad_norm": 0.25390625, "learning_rate": 3.2636142948350196e-05, "loss": 2.782, "num_input_tokens_seen": 4660920320, "step": 8890 }, { "epoch": 0.4313746885627512, "grad_norm": 0.2578125, "learning_rate": 3.261705162953853e-05, "loss": 2.7692, "num_input_tokens_seen": 4663541760, "step": 8895 }, { "epoch": 0.4316171701189978, "grad_norm": 0.24609375, "learning_rate": 3.259795541291503e-05, "loss": 2.7821, "num_input_tokens_seen": 4666163200, "step": 8900 }, { "epoch": 0.4318596516752444, "grad_norm": 0.2578125, "learning_rate": 3.2578854310758656e-05, "loss": 2.7684, "num_input_tokens_seen": 4668784640, "step": 8905 }, { "epoch": 0.4321021332314911, "grad_norm": 0.25, "learning_rate": 3.255974833535154e-05, "loss": 2.783, "num_input_tokens_seen": 4671406080, "step": 8910 }, { "epoch": 0.4323446147877377, "grad_norm": 0.25, "learning_rate": 3.2540637498978963e-05, "loss": 2.7794, "num_input_tokens_seen": 4674027520, "step": 8915 }, { "epoch": 0.43258709634398435, "grad_norm": 0.255859375, "learning_rate": 3.25215218139293e-05, "loss": 2.7685, "num_input_tokens_seen": 4676648960, "step": 8920 }, { "epoch": 0.43282957790023097, "grad_norm": 0.2431640625, "learning_rate": 3.250240129249405e-05, "loss": 2.7674, "num_input_tokens_seen": 4679270400, "step": 8925 }, { "epoch": 0.4330720594564776, "grad_norm": 0.251953125, "learning_rate": 3.2483275946967825e-05, "loss": 2.7863, "num_input_tokens_seen": 4681891840, "step": 8930 }, { "epoch": 0.4333145410127242, "grad_norm": 0.2470703125, "learning_rate": 3.246414578964837e-05, "loss": 2.7796, "num_input_tokens_seen": 4684513280, "step": 8935 }, { "epoch": 0.43355702256897083, "grad_norm": 0.255859375, "learning_rate": 3.244501083283647e-05, "loss": 2.7703, "num_input_tokens_seen": 4687134720, "step": 8940 }, { "epoch": 0.43379950412521745, "grad_norm": 0.25, "learning_rate": 3.242587108883602e-05, "loss": 2.7785, "num_input_tokens_seen": 4689756160, "step": 8945 }, { "epoch": 0.4340419856814641, "grad_norm": 0.25, "learning_rate": 3.240672656995402e-05, "loss": 2.7784, "num_input_tokens_seen": 4692377600, "step": 8950 }, { "epoch": 0.43428446723771075, "grad_norm": 0.255859375, "learning_rate": 3.2387577288500484e-05, "loss": 2.7953, "num_input_tokens_seen": 4694999040, "step": 8955 }, { "epoch": 0.43452694879395737, "grad_norm": 0.24609375, "learning_rate": 3.236842325678854e-05, "loss": 2.7773, "num_input_tokens_seen": 4697620480, "step": 8960 }, { "epoch": 0.434769430350204, "grad_norm": 0.25, "learning_rate": 3.2349264487134354e-05, "loss": 2.7707, "num_input_tokens_seen": 4700241920, "step": 8965 }, { "epoch": 0.4350119119064506, "grad_norm": 0.267578125, "learning_rate": 3.233010099185711e-05, "loss": 2.7754, "num_input_tokens_seen": 4702863360, "step": 8970 }, { "epoch": 0.43525439346269723, "grad_norm": 0.26171875, "learning_rate": 3.231093278327908e-05, "loss": 2.7774, "num_input_tokens_seen": 4705484800, "step": 8975 }, { "epoch": 0.43549687501894385, "grad_norm": 0.25390625, "learning_rate": 3.229175987372553e-05, "loss": 2.7743, "num_input_tokens_seen": 4708106240, "step": 8980 }, { "epoch": 0.4357393565751905, "grad_norm": 0.26171875, "learning_rate": 3.2272582275524765e-05, "loss": 2.7851, "num_input_tokens_seen": 4710727680, "step": 8985 }, { "epoch": 0.43598183813143715, "grad_norm": 0.244140625, "learning_rate": 3.22534000010081e-05, "loss": 2.7697, "num_input_tokens_seen": 4713349120, "step": 8990 }, { "epoch": 0.43622431968768377, "grad_norm": 0.25390625, "learning_rate": 3.2234213062509865e-05, "loss": 2.7605, "num_input_tokens_seen": 4715970560, "step": 8995 }, { "epoch": 0.4364668012439304, "grad_norm": 0.251953125, "learning_rate": 3.221502147236737e-05, "loss": 2.7883, "num_input_tokens_seen": 4718592000, "step": 9000 }, { "epoch": 0.4364668012439304, "eval_accuracy": 0.4555984367366878, "eval_loss": 2.7440221309661865, "eval_runtime": 5.8804, "eval_samples_per_second": 51.017, "eval_steps_per_second": 6.462, "num_input_tokens_seen": 4718592000, "step": 9000 }, { "epoch": 0.436709282800177, "grad_norm": 0.255859375, "learning_rate": 3.219582524292093e-05, "loss": 2.7729, "num_input_tokens_seen": 4721213440, "step": 9005 }, { "epoch": 0.43695176435642363, "grad_norm": 0.251953125, "learning_rate": 3.217662438651383e-05, "loss": 2.7722, "num_input_tokens_seen": 4723834880, "step": 9010 }, { "epoch": 0.43719424591267025, "grad_norm": 0.259765625, "learning_rate": 3.2157418915492367e-05, "loss": 2.7743, "num_input_tokens_seen": 4726456320, "step": 9015 }, { "epoch": 0.4374367274689169, "grad_norm": 0.2578125, "learning_rate": 3.213820884220575e-05, "loss": 2.7798, "num_input_tokens_seen": 4729077760, "step": 9020 }, { "epoch": 0.4376792090251635, "grad_norm": 0.25390625, "learning_rate": 3.211899417900621e-05, "loss": 2.7814, "num_input_tokens_seen": 4731699200, "step": 9025 }, { "epoch": 0.4379216905814102, "grad_norm": 0.2490234375, "learning_rate": 3.2099774938248864e-05, "loss": 2.7762, "num_input_tokens_seen": 4734320640, "step": 9030 }, { "epoch": 0.4381641721376568, "grad_norm": 0.248046875, "learning_rate": 3.208055113229183e-05, "loss": 2.7765, "num_input_tokens_seen": 4736942080, "step": 9035 }, { "epoch": 0.4384066536939034, "grad_norm": 0.25390625, "learning_rate": 3.2061322773496106e-05, "loss": 2.7686, "num_input_tokens_seen": 4739563520, "step": 9040 }, { "epoch": 0.43864913525015004, "grad_norm": 0.255859375, "learning_rate": 3.2042089874225665e-05, "loss": 2.7919, "num_input_tokens_seen": 4742184960, "step": 9045 }, { "epoch": 0.43889161680639666, "grad_norm": 0.2470703125, "learning_rate": 3.202285244684738e-05, "loss": 2.7705, "num_input_tokens_seen": 4744806400, "step": 9050 }, { "epoch": 0.4391340983626433, "grad_norm": 0.24609375, "learning_rate": 3.200361050373105e-05, "loss": 2.7638, "num_input_tokens_seen": 4747427840, "step": 9055 }, { "epoch": 0.4393765799188899, "grad_norm": 0.251953125, "learning_rate": 3.198436405724934e-05, "loss": 2.7729, "num_input_tokens_seen": 4750049280, "step": 9060 }, { "epoch": 0.4396190614751366, "grad_norm": 0.2578125, "learning_rate": 3.1965113119777844e-05, "loss": 2.7669, "num_input_tokens_seen": 4752670720, "step": 9065 }, { "epoch": 0.4398615430313832, "grad_norm": 0.25390625, "learning_rate": 3.194585770369504e-05, "loss": 2.7682, "num_input_tokens_seen": 4755292160, "step": 9070 }, { "epoch": 0.4401040245876298, "grad_norm": 0.2578125, "learning_rate": 3.1926597821382295e-05, "loss": 2.7861, "num_input_tokens_seen": 4757913600, "step": 9075 }, { "epoch": 0.44034650614387644, "grad_norm": 0.255859375, "learning_rate": 3.19073334852238e-05, "loss": 2.7897, "num_input_tokens_seen": 4760535040, "step": 9080 }, { "epoch": 0.44058898770012306, "grad_norm": 0.248046875, "learning_rate": 3.188806470760667e-05, "loss": 2.7811, "num_input_tokens_seen": 4763156480, "step": 9085 }, { "epoch": 0.4408314692563697, "grad_norm": 0.255859375, "learning_rate": 3.1868791500920836e-05, "loss": 2.7823, "num_input_tokens_seen": 4765777920, "step": 9090 }, { "epoch": 0.4410739508126163, "grad_norm": 0.2412109375, "learning_rate": 3.18495138775591e-05, "loss": 2.7865, "num_input_tokens_seen": 4768399360, "step": 9095 }, { "epoch": 0.4413164323688629, "grad_norm": 0.2490234375, "learning_rate": 3.183023184991709e-05, "loss": 2.764, "num_input_tokens_seen": 4771020800, "step": 9100 }, { "epoch": 0.4415589139251096, "grad_norm": 0.255859375, "learning_rate": 3.181094543039328e-05, "loss": 2.7795, "num_input_tokens_seen": 4773642240, "step": 9105 }, { "epoch": 0.4418013954813562, "grad_norm": 0.2431640625, "learning_rate": 3.179165463138893e-05, "loss": 2.7685, "num_input_tokens_seen": 4776263680, "step": 9110 }, { "epoch": 0.44204387703760284, "grad_norm": 0.24609375, "learning_rate": 3.177235946530818e-05, "loss": 2.7632, "num_input_tokens_seen": 4778885120, "step": 9115 }, { "epoch": 0.44228635859384946, "grad_norm": 0.2578125, "learning_rate": 3.175305994455791e-05, "loss": 2.7808, "num_input_tokens_seen": 4781506560, "step": 9120 }, { "epoch": 0.4425288401500961, "grad_norm": 0.25, "learning_rate": 3.1733756081547864e-05, "loss": 2.7894, "num_input_tokens_seen": 4784128000, "step": 9125 }, { "epoch": 0.4427713217063427, "grad_norm": 0.251953125, "learning_rate": 3.171444788869052e-05, "loss": 2.7745, "num_input_tokens_seen": 4786749440, "step": 9130 }, { "epoch": 0.4430138032625893, "grad_norm": 0.25, "learning_rate": 3.1695135378401185e-05, "loss": 2.7749, "num_input_tokens_seen": 4789370880, "step": 9135 }, { "epoch": 0.44325628481883594, "grad_norm": 0.259765625, "learning_rate": 3.167581856309792e-05, "loss": 2.782, "num_input_tokens_seen": 4791992320, "step": 9140 }, { "epoch": 0.4434987663750826, "grad_norm": 0.251953125, "learning_rate": 3.1656497455201546e-05, "loss": 2.785, "num_input_tokens_seen": 4794613760, "step": 9145 }, { "epoch": 0.44374124793132924, "grad_norm": 0.25390625, "learning_rate": 3.163717206713567e-05, "loss": 2.7691, "num_input_tokens_seen": 4797235200, "step": 9150 }, { "epoch": 0.44398372948757586, "grad_norm": 0.240234375, "learning_rate": 3.161784241132663e-05, "loss": 2.7753, "num_input_tokens_seen": 4799856640, "step": 9155 }, { "epoch": 0.4442262110438225, "grad_norm": 0.244140625, "learning_rate": 3.159850850020352e-05, "loss": 2.78, "num_input_tokens_seen": 4802478080, "step": 9160 }, { "epoch": 0.4444686926000691, "grad_norm": 0.263671875, "learning_rate": 3.157917034619817e-05, "loss": 2.7739, "num_input_tokens_seen": 4805099520, "step": 9165 }, { "epoch": 0.4447111741563157, "grad_norm": 0.2470703125, "learning_rate": 3.155982796174512e-05, "loss": 2.7739, "num_input_tokens_seen": 4807720960, "step": 9170 }, { "epoch": 0.44495365571256235, "grad_norm": 0.25, "learning_rate": 3.154048135928165e-05, "loss": 2.7868, "num_input_tokens_seen": 4810342400, "step": 9175 }, { "epoch": 0.44519613726880897, "grad_norm": 0.2470703125, "learning_rate": 3.1521130551247755e-05, "loss": 2.7715, "num_input_tokens_seen": 4812963840, "step": 9180 }, { "epoch": 0.44543861882505564, "grad_norm": 0.248046875, "learning_rate": 3.150177555008612e-05, "loss": 2.7747, "num_input_tokens_seen": 4815585280, "step": 9185 }, { "epoch": 0.44568110038130226, "grad_norm": 0.25, "learning_rate": 3.148241636824213e-05, "loss": 2.7619, "num_input_tokens_seen": 4818206720, "step": 9190 }, { "epoch": 0.4459235819375489, "grad_norm": 0.25390625, "learning_rate": 3.146305301816386e-05, "loss": 2.7788, "num_input_tokens_seen": 4820828160, "step": 9195 }, { "epoch": 0.4461660634937955, "grad_norm": 0.26171875, "learning_rate": 3.1443685512302065e-05, "loss": 2.7807, "num_input_tokens_seen": 4823449600, "step": 9200 }, { "epoch": 0.4464085450500421, "grad_norm": 0.255859375, "learning_rate": 3.142431386311018e-05, "loss": 2.781, "num_input_tokens_seen": 4826071040, "step": 9205 }, { "epoch": 0.44665102660628875, "grad_norm": 0.2578125, "learning_rate": 3.140493808304429e-05, "loss": 2.7921, "num_input_tokens_seen": 4828692480, "step": 9210 }, { "epoch": 0.44689350816253537, "grad_norm": 0.251953125, "learning_rate": 3.138555818456314e-05, "loss": 2.7781, "num_input_tokens_seen": 4831313920, "step": 9215 }, { "epoch": 0.447135989718782, "grad_norm": 0.25, "learning_rate": 3.136617418012813e-05, "loss": 2.7862, "num_input_tokens_seen": 4833935360, "step": 9220 }, { "epoch": 0.44737847127502867, "grad_norm": 0.248046875, "learning_rate": 3.134678608220329e-05, "loss": 2.7759, "num_input_tokens_seen": 4836556800, "step": 9225 }, { "epoch": 0.4476209528312753, "grad_norm": 0.24609375, "learning_rate": 3.13273939032553e-05, "loss": 2.7752, "num_input_tokens_seen": 4839178240, "step": 9230 }, { "epoch": 0.4478634343875219, "grad_norm": 0.2470703125, "learning_rate": 3.130799765575344e-05, "loss": 2.7787, "num_input_tokens_seen": 4841799680, "step": 9235 }, { "epoch": 0.44810591594376853, "grad_norm": 0.251953125, "learning_rate": 3.128859735216963e-05, "loss": 2.7803, "num_input_tokens_seen": 4844421120, "step": 9240 }, { "epoch": 0.44834839750001515, "grad_norm": 0.271484375, "learning_rate": 3.126919300497839e-05, "loss": 2.7779, "num_input_tokens_seen": 4847042560, "step": 9245 }, { "epoch": 0.44859087905626177, "grad_norm": 0.255859375, "learning_rate": 3.124978462665681e-05, "loss": 2.7938, "num_input_tokens_seen": 4849664000, "step": 9250 }, { "epoch": 0.4488333606125084, "grad_norm": 0.25390625, "learning_rate": 3.123037222968463e-05, "loss": 2.7858, "num_input_tokens_seen": 4852285440, "step": 9255 }, { "epoch": 0.449075842168755, "grad_norm": 0.25390625, "learning_rate": 3.121095582654412e-05, "loss": 2.7809, "num_input_tokens_seen": 4854906880, "step": 9260 }, { "epoch": 0.4493183237250017, "grad_norm": 0.2451171875, "learning_rate": 3.119153542972017e-05, "loss": 2.7655, "num_input_tokens_seen": 4857528320, "step": 9265 }, { "epoch": 0.4495608052812483, "grad_norm": 0.251953125, "learning_rate": 3.117211105170019e-05, "loss": 2.7759, "num_input_tokens_seen": 4860149760, "step": 9270 }, { "epoch": 0.44980328683749493, "grad_norm": 0.263671875, "learning_rate": 3.11526827049742e-05, "loss": 2.7733, "num_input_tokens_seen": 4862771200, "step": 9275 }, { "epoch": 0.45004576839374155, "grad_norm": 0.244140625, "learning_rate": 3.113325040203474e-05, "loss": 2.7925, "num_input_tokens_seen": 4865392640, "step": 9280 }, { "epoch": 0.4502882499499882, "grad_norm": 0.25390625, "learning_rate": 3.1113814155376897e-05, "loss": 2.7686, "num_input_tokens_seen": 4868014080, "step": 9285 }, { "epoch": 0.4505307315062348, "grad_norm": 0.2451171875, "learning_rate": 3.1094373977498306e-05, "loss": 2.7807, "num_input_tokens_seen": 4870635520, "step": 9290 }, { "epoch": 0.4507732130624814, "grad_norm": 0.2421875, "learning_rate": 3.107492988089912e-05, "loss": 2.7717, "num_input_tokens_seen": 4873256960, "step": 9295 }, { "epoch": 0.45101569461872804, "grad_norm": 0.2470703125, "learning_rate": 3.105548187808202e-05, "loss": 2.7627, "num_input_tokens_seen": 4875878400, "step": 9300 }, { "epoch": 0.45101569461872804, "eval_accuracy": 0.45560657873310534, "eval_loss": 2.7437386512756348, "eval_runtime": 5.8688, "eval_samples_per_second": 51.118, "eval_steps_per_second": 6.475, "num_input_tokens_seen": 4875878400, "step": 9300 }, { "epoch": 0.4512581761749747, "grad_norm": 0.244140625, "learning_rate": 3.103602998155219e-05, "loss": 2.7802, "num_input_tokens_seen": 4878499840, "step": 9305 }, { "epoch": 0.45150065773122133, "grad_norm": 0.2578125, "learning_rate": 3.1016574203817316e-05, "loss": 2.7694, "num_input_tokens_seen": 4881121280, "step": 9310 }, { "epoch": 0.45174313928746795, "grad_norm": 0.25390625, "learning_rate": 3.099711455738759e-05, "loss": 2.7778, "num_input_tokens_seen": 4883742720, "step": 9315 }, { "epoch": 0.4519856208437146, "grad_norm": 0.2470703125, "learning_rate": 3.097765105477569e-05, "loss": 2.7555, "num_input_tokens_seen": 4886364160, "step": 9320 }, { "epoch": 0.4522281023999612, "grad_norm": 0.2470703125, "learning_rate": 3.0958183708496756e-05, "loss": 2.7702, "num_input_tokens_seen": 4888985600, "step": 9325 }, { "epoch": 0.4524705839562078, "grad_norm": 0.240234375, "learning_rate": 3.093871253106843e-05, "loss": 2.781, "num_input_tokens_seen": 4891607040, "step": 9330 }, { "epoch": 0.45271306551245444, "grad_norm": 0.251953125, "learning_rate": 3.0919237535010805e-05, "loss": 2.7824, "num_input_tokens_seen": 4894228480, "step": 9335 }, { "epoch": 0.4529555470687011, "grad_norm": 0.24609375, "learning_rate": 3.08997587328464e-05, "loss": 2.7741, "num_input_tokens_seen": 4896849920, "step": 9340 }, { "epoch": 0.45319802862494774, "grad_norm": 0.2421875, "learning_rate": 3.088027613710022e-05, "loss": 2.7757, "num_input_tokens_seen": 4899471360, "step": 9345 }, { "epoch": 0.45344051018119436, "grad_norm": 0.25390625, "learning_rate": 3.0860789760299705e-05, "loss": 2.7778, "num_input_tokens_seen": 4902092800, "step": 9350 }, { "epoch": 0.453682991737441, "grad_norm": 0.2451171875, "learning_rate": 3.08412996149747e-05, "loss": 2.7714, "num_input_tokens_seen": 4904714240, "step": 9355 }, { "epoch": 0.4539254732936876, "grad_norm": 0.2490234375, "learning_rate": 3.0821805713657504e-05, "loss": 2.7841, "num_input_tokens_seen": 4907335680, "step": 9360 }, { "epoch": 0.4541679548499342, "grad_norm": 0.25, "learning_rate": 3.0802308068882817e-05, "loss": 2.7705, "num_input_tokens_seen": 4909957120, "step": 9365 }, { "epoch": 0.45441043640618084, "grad_norm": 0.248046875, "learning_rate": 3.078280669318774e-05, "loss": 2.7756, "num_input_tokens_seen": 4912578560, "step": 9370 }, { "epoch": 0.45465291796242746, "grad_norm": 0.2490234375, "learning_rate": 3.076330159911178e-05, "loss": 2.7805, "num_input_tokens_seen": 4915200000, "step": 9375 }, { "epoch": 0.45489539951867414, "grad_norm": 0.2431640625, "learning_rate": 3.074379279919683e-05, "loss": 2.7801, "num_input_tokens_seen": 4917821440, "step": 9380 }, { "epoch": 0.45513788107492076, "grad_norm": 0.25, "learning_rate": 3.072428030598719e-05, "loss": 2.771, "num_input_tokens_seen": 4920442880, "step": 9385 }, { "epoch": 0.4553803626311674, "grad_norm": 0.259765625, "learning_rate": 3.07047641320295e-05, "loss": 2.7795, "num_input_tokens_seen": 4923064320, "step": 9390 }, { "epoch": 0.455622844187414, "grad_norm": 0.24609375, "learning_rate": 3.0685244289872777e-05, "loss": 2.7682, "num_input_tokens_seen": 4925685760, "step": 9395 }, { "epoch": 0.4558653257436606, "grad_norm": 0.251953125, "learning_rate": 3.066572079206841e-05, "loss": 2.7947, "num_input_tokens_seen": 4928307200, "step": 9400 }, { "epoch": 0.45610780729990724, "grad_norm": 0.263671875, "learning_rate": 3.064619365117013e-05, "loss": 2.7797, "num_input_tokens_seen": 4930928640, "step": 9405 }, { "epoch": 0.45635028885615386, "grad_norm": 0.2431640625, "learning_rate": 3.0626662879734015e-05, "loss": 2.7681, "num_input_tokens_seen": 4933550080, "step": 9410 }, { "epoch": 0.4565927704124005, "grad_norm": 0.25, "learning_rate": 3.060712849031846e-05, "loss": 2.7614, "num_input_tokens_seen": 4936171520, "step": 9415 }, { "epoch": 0.45683525196864716, "grad_norm": 0.25390625, "learning_rate": 3.058759049548422e-05, "loss": 2.7784, "num_input_tokens_seen": 4938792960, "step": 9420 }, { "epoch": 0.4570777335248938, "grad_norm": 0.255859375, "learning_rate": 3.056804890779433e-05, "loss": 2.7753, "num_input_tokens_seen": 4941414400, "step": 9425 }, { "epoch": 0.4573202150811404, "grad_norm": 0.2490234375, "learning_rate": 3.054850373981415e-05, "loss": 2.7847, "num_input_tokens_seen": 4944035840, "step": 9430 }, { "epoch": 0.457562696637387, "grad_norm": 0.25, "learning_rate": 3.052895500411136e-05, "loss": 2.7763, "num_input_tokens_seen": 4946657280, "step": 9435 }, { "epoch": 0.45780517819363364, "grad_norm": 0.24609375, "learning_rate": 3.0509402713255913e-05, "loss": 2.7803, "num_input_tokens_seen": 4949278720, "step": 9440 }, { "epoch": 0.45804765974988026, "grad_norm": 0.2490234375, "learning_rate": 3.048984687982006e-05, "loss": 2.7799, "num_input_tokens_seen": 4951900160, "step": 9445 }, { "epoch": 0.4582901413061269, "grad_norm": 0.255859375, "learning_rate": 3.0470287516378315e-05, "loss": 2.7765, "num_input_tokens_seen": 4954521600, "step": 9450 }, { "epoch": 0.4585326228623735, "grad_norm": 0.248046875, "learning_rate": 3.045072463550747e-05, "loss": 2.7775, "num_input_tokens_seen": 4957143040, "step": 9455 }, { "epoch": 0.4587751044186202, "grad_norm": 0.26171875, "learning_rate": 3.043115824978659e-05, "loss": 2.7816, "num_input_tokens_seen": 4959764480, "step": 9460 }, { "epoch": 0.4590175859748668, "grad_norm": 0.2578125, "learning_rate": 3.041158837179698e-05, "loss": 2.7748, "num_input_tokens_seen": 4962385920, "step": 9465 }, { "epoch": 0.4592600675311134, "grad_norm": 0.259765625, "learning_rate": 3.039201501412218e-05, "loss": 2.7679, "num_input_tokens_seen": 4965007360, "step": 9470 }, { "epoch": 0.45950254908736005, "grad_norm": 0.2578125, "learning_rate": 3.0372438189348013e-05, "loss": 2.7898, "num_input_tokens_seen": 4967628800, "step": 9475 }, { "epoch": 0.45974503064360667, "grad_norm": 0.2451171875, "learning_rate": 3.0352857910062466e-05, "loss": 2.7744, "num_input_tokens_seen": 4970250240, "step": 9480 }, { "epoch": 0.4599875121998533, "grad_norm": 0.2470703125, "learning_rate": 3.03332741888558e-05, "loss": 2.7674, "num_input_tokens_seen": 4972871680, "step": 9485 }, { "epoch": 0.4602299937560999, "grad_norm": 0.2490234375, "learning_rate": 3.0313687038320464e-05, "loss": 2.7796, "num_input_tokens_seen": 4975493120, "step": 9490 }, { "epoch": 0.46047247531234653, "grad_norm": 0.25, "learning_rate": 3.029409647105112e-05, "loss": 2.779, "num_input_tokens_seen": 4978114560, "step": 9495 }, { "epoch": 0.4607149568685932, "grad_norm": 0.248046875, "learning_rate": 3.0274502499644625e-05, "loss": 2.7835, "num_input_tokens_seen": 4980736000, "step": 9500 }, { "epoch": 0.4609574384248398, "grad_norm": 0.2470703125, "learning_rate": 3.0254905136700036e-05, "loss": 2.7697, "num_input_tokens_seen": 4983357440, "step": 9505 }, { "epoch": 0.46119991998108645, "grad_norm": 0.255859375, "learning_rate": 3.0235304394818553e-05, "loss": 2.7712, "num_input_tokens_seen": 4985978880, "step": 9510 }, { "epoch": 0.46144240153733307, "grad_norm": 0.255859375, "learning_rate": 3.0215700286603606e-05, "loss": 2.7681, "num_input_tokens_seen": 4988600320, "step": 9515 }, { "epoch": 0.4616848830935797, "grad_norm": 0.251953125, "learning_rate": 3.0196092824660732e-05, "loss": 2.7777, "num_input_tokens_seen": 4991221760, "step": 9520 }, { "epoch": 0.4619273646498263, "grad_norm": 0.251953125, "learning_rate": 3.0176482021597675e-05, "loss": 2.7898, "num_input_tokens_seen": 4993843200, "step": 9525 }, { "epoch": 0.46216984620607293, "grad_norm": 0.2470703125, "learning_rate": 3.0156867890024286e-05, "loss": 2.7796, "num_input_tokens_seen": 4996464640, "step": 9530 }, { "epoch": 0.46241232776231955, "grad_norm": 0.2451171875, "learning_rate": 3.0137250442552594e-05, "loss": 2.7688, "num_input_tokens_seen": 4999086080, "step": 9535 }, { "epoch": 0.46265480931856623, "grad_norm": 0.25390625, "learning_rate": 3.011762969179672e-05, "loss": 2.7564, "num_input_tokens_seen": 5001707520, "step": 9540 }, { "epoch": 0.46289729087481285, "grad_norm": 0.24609375, "learning_rate": 3.0098005650372933e-05, "loss": 2.7553, "num_input_tokens_seen": 5004328960, "step": 9545 }, { "epoch": 0.46313977243105947, "grad_norm": 0.248046875, "learning_rate": 3.007837833089963e-05, "loss": 2.7563, "num_input_tokens_seen": 5006950400, "step": 9550 }, { "epoch": 0.4633822539873061, "grad_norm": 0.24609375, "learning_rate": 3.005874774599729e-05, "loss": 2.7804, "num_input_tokens_seen": 5009571840, "step": 9555 }, { "epoch": 0.4636247355435527, "grad_norm": 0.251953125, "learning_rate": 3.00391139082885e-05, "loss": 2.7697, "num_input_tokens_seen": 5012193280, "step": 9560 }, { "epoch": 0.46386721709979933, "grad_norm": 0.25, "learning_rate": 3.0019476830397942e-05, "loss": 2.7715, "num_input_tokens_seen": 5014814720, "step": 9565 }, { "epoch": 0.46410969865604595, "grad_norm": 0.25, "learning_rate": 2.9999836524952385e-05, "loss": 2.7585, "num_input_tokens_seen": 5017436160, "step": 9570 }, { "epoch": 0.4643521802122926, "grad_norm": 0.25, "learning_rate": 2.9980193004580648e-05, "loss": 2.7711, "num_input_tokens_seen": 5020057600, "step": 9575 }, { "epoch": 0.46459466176853925, "grad_norm": 0.25390625, "learning_rate": 2.9960546281913664e-05, "loss": 2.7938, "num_input_tokens_seen": 5022679040, "step": 9580 }, { "epoch": 0.4648371433247859, "grad_norm": 0.25, "learning_rate": 2.9940896369584394e-05, "loss": 2.775, "num_input_tokens_seen": 5025300480, "step": 9585 }, { "epoch": 0.4650796248810325, "grad_norm": 0.25390625, "learning_rate": 2.992124328022784e-05, "loss": 2.775, "num_input_tokens_seen": 5027921920, "step": 9590 }, { "epoch": 0.4653221064372791, "grad_norm": 0.25390625, "learning_rate": 2.9901587026481072e-05, "loss": 2.7668, "num_input_tokens_seen": 5030543360, "step": 9595 }, { "epoch": 0.46556458799352574, "grad_norm": 0.2578125, "learning_rate": 2.9881927620983174e-05, "loss": 2.7841, "num_input_tokens_seen": 5033164800, "step": 9600 }, { "epoch": 0.46556458799352574, "eval_accuracy": 0.45567334310372903, "eval_loss": 2.743530511856079, "eval_runtime": 6.337, "eval_samples_per_second": 47.341, "eval_steps_per_second": 5.997, "num_input_tokens_seen": 5033164800, "step": 9600 }, { "epoch": 0.46580706954977236, "grad_norm": 0.25, "learning_rate": 2.9862265076375285e-05, "loss": 2.7953, "num_input_tokens_seen": 5035786240, "step": 9605 }, { "epoch": 0.466049551106019, "grad_norm": 0.2470703125, "learning_rate": 2.9842599405300524e-05, "loss": 2.7845, "num_input_tokens_seen": 5038407680, "step": 9610 }, { "epoch": 0.46629203266226565, "grad_norm": 0.2490234375, "learning_rate": 2.9822930620404065e-05, "loss": 2.7753, "num_input_tokens_seen": 5041029120, "step": 9615 }, { "epoch": 0.4665345142185123, "grad_norm": 0.24609375, "learning_rate": 2.9803258734333033e-05, "loss": 2.7817, "num_input_tokens_seen": 5043650560, "step": 9620 }, { "epoch": 0.4667769957747589, "grad_norm": 0.25, "learning_rate": 2.9783583759736587e-05, "loss": 2.7734, "num_input_tokens_seen": 5046272000, "step": 9625 }, { "epoch": 0.4670194773310055, "grad_norm": 0.25, "learning_rate": 2.976390570926586e-05, "loss": 2.7708, "num_input_tokens_seen": 5048893440, "step": 9630 }, { "epoch": 0.46726195888725214, "grad_norm": 0.2451171875, "learning_rate": 2.9744224595573956e-05, "loss": 2.7804, "num_input_tokens_seen": 5051514880, "step": 9635 }, { "epoch": 0.46750444044349876, "grad_norm": 0.248046875, "learning_rate": 2.9724540431315962e-05, "loss": 2.774, "num_input_tokens_seen": 5054136320, "step": 9640 }, { "epoch": 0.4677469219997454, "grad_norm": 0.25390625, "learning_rate": 2.970485322914891e-05, "loss": 2.7814, "num_input_tokens_seen": 5056757760, "step": 9645 }, { "epoch": 0.467989403555992, "grad_norm": 0.251953125, "learning_rate": 2.9685163001731803e-05, "loss": 2.7618, "num_input_tokens_seen": 5059379200, "step": 9650 }, { "epoch": 0.4682318851122387, "grad_norm": 0.24609375, "learning_rate": 2.9665469761725567e-05, "loss": 2.7925, "num_input_tokens_seen": 5062000640, "step": 9655 }, { "epoch": 0.4684743666684853, "grad_norm": 0.259765625, "learning_rate": 2.964577352179309e-05, "loss": 2.7791, "num_input_tokens_seen": 5064622080, "step": 9660 }, { "epoch": 0.4687168482247319, "grad_norm": 0.251953125, "learning_rate": 2.9626074294599177e-05, "loss": 2.7738, "num_input_tokens_seen": 5067243520, "step": 9665 }, { "epoch": 0.46895932978097854, "grad_norm": 0.25, "learning_rate": 2.9606372092810554e-05, "loss": 2.7793, "num_input_tokens_seen": 5069864960, "step": 9670 }, { "epoch": 0.46920181133722516, "grad_norm": 0.255859375, "learning_rate": 2.9586666929095857e-05, "loss": 2.7774, "num_input_tokens_seen": 5072486400, "step": 9675 }, { "epoch": 0.4694442928934718, "grad_norm": 0.248046875, "learning_rate": 2.9566958816125628e-05, "loss": 2.7787, "num_input_tokens_seen": 5075107840, "step": 9680 }, { "epoch": 0.4696867744497184, "grad_norm": 0.251953125, "learning_rate": 2.954724776657231e-05, "loss": 2.7714, "num_input_tokens_seen": 5077729280, "step": 9685 }, { "epoch": 0.469929256005965, "grad_norm": 0.251953125, "learning_rate": 2.952753379311023e-05, "loss": 2.7759, "num_input_tokens_seen": 5080350720, "step": 9690 }, { "epoch": 0.4701717375622117, "grad_norm": 0.263671875, "learning_rate": 2.9507816908415598e-05, "loss": 2.784, "num_input_tokens_seen": 5082972160, "step": 9695 }, { "epoch": 0.4704142191184583, "grad_norm": 0.244140625, "learning_rate": 2.948809712516649e-05, "loss": 2.7722, "num_input_tokens_seen": 5085593600, "step": 9700 }, { "epoch": 0.47065670067470494, "grad_norm": 0.259765625, "learning_rate": 2.9468374456042857e-05, "loss": 2.7904, "num_input_tokens_seen": 5088215040, "step": 9705 }, { "epoch": 0.47089918223095156, "grad_norm": 0.2578125, "learning_rate": 2.9448648913726495e-05, "loss": 2.771, "num_input_tokens_seen": 5090836480, "step": 9710 }, { "epoch": 0.4711416637871982, "grad_norm": 0.251953125, "learning_rate": 2.942892051090104e-05, "loss": 2.784, "num_input_tokens_seen": 5093457920, "step": 9715 }, { "epoch": 0.4713841453434448, "grad_norm": 0.25, "learning_rate": 2.9409189260252e-05, "loss": 2.7794, "num_input_tokens_seen": 5096079360, "step": 9720 }, { "epoch": 0.4716266268996914, "grad_norm": 0.251953125, "learning_rate": 2.9389455174466684e-05, "loss": 2.7578, "num_input_tokens_seen": 5098700800, "step": 9725 }, { "epoch": 0.47186910845593805, "grad_norm": 0.251953125, "learning_rate": 2.936971826623423e-05, "loss": 2.7874, "num_input_tokens_seen": 5101322240, "step": 9730 }, { "epoch": 0.4721115900121847, "grad_norm": 0.2578125, "learning_rate": 2.9349978548245587e-05, "loss": 2.7734, "num_input_tokens_seen": 5103943680, "step": 9735 }, { "epoch": 0.47235407156843134, "grad_norm": 0.25390625, "learning_rate": 2.9330236033193538e-05, "loss": 2.7948, "num_input_tokens_seen": 5106565120, "step": 9740 }, { "epoch": 0.47259655312467796, "grad_norm": 0.27734375, "learning_rate": 2.931049073377261e-05, "loss": 2.7769, "num_input_tokens_seen": 5109186560, "step": 9745 }, { "epoch": 0.4728390346809246, "grad_norm": 0.2392578125, "learning_rate": 2.9290742662679183e-05, "loss": 2.7823, "num_input_tokens_seen": 5111808000, "step": 9750 }, { "epoch": 0.4730815162371712, "grad_norm": 0.25, "learning_rate": 2.927099183261138e-05, "loss": 2.7735, "num_input_tokens_seen": 5114429440, "step": 9755 }, { "epoch": 0.4733239977934178, "grad_norm": 0.2490234375, "learning_rate": 2.9251238256269116e-05, "loss": 2.7771, "num_input_tokens_seen": 5117050880, "step": 9760 }, { "epoch": 0.47356647934966445, "grad_norm": 0.251953125, "learning_rate": 2.923148194635405e-05, "loss": 2.7808, "num_input_tokens_seen": 5119672320, "step": 9765 }, { "epoch": 0.47380896090591107, "grad_norm": 0.248046875, "learning_rate": 2.9211722915569622e-05, "loss": 2.7758, "num_input_tokens_seen": 5122293760, "step": 9770 }, { "epoch": 0.47405144246215775, "grad_norm": 0.255859375, "learning_rate": 2.9191961176621007e-05, "loss": 2.7765, "num_input_tokens_seen": 5124915200, "step": 9775 }, { "epoch": 0.47429392401840437, "grad_norm": 0.251953125, "learning_rate": 2.9172196742215135e-05, "loss": 2.7782, "num_input_tokens_seen": 5127536640, "step": 9780 }, { "epoch": 0.474536405574651, "grad_norm": 0.2451171875, "learning_rate": 2.9152429625060664e-05, "loss": 2.7861, "num_input_tokens_seen": 5130158080, "step": 9785 }, { "epoch": 0.4747788871308976, "grad_norm": 0.2490234375, "learning_rate": 2.913265983786796e-05, "loss": 2.7683, "num_input_tokens_seen": 5132779520, "step": 9790 }, { "epoch": 0.47502136868714423, "grad_norm": 0.26953125, "learning_rate": 2.911288739334912e-05, "loss": 2.7809, "num_input_tokens_seen": 5135400960, "step": 9795 }, { "epoch": 0.47526385024339085, "grad_norm": 0.255859375, "learning_rate": 2.9093112304217962e-05, "loss": 2.776, "num_input_tokens_seen": 5138022400, "step": 9800 }, { "epoch": 0.47550633179963747, "grad_norm": 0.26171875, "learning_rate": 2.907333458318998e-05, "loss": 2.7738, "num_input_tokens_seen": 5140643840, "step": 9805 }, { "epoch": 0.4757488133558841, "grad_norm": 0.2470703125, "learning_rate": 2.905355424298239e-05, "loss": 2.7948, "num_input_tokens_seen": 5143265280, "step": 9810 }, { "epoch": 0.47599129491213077, "grad_norm": 0.2431640625, "learning_rate": 2.903377129631406e-05, "loss": 2.7737, "num_input_tokens_seen": 5145886720, "step": 9815 }, { "epoch": 0.4762337764683774, "grad_norm": 0.2431640625, "learning_rate": 2.9013985755905544e-05, "loss": 2.7716, "num_input_tokens_seen": 5148508160, "step": 9820 }, { "epoch": 0.476476258024624, "grad_norm": 0.24609375, "learning_rate": 2.899419763447908e-05, "loss": 2.7773, "num_input_tokens_seen": 5151129600, "step": 9825 }, { "epoch": 0.47671873958087063, "grad_norm": 0.251953125, "learning_rate": 2.8974406944758548e-05, "loss": 2.779, "num_input_tokens_seen": 5153751040, "step": 9830 }, { "epoch": 0.47696122113711725, "grad_norm": 0.251953125, "learning_rate": 2.8954613699469485e-05, "loss": 2.767, "num_input_tokens_seen": 5156372480, "step": 9835 }, { "epoch": 0.4772037026933639, "grad_norm": 0.25, "learning_rate": 2.8934817911339075e-05, "loss": 2.79, "num_input_tokens_seen": 5158993920, "step": 9840 }, { "epoch": 0.4774461842496105, "grad_norm": 0.25, "learning_rate": 2.8915019593096138e-05, "loss": 2.7798, "num_input_tokens_seen": 5161615360, "step": 9845 }, { "epoch": 0.4776886658058571, "grad_norm": 0.255859375, "learning_rate": 2.8895218757471105e-05, "loss": 2.7674, "num_input_tokens_seen": 5164236800, "step": 9850 }, { "epoch": 0.4779311473621038, "grad_norm": 0.2431640625, "learning_rate": 2.8875415417196038e-05, "loss": 2.7766, "num_input_tokens_seen": 5166858240, "step": 9855 }, { "epoch": 0.4781736289183504, "grad_norm": 0.2470703125, "learning_rate": 2.8855609585004613e-05, "loss": 2.7881, "num_input_tokens_seen": 5169479680, "step": 9860 }, { "epoch": 0.47841611047459703, "grad_norm": 0.236328125, "learning_rate": 2.8835801273632112e-05, "loss": 2.7514, "num_input_tokens_seen": 5172101120, "step": 9865 }, { "epoch": 0.47865859203084365, "grad_norm": 0.2470703125, "learning_rate": 2.8815990495815394e-05, "loss": 2.7634, "num_input_tokens_seen": 5174722560, "step": 9870 }, { "epoch": 0.4789010735870903, "grad_norm": 0.2578125, "learning_rate": 2.8796177264292905e-05, "loss": 2.7633, "num_input_tokens_seen": 5177344000, "step": 9875 }, { "epoch": 0.4791435551433369, "grad_norm": 0.2578125, "learning_rate": 2.8776361591804703e-05, "loss": 2.7783, "num_input_tokens_seen": 5179965440, "step": 9880 }, { "epoch": 0.4793860366995835, "grad_norm": 0.244140625, "learning_rate": 2.875654349109235e-05, "loss": 2.7606, "num_input_tokens_seen": 5182586880, "step": 9885 }, { "epoch": 0.47962851825583014, "grad_norm": 0.25390625, "learning_rate": 2.873672297489905e-05, "loss": 2.7646, "num_input_tokens_seen": 5185208320, "step": 9890 }, { "epoch": 0.4798709998120768, "grad_norm": 0.25, "learning_rate": 2.8716900055969497e-05, "loss": 2.7728, "num_input_tokens_seen": 5187829760, "step": 9895 }, { "epoch": 0.48011348136832344, "grad_norm": 0.25390625, "learning_rate": 2.869707474704995e-05, "loss": 2.7734, "num_input_tokens_seen": 5190451200, "step": 9900 }, { "epoch": 0.48011348136832344, "eval_accuracy": 0.4556896270965641, "eval_loss": 2.743288516998291, "eval_runtime": 6.1957, "eval_samples_per_second": 48.421, "eval_steps_per_second": 6.133, "num_input_tokens_seen": 5190451200, "step": 9900 }, { "epoch": 0.48035596292457006, "grad_norm": 0.2470703125, "learning_rate": 2.8677247060888217e-05, "loss": 2.7768, "num_input_tokens_seen": 5193072640, "step": 9905 }, { "epoch": 0.4805984444808167, "grad_norm": 0.244140625, "learning_rate": 2.8657417010233616e-05, "loss": 2.775, "num_input_tokens_seen": 5195694080, "step": 9910 }, { "epoch": 0.4808409260370633, "grad_norm": 0.23828125, "learning_rate": 2.8637584607836997e-05, "loss": 2.7721, "num_input_tokens_seen": 5198315520, "step": 9915 }, { "epoch": 0.4810834075933099, "grad_norm": 0.26171875, "learning_rate": 2.8617749866450716e-05, "loss": 2.775, "num_input_tokens_seen": 5200936960, "step": 9920 }, { "epoch": 0.48132588914955654, "grad_norm": 0.248046875, "learning_rate": 2.8597912798828647e-05, "loss": 2.7946, "num_input_tokens_seen": 5203558400, "step": 9925 }, { "epoch": 0.4815683707058032, "grad_norm": 0.2431640625, "learning_rate": 2.8578073417726132e-05, "loss": 2.7977, "num_input_tokens_seen": 5206179840, "step": 9930 }, { "epoch": 0.48181085226204984, "grad_norm": 0.2451171875, "learning_rate": 2.8558231735900028e-05, "loss": 2.7676, "num_input_tokens_seen": 5208801280, "step": 9935 }, { "epoch": 0.48205333381829646, "grad_norm": 0.255859375, "learning_rate": 2.8538387766108655e-05, "loss": 2.7659, "num_input_tokens_seen": 5211422720, "step": 9940 }, { "epoch": 0.4822958153745431, "grad_norm": 0.251953125, "learning_rate": 2.8518541521111813e-05, "loss": 2.7696, "num_input_tokens_seen": 5214044160, "step": 9945 }, { "epoch": 0.4825382969307897, "grad_norm": 0.251953125, "learning_rate": 2.849869301367076e-05, "loss": 2.7786, "num_input_tokens_seen": 5216665600, "step": 9950 }, { "epoch": 0.4827807784870363, "grad_norm": 0.255859375, "learning_rate": 2.8478842256548215e-05, "loss": 2.7738, "num_input_tokens_seen": 5219287040, "step": 9955 }, { "epoch": 0.48302326004328294, "grad_norm": 0.255859375, "learning_rate": 2.8458989262508334e-05, "loss": 2.7723, "num_input_tokens_seen": 5221908480, "step": 9960 }, { "epoch": 0.48326574159952956, "grad_norm": 0.2470703125, "learning_rate": 2.8439134044316716e-05, "loss": 2.7677, "num_input_tokens_seen": 5224529920, "step": 9965 }, { "epoch": 0.48350822315577624, "grad_norm": 0.248046875, "learning_rate": 2.8419276614740397e-05, "loss": 2.7737, "num_input_tokens_seen": 5227151360, "step": 9970 }, { "epoch": 0.48375070471202286, "grad_norm": 0.24609375, "learning_rate": 2.8399416986547817e-05, "loss": 2.7726, "num_input_tokens_seen": 5229772800, "step": 9975 }, { "epoch": 0.4839931862682695, "grad_norm": 0.24609375, "learning_rate": 2.8379555172508853e-05, "loss": 2.777, "num_input_tokens_seen": 5232394240, "step": 9980 }, { "epoch": 0.4842356678245161, "grad_norm": 0.25, "learning_rate": 2.835969118539477e-05, "loss": 2.7792, "num_input_tokens_seen": 5235015680, "step": 9985 }, { "epoch": 0.4844781493807627, "grad_norm": 0.248046875, "learning_rate": 2.8339825037978234e-05, "loss": 2.7743, "num_input_tokens_seen": 5237637120, "step": 9990 }, { "epoch": 0.48472063093700934, "grad_norm": 0.251953125, "learning_rate": 2.831995674303331e-05, "loss": 2.7795, "num_input_tokens_seen": 5240258560, "step": 9995 }, { "epoch": 0.48496311249325597, "grad_norm": 0.259765625, "learning_rate": 2.830008631333543e-05, "loss": 2.779, "num_input_tokens_seen": 5242880000, "step": 10000 }, { "epoch": 0.4852055940495026, "grad_norm": 0.2578125, "learning_rate": 2.8280213761661394e-05, "loss": 2.793, "num_input_tokens_seen": 5245501440, "step": 10005 }, { "epoch": 0.48544807560574926, "grad_norm": 0.2470703125, "learning_rate": 2.8260339100789397e-05, "loss": 2.7825, "num_input_tokens_seen": 5248122880, "step": 10010 }, { "epoch": 0.4856905571619959, "grad_norm": 0.25, "learning_rate": 2.8240462343498963e-05, "loss": 2.7842, "num_input_tokens_seen": 5250744320, "step": 10015 }, { "epoch": 0.4859330387182425, "grad_norm": 0.255859375, "learning_rate": 2.822058350257097e-05, "loss": 2.7721, "num_input_tokens_seen": 5253365760, "step": 10020 }, { "epoch": 0.4861755202744891, "grad_norm": 0.2412109375, "learning_rate": 2.8200702590787637e-05, "loss": 2.7691, "num_input_tokens_seen": 5255987200, "step": 10025 }, { "epoch": 0.48641800183073575, "grad_norm": 0.25, "learning_rate": 2.8180819620932503e-05, "loss": 2.7571, "num_input_tokens_seen": 5258608640, "step": 10030 }, { "epoch": 0.48666048338698237, "grad_norm": 0.255859375, "learning_rate": 2.816093460579047e-05, "loss": 2.7751, "num_input_tokens_seen": 5261230080, "step": 10035 }, { "epoch": 0.486902964943229, "grad_norm": 0.251953125, "learning_rate": 2.8141047558147704e-05, "loss": 2.771, "num_input_tokens_seen": 5263851520, "step": 10040 }, { "epoch": 0.4871454464994756, "grad_norm": 0.2451171875, "learning_rate": 2.8121158490791706e-05, "loss": 2.788, "num_input_tokens_seen": 5266472960, "step": 10045 }, { "epoch": 0.4873879280557223, "grad_norm": 0.259765625, "learning_rate": 2.810126741651128e-05, "loss": 2.7768, "num_input_tokens_seen": 5269094400, "step": 10050 }, { "epoch": 0.4876304096119689, "grad_norm": 0.25, "learning_rate": 2.808137434809649e-05, "loss": 2.7901, "num_input_tokens_seen": 5271715840, "step": 10055 }, { "epoch": 0.48787289116821553, "grad_norm": 0.25390625, "learning_rate": 2.806147929833872e-05, "loss": 2.7703, "num_input_tokens_seen": 5274337280, "step": 10060 }, { "epoch": 0.48811537272446215, "grad_norm": 0.2490234375, "learning_rate": 2.804158228003062e-05, "loss": 2.7837, "num_input_tokens_seen": 5276958720, "step": 10065 }, { "epoch": 0.48835785428070877, "grad_norm": 0.259765625, "learning_rate": 2.8021683305966078e-05, "loss": 2.7862, "num_input_tokens_seen": 5279580160, "step": 10070 }, { "epoch": 0.4886003358369554, "grad_norm": 0.2451171875, "learning_rate": 2.8001782388940268e-05, "loss": 2.7691, "num_input_tokens_seen": 5282201600, "step": 10075 }, { "epoch": 0.488842817393202, "grad_norm": 0.2451171875, "learning_rate": 2.79818795417496e-05, "loss": 2.7753, "num_input_tokens_seen": 5284823040, "step": 10080 }, { "epoch": 0.48908529894944863, "grad_norm": 0.25, "learning_rate": 2.7961974777191736e-05, "loss": 2.7761, "num_input_tokens_seen": 5287444480, "step": 10085 }, { "epoch": 0.4893277805056953, "grad_norm": 0.25390625, "learning_rate": 2.7942068108065555e-05, "loss": 2.7633, "num_input_tokens_seen": 5290065920, "step": 10090 }, { "epoch": 0.48957026206194193, "grad_norm": 0.251953125, "learning_rate": 2.7922159547171184e-05, "loss": 2.7823, "num_input_tokens_seen": 5292687360, "step": 10095 }, { "epoch": 0.48981274361818855, "grad_norm": 0.251953125, "learning_rate": 2.7902249107309943e-05, "loss": 2.7701, "num_input_tokens_seen": 5295308800, "step": 10100 }, { "epoch": 0.49005522517443517, "grad_norm": 0.2470703125, "learning_rate": 2.788233680128436e-05, "loss": 2.7816, "num_input_tokens_seen": 5297930240, "step": 10105 }, { "epoch": 0.4902977067306818, "grad_norm": 0.2412109375, "learning_rate": 2.7862422641898182e-05, "loss": 2.7677, "num_input_tokens_seen": 5300551680, "step": 10110 }, { "epoch": 0.4905401882869284, "grad_norm": 0.265625, "learning_rate": 2.7842506641956346e-05, "loss": 2.7749, "num_input_tokens_seen": 5303173120, "step": 10115 }, { "epoch": 0.49078266984317503, "grad_norm": 0.25, "learning_rate": 2.782258881426495e-05, "loss": 2.7748, "num_input_tokens_seen": 5305794560, "step": 10120 }, { "epoch": 0.49102515139942166, "grad_norm": 0.248046875, "learning_rate": 2.7802669171631297e-05, "loss": 2.7566, "num_input_tokens_seen": 5308416000, "step": 10125 }, { "epoch": 0.49126763295566833, "grad_norm": 0.25390625, "learning_rate": 2.7782747726863827e-05, "loss": 2.7735, "num_input_tokens_seen": 5311037440, "step": 10130 }, { "epoch": 0.49151011451191495, "grad_norm": 0.25390625, "learning_rate": 2.776282449277216e-05, "loss": 2.7642, "num_input_tokens_seen": 5313658880, "step": 10135 }, { "epoch": 0.4917525960681616, "grad_norm": 0.2431640625, "learning_rate": 2.7742899482167063e-05, "loss": 2.7836, "num_input_tokens_seen": 5316280320, "step": 10140 }, { "epoch": 0.4919950776244082, "grad_norm": 0.2412109375, "learning_rate": 2.7722972707860435e-05, "loss": 2.7686, "num_input_tokens_seen": 5318901760, "step": 10145 }, { "epoch": 0.4922375591806548, "grad_norm": 0.248046875, "learning_rate": 2.770304418266532e-05, "loss": 2.7712, "num_input_tokens_seen": 5321523200, "step": 10150 }, { "epoch": 0.49248004073690144, "grad_norm": 0.2421875, "learning_rate": 2.768311391939589e-05, "loss": 2.7749, "num_input_tokens_seen": 5324144640, "step": 10155 }, { "epoch": 0.49272252229314806, "grad_norm": 0.24609375, "learning_rate": 2.7663181930867428e-05, "loss": 2.7686, "num_input_tokens_seen": 5326766080, "step": 10160 }, { "epoch": 0.4929650038493947, "grad_norm": 0.251953125, "learning_rate": 2.7643248229896314e-05, "loss": 2.7831, "num_input_tokens_seen": 5329387520, "step": 10165 }, { "epoch": 0.49320748540564135, "grad_norm": 0.255859375, "learning_rate": 2.762331282930005e-05, "loss": 2.7871, "num_input_tokens_seen": 5332008960, "step": 10170 }, { "epoch": 0.493449966961888, "grad_norm": 0.25390625, "learning_rate": 2.7603375741897235e-05, "loss": 2.7793, "num_input_tokens_seen": 5334630400, "step": 10175 }, { "epoch": 0.4936924485181346, "grad_norm": 0.251953125, "learning_rate": 2.7583436980507528e-05, "loss": 2.776, "num_input_tokens_seen": 5337251840, "step": 10180 }, { "epoch": 0.4939349300743812, "grad_norm": 0.259765625, "learning_rate": 2.756349655795168e-05, "loss": 2.7764, "num_input_tokens_seen": 5339873280, "step": 10185 }, { "epoch": 0.49417741163062784, "grad_norm": 0.24609375, "learning_rate": 2.754355448705151e-05, "loss": 2.7685, "num_input_tokens_seen": 5342494720, "step": 10190 }, { "epoch": 0.49441989318687446, "grad_norm": 0.24609375, "learning_rate": 2.7523610780629893e-05, "loss": 2.7762, "num_input_tokens_seen": 5345116160, "step": 10195 }, { "epoch": 0.4946623747431211, "grad_norm": 0.2412109375, "learning_rate": 2.7503665451510746e-05, "loss": 2.7829, "num_input_tokens_seen": 5347737600, "step": 10200 }, { "epoch": 0.4946623747431211, "eval_accuracy": 0.45571893828366716, "eval_loss": 2.742992639541626, "eval_runtime": 5.8537, "eval_samples_per_second": 51.25, "eval_steps_per_second": 6.492, "num_input_tokens_seen": 5347737600, "step": 10200 }, { "epoch": 0.49490485629936776, "grad_norm": 0.25, "learning_rate": 2.7483718512519062e-05, "loss": 2.7699, "num_input_tokens_seen": 5350359040, "step": 10205 }, { "epoch": 0.4951473378556144, "grad_norm": 0.25390625, "learning_rate": 2.7463769976480837e-05, "loss": 2.7728, "num_input_tokens_seen": 5352980480, "step": 10210 }, { "epoch": 0.495389819411861, "grad_norm": 0.2470703125, "learning_rate": 2.7443819856223097e-05, "loss": 2.778, "num_input_tokens_seen": 5355601920, "step": 10215 }, { "epoch": 0.4956323009681076, "grad_norm": 0.2431640625, "learning_rate": 2.7423868164573907e-05, "loss": 2.7786, "num_input_tokens_seen": 5358223360, "step": 10220 }, { "epoch": 0.49587478252435424, "grad_norm": 0.255859375, "learning_rate": 2.7403914914362317e-05, "loss": 2.7663, "num_input_tokens_seen": 5360844800, "step": 10225 }, { "epoch": 0.49611726408060086, "grad_norm": 0.2373046875, "learning_rate": 2.73839601184184e-05, "loss": 2.7782, "num_input_tokens_seen": 5363466240, "step": 10230 }, { "epoch": 0.4963597456368475, "grad_norm": 0.25390625, "learning_rate": 2.7364003789573216e-05, "loss": 2.7852, "num_input_tokens_seen": 5366087680, "step": 10235 }, { "epoch": 0.4966022271930941, "grad_norm": 0.25390625, "learning_rate": 2.7344045940658807e-05, "loss": 2.7884, "num_input_tokens_seen": 5368709120, "step": 10240 }, { "epoch": 0.4968447087493408, "grad_norm": 0.251953125, "learning_rate": 2.732408658450819e-05, "loss": 2.7718, "num_input_tokens_seen": 5371330560, "step": 10245 }, { "epoch": 0.4970871903055874, "grad_norm": 0.2470703125, "learning_rate": 2.730412573395536e-05, "loss": 2.7532, "num_input_tokens_seen": 5373952000, "step": 10250 }, { "epoch": 0.497329671861834, "grad_norm": 0.2431640625, "learning_rate": 2.7284163401835274e-05, "loss": 2.7714, "num_input_tokens_seen": 5376573440, "step": 10255 }, { "epoch": 0.49757215341808064, "grad_norm": 0.259765625, "learning_rate": 2.726419960098382e-05, "loss": 2.7828, "num_input_tokens_seen": 5379194880, "step": 10260 }, { "epoch": 0.49781463497432726, "grad_norm": 0.255859375, "learning_rate": 2.7244234344237868e-05, "loss": 2.773, "num_input_tokens_seen": 5381816320, "step": 10265 }, { "epoch": 0.4980571165305739, "grad_norm": 0.251953125, "learning_rate": 2.722426764443519e-05, "loss": 2.7669, "num_input_tokens_seen": 5384437760, "step": 10270 }, { "epoch": 0.4982995980868205, "grad_norm": 0.248046875, "learning_rate": 2.72042995144145e-05, "loss": 2.765, "num_input_tokens_seen": 5387059200, "step": 10275 }, { "epoch": 0.4985420796430671, "grad_norm": 0.24609375, "learning_rate": 2.718432996701543e-05, "loss": 2.7815, "num_input_tokens_seen": 5389680640, "step": 10280 }, { "epoch": 0.4987845611993138, "grad_norm": 0.2470703125, "learning_rate": 2.7164359015078533e-05, "loss": 2.7781, "num_input_tokens_seen": 5392302080, "step": 10285 }, { "epoch": 0.4990270427555604, "grad_norm": 0.255859375, "learning_rate": 2.7144386671445242e-05, "loss": 2.7668, "num_input_tokens_seen": 5394923520, "step": 10290 }, { "epoch": 0.49926952431180704, "grad_norm": 0.244140625, "learning_rate": 2.7124412948957913e-05, "loss": 2.7715, "num_input_tokens_seen": 5397544960, "step": 10295 }, { "epoch": 0.49951200586805367, "grad_norm": 0.25390625, "learning_rate": 2.7104437860459763e-05, "loss": 2.7773, "num_input_tokens_seen": 5400166400, "step": 10300 }, { "epoch": 0.4997544874243003, "grad_norm": 0.248046875, "learning_rate": 2.7084461418794903e-05, "loss": 2.7761, "num_input_tokens_seen": 5402787840, "step": 10305 }, { "epoch": 0.4999969689805469, "grad_norm": 0.251953125, "learning_rate": 2.7064483636808313e-05, "loss": 2.7861, "num_input_tokens_seen": 5405409280, "step": 10310 }, { "epoch": 0.5002394505367935, "grad_norm": 0.25, "learning_rate": 2.7044504527345826e-05, "loss": 2.7889, "num_input_tokens_seen": 5408030720, "step": 10315 }, { "epoch": 0.5004819320930401, "grad_norm": 0.244140625, "learning_rate": 2.702452410325414e-05, "loss": 2.7849, "num_input_tokens_seen": 5410652160, "step": 10320 }, { "epoch": 0.5007244136492868, "grad_norm": 0.251953125, "learning_rate": 2.7004542377380788e-05, "loss": 2.7642, "num_input_tokens_seen": 5413273600, "step": 10325 }, { "epoch": 0.5009668952055334, "grad_norm": 0.2490234375, "learning_rate": 2.698455936257415e-05, "loss": 2.763, "num_input_tokens_seen": 5415895040, "step": 10330 }, { "epoch": 0.50120937676178, "grad_norm": 0.2470703125, "learning_rate": 2.6964575071683423e-05, "loss": 2.7908, "num_input_tokens_seen": 5418516480, "step": 10335 }, { "epoch": 0.5014518583180266, "grad_norm": 0.2451171875, "learning_rate": 2.694458951755863e-05, "loss": 2.7781, "num_input_tokens_seen": 5421137920, "step": 10340 }, { "epoch": 0.5016943398742734, "grad_norm": 0.2421875, "learning_rate": 2.6924602713050623e-05, "loss": 2.7945, "num_input_tokens_seen": 5423759360, "step": 10345 }, { "epoch": 0.50193682143052, "grad_norm": 0.248046875, "learning_rate": 2.6904614671011025e-05, "loss": 2.7726, "num_input_tokens_seen": 5426380800, "step": 10350 }, { "epoch": 0.5021793029867666, "grad_norm": 0.251953125, "learning_rate": 2.688462540429228e-05, "loss": 2.7678, "num_input_tokens_seen": 5429002240, "step": 10355 }, { "epoch": 0.5024217845430132, "grad_norm": 0.255859375, "learning_rate": 2.686463492574761e-05, "loss": 2.7737, "num_input_tokens_seen": 5431623680, "step": 10360 }, { "epoch": 0.5026642660992598, "grad_norm": 0.248046875, "learning_rate": 2.6844643248231016e-05, "loss": 2.782, "num_input_tokens_seen": 5434245120, "step": 10365 }, { "epoch": 0.5029067476555065, "grad_norm": 0.2421875, "learning_rate": 2.6824650384597272e-05, "loss": 2.771, "num_input_tokens_seen": 5436866560, "step": 10370 }, { "epoch": 0.5031492292117531, "grad_norm": 0.2470703125, "learning_rate": 2.6804656347701923e-05, "loss": 2.7808, "num_input_tokens_seen": 5439488000, "step": 10375 }, { "epoch": 0.5033917107679997, "grad_norm": 0.25, "learning_rate": 2.6784661150401248e-05, "loss": 2.7828, "num_input_tokens_seen": 5442109440, "step": 10380 }, { "epoch": 0.5036341923242463, "grad_norm": 0.251953125, "learning_rate": 2.6764664805552287e-05, "loss": 2.7549, "num_input_tokens_seen": 5444730880, "step": 10385 }, { "epoch": 0.503876673880493, "grad_norm": 0.251953125, "learning_rate": 2.6744667326012822e-05, "loss": 2.7733, "num_input_tokens_seen": 5447352320, "step": 10390 }, { "epoch": 0.5041191554367396, "grad_norm": 0.2421875, "learning_rate": 2.672466872464134e-05, "loss": 2.7889, "num_input_tokens_seen": 5449973760, "step": 10395 }, { "epoch": 0.5043616369929862, "grad_norm": 0.25390625, "learning_rate": 2.6704669014297083e-05, "loss": 2.7714, "num_input_tokens_seen": 5452595200, "step": 10400 }, { "epoch": 0.5046041185492328, "grad_norm": 0.2578125, "learning_rate": 2.6684668207839996e-05, "loss": 2.7692, "num_input_tokens_seen": 5455216640, "step": 10405 }, { "epoch": 0.5048466001054794, "grad_norm": 0.2490234375, "learning_rate": 2.6664666318130704e-05, "loss": 2.7891, "num_input_tokens_seen": 5457838080, "step": 10410 }, { "epoch": 0.5050890816617261, "grad_norm": 0.2578125, "learning_rate": 2.6644663358030552e-05, "loss": 2.7877, "num_input_tokens_seen": 5460459520, "step": 10415 }, { "epoch": 0.5053315632179727, "grad_norm": 0.255859375, "learning_rate": 2.6624659340401576e-05, "loss": 2.7846, "num_input_tokens_seen": 5463080960, "step": 10420 }, { "epoch": 0.5055740447742194, "grad_norm": 0.2578125, "learning_rate": 2.6604654278106477e-05, "loss": 2.7814, "num_input_tokens_seen": 5465702400, "step": 10425 }, { "epoch": 0.505816526330466, "grad_norm": 0.25, "learning_rate": 2.6584648184008642e-05, "loss": 2.7728, "num_input_tokens_seen": 5468323840, "step": 10430 }, { "epoch": 0.5060590078867127, "grad_norm": 0.24609375, "learning_rate": 2.6564641070972117e-05, "loss": 2.7744, "num_input_tokens_seen": 5470945280, "step": 10435 }, { "epoch": 0.5063014894429593, "grad_norm": 0.259765625, "learning_rate": 2.6544632951861586e-05, "loss": 2.7633, "num_input_tokens_seen": 5473566720, "step": 10440 }, { "epoch": 0.5065439709992059, "grad_norm": 0.2421875, "learning_rate": 2.6524623839542408e-05, "loss": 2.7841, "num_input_tokens_seen": 5476188160, "step": 10445 }, { "epoch": 0.5067864525554525, "grad_norm": 0.244140625, "learning_rate": 2.6504613746880557e-05, "loss": 2.7543, "num_input_tokens_seen": 5478809600, "step": 10450 }, { "epoch": 0.5070289341116991, "grad_norm": 0.25390625, "learning_rate": 2.648460268674266e-05, "loss": 2.7903, "num_input_tokens_seen": 5481431040, "step": 10455 }, { "epoch": 0.5072714156679458, "grad_norm": 0.251953125, "learning_rate": 2.6464590671995943e-05, "loss": 2.7832, "num_input_tokens_seen": 5484052480, "step": 10460 }, { "epoch": 0.5075138972241924, "grad_norm": 0.255859375, "learning_rate": 2.6444577715508268e-05, "loss": 2.7857, "num_input_tokens_seen": 5486673920, "step": 10465 }, { "epoch": 0.507756378780439, "grad_norm": 0.2451171875, "learning_rate": 2.642456383014808e-05, "loss": 2.7707, "num_input_tokens_seen": 5489295360, "step": 10470 }, { "epoch": 0.5079988603366856, "grad_norm": 0.25390625, "learning_rate": 2.6404549028784438e-05, "loss": 2.7745, "num_input_tokens_seen": 5491916800, "step": 10475 }, { "epoch": 0.5082413418929322, "grad_norm": 0.2421875, "learning_rate": 2.6384533324286982e-05, "loss": 2.7767, "num_input_tokens_seen": 5494538240, "step": 10480 }, { "epoch": 0.5084838234491789, "grad_norm": 0.2412109375, "learning_rate": 2.636451672952594e-05, "loss": 2.7725, "num_input_tokens_seen": 5497159680, "step": 10485 }, { "epoch": 0.5087263050054255, "grad_norm": 0.251953125, "learning_rate": 2.63444992573721e-05, "loss": 2.7672, "num_input_tokens_seen": 5499781120, "step": 10490 }, { "epoch": 0.5089687865616721, "grad_norm": 0.259765625, "learning_rate": 2.632448092069683e-05, "loss": 2.7843, "num_input_tokens_seen": 5502402560, "step": 10495 }, { "epoch": 0.5092112681179188, "grad_norm": 0.2490234375, "learning_rate": 2.6304461732372047e-05, "loss": 2.781, "num_input_tokens_seen": 5505024000, "step": 10500 }, { "epoch": 0.5092112681179188, "eval_accuracy": 0.45571568148510017, "eval_loss": 2.7428910732269287, "eval_runtime": 6.7158, "eval_samples_per_second": 44.671, "eval_steps_per_second": 5.658, "num_input_tokens_seen": 5505024000, "step": 10500 }, { "epoch": 0.5094537496741655, "grad_norm": 0.251953125, "learning_rate": 2.62844417052702e-05, "loss": 2.786, "num_input_tokens_seen": 5507645440, "step": 10505 }, { "epoch": 0.5096962312304121, "grad_norm": 0.25, "learning_rate": 2.6264420852264297e-05, "loss": 2.7865, "num_input_tokens_seen": 5510266880, "step": 10510 }, { "epoch": 0.5099387127866587, "grad_norm": 0.2470703125, "learning_rate": 2.624439918622789e-05, "loss": 2.7763, "num_input_tokens_seen": 5512888320, "step": 10515 }, { "epoch": 0.5101811943429053, "grad_norm": 0.244140625, "learning_rate": 2.6224376720035014e-05, "loss": 2.7807, "num_input_tokens_seen": 5515509760, "step": 10520 }, { "epoch": 0.5104236758991519, "grad_norm": 0.2578125, "learning_rate": 2.620435346656025e-05, "loss": 2.7708, "num_input_tokens_seen": 5518131200, "step": 10525 }, { "epoch": 0.5106661574553986, "grad_norm": 0.265625, "learning_rate": 2.6184329438678677e-05, "loss": 2.7785, "num_input_tokens_seen": 5520752640, "step": 10530 }, { "epoch": 0.5109086390116452, "grad_norm": 0.251953125, "learning_rate": 2.6164304649265852e-05, "loss": 2.789, "num_input_tokens_seen": 5523374080, "step": 10535 }, { "epoch": 0.5111511205678918, "grad_norm": 0.2451171875, "learning_rate": 2.614427911119786e-05, "loss": 2.7725, "num_input_tokens_seen": 5525995520, "step": 10540 }, { "epoch": 0.5113936021241384, "grad_norm": 0.24609375, "learning_rate": 2.6124252837351247e-05, "loss": 2.7703, "num_input_tokens_seen": 5528616960, "step": 10545 }, { "epoch": 0.511636083680385, "grad_norm": 0.2412109375, "learning_rate": 2.6104225840603024e-05, "loss": 2.7767, "num_input_tokens_seen": 5531238400, "step": 10550 }, { "epoch": 0.5118785652366317, "grad_norm": 0.24609375, "learning_rate": 2.608419813383067e-05, "loss": 2.7687, "num_input_tokens_seen": 5533859840, "step": 10555 }, { "epoch": 0.5121210467928783, "grad_norm": 0.251953125, "learning_rate": 2.6064169729912145e-05, "loss": 2.777, "num_input_tokens_seen": 5536481280, "step": 10560 }, { "epoch": 0.5123635283491249, "grad_norm": 0.2353515625, "learning_rate": 2.604414064172581e-05, "loss": 2.7811, "num_input_tokens_seen": 5539102720, "step": 10565 }, { "epoch": 0.5126060099053715, "grad_norm": 0.25, "learning_rate": 2.602411088215052e-05, "loss": 2.7831, "num_input_tokens_seen": 5541724160, "step": 10570 }, { "epoch": 0.5128484914616182, "grad_norm": 0.251953125, "learning_rate": 2.6004080464065532e-05, "loss": 2.7783, "num_input_tokens_seen": 5544345600, "step": 10575 }, { "epoch": 0.5130909730178649, "grad_norm": 0.26171875, "learning_rate": 2.598404940035052e-05, "loss": 2.7667, "num_input_tokens_seen": 5546967040, "step": 10580 }, { "epoch": 0.5133334545741115, "grad_norm": 0.251953125, "learning_rate": 2.5964017703885586e-05, "loss": 2.7711, "num_input_tokens_seen": 5549588480, "step": 10585 }, { "epoch": 0.5135759361303581, "grad_norm": 0.2431640625, "learning_rate": 2.5943985387551234e-05, "loss": 2.7631, "num_input_tokens_seen": 5552209920, "step": 10590 }, { "epoch": 0.5138184176866047, "grad_norm": 0.2470703125, "learning_rate": 2.5923952464228373e-05, "loss": 2.7706, "num_input_tokens_seen": 5554831360, "step": 10595 }, { "epoch": 0.5140608992428514, "grad_norm": 0.248046875, "learning_rate": 2.59039189467983e-05, "loss": 2.7733, "num_input_tokens_seen": 5557452800, "step": 10600 }, { "epoch": 0.514303380799098, "grad_norm": 0.2578125, "learning_rate": 2.5883884848142693e-05, "loss": 2.7746, "num_input_tokens_seen": 5560074240, "step": 10605 }, { "epoch": 0.5145458623553446, "grad_norm": 0.255859375, "learning_rate": 2.586385018114359e-05, "loss": 2.7857, "num_input_tokens_seen": 5562695680, "step": 10610 }, { "epoch": 0.5147883439115912, "grad_norm": 0.248046875, "learning_rate": 2.5843814958683423e-05, "loss": 2.7802, "num_input_tokens_seen": 5565317120, "step": 10615 }, { "epoch": 0.5150308254678378, "grad_norm": 0.24609375, "learning_rate": 2.5823779193644953e-05, "loss": 2.7669, "num_input_tokens_seen": 5567938560, "step": 10620 }, { "epoch": 0.5152733070240845, "grad_norm": 0.26171875, "learning_rate": 2.5803742898911316e-05, "loss": 2.782, "num_input_tokens_seen": 5570560000, "step": 10625 }, { "epoch": 0.5155157885803311, "grad_norm": 0.2470703125, "learning_rate": 2.578370608736596e-05, "loss": 2.7886, "num_input_tokens_seen": 5573181440, "step": 10630 }, { "epoch": 0.5157582701365777, "grad_norm": 0.2470703125, "learning_rate": 2.576366877189269e-05, "loss": 2.7732, "num_input_tokens_seen": 5575802880, "step": 10635 }, { "epoch": 0.5160007516928243, "grad_norm": 0.2470703125, "learning_rate": 2.5743630965375624e-05, "loss": 2.7792, "num_input_tokens_seen": 5578424320, "step": 10640 }, { "epoch": 0.516243233249071, "grad_norm": 0.2451171875, "learning_rate": 2.5723592680699194e-05, "loss": 2.7736, "num_input_tokens_seen": 5581045760, "step": 10645 }, { "epoch": 0.5164857148053176, "grad_norm": 0.24609375, "learning_rate": 2.5703553930748138e-05, "loss": 2.7993, "num_input_tokens_seen": 5583667200, "step": 10650 }, { "epoch": 0.5167281963615642, "grad_norm": 0.248046875, "learning_rate": 2.5683514728407505e-05, "loss": 2.7852, "num_input_tokens_seen": 5586288640, "step": 10655 }, { "epoch": 0.5169706779178109, "grad_norm": 0.24609375, "learning_rate": 2.5663475086562628e-05, "loss": 2.7788, "num_input_tokens_seen": 5588910080, "step": 10660 }, { "epoch": 0.5172131594740575, "grad_norm": 0.248046875, "learning_rate": 2.564343501809912e-05, "loss": 2.775, "num_input_tokens_seen": 5591531520, "step": 10665 }, { "epoch": 0.5174556410303042, "grad_norm": 0.251953125, "learning_rate": 2.562339453590287e-05, "loss": 2.7701, "num_input_tokens_seen": 5594152960, "step": 10670 }, { "epoch": 0.5176981225865508, "grad_norm": 0.2451171875, "learning_rate": 2.5603353652860034e-05, "loss": 2.7722, "num_input_tokens_seen": 5596774400, "step": 10675 }, { "epoch": 0.5179406041427974, "grad_norm": 0.2431640625, "learning_rate": 2.5583312381857017e-05, "loss": 2.7631, "num_input_tokens_seen": 5599395840, "step": 10680 }, { "epoch": 0.518183085699044, "grad_norm": 0.2412109375, "learning_rate": 2.5563270735780504e-05, "loss": 2.7686, "num_input_tokens_seen": 5602017280, "step": 10685 }, { "epoch": 0.5184255672552907, "grad_norm": 0.259765625, "learning_rate": 2.554322872751737e-05, "loss": 2.7939, "num_input_tokens_seen": 5604638720, "step": 10690 }, { "epoch": 0.5186680488115373, "grad_norm": 0.265625, "learning_rate": 2.5523186369954777e-05, "loss": 2.7759, "num_input_tokens_seen": 5607260160, "step": 10695 }, { "epoch": 0.5189105303677839, "grad_norm": 0.2490234375, "learning_rate": 2.5503143675980072e-05, "loss": 2.7725, "num_input_tokens_seen": 5609881600, "step": 10700 }, { "epoch": 0.5191530119240305, "grad_norm": 0.25, "learning_rate": 2.5483100658480828e-05, "loss": 2.7973, "num_input_tokens_seen": 5612503040, "step": 10705 }, { "epoch": 0.5193954934802771, "grad_norm": 0.25, "learning_rate": 2.546305733034484e-05, "loss": 2.7867, "num_input_tokens_seen": 5615124480, "step": 10710 }, { "epoch": 0.5196379750365238, "grad_norm": 0.2578125, "learning_rate": 2.54430137044601e-05, "loss": 2.767, "num_input_tokens_seen": 5617745920, "step": 10715 }, { "epoch": 0.5198804565927704, "grad_norm": 0.25390625, "learning_rate": 2.5422969793714773e-05, "loss": 2.7687, "num_input_tokens_seen": 5620367360, "step": 10720 }, { "epoch": 0.520122938149017, "grad_norm": 0.2451171875, "learning_rate": 2.540292561099722e-05, "loss": 2.7716, "num_input_tokens_seen": 5622988800, "step": 10725 }, { "epoch": 0.5203654197052636, "grad_norm": 0.25390625, "learning_rate": 2.5382881169195982e-05, "loss": 2.7808, "num_input_tokens_seen": 5625610240, "step": 10730 }, { "epoch": 0.5206079012615102, "grad_norm": 0.2470703125, "learning_rate": 2.5362836481199752e-05, "loss": 2.7769, "num_input_tokens_seen": 5628231680, "step": 10735 }, { "epoch": 0.520850382817757, "grad_norm": 0.251953125, "learning_rate": 2.53427915598974e-05, "loss": 2.7741, "num_input_tokens_seen": 5630853120, "step": 10740 }, { "epoch": 0.5210928643740036, "grad_norm": 0.251953125, "learning_rate": 2.532274641817793e-05, "loss": 2.777, "num_input_tokens_seen": 5633474560, "step": 10745 }, { "epoch": 0.5213353459302502, "grad_norm": 0.2490234375, "learning_rate": 2.530270106893049e-05, "loss": 2.7766, "num_input_tokens_seen": 5636096000, "step": 10750 }, { "epoch": 0.5215778274864968, "grad_norm": 0.24609375, "learning_rate": 2.5282655525044375e-05, "loss": 2.771, "num_input_tokens_seen": 5638717440, "step": 10755 }, { "epoch": 0.5218203090427435, "grad_norm": 0.255859375, "learning_rate": 2.5262609799408983e-05, "loss": 2.7734, "num_input_tokens_seen": 5641338880, "step": 10760 }, { "epoch": 0.5220627905989901, "grad_norm": 0.2431640625, "learning_rate": 2.5242563904913846e-05, "loss": 2.7722, "num_input_tokens_seen": 5643960320, "step": 10765 }, { "epoch": 0.5223052721552367, "grad_norm": 0.25, "learning_rate": 2.5222517854448603e-05, "loss": 2.7821, "num_input_tokens_seen": 5646581760, "step": 10770 }, { "epoch": 0.5225477537114833, "grad_norm": 0.255859375, "learning_rate": 2.5202471660902992e-05, "loss": 2.7856, "num_input_tokens_seen": 5649203200, "step": 10775 }, { "epoch": 0.5227902352677299, "grad_norm": 0.244140625, "learning_rate": 2.518242533716683e-05, "loss": 2.7788, "num_input_tokens_seen": 5651824640, "step": 10780 }, { "epoch": 0.5230327168239766, "grad_norm": 0.2578125, "learning_rate": 2.516237889613004e-05, "loss": 2.7902, "num_input_tokens_seen": 5654446080, "step": 10785 }, { "epoch": 0.5232751983802232, "grad_norm": 0.25390625, "learning_rate": 2.51423323506826e-05, "loss": 2.7657, "num_input_tokens_seen": 5657067520, "step": 10790 }, { "epoch": 0.5235176799364698, "grad_norm": 0.2451171875, "learning_rate": 2.5122285713714573e-05, "loss": 2.7771, "num_input_tokens_seen": 5659688960, "step": 10795 }, { "epoch": 0.5237601614927164, "grad_norm": 0.2451171875, "learning_rate": 2.510223899811606e-05, "loss": 2.7757, "num_input_tokens_seen": 5662310400, "step": 10800 }, { "epoch": 0.5237601614927164, "eval_accuracy": 0.4557319654779352, "eval_loss": 2.74280047416687, "eval_runtime": 6.1505, "eval_samples_per_second": 48.777, "eval_steps_per_second": 6.178, "num_input_tokens_seen": 5662310400, "step": 10800 }, { "epoch": 0.524002643048963, "grad_norm": 0.2412109375, "learning_rate": 2.5082192216777232e-05, "loss": 2.8038, "num_input_tokens_seen": 5664931840, "step": 10805 }, { "epoch": 0.5242451246052097, "grad_norm": 0.251953125, "learning_rate": 2.5062145382588304e-05, "loss": 2.7682, "num_input_tokens_seen": 5667553280, "step": 10810 }, { "epoch": 0.5244876061614564, "grad_norm": 0.255859375, "learning_rate": 2.50420985084395e-05, "loss": 2.781, "num_input_tokens_seen": 5670174720, "step": 10815 }, { "epoch": 0.524730087717703, "grad_norm": 0.2451171875, "learning_rate": 2.5022051607221086e-05, "loss": 2.7693, "num_input_tokens_seen": 5672796160, "step": 10820 }, { "epoch": 0.5249725692739496, "grad_norm": 0.255859375, "learning_rate": 2.500200469182336e-05, "loss": 2.778, "num_input_tokens_seen": 5675417600, "step": 10825 }, { "epoch": 0.5252150508301963, "grad_norm": 0.263671875, "learning_rate": 2.4981957775136602e-05, "loss": 2.7754, "num_input_tokens_seen": 5678039040, "step": 10830 }, { "epoch": 0.5254575323864429, "grad_norm": 0.2470703125, "learning_rate": 2.4961910870051105e-05, "loss": 2.7791, "num_input_tokens_seen": 5680660480, "step": 10835 }, { "epoch": 0.5257000139426895, "grad_norm": 0.2578125, "learning_rate": 2.4941863989457158e-05, "loss": 2.7608, "num_input_tokens_seen": 5683281920, "step": 10840 }, { "epoch": 0.5259424954989361, "grad_norm": 0.2431640625, "learning_rate": 2.4921817146245035e-05, "loss": 2.7732, "num_input_tokens_seen": 5685903360, "step": 10845 }, { "epoch": 0.5261849770551827, "grad_norm": 0.248046875, "learning_rate": 2.490177035330497e-05, "loss": 2.7663, "num_input_tokens_seen": 5688524800, "step": 10850 }, { "epoch": 0.5264274586114294, "grad_norm": 0.25, "learning_rate": 2.4881723623527182e-05, "loss": 2.7741, "num_input_tokens_seen": 5691146240, "step": 10855 }, { "epoch": 0.526669940167676, "grad_norm": 0.25390625, "learning_rate": 2.4861676969801846e-05, "loss": 2.7632, "num_input_tokens_seen": 5693767680, "step": 10860 }, { "epoch": 0.5269124217239226, "grad_norm": 0.2470703125, "learning_rate": 2.484163040501908e-05, "loss": 2.7633, "num_input_tokens_seen": 5696389120, "step": 10865 }, { "epoch": 0.5271549032801692, "grad_norm": 0.2451171875, "learning_rate": 2.4821583942068956e-05, "loss": 2.7789, "num_input_tokens_seen": 5699010560, "step": 10870 }, { "epoch": 0.5273973848364159, "grad_norm": 0.2451171875, "learning_rate": 2.4801537593841475e-05, "loss": 2.7693, "num_input_tokens_seen": 5701632000, "step": 10875 }, { "epoch": 0.5276398663926625, "grad_norm": 0.25, "learning_rate": 2.4781491373226568e-05, "loss": 2.7647, "num_input_tokens_seen": 5704253440, "step": 10880 }, { "epoch": 0.5278823479489091, "grad_norm": 0.255859375, "learning_rate": 2.476144529311407e-05, "loss": 2.7807, "num_input_tokens_seen": 5706874880, "step": 10885 }, { "epoch": 0.5281248295051557, "grad_norm": 0.24609375, "learning_rate": 2.4741399366393738e-05, "loss": 2.7813, "num_input_tokens_seen": 5709496320, "step": 10890 }, { "epoch": 0.5283673110614024, "grad_norm": 0.251953125, "learning_rate": 2.4721353605955232e-05, "loss": 2.767, "num_input_tokens_seen": 5712117760, "step": 10895 }, { "epoch": 0.5286097926176491, "grad_norm": 0.251953125, "learning_rate": 2.4701308024688102e-05, "loss": 2.7709, "num_input_tokens_seen": 5714739200, "step": 10900 }, { "epoch": 0.5288522741738957, "grad_norm": 0.25390625, "learning_rate": 2.468126263548178e-05, "loss": 2.7725, "num_input_tokens_seen": 5717360640, "step": 10905 }, { "epoch": 0.5290947557301423, "grad_norm": 0.2451171875, "learning_rate": 2.4661217451225585e-05, "loss": 2.7726, "num_input_tokens_seen": 5719982080, "step": 10910 }, { "epoch": 0.5293372372863889, "grad_norm": 0.2412109375, "learning_rate": 2.464117248480868e-05, "loss": 2.7946, "num_input_tokens_seen": 5722603520, "step": 10915 }, { "epoch": 0.5295797188426355, "grad_norm": 0.2470703125, "learning_rate": 2.4621127749120114e-05, "loss": 2.781, "num_input_tokens_seen": 5725224960, "step": 10920 }, { "epoch": 0.5298222003988822, "grad_norm": 0.25, "learning_rate": 2.4601083257048774e-05, "loss": 2.7655, "num_input_tokens_seen": 5727846400, "step": 10925 }, { "epoch": 0.5300646819551288, "grad_norm": 0.2412109375, "learning_rate": 2.4581039021483396e-05, "loss": 2.7875, "num_input_tokens_seen": 5730467840, "step": 10930 }, { "epoch": 0.5303071635113754, "grad_norm": 0.240234375, "learning_rate": 2.4560995055312546e-05, "loss": 2.7598, "num_input_tokens_seen": 5733089280, "step": 10935 }, { "epoch": 0.530549645067622, "grad_norm": 0.251953125, "learning_rate": 2.4540951371424632e-05, "loss": 2.7835, "num_input_tokens_seen": 5735710720, "step": 10940 }, { "epoch": 0.5307921266238687, "grad_norm": 0.2490234375, "learning_rate": 2.452090798270785e-05, "loss": 2.7637, "num_input_tokens_seen": 5738332160, "step": 10945 }, { "epoch": 0.5310346081801153, "grad_norm": 0.251953125, "learning_rate": 2.450086490205023e-05, "loss": 2.7786, "num_input_tokens_seen": 5740953600, "step": 10950 }, { "epoch": 0.5312770897363619, "grad_norm": 0.251953125, "learning_rate": 2.4480822142339606e-05, "loss": 2.7929, "num_input_tokens_seen": 5743575040, "step": 10955 }, { "epoch": 0.5315195712926085, "grad_norm": 0.2421875, "learning_rate": 2.4460779716463585e-05, "loss": 2.7764, "num_input_tokens_seen": 5746196480, "step": 10960 }, { "epoch": 0.5317620528488551, "grad_norm": 0.2451171875, "learning_rate": 2.444073763730958e-05, "loss": 2.771, "num_input_tokens_seen": 5748817920, "step": 10965 }, { "epoch": 0.5320045344051018, "grad_norm": 0.244140625, "learning_rate": 2.4420695917764787e-05, "loss": 2.7694, "num_input_tokens_seen": 5751439360, "step": 10970 }, { "epoch": 0.5322470159613485, "grad_norm": 0.244140625, "learning_rate": 2.4400654570716132e-05, "loss": 2.79, "num_input_tokens_seen": 5754060800, "step": 10975 }, { "epoch": 0.5324894975175951, "grad_norm": 0.25, "learning_rate": 2.438061360905034e-05, "loss": 2.7789, "num_input_tokens_seen": 5756682240, "step": 10980 }, { "epoch": 0.5327319790738417, "grad_norm": 0.2431640625, "learning_rate": 2.436057304565387e-05, "loss": 2.7824, "num_input_tokens_seen": 5759303680, "step": 10985 }, { "epoch": 0.5329744606300884, "grad_norm": 0.248046875, "learning_rate": 2.4340532893412927e-05, "loss": 2.767, "num_input_tokens_seen": 5761925120, "step": 10990 }, { "epoch": 0.533216942186335, "grad_norm": 0.2490234375, "learning_rate": 2.4320493165213464e-05, "loss": 2.7737, "num_input_tokens_seen": 5764546560, "step": 10995 }, { "epoch": 0.5334594237425816, "grad_norm": 0.2451171875, "learning_rate": 2.4300453873941158e-05, "loss": 2.7691, "num_input_tokens_seen": 5767168000, "step": 11000 }, { "epoch": 0.5337019052988282, "grad_norm": 0.2451171875, "learning_rate": 2.428041503248138e-05, "loss": 2.7648, "num_input_tokens_seen": 5769789440, "step": 11005 }, { "epoch": 0.5339443868550748, "grad_norm": 0.255859375, "learning_rate": 2.4260376653719232e-05, "loss": 2.7915, "num_input_tokens_seen": 5772410880, "step": 11010 }, { "epoch": 0.5341868684113215, "grad_norm": 0.2431640625, "learning_rate": 2.4240338750539526e-05, "loss": 2.7746, "num_input_tokens_seen": 5775032320, "step": 11015 }, { "epoch": 0.5344293499675681, "grad_norm": 0.2470703125, "learning_rate": 2.422030133582675e-05, "loss": 2.7697, "num_input_tokens_seen": 5777653760, "step": 11020 }, { "epoch": 0.5346718315238147, "grad_norm": 0.2431640625, "learning_rate": 2.4200264422465097e-05, "loss": 2.7823, "num_input_tokens_seen": 5780275200, "step": 11025 }, { "epoch": 0.5349143130800613, "grad_norm": 0.2490234375, "learning_rate": 2.4180228023338423e-05, "loss": 2.77, "num_input_tokens_seen": 5782896640, "step": 11030 }, { "epoch": 0.5351567946363079, "grad_norm": 0.251953125, "learning_rate": 2.4160192151330274e-05, "loss": 2.7643, "num_input_tokens_seen": 5785518080, "step": 11035 }, { "epoch": 0.5353992761925546, "grad_norm": 0.240234375, "learning_rate": 2.4140156819323812e-05, "loss": 2.7711, "num_input_tokens_seen": 5788139520, "step": 11040 }, { "epoch": 0.5356417577488012, "grad_norm": 0.2470703125, "learning_rate": 2.4120122040201888e-05, "loss": 2.7834, "num_input_tokens_seen": 5790760960, "step": 11045 }, { "epoch": 0.5358842393050478, "grad_norm": 0.2412109375, "learning_rate": 2.4100087826847e-05, "loss": 2.7871, "num_input_tokens_seen": 5793382400, "step": 11050 }, { "epoch": 0.5361267208612945, "grad_norm": 0.26171875, "learning_rate": 2.4080054192141273e-05, "loss": 2.7802, "num_input_tokens_seen": 5796003840, "step": 11055 }, { "epoch": 0.5363692024175412, "grad_norm": 0.2431640625, "learning_rate": 2.4060021148966455e-05, "loss": 2.7675, "num_input_tokens_seen": 5798625280, "step": 11060 }, { "epoch": 0.5366116839737878, "grad_norm": 0.2421875, "learning_rate": 2.4039988710203927e-05, "loss": 2.7878, "num_input_tokens_seen": 5801246720, "step": 11065 }, { "epoch": 0.5368541655300344, "grad_norm": 0.244140625, "learning_rate": 2.4019956888734653e-05, "loss": 2.7701, "num_input_tokens_seen": 5803868160, "step": 11070 }, { "epoch": 0.537096647086281, "grad_norm": 0.2412109375, "learning_rate": 2.399992569743923e-05, "loss": 2.7742, "num_input_tokens_seen": 5806489600, "step": 11075 }, { "epoch": 0.5373391286425276, "grad_norm": 0.244140625, "learning_rate": 2.3979895149197844e-05, "loss": 2.7734, "num_input_tokens_seen": 5809111040, "step": 11080 }, { "epoch": 0.5375816101987743, "grad_norm": 0.2490234375, "learning_rate": 2.3959865256890258e-05, "loss": 2.7798, "num_input_tokens_seen": 5811732480, "step": 11085 }, { "epoch": 0.5378240917550209, "grad_norm": 0.244140625, "learning_rate": 2.3939836033395818e-05, "loss": 2.7678, "num_input_tokens_seen": 5814353920, "step": 11090 }, { "epoch": 0.5380665733112675, "grad_norm": 0.25390625, "learning_rate": 2.391980749159345e-05, "loss": 2.7885, "num_input_tokens_seen": 5816975360, "step": 11095 }, { "epoch": 0.5383090548675141, "grad_norm": 0.251953125, "learning_rate": 2.3899779644361612e-05, "loss": 2.779, "num_input_tokens_seen": 5819596800, "step": 11100 }, { "epoch": 0.5383090548675141, "eval_accuracy": 0.4559078326005537, "eval_loss": 2.7425777912139893, "eval_runtime": 6.3102, "eval_samples_per_second": 47.542, "eval_steps_per_second": 6.022, "num_input_tokens_seen": 5819596800, "step": 11100 }, { "epoch": 0.5385515364237607, "grad_norm": 0.2412109375, "learning_rate": 2.3879752504578347e-05, "loss": 2.7573, "num_input_tokens_seen": 5822218240, "step": 11105 }, { "epoch": 0.5387940179800074, "grad_norm": 0.2578125, "learning_rate": 2.385972608512123e-05, "loss": 2.781, "num_input_tokens_seen": 5824839680, "step": 11110 }, { "epoch": 0.539036499536254, "grad_norm": 0.244140625, "learning_rate": 2.3839700398867372e-05, "loss": 2.7713, "num_input_tokens_seen": 5827461120, "step": 11115 }, { "epoch": 0.5392789810925006, "grad_norm": 0.248046875, "learning_rate": 2.3819675458693423e-05, "loss": 2.7705, "num_input_tokens_seen": 5830082560, "step": 11120 }, { "epoch": 0.5395214626487472, "grad_norm": 0.26171875, "learning_rate": 2.3799651277475537e-05, "loss": 2.7725, "num_input_tokens_seen": 5832704000, "step": 11125 }, { "epoch": 0.539763944204994, "grad_norm": 0.2470703125, "learning_rate": 2.3779627868089386e-05, "loss": 2.7977, "num_input_tokens_seen": 5835325440, "step": 11130 }, { "epoch": 0.5400064257612406, "grad_norm": 0.25390625, "learning_rate": 2.375960524341015e-05, "loss": 2.7826, "num_input_tokens_seen": 5837946880, "step": 11135 }, { "epoch": 0.5402489073174872, "grad_norm": 0.248046875, "learning_rate": 2.37395834163125e-05, "loss": 2.7648, "num_input_tokens_seen": 5840568320, "step": 11140 }, { "epoch": 0.5404913888737338, "grad_norm": 0.2392578125, "learning_rate": 2.3719562399670604e-05, "loss": 2.7645, "num_input_tokens_seen": 5843189760, "step": 11145 }, { "epoch": 0.5407338704299804, "grad_norm": 0.25, "learning_rate": 2.369954220635809e-05, "loss": 2.7695, "num_input_tokens_seen": 5845811200, "step": 11150 }, { "epoch": 0.5409763519862271, "grad_norm": 0.2490234375, "learning_rate": 2.367952284924808e-05, "loss": 2.7828, "num_input_tokens_seen": 5848432640, "step": 11155 }, { "epoch": 0.5412188335424737, "grad_norm": 0.24609375, "learning_rate": 2.365950434121314e-05, "loss": 2.7753, "num_input_tokens_seen": 5851054080, "step": 11160 }, { "epoch": 0.5414613150987203, "grad_norm": 0.2451171875, "learning_rate": 2.3639486695125284e-05, "loss": 2.7763, "num_input_tokens_seen": 5853675520, "step": 11165 }, { "epoch": 0.5417037966549669, "grad_norm": 0.25, "learning_rate": 2.3619469923856e-05, "loss": 2.7781, "num_input_tokens_seen": 5856296960, "step": 11170 }, { "epoch": 0.5419462782112136, "grad_norm": 0.2412109375, "learning_rate": 2.359945404027619e-05, "loss": 2.7736, "num_input_tokens_seen": 5858918400, "step": 11175 }, { "epoch": 0.5421887597674602, "grad_norm": 0.23828125, "learning_rate": 2.3579439057256198e-05, "loss": 2.784, "num_input_tokens_seen": 5861539840, "step": 11180 }, { "epoch": 0.5424312413237068, "grad_norm": 0.2470703125, "learning_rate": 2.355942498766578e-05, "loss": 2.7725, "num_input_tokens_seen": 5864161280, "step": 11185 }, { "epoch": 0.5426737228799534, "grad_norm": 0.240234375, "learning_rate": 2.3539411844374104e-05, "loss": 2.7762, "num_input_tokens_seen": 5866782720, "step": 11190 }, { "epoch": 0.5429162044362, "grad_norm": 0.2490234375, "learning_rate": 2.351939964024975e-05, "loss": 2.7915, "num_input_tokens_seen": 5869404160, "step": 11195 }, { "epoch": 0.5431586859924467, "grad_norm": 0.240234375, "learning_rate": 2.3499388388160686e-05, "loss": 2.7741, "num_input_tokens_seen": 5872025600, "step": 11200 }, { "epoch": 0.5434011675486933, "grad_norm": 0.255859375, "learning_rate": 2.3479378100974277e-05, "loss": 2.7715, "num_input_tokens_seen": 5874647040, "step": 11205 }, { "epoch": 0.54364364910494, "grad_norm": 0.255859375, "learning_rate": 2.3459368791557267e-05, "loss": 2.7739, "num_input_tokens_seen": 5877268480, "step": 11210 }, { "epoch": 0.5438861306611866, "grad_norm": 0.255859375, "learning_rate": 2.3439360472775758e-05, "loss": 2.7784, "num_input_tokens_seen": 5879889920, "step": 11215 }, { "epoch": 0.5441286122174332, "grad_norm": 0.2412109375, "learning_rate": 2.3419353157495236e-05, "loss": 2.7808, "num_input_tokens_seen": 5882511360, "step": 11220 }, { "epoch": 0.5443710937736799, "grad_norm": 0.2470703125, "learning_rate": 2.3399346858580514e-05, "loss": 2.762, "num_input_tokens_seen": 5885132800, "step": 11225 }, { "epoch": 0.5446135753299265, "grad_norm": 0.251953125, "learning_rate": 2.3379341588895778e-05, "loss": 2.7911, "num_input_tokens_seen": 5887754240, "step": 11230 }, { "epoch": 0.5448560568861731, "grad_norm": 0.251953125, "learning_rate": 2.3359337361304536e-05, "loss": 2.7616, "num_input_tokens_seen": 5890375680, "step": 11235 }, { "epoch": 0.5450985384424197, "grad_norm": 0.25, "learning_rate": 2.3339334188669638e-05, "loss": 2.7732, "num_input_tokens_seen": 5892997120, "step": 11240 }, { "epoch": 0.5453410199986664, "grad_norm": 0.25390625, "learning_rate": 2.3319332083853246e-05, "loss": 2.7783, "num_input_tokens_seen": 5895618560, "step": 11245 }, { "epoch": 0.545583501554913, "grad_norm": 0.248046875, "learning_rate": 2.3299331059716843e-05, "loss": 2.7717, "num_input_tokens_seen": 5898240000, "step": 11250 }, { "epoch": 0.5458259831111596, "grad_norm": 0.25390625, "learning_rate": 2.3279331129121202e-05, "loss": 2.7611, "num_input_tokens_seen": 5900861440, "step": 11255 }, { "epoch": 0.5460684646674062, "grad_norm": 0.2431640625, "learning_rate": 2.325933230492641e-05, "loss": 2.7695, "num_input_tokens_seen": 5903482880, "step": 11260 }, { "epoch": 0.5463109462236528, "grad_norm": 0.2470703125, "learning_rate": 2.3239334599991833e-05, "loss": 2.7831, "num_input_tokens_seen": 5906104320, "step": 11265 }, { "epoch": 0.5465534277798995, "grad_norm": 0.251953125, "learning_rate": 2.3219338027176124e-05, "loss": 2.7846, "num_input_tokens_seen": 5908725760, "step": 11270 }, { "epoch": 0.5467959093361461, "grad_norm": 0.24609375, "learning_rate": 2.31993425993372e-05, "loss": 2.7985, "num_input_tokens_seen": 5911347200, "step": 11275 }, { "epoch": 0.5470383908923927, "grad_norm": 0.25, "learning_rate": 2.3179348329332266e-05, "loss": 2.7801, "num_input_tokens_seen": 5913968640, "step": 11280 }, { "epoch": 0.5472808724486393, "grad_norm": 0.2470703125, "learning_rate": 2.315935523001773e-05, "loss": 2.7719, "num_input_tokens_seen": 5916590080, "step": 11285 }, { "epoch": 0.547523354004886, "grad_norm": 0.248046875, "learning_rate": 2.3139363314249304e-05, "loss": 2.7801, "num_input_tokens_seen": 5919211520, "step": 11290 }, { "epoch": 0.5477658355611327, "grad_norm": 0.2451171875, "learning_rate": 2.3119372594881902e-05, "loss": 2.7853, "num_input_tokens_seen": 5921832960, "step": 11295 }, { "epoch": 0.5480083171173793, "grad_norm": 0.2451171875, "learning_rate": 2.3099383084769686e-05, "loss": 2.7669, "num_input_tokens_seen": 5924454400, "step": 11300 }, { "epoch": 0.5482507986736259, "grad_norm": 0.25, "learning_rate": 2.3079394796766036e-05, "loss": 2.786, "num_input_tokens_seen": 5927075840, "step": 11305 }, { "epoch": 0.5484932802298725, "grad_norm": 0.2490234375, "learning_rate": 2.305940774372356e-05, "loss": 2.7816, "num_input_tokens_seen": 5929697280, "step": 11310 }, { "epoch": 0.5487357617861192, "grad_norm": 0.24609375, "learning_rate": 2.3039421938494036e-05, "loss": 2.7675, "num_input_tokens_seen": 5932318720, "step": 11315 }, { "epoch": 0.5489782433423658, "grad_norm": 0.2412109375, "learning_rate": 2.3019437393928467e-05, "loss": 2.7605, "num_input_tokens_seen": 5934940160, "step": 11320 }, { "epoch": 0.5492207248986124, "grad_norm": 0.2490234375, "learning_rate": 2.2999454122877047e-05, "loss": 2.7724, "num_input_tokens_seen": 5937561600, "step": 11325 }, { "epoch": 0.549463206454859, "grad_norm": 0.25, "learning_rate": 2.297947213818914e-05, "loss": 2.7615, "num_input_tokens_seen": 5940183040, "step": 11330 }, { "epoch": 0.5497056880111056, "grad_norm": 0.2470703125, "learning_rate": 2.2959491452713287e-05, "loss": 2.7897, "num_input_tokens_seen": 5942804480, "step": 11335 }, { "epoch": 0.5499481695673523, "grad_norm": 0.25390625, "learning_rate": 2.2939512079297208e-05, "loss": 2.7792, "num_input_tokens_seen": 5945425920, "step": 11340 }, { "epoch": 0.5501906511235989, "grad_norm": 0.240234375, "learning_rate": 2.2919534030787743e-05, "loss": 2.7748, "num_input_tokens_seen": 5948047360, "step": 11345 }, { "epoch": 0.5504331326798455, "grad_norm": 0.2451171875, "learning_rate": 2.289955732003091e-05, "loss": 2.765, "num_input_tokens_seen": 5950668800, "step": 11350 }, { "epoch": 0.5506756142360921, "grad_norm": 0.25390625, "learning_rate": 2.2879581959871856e-05, "loss": 2.7712, "num_input_tokens_seen": 5953290240, "step": 11355 }, { "epoch": 0.5509180957923387, "grad_norm": 0.2470703125, "learning_rate": 2.2859607963154872e-05, "loss": 2.769, "num_input_tokens_seen": 5955911680, "step": 11360 }, { "epoch": 0.5511605773485855, "grad_norm": 0.2412109375, "learning_rate": 2.283963534272336e-05, "loss": 2.7719, "num_input_tokens_seen": 5958533120, "step": 11365 }, { "epoch": 0.5514030589048321, "grad_norm": 0.2392578125, "learning_rate": 2.2819664111419835e-05, "loss": 2.7636, "num_input_tokens_seen": 5961154560, "step": 11370 }, { "epoch": 0.5516455404610787, "grad_norm": 0.23828125, "learning_rate": 2.2799694282085937e-05, "loss": 2.7702, "num_input_tokens_seen": 5963776000, "step": 11375 }, { "epoch": 0.5518880220173253, "grad_norm": 0.2431640625, "learning_rate": 2.2779725867562373e-05, "loss": 2.7808, "num_input_tokens_seen": 5966397440, "step": 11380 }, { "epoch": 0.552130503573572, "grad_norm": 0.244140625, "learning_rate": 2.2759758880688958e-05, "loss": 2.765, "num_input_tokens_seen": 5969018880, "step": 11385 }, { "epoch": 0.5523729851298186, "grad_norm": 0.2421875, "learning_rate": 2.2739793334304605e-05, "loss": 2.7826, "num_input_tokens_seen": 5971640320, "step": 11390 }, { "epoch": 0.5526154666860652, "grad_norm": 0.2470703125, "learning_rate": 2.2719829241247277e-05, "loss": 2.7833, "num_input_tokens_seen": 5974261760, "step": 11395 }, { "epoch": 0.5528579482423118, "grad_norm": 0.2490234375, "learning_rate": 2.2699866614354013e-05, "loss": 2.7771, "num_input_tokens_seen": 5976883200, "step": 11400 }, { "epoch": 0.5528579482423118, "eval_accuracy": 0.45590946099983715, "eval_loss": 2.742483377456665, "eval_runtime": 5.8696, "eval_samples_per_second": 51.111, "eval_steps_per_second": 6.474, "num_input_tokens_seen": 5976883200, "step": 11400 }, { "epoch": 0.5531004297985584, "grad_norm": 0.24609375, "learning_rate": 2.2679905466460915e-05, "loss": 2.7636, "num_input_tokens_seen": 5979504640, "step": 11405 }, { "epoch": 0.5533429113548051, "grad_norm": 0.25, "learning_rate": 2.26599458104031e-05, "loss": 2.7702, "num_input_tokens_seen": 5982126080, "step": 11410 }, { "epoch": 0.5535853929110517, "grad_norm": 0.2451171875, "learning_rate": 2.2639987659014775e-05, "loss": 2.7812, "num_input_tokens_seen": 5984747520, "step": 11415 }, { "epoch": 0.5538278744672983, "grad_norm": 0.251953125, "learning_rate": 2.2620031025129145e-05, "loss": 2.7808, "num_input_tokens_seen": 5987368960, "step": 11420 }, { "epoch": 0.5540703560235449, "grad_norm": 0.24609375, "learning_rate": 2.2600075921578463e-05, "loss": 2.7771, "num_input_tokens_seen": 5989990400, "step": 11425 }, { "epoch": 0.5543128375797916, "grad_norm": 0.2490234375, "learning_rate": 2.258012236119397e-05, "loss": 2.7849, "num_input_tokens_seen": 5992611840, "step": 11430 }, { "epoch": 0.5545553191360382, "grad_norm": 0.244140625, "learning_rate": 2.2560170356805947e-05, "loss": 2.7683, "num_input_tokens_seen": 5995233280, "step": 11435 }, { "epoch": 0.5547978006922848, "grad_norm": 0.2392578125, "learning_rate": 2.2540219921243642e-05, "loss": 2.7702, "num_input_tokens_seen": 5997854720, "step": 11440 }, { "epoch": 0.5550402822485315, "grad_norm": 0.2470703125, "learning_rate": 2.2520271067335314e-05, "loss": 2.7836, "num_input_tokens_seen": 6000476160, "step": 11445 }, { "epoch": 0.5552827638047781, "grad_norm": 0.25390625, "learning_rate": 2.2500323807908206e-05, "loss": 2.7707, "num_input_tokens_seen": 6003097600, "step": 11450 }, { "epoch": 0.5555252453610248, "grad_norm": 0.25390625, "learning_rate": 2.2480378155788525e-05, "loss": 2.7719, "num_input_tokens_seen": 6005719040, "step": 11455 }, { "epoch": 0.5557677269172714, "grad_norm": 0.244140625, "learning_rate": 2.2460434123801454e-05, "loss": 2.7837, "num_input_tokens_seen": 6008340480, "step": 11460 }, { "epoch": 0.556010208473518, "grad_norm": 0.25, "learning_rate": 2.2440491724771133e-05, "loss": 2.7644, "num_input_tokens_seen": 6010961920, "step": 11465 }, { "epoch": 0.5562526900297646, "grad_norm": 0.24609375, "learning_rate": 2.242055097152064e-05, "loss": 2.7792, "num_input_tokens_seen": 6013583360, "step": 11470 }, { "epoch": 0.5564951715860113, "grad_norm": 0.251953125, "learning_rate": 2.2400611876872007e-05, "loss": 2.7749, "num_input_tokens_seen": 6016204800, "step": 11475 }, { "epoch": 0.5567376531422579, "grad_norm": 0.2470703125, "learning_rate": 2.2380674453646204e-05, "loss": 2.774, "num_input_tokens_seen": 6018826240, "step": 11480 }, { "epoch": 0.5569801346985045, "grad_norm": 0.255859375, "learning_rate": 2.236073871466311e-05, "loss": 2.7635, "num_input_tokens_seen": 6021447680, "step": 11485 }, { "epoch": 0.5572226162547511, "grad_norm": 0.25, "learning_rate": 2.2340804672741535e-05, "loss": 2.7605, "num_input_tokens_seen": 6024069120, "step": 11490 }, { "epoch": 0.5574650978109977, "grad_norm": 0.25, "learning_rate": 2.2320872340699198e-05, "loss": 2.7857, "num_input_tokens_seen": 6026690560, "step": 11495 }, { "epoch": 0.5577075793672444, "grad_norm": 0.24609375, "learning_rate": 2.23009417313527e-05, "loss": 2.768, "num_input_tokens_seen": 6029312000, "step": 11500 }, { "epoch": 0.557950060923491, "grad_norm": 0.2490234375, "learning_rate": 2.2281012857517553e-05, "loss": 2.7794, "num_input_tokens_seen": 6031933440, "step": 11505 }, { "epoch": 0.5581925424797376, "grad_norm": 0.2490234375, "learning_rate": 2.2261085732008148e-05, "loss": 2.7714, "num_input_tokens_seen": 6034554880, "step": 11510 }, { "epoch": 0.5584350240359842, "grad_norm": 0.2490234375, "learning_rate": 2.2241160367637754e-05, "loss": 2.7871, "num_input_tokens_seen": 6037176320, "step": 11515 }, { "epoch": 0.5586775055922308, "grad_norm": 0.24609375, "learning_rate": 2.22212367772185e-05, "loss": 2.7635, "num_input_tokens_seen": 6039797760, "step": 11520 }, { "epoch": 0.5589199871484776, "grad_norm": 0.251953125, "learning_rate": 2.220131497356139e-05, "loss": 2.7709, "num_input_tokens_seen": 6042419200, "step": 11525 }, { "epoch": 0.5591624687047242, "grad_norm": 0.2412109375, "learning_rate": 2.2181394969476257e-05, "loss": 2.7635, "num_input_tokens_seen": 6045040640, "step": 11530 }, { "epoch": 0.5594049502609708, "grad_norm": 0.251953125, "learning_rate": 2.216147677777179e-05, "loss": 2.7799, "num_input_tokens_seen": 6047662080, "step": 11535 }, { "epoch": 0.5596474318172174, "grad_norm": 0.24609375, "learning_rate": 2.2141560411255515e-05, "loss": 2.7749, "num_input_tokens_seen": 6050283520, "step": 11540 }, { "epoch": 0.559889913373464, "grad_norm": 0.25, "learning_rate": 2.212164588273377e-05, "loss": 2.7767, "num_input_tokens_seen": 6052904960, "step": 11545 }, { "epoch": 0.5601323949297107, "grad_norm": 0.2451171875, "learning_rate": 2.2101733205011737e-05, "loss": 2.793, "num_input_tokens_seen": 6055526400, "step": 11550 }, { "epoch": 0.5603748764859573, "grad_norm": 0.2421875, "learning_rate": 2.2081822390893382e-05, "loss": 2.7892, "num_input_tokens_seen": 6058147840, "step": 11555 }, { "epoch": 0.5606173580422039, "grad_norm": 0.2431640625, "learning_rate": 2.2061913453181494e-05, "loss": 2.7631, "num_input_tokens_seen": 6060769280, "step": 11560 }, { "epoch": 0.5608598395984505, "grad_norm": 0.2431640625, "learning_rate": 2.2042006404677627e-05, "loss": 2.7831, "num_input_tokens_seen": 6063390720, "step": 11565 }, { "epoch": 0.5611023211546972, "grad_norm": 0.251953125, "learning_rate": 2.2022101258182147e-05, "loss": 2.7708, "num_input_tokens_seen": 6066012160, "step": 11570 }, { "epoch": 0.5613448027109438, "grad_norm": 0.25390625, "learning_rate": 2.200219802649419e-05, "loss": 2.7789, "num_input_tokens_seen": 6068633600, "step": 11575 }, { "epoch": 0.5615872842671904, "grad_norm": 0.2490234375, "learning_rate": 2.1982296722411657e-05, "loss": 2.7754, "num_input_tokens_seen": 6071255040, "step": 11580 }, { "epoch": 0.561829765823437, "grad_norm": 0.251953125, "learning_rate": 2.1962397358731206e-05, "loss": 2.7727, "num_input_tokens_seen": 6073876480, "step": 11585 }, { "epoch": 0.5620722473796836, "grad_norm": 0.244140625, "learning_rate": 2.1942499948248264e-05, "loss": 2.7907, "num_input_tokens_seen": 6076497920, "step": 11590 }, { "epoch": 0.5623147289359303, "grad_norm": 0.2431640625, "learning_rate": 2.192260450375698e-05, "loss": 2.7701, "num_input_tokens_seen": 6079119360, "step": 11595 }, { "epoch": 0.5625572104921769, "grad_norm": 0.2431640625, "learning_rate": 2.1902711038050248e-05, "loss": 2.7711, "num_input_tokens_seen": 6081740800, "step": 11600 }, { "epoch": 0.5627996920484236, "grad_norm": 0.2451171875, "learning_rate": 2.1882819563919695e-05, "loss": 2.7881, "num_input_tokens_seen": 6084362240, "step": 11605 }, { "epoch": 0.5630421736046702, "grad_norm": 0.236328125, "learning_rate": 2.1862930094155666e-05, "loss": 2.783, "num_input_tokens_seen": 6086983680, "step": 11610 }, { "epoch": 0.5632846551609169, "grad_norm": 0.2431640625, "learning_rate": 2.1843042641547205e-05, "loss": 2.7647, "num_input_tokens_seen": 6089605120, "step": 11615 }, { "epoch": 0.5635271367171635, "grad_norm": 0.23828125, "learning_rate": 2.1823157218882096e-05, "loss": 2.7783, "num_input_tokens_seen": 6092226560, "step": 11620 }, { "epoch": 0.5637696182734101, "grad_norm": 0.2431640625, "learning_rate": 2.1803273838946755e-05, "loss": 2.7705, "num_input_tokens_seen": 6094848000, "step": 11625 }, { "epoch": 0.5640120998296567, "grad_norm": 0.2490234375, "learning_rate": 2.1783392514526336e-05, "loss": 2.7722, "num_input_tokens_seen": 6097469440, "step": 11630 }, { "epoch": 0.5642545813859033, "grad_norm": 0.2470703125, "learning_rate": 2.176351325840465e-05, "loss": 2.7664, "num_input_tokens_seen": 6100090880, "step": 11635 }, { "epoch": 0.56449706294215, "grad_norm": 0.2431640625, "learning_rate": 2.174363608336418e-05, "loss": 2.7781, "num_input_tokens_seen": 6102712320, "step": 11640 }, { "epoch": 0.5647395444983966, "grad_norm": 0.2431640625, "learning_rate": 2.172376100218609e-05, "loss": 2.7783, "num_input_tokens_seen": 6105333760, "step": 11645 }, { "epoch": 0.5649820260546432, "grad_norm": 0.2431640625, "learning_rate": 2.1703888027650182e-05, "loss": 2.7825, "num_input_tokens_seen": 6107955200, "step": 11650 }, { "epoch": 0.5652245076108898, "grad_norm": 0.244140625, "learning_rate": 2.1684017172534883e-05, "loss": 2.772, "num_input_tokens_seen": 6110576640, "step": 11655 }, { "epoch": 0.5654669891671364, "grad_norm": 0.25, "learning_rate": 2.1664148449617282e-05, "loss": 2.7866, "num_input_tokens_seen": 6113198080, "step": 11660 }, { "epoch": 0.5657094707233831, "grad_norm": 0.240234375, "learning_rate": 2.16442818716731e-05, "loss": 2.766, "num_input_tokens_seen": 6115819520, "step": 11665 }, { "epoch": 0.5659519522796297, "grad_norm": 0.2421875, "learning_rate": 2.162441745147666e-05, "loss": 2.775, "num_input_tokens_seen": 6118440960, "step": 11670 }, { "epoch": 0.5661944338358763, "grad_norm": 0.2412109375, "learning_rate": 2.1604555201800924e-05, "loss": 2.7797, "num_input_tokens_seen": 6121062400, "step": 11675 }, { "epoch": 0.566436915392123, "grad_norm": 0.2490234375, "learning_rate": 2.1584695135417434e-05, "loss": 2.7739, "num_input_tokens_seen": 6123683840, "step": 11680 }, { "epoch": 0.5666793969483697, "grad_norm": 0.24609375, "learning_rate": 2.156483726509635e-05, "loss": 2.7712, "num_input_tokens_seen": 6126305280, "step": 11685 }, { "epoch": 0.5669218785046163, "grad_norm": 0.244140625, "learning_rate": 2.1544981603606384e-05, "loss": 2.7839, "num_input_tokens_seen": 6128926720, "step": 11690 }, { "epoch": 0.5671643600608629, "grad_norm": 0.240234375, "learning_rate": 2.1525128163714855e-05, "loss": 2.7843, "num_input_tokens_seen": 6131548160, "step": 11695 }, { "epoch": 0.5674068416171095, "grad_norm": 0.2470703125, "learning_rate": 2.150527695818766e-05, "loss": 2.7828, "num_input_tokens_seen": 6134169600, "step": 11700 }, { "epoch": 0.5674068416171095, "eval_accuracy": 0.4560267057482495, "eval_loss": 2.7423644065856934, "eval_runtime": 5.8594, "eval_samples_per_second": 51.199, "eval_steps_per_second": 6.485, "num_input_tokens_seen": 6134169600, "step": 11700 }, { "epoch": 0.5676493231733561, "grad_norm": 0.24609375, "learning_rate": 2.1485427999789247e-05, "loss": 2.7716, "num_input_tokens_seen": 6136791040, "step": 11705 }, { "epoch": 0.5678918047296028, "grad_norm": 0.2470703125, "learning_rate": 2.1465581301282617e-05, "loss": 2.7704, "num_input_tokens_seen": 6139412480, "step": 11710 }, { "epoch": 0.5681342862858494, "grad_norm": 0.25, "learning_rate": 2.144573687542933e-05, "loss": 2.7742, "num_input_tokens_seen": 6142033920, "step": 11715 }, { "epoch": 0.568376767842096, "grad_norm": 0.248046875, "learning_rate": 2.1425894734989453e-05, "loss": 2.7573, "num_input_tokens_seen": 6144655360, "step": 11720 }, { "epoch": 0.5686192493983426, "grad_norm": 0.2431640625, "learning_rate": 2.1406054892721626e-05, "loss": 2.7731, "num_input_tokens_seen": 6147276800, "step": 11725 }, { "epoch": 0.5688617309545893, "grad_norm": 0.2470703125, "learning_rate": 2.1386217361382983e-05, "loss": 2.7861, "num_input_tokens_seen": 6149898240, "step": 11730 }, { "epoch": 0.5691042125108359, "grad_norm": 0.26171875, "learning_rate": 2.136638215372919e-05, "loss": 2.7702, "num_input_tokens_seen": 6152519680, "step": 11735 }, { "epoch": 0.5693466940670825, "grad_norm": 0.2451171875, "learning_rate": 2.13465492825144e-05, "loss": 2.7707, "num_input_tokens_seen": 6155141120, "step": 11740 }, { "epoch": 0.5695891756233291, "grad_norm": 0.2490234375, "learning_rate": 2.132671876049129e-05, "loss": 2.7842, "num_input_tokens_seen": 6157762560, "step": 11745 }, { "epoch": 0.5698316571795757, "grad_norm": 0.2431640625, "learning_rate": 2.130689060041098e-05, "loss": 2.7857, "num_input_tokens_seen": 6160384000, "step": 11750 }, { "epoch": 0.5700741387358224, "grad_norm": 0.2470703125, "learning_rate": 2.1287064815023125e-05, "loss": 2.7658, "num_input_tokens_seen": 6163005440, "step": 11755 }, { "epoch": 0.5703166202920691, "grad_norm": 0.2373046875, "learning_rate": 2.126724141707582e-05, "loss": 2.7904, "num_input_tokens_seen": 6165626880, "step": 11760 }, { "epoch": 0.5705591018483157, "grad_norm": 0.2470703125, "learning_rate": 2.1247420419315638e-05, "loss": 2.7774, "num_input_tokens_seen": 6168248320, "step": 11765 }, { "epoch": 0.5708015834045623, "grad_norm": 0.251953125, "learning_rate": 2.1227601834487602e-05, "loss": 2.765, "num_input_tokens_seen": 6170869760, "step": 11770 }, { "epoch": 0.571044064960809, "grad_norm": 0.2421875, "learning_rate": 2.120778567533519e-05, "loss": 2.7586, "num_input_tokens_seen": 6173491200, "step": 11775 }, { "epoch": 0.5712865465170556, "grad_norm": 0.25390625, "learning_rate": 2.118797195460031e-05, "loss": 2.7777, "num_input_tokens_seen": 6176112640, "step": 11780 }, { "epoch": 0.5715290280733022, "grad_norm": 0.2412109375, "learning_rate": 2.116816068502331e-05, "loss": 2.7851, "num_input_tokens_seen": 6178734080, "step": 11785 }, { "epoch": 0.5717715096295488, "grad_norm": 0.25, "learning_rate": 2.114835187934296e-05, "loss": 2.7728, "num_input_tokens_seen": 6181355520, "step": 11790 }, { "epoch": 0.5720139911857954, "grad_norm": 0.251953125, "learning_rate": 2.1128545550296448e-05, "loss": 2.7659, "num_input_tokens_seen": 6183976960, "step": 11795 }, { "epoch": 0.572256472742042, "grad_norm": 0.255859375, "learning_rate": 2.1108741710619367e-05, "loss": 2.7775, "num_input_tokens_seen": 6186598400, "step": 11800 }, { "epoch": 0.5724989542982887, "grad_norm": 0.2412109375, "learning_rate": 2.1088940373045717e-05, "loss": 2.7777, "num_input_tokens_seen": 6189219840, "step": 11805 }, { "epoch": 0.5727414358545353, "grad_norm": 0.2490234375, "learning_rate": 2.106914155030787e-05, "loss": 2.7961, "num_input_tokens_seen": 6191841280, "step": 11810 }, { "epoch": 0.5729839174107819, "grad_norm": 0.2431640625, "learning_rate": 2.1049345255136595e-05, "loss": 2.7603, "num_input_tokens_seen": 6194462720, "step": 11815 }, { "epoch": 0.5732263989670285, "grad_norm": 0.26171875, "learning_rate": 2.1029551500261035e-05, "loss": 2.7735, "num_input_tokens_seen": 6197084160, "step": 11820 }, { "epoch": 0.5734688805232752, "grad_norm": 0.248046875, "learning_rate": 2.10097602984087e-05, "loss": 2.7866, "num_input_tokens_seen": 6199705600, "step": 11825 }, { "epoch": 0.5737113620795218, "grad_norm": 0.24609375, "learning_rate": 2.0989971662305458e-05, "loss": 2.7766, "num_input_tokens_seen": 6202327040, "step": 11830 }, { "epoch": 0.5739538436357684, "grad_norm": 0.244140625, "learning_rate": 2.0970185604675523e-05, "loss": 2.7827, "num_input_tokens_seen": 6204948480, "step": 11835 }, { "epoch": 0.5741963251920151, "grad_norm": 0.248046875, "learning_rate": 2.095040213824146e-05, "loss": 2.7836, "num_input_tokens_seen": 6207569920, "step": 11840 }, { "epoch": 0.5744388067482618, "grad_norm": 0.2421875, "learning_rate": 2.093062127572415e-05, "loss": 2.7746, "num_input_tokens_seen": 6210191360, "step": 11845 }, { "epoch": 0.5746812883045084, "grad_norm": 0.25, "learning_rate": 2.0910843029842818e-05, "loss": 2.7715, "num_input_tokens_seen": 6212812800, "step": 11850 }, { "epoch": 0.574923769860755, "grad_norm": 0.2451171875, "learning_rate": 2.0891067413315002e-05, "loss": 2.7736, "num_input_tokens_seen": 6215434240, "step": 11855 }, { "epoch": 0.5751662514170016, "grad_norm": 0.244140625, "learning_rate": 2.0871294438856543e-05, "loss": 2.7608, "num_input_tokens_seen": 6218055680, "step": 11860 }, { "epoch": 0.5754087329732482, "grad_norm": 0.244140625, "learning_rate": 2.0851524119181585e-05, "loss": 2.7733, "num_input_tokens_seen": 6220677120, "step": 11865 }, { "epoch": 0.5756512145294949, "grad_norm": 0.2451171875, "learning_rate": 2.083175646700258e-05, "loss": 2.782, "num_input_tokens_seen": 6223298560, "step": 11870 }, { "epoch": 0.5758936960857415, "grad_norm": 0.2578125, "learning_rate": 2.081199149503024e-05, "loss": 2.7788, "num_input_tokens_seen": 6225920000, "step": 11875 }, { "epoch": 0.5761361776419881, "grad_norm": 0.251953125, "learning_rate": 2.0792229215973567e-05, "loss": 2.7713, "num_input_tokens_seen": 6228541440, "step": 11880 }, { "epoch": 0.5763786591982347, "grad_norm": 0.2421875, "learning_rate": 2.0772469642539834e-05, "loss": 2.7621, "num_input_tokens_seen": 6231162880, "step": 11885 }, { "epoch": 0.5766211407544813, "grad_norm": 0.2470703125, "learning_rate": 2.0752712787434565e-05, "loss": 2.7741, "num_input_tokens_seen": 6233784320, "step": 11890 }, { "epoch": 0.576863622310728, "grad_norm": 0.255859375, "learning_rate": 2.0732958663361545e-05, "loss": 2.7787, "num_input_tokens_seen": 6236405760, "step": 11895 }, { "epoch": 0.5771061038669746, "grad_norm": 0.2490234375, "learning_rate": 2.0713207283022808e-05, "loss": 2.7852, "num_input_tokens_seen": 6239027200, "step": 11900 }, { "epoch": 0.5773485854232212, "grad_norm": 0.244140625, "learning_rate": 2.0693458659118596e-05, "loss": 2.7799, "num_input_tokens_seen": 6241648640, "step": 11905 }, { "epoch": 0.5775910669794678, "grad_norm": 0.2470703125, "learning_rate": 2.067371280434741e-05, "loss": 2.7902, "num_input_tokens_seen": 6244270080, "step": 11910 }, { "epoch": 0.5778335485357144, "grad_norm": 0.25, "learning_rate": 2.0653969731405954e-05, "loss": 2.7689, "num_input_tokens_seen": 6246891520, "step": 11915 }, { "epoch": 0.5780760300919612, "grad_norm": 0.2470703125, "learning_rate": 2.063422945298915e-05, "loss": 2.7857, "num_input_tokens_seen": 6249512960, "step": 11920 }, { "epoch": 0.5783185116482078, "grad_norm": 0.25390625, "learning_rate": 2.0614491981790125e-05, "loss": 2.7829, "num_input_tokens_seen": 6252134400, "step": 11925 }, { "epoch": 0.5785609932044544, "grad_norm": 0.2490234375, "learning_rate": 2.059475733050019e-05, "loss": 2.7723, "num_input_tokens_seen": 6254755840, "step": 11930 }, { "epoch": 0.578803474760701, "grad_norm": 0.24609375, "learning_rate": 2.0575025511808847e-05, "loss": 2.7719, "num_input_tokens_seen": 6257377280, "step": 11935 }, { "epoch": 0.5790459563169477, "grad_norm": 0.251953125, "learning_rate": 2.0555296538403786e-05, "loss": 2.7708, "num_input_tokens_seen": 6259998720, "step": 11940 }, { "epoch": 0.5792884378731943, "grad_norm": 0.2451171875, "learning_rate": 2.0535570422970854e-05, "loss": 2.7837, "num_input_tokens_seen": 6262620160, "step": 11945 }, { "epoch": 0.5795309194294409, "grad_norm": 0.248046875, "learning_rate": 2.051584717819407e-05, "loss": 2.775, "num_input_tokens_seen": 6265241600, "step": 11950 }, { "epoch": 0.5797734009856875, "grad_norm": 0.255859375, "learning_rate": 2.0496126816755598e-05, "loss": 2.7863, "num_input_tokens_seen": 6267863040, "step": 11955 }, { "epoch": 0.5800158825419341, "grad_norm": 0.25, "learning_rate": 2.0476409351335772e-05, "loss": 2.7821, "num_input_tokens_seen": 6270484480, "step": 11960 }, { "epoch": 0.5802583640981808, "grad_norm": 0.255859375, "learning_rate": 2.0456694794613026e-05, "loss": 2.791, "num_input_tokens_seen": 6273105920, "step": 11965 }, { "epoch": 0.5805008456544274, "grad_norm": 0.25, "learning_rate": 2.043698315926395e-05, "loss": 2.7786, "num_input_tokens_seen": 6275727360, "step": 11970 }, { "epoch": 0.580743327210674, "grad_norm": 0.2392578125, "learning_rate": 2.0417274457963248e-05, "loss": 2.7756, "num_input_tokens_seen": 6278348800, "step": 11975 }, { "epoch": 0.5809858087669206, "grad_norm": 0.24609375, "learning_rate": 2.0397568703383734e-05, "loss": 2.793, "num_input_tokens_seen": 6280970240, "step": 11980 }, { "epoch": 0.5812282903231673, "grad_norm": 0.248046875, "learning_rate": 2.037786590819634e-05, "loss": 2.7853, "num_input_tokens_seen": 6283591680, "step": 11985 }, { "epoch": 0.5814707718794139, "grad_norm": 0.251953125, "learning_rate": 2.035816608507008e-05, "loss": 2.7757, "num_input_tokens_seen": 6286213120, "step": 11990 }, { "epoch": 0.5817132534356606, "grad_norm": 0.24609375, "learning_rate": 2.0338469246672085e-05, "loss": 2.7765, "num_input_tokens_seen": 6288834560, "step": 11995 }, { "epoch": 0.5819557349919072, "grad_norm": 0.2451171875, "learning_rate": 2.0318775405667512e-05, "loss": 2.7814, "num_input_tokens_seen": 6291456000, "step": 12000 }, { "epoch": 0.5819557349919072, "eval_accuracy": 0.45578733105357433, "eval_loss": 2.742259979248047, "eval_runtime": 5.8929, "eval_samples_per_second": 50.909, "eval_steps_per_second": 6.448, "num_input_tokens_seen": 6291456000, "step": 12000 }, { "epoch": 0.5821982165481538, "grad_norm": 0.255859375, "learning_rate": 2.0299084574719634e-05, "loss": 2.7799, "num_input_tokens_seen": 6294077440, "step": 12005 }, { "epoch": 0.5824406981044005, "grad_norm": 0.2451171875, "learning_rate": 2.0279396766489787e-05, "loss": 2.7878, "num_input_tokens_seen": 6296698880, "step": 12010 }, { "epoch": 0.5826831796606471, "grad_norm": 0.24609375, "learning_rate": 2.0259711993637354e-05, "loss": 2.7795, "num_input_tokens_seen": 6299320320, "step": 12015 }, { "epoch": 0.5829256612168937, "grad_norm": 0.251953125, "learning_rate": 2.024003026881976e-05, "loss": 2.7704, "num_input_tokens_seen": 6301941760, "step": 12020 }, { "epoch": 0.5831681427731403, "grad_norm": 0.2392578125, "learning_rate": 2.0220351604692497e-05, "loss": 2.7574, "num_input_tokens_seen": 6304563200, "step": 12025 }, { "epoch": 0.583410624329387, "grad_norm": 0.2421875, "learning_rate": 2.0200676013909042e-05, "loss": 2.7772, "num_input_tokens_seen": 6307184640, "step": 12030 }, { "epoch": 0.5836531058856336, "grad_norm": 0.248046875, "learning_rate": 2.0181003509120927e-05, "loss": 2.8018, "num_input_tokens_seen": 6309806080, "step": 12035 }, { "epoch": 0.5838955874418802, "grad_norm": 0.2392578125, "learning_rate": 2.0161334102977708e-05, "loss": 2.7769, "num_input_tokens_seen": 6312427520, "step": 12040 }, { "epoch": 0.5841380689981268, "grad_norm": 0.24609375, "learning_rate": 2.0141667808126935e-05, "loss": 2.7753, "num_input_tokens_seen": 6315048960, "step": 12045 }, { "epoch": 0.5843805505543734, "grad_norm": 0.2412109375, "learning_rate": 2.0122004637214154e-05, "loss": 2.7767, "num_input_tokens_seen": 6317670400, "step": 12050 }, { "epoch": 0.5846230321106201, "grad_norm": 0.2451171875, "learning_rate": 2.0102344602882916e-05, "loss": 2.7824, "num_input_tokens_seen": 6320291840, "step": 12055 }, { "epoch": 0.5848655136668667, "grad_norm": 0.248046875, "learning_rate": 2.0082687717774725e-05, "loss": 2.7849, "num_input_tokens_seen": 6322913280, "step": 12060 }, { "epoch": 0.5851079952231133, "grad_norm": 0.2392578125, "learning_rate": 2.0063033994529096e-05, "loss": 2.7667, "num_input_tokens_seen": 6325534720, "step": 12065 }, { "epoch": 0.5853504767793599, "grad_norm": 0.251953125, "learning_rate": 2.0043383445783498e-05, "loss": 2.7749, "num_input_tokens_seen": 6328156160, "step": 12070 }, { "epoch": 0.5855929583356067, "grad_norm": 0.244140625, "learning_rate": 2.002373608417335e-05, "loss": 2.7736, "num_input_tokens_seen": 6330777600, "step": 12075 }, { "epoch": 0.5858354398918533, "grad_norm": 0.25390625, "learning_rate": 2.0004091922332034e-05, "loss": 2.7702, "num_input_tokens_seen": 6333399040, "step": 12080 }, { "epoch": 0.5860779214480999, "grad_norm": 0.2431640625, "learning_rate": 1.998445097289087e-05, "loss": 2.7792, "num_input_tokens_seen": 6336020480, "step": 12085 }, { "epoch": 0.5863204030043465, "grad_norm": 0.244140625, "learning_rate": 1.9964813248479102e-05, "loss": 2.7845, "num_input_tokens_seen": 6338641920, "step": 12090 }, { "epoch": 0.5865628845605931, "grad_norm": 0.25, "learning_rate": 1.9945178761723915e-05, "loss": 2.7684, "num_input_tokens_seen": 6341263360, "step": 12095 }, { "epoch": 0.5868053661168398, "grad_norm": 0.2470703125, "learning_rate": 1.992554752525041e-05, "loss": 2.7881, "num_input_tokens_seen": 6343884800, "step": 12100 }, { "epoch": 0.5870478476730864, "grad_norm": 0.251953125, "learning_rate": 1.990591955168159e-05, "loss": 2.7734, "num_input_tokens_seen": 6346506240, "step": 12105 }, { "epoch": 0.587290329229333, "grad_norm": 0.25390625, "learning_rate": 1.9886294853638364e-05, "loss": 2.7566, "num_input_tokens_seen": 6349127680, "step": 12110 }, { "epoch": 0.5875328107855796, "grad_norm": 0.25390625, "learning_rate": 1.9866673443739548e-05, "loss": 2.7629, "num_input_tokens_seen": 6351749120, "step": 12115 }, { "epoch": 0.5877752923418262, "grad_norm": 0.25, "learning_rate": 1.9847055334601814e-05, "loss": 2.784, "num_input_tokens_seen": 6354370560, "step": 12120 }, { "epoch": 0.5880177738980729, "grad_norm": 0.2470703125, "learning_rate": 1.9827440538839737e-05, "loss": 2.7889, "num_input_tokens_seen": 6356992000, "step": 12125 }, { "epoch": 0.5882602554543195, "grad_norm": 0.2421875, "learning_rate": 1.980782906906575e-05, "loss": 2.7823, "num_input_tokens_seen": 6359613440, "step": 12130 }, { "epoch": 0.5885027370105661, "grad_norm": 0.24609375, "learning_rate": 1.978822093789016e-05, "loss": 2.7825, "num_input_tokens_seen": 6362234880, "step": 12135 }, { "epoch": 0.5887452185668127, "grad_norm": 0.2490234375, "learning_rate": 1.9768616157921107e-05, "loss": 2.7652, "num_input_tokens_seen": 6364856320, "step": 12140 }, { "epoch": 0.5889877001230593, "grad_norm": 0.2421875, "learning_rate": 1.9749014741764596e-05, "loss": 2.7771, "num_input_tokens_seen": 6367477760, "step": 12145 }, { "epoch": 0.589230181679306, "grad_norm": 0.2490234375, "learning_rate": 1.972941670202446e-05, "loss": 2.767, "num_input_tokens_seen": 6370099200, "step": 12150 }, { "epoch": 0.5894726632355527, "grad_norm": 0.251953125, "learning_rate": 1.970982205130235e-05, "loss": 2.7573, "num_input_tokens_seen": 6372720640, "step": 12155 }, { "epoch": 0.5897151447917993, "grad_norm": 0.251953125, "learning_rate": 1.9690230802197757e-05, "loss": 2.7676, "num_input_tokens_seen": 6375342080, "step": 12160 }, { "epoch": 0.5899576263480459, "grad_norm": 0.25390625, "learning_rate": 1.9670642967307976e-05, "loss": 2.78, "num_input_tokens_seen": 6377963520, "step": 12165 }, { "epoch": 0.5902001079042926, "grad_norm": 0.2431640625, "learning_rate": 1.9651058559228107e-05, "loss": 2.7715, "num_input_tokens_seen": 6380584960, "step": 12170 }, { "epoch": 0.5904425894605392, "grad_norm": 0.24609375, "learning_rate": 1.963147759055105e-05, "loss": 2.764, "num_input_tokens_seen": 6383206400, "step": 12175 }, { "epoch": 0.5906850710167858, "grad_norm": 0.2431640625, "learning_rate": 1.961190007386749e-05, "loss": 2.7718, "num_input_tokens_seen": 6385827840, "step": 12180 }, { "epoch": 0.5909275525730324, "grad_norm": 0.25390625, "learning_rate": 1.9592326021765887e-05, "loss": 2.7784, "num_input_tokens_seen": 6388449280, "step": 12185 }, { "epoch": 0.591170034129279, "grad_norm": 0.2392578125, "learning_rate": 1.957275544683248e-05, "loss": 2.777, "num_input_tokens_seen": 6391070720, "step": 12190 }, { "epoch": 0.5914125156855257, "grad_norm": 0.25, "learning_rate": 1.9553188361651276e-05, "loss": 2.7786, "num_input_tokens_seen": 6393692160, "step": 12195 }, { "epoch": 0.5916549972417723, "grad_norm": 0.2451171875, "learning_rate": 1.953362477880403e-05, "loss": 2.7628, "num_input_tokens_seen": 6396313600, "step": 12200 }, { "epoch": 0.5918974787980189, "grad_norm": 0.2431640625, "learning_rate": 1.9514064710870248e-05, "loss": 2.7649, "num_input_tokens_seen": 6398935040, "step": 12205 }, { "epoch": 0.5921399603542655, "grad_norm": 0.248046875, "learning_rate": 1.9494508170427183e-05, "loss": 2.7757, "num_input_tokens_seen": 6401556480, "step": 12210 }, { "epoch": 0.5923824419105121, "grad_norm": 0.24609375, "learning_rate": 1.9474955170049802e-05, "loss": 2.7814, "num_input_tokens_seen": 6404177920, "step": 12215 }, { "epoch": 0.5926249234667588, "grad_norm": 0.248046875, "learning_rate": 1.945540572231081e-05, "loss": 2.7674, "num_input_tokens_seen": 6406799360, "step": 12220 }, { "epoch": 0.5928674050230054, "grad_norm": 0.25390625, "learning_rate": 1.9435859839780623e-05, "loss": 2.7723, "num_input_tokens_seen": 6409420800, "step": 12225 }, { "epoch": 0.593109886579252, "grad_norm": 0.248046875, "learning_rate": 1.9416317535027374e-05, "loss": 2.7838, "num_input_tokens_seen": 6412042240, "step": 12230 }, { "epoch": 0.5933523681354987, "grad_norm": 0.24609375, "learning_rate": 1.9396778820616876e-05, "loss": 2.769, "num_input_tokens_seen": 6414663680, "step": 12235 }, { "epoch": 0.5935948496917454, "grad_norm": 0.2470703125, "learning_rate": 1.937724370911266e-05, "loss": 2.7728, "num_input_tokens_seen": 6417285120, "step": 12240 }, { "epoch": 0.593837331247992, "grad_norm": 0.2421875, "learning_rate": 1.9357712213075907e-05, "loss": 2.7887, "num_input_tokens_seen": 6419906560, "step": 12245 }, { "epoch": 0.5940798128042386, "grad_norm": 0.251953125, "learning_rate": 1.9338184345065495e-05, "loss": 2.7656, "num_input_tokens_seen": 6422528000, "step": 12250 }, { "epoch": 0.5943222943604852, "grad_norm": 0.244140625, "learning_rate": 1.9318660117637978e-05, "loss": 2.7674, "num_input_tokens_seen": 6425149440, "step": 12255 }, { "epoch": 0.5945647759167318, "grad_norm": 0.2490234375, "learning_rate": 1.9299139543347542e-05, "loss": 2.7721, "num_input_tokens_seen": 6427770880, "step": 12260 }, { "epoch": 0.5948072574729785, "grad_norm": 0.244140625, "learning_rate": 1.927962263474604e-05, "loss": 2.7753, "num_input_tokens_seen": 6430392320, "step": 12265 }, { "epoch": 0.5950497390292251, "grad_norm": 0.24609375, "learning_rate": 1.9260109404382985e-05, "loss": 2.7697, "num_input_tokens_seen": 6433013760, "step": 12270 }, { "epoch": 0.5952922205854717, "grad_norm": 0.24609375, "learning_rate": 1.9240599864805485e-05, "loss": 2.7753, "num_input_tokens_seen": 6435635200, "step": 12275 }, { "epoch": 0.5955347021417183, "grad_norm": 0.2392578125, "learning_rate": 1.92210940285583e-05, "loss": 2.7812, "num_input_tokens_seen": 6438256640, "step": 12280 }, { "epoch": 0.595777183697965, "grad_norm": 0.240234375, "learning_rate": 1.9201591908183808e-05, "loss": 2.7734, "num_input_tokens_seen": 6440878080, "step": 12285 }, { "epoch": 0.5960196652542116, "grad_norm": 0.2470703125, "learning_rate": 1.9182093516221995e-05, "loss": 2.7799, "num_input_tokens_seen": 6443499520, "step": 12290 }, { "epoch": 0.5962621468104582, "grad_norm": 0.2451171875, "learning_rate": 1.916259886521044e-05, "loss": 2.7748, "num_input_tokens_seen": 6446120960, "step": 12295 }, { "epoch": 0.5965046283667048, "grad_norm": 0.240234375, "learning_rate": 1.914310796768434e-05, "loss": 2.7735, "num_input_tokens_seen": 6448742400, "step": 12300 }, { "epoch": 0.5965046283667048, "eval_accuracy": 0.45588014981273406, "eval_loss": 2.742222309112549, "eval_runtime": 5.8637, "eval_samples_per_second": 51.162, "eval_steps_per_second": 6.481, "num_input_tokens_seen": 6448742400, "step": 12300 }, { "epoch": 0.5967471099229514, "grad_norm": 0.2431640625, "learning_rate": 1.9123620836176467e-05, "loss": 2.7739, "num_input_tokens_seen": 6451363840, "step": 12305 }, { "epoch": 0.5969895914791982, "grad_norm": 0.2353515625, "learning_rate": 1.9104137483217148e-05, "loss": 2.7703, "num_input_tokens_seen": 6453985280, "step": 12310 }, { "epoch": 0.5972320730354448, "grad_norm": 0.2431640625, "learning_rate": 1.9084657921334314e-05, "loss": 2.7687, "num_input_tokens_seen": 6456606720, "step": 12315 }, { "epoch": 0.5974745545916914, "grad_norm": 0.2451171875, "learning_rate": 1.9065182163053435e-05, "loss": 2.7785, "num_input_tokens_seen": 6459228160, "step": 12320 }, { "epoch": 0.597717036147938, "grad_norm": 0.2490234375, "learning_rate": 1.904571022089756e-05, "loss": 2.7631, "num_input_tokens_seen": 6461849600, "step": 12325 }, { "epoch": 0.5979595177041847, "grad_norm": 0.2451171875, "learning_rate": 1.9026242107387266e-05, "loss": 2.7736, "num_input_tokens_seen": 6464471040, "step": 12330 }, { "epoch": 0.5982019992604313, "grad_norm": 0.2431640625, "learning_rate": 1.9006777835040675e-05, "loss": 2.7736, "num_input_tokens_seen": 6467092480, "step": 12335 }, { "epoch": 0.5984444808166779, "grad_norm": 0.248046875, "learning_rate": 1.8987317416373418e-05, "loss": 2.7852, "num_input_tokens_seen": 6469713920, "step": 12340 }, { "epoch": 0.5986869623729245, "grad_norm": 0.2412109375, "learning_rate": 1.8967860863898677e-05, "loss": 2.7914, "num_input_tokens_seen": 6472335360, "step": 12345 }, { "epoch": 0.5989294439291711, "grad_norm": 0.251953125, "learning_rate": 1.894840819012714e-05, "loss": 2.7865, "num_input_tokens_seen": 6474956800, "step": 12350 }, { "epoch": 0.5991719254854178, "grad_norm": 0.255859375, "learning_rate": 1.892895940756699e-05, "loss": 2.7708, "num_input_tokens_seen": 6477578240, "step": 12355 }, { "epoch": 0.5994144070416644, "grad_norm": 0.2373046875, "learning_rate": 1.8909514528723933e-05, "loss": 2.7799, "num_input_tokens_seen": 6480199680, "step": 12360 }, { "epoch": 0.599656888597911, "grad_norm": 0.2451171875, "learning_rate": 1.8890073566101138e-05, "loss": 2.7649, "num_input_tokens_seen": 6482821120, "step": 12365 }, { "epoch": 0.5998993701541576, "grad_norm": 0.2373046875, "learning_rate": 1.887063653219925e-05, "loss": 2.7881, "num_input_tokens_seen": 6485442560, "step": 12370 }, { "epoch": 0.6001418517104042, "grad_norm": 0.251953125, "learning_rate": 1.8851203439516423e-05, "loss": 2.7742, "num_input_tokens_seen": 6488064000, "step": 12375 }, { "epoch": 0.6003843332666509, "grad_norm": 0.2421875, "learning_rate": 1.8831774300548252e-05, "loss": 2.7796, "num_input_tokens_seen": 6490685440, "step": 12380 }, { "epoch": 0.6006268148228975, "grad_norm": 0.24609375, "learning_rate": 1.8812349127787792e-05, "loss": 2.7747, "num_input_tokens_seen": 6493306880, "step": 12385 }, { "epoch": 0.6008692963791442, "grad_norm": 0.24609375, "learning_rate": 1.8792927933725555e-05, "loss": 2.7859, "num_input_tokens_seen": 6495928320, "step": 12390 }, { "epoch": 0.6011117779353908, "grad_norm": 0.2451171875, "learning_rate": 1.8773510730849497e-05, "loss": 2.7795, "num_input_tokens_seen": 6498549760, "step": 12395 }, { "epoch": 0.6013542594916375, "grad_norm": 0.24609375, "learning_rate": 1.8754097531644975e-05, "loss": 2.7734, "num_input_tokens_seen": 6501171200, "step": 12400 }, { "epoch": 0.6015967410478841, "grad_norm": 0.2421875, "learning_rate": 1.873468834859482e-05, "loss": 2.7718, "num_input_tokens_seen": 6503792640, "step": 12405 }, { "epoch": 0.6018392226041307, "grad_norm": 0.24609375, "learning_rate": 1.8715283194179254e-05, "loss": 2.7756, "num_input_tokens_seen": 6506414080, "step": 12410 }, { "epoch": 0.6020817041603773, "grad_norm": 0.25, "learning_rate": 1.869588208087591e-05, "loss": 2.7581, "num_input_tokens_seen": 6509035520, "step": 12415 }, { "epoch": 0.6023241857166239, "grad_norm": 0.244140625, "learning_rate": 1.8676485021159827e-05, "loss": 2.769, "num_input_tokens_seen": 6511656960, "step": 12420 }, { "epoch": 0.6025666672728706, "grad_norm": 0.2421875, "learning_rate": 1.8657092027503444e-05, "loss": 2.7913, "num_input_tokens_seen": 6514278400, "step": 12425 }, { "epoch": 0.6028091488291172, "grad_norm": 0.248046875, "learning_rate": 1.863770311237656e-05, "loss": 2.783, "num_input_tokens_seen": 6516899840, "step": 12430 }, { "epoch": 0.6030516303853638, "grad_norm": 0.248046875, "learning_rate": 1.8618318288246373e-05, "loss": 2.7735, "num_input_tokens_seen": 6519521280, "step": 12435 }, { "epoch": 0.6032941119416104, "grad_norm": 0.23828125, "learning_rate": 1.8598937567577456e-05, "loss": 2.7734, "num_input_tokens_seen": 6522142720, "step": 12440 }, { "epoch": 0.603536593497857, "grad_norm": 0.244140625, "learning_rate": 1.8579560962831727e-05, "loss": 2.7794, "num_input_tokens_seen": 6524764160, "step": 12445 }, { "epoch": 0.6037790750541037, "grad_norm": 0.2470703125, "learning_rate": 1.8560188486468463e-05, "loss": 2.7717, "num_input_tokens_seen": 6527385600, "step": 12450 }, { "epoch": 0.6040215566103503, "grad_norm": 0.2490234375, "learning_rate": 1.8540820150944292e-05, "loss": 2.7831, "num_input_tokens_seen": 6530007040, "step": 12455 }, { "epoch": 0.6042640381665969, "grad_norm": 0.251953125, "learning_rate": 1.8521455968713176e-05, "loss": 2.7735, "num_input_tokens_seen": 6532628480, "step": 12460 }, { "epoch": 0.6045065197228435, "grad_norm": 0.248046875, "learning_rate": 1.85020959522264e-05, "loss": 2.7771, "num_input_tokens_seen": 6535249920, "step": 12465 }, { "epoch": 0.6047490012790903, "grad_norm": 0.2431640625, "learning_rate": 1.8482740113932573e-05, "loss": 2.7757, "num_input_tokens_seen": 6537871360, "step": 12470 }, { "epoch": 0.6049914828353369, "grad_norm": 0.251953125, "learning_rate": 1.8463388466277625e-05, "loss": 2.7683, "num_input_tokens_seen": 6540492800, "step": 12475 }, { "epoch": 0.6052339643915835, "grad_norm": 0.2490234375, "learning_rate": 1.844404102170479e-05, "loss": 2.7574, "num_input_tokens_seen": 6543114240, "step": 12480 }, { "epoch": 0.6054764459478301, "grad_norm": 0.248046875, "learning_rate": 1.842469779265459e-05, "loss": 2.7632, "num_input_tokens_seen": 6545735680, "step": 12485 }, { "epoch": 0.6057189275040767, "grad_norm": 0.251953125, "learning_rate": 1.8405358791564846e-05, "loss": 2.7693, "num_input_tokens_seen": 6548357120, "step": 12490 }, { "epoch": 0.6059614090603234, "grad_norm": 0.2431640625, "learning_rate": 1.8386024030870653e-05, "loss": 2.7743, "num_input_tokens_seen": 6550978560, "step": 12495 }, { "epoch": 0.60620389061657, "grad_norm": 0.23828125, "learning_rate": 1.8366693523004385e-05, "loss": 2.764, "num_input_tokens_seen": 6553600000, "step": 12500 }, { "epoch": 0.6064463721728166, "grad_norm": 0.251953125, "learning_rate": 1.834736728039568e-05, "loss": 2.7782, "num_input_tokens_seen": 6556221440, "step": 12505 }, { "epoch": 0.6066888537290632, "grad_norm": 0.2431640625, "learning_rate": 1.8328045315471432e-05, "loss": 2.7852, "num_input_tokens_seen": 6558842880, "step": 12510 }, { "epoch": 0.6069313352853098, "grad_norm": 0.2431640625, "learning_rate": 1.8308727640655786e-05, "loss": 2.7706, "num_input_tokens_seen": 6561464320, "step": 12515 }, { "epoch": 0.6071738168415565, "grad_norm": 0.240234375, "learning_rate": 1.828941426837013e-05, "loss": 2.7746, "num_input_tokens_seen": 6564085760, "step": 12520 }, { "epoch": 0.6074162983978031, "grad_norm": 0.24609375, "learning_rate": 1.8270105211033082e-05, "loss": 2.7705, "num_input_tokens_seen": 6566707200, "step": 12525 }, { "epoch": 0.6076587799540497, "grad_norm": 0.2431640625, "learning_rate": 1.8250800481060482e-05, "loss": 2.7724, "num_input_tokens_seen": 6569328640, "step": 12530 }, { "epoch": 0.6079012615102963, "grad_norm": 0.2412109375, "learning_rate": 1.8231500090865395e-05, "loss": 2.7723, "num_input_tokens_seen": 6571950080, "step": 12535 }, { "epoch": 0.608143743066543, "grad_norm": 0.25, "learning_rate": 1.821220405285809e-05, "loss": 2.7829, "num_input_tokens_seen": 6574571520, "step": 12540 }, { "epoch": 0.6083862246227897, "grad_norm": 0.2451171875, "learning_rate": 1.8192912379446047e-05, "loss": 2.7766, "num_input_tokens_seen": 6577192960, "step": 12545 }, { "epoch": 0.6086287061790363, "grad_norm": 0.25, "learning_rate": 1.8173625083033935e-05, "loss": 2.775, "num_input_tokens_seen": 6579814400, "step": 12550 }, { "epoch": 0.6088711877352829, "grad_norm": 0.2451171875, "learning_rate": 1.81543421760236e-05, "loss": 2.7615, "num_input_tokens_seen": 6582435840, "step": 12555 }, { "epoch": 0.6091136692915295, "grad_norm": 0.2373046875, "learning_rate": 1.8135063670814062e-05, "loss": 2.7693, "num_input_tokens_seen": 6585057280, "step": 12560 }, { "epoch": 0.6093561508477762, "grad_norm": 0.2470703125, "learning_rate": 1.8115789579801534e-05, "loss": 2.7738, "num_input_tokens_seen": 6587678720, "step": 12565 }, { "epoch": 0.6095986324040228, "grad_norm": 0.2421875, "learning_rate": 1.8096519915379376e-05, "loss": 2.7831, "num_input_tokens_seen": 6590300160, "step": 12570 }, { "epoch": 0.6098411139602694, "grad_norm": 0.25, "learning_rate": 1.80772546899381e-05, "loss": 2.7983, "num_input_tokens_seen": 6592921600, "step": 12575 }, { "epoch": 0.610083595516516, "grad_norm": 0.263671875, "learning_rate": 1.8057993915865372e-05, "loss": 2.7901, "num_input_tokens_seen": 6595543040, "step": 12580 }, { "epoch": 0.6103260770727627, "grad_norm": 0.2431640625, "learning_rate": 1.8038737605545977e-05, "loss": 2.7852, "num_input_tokens_seen": 6598164480, "step": 12585 }, { "epoch": 0.6105685586290093, "grad_norm": 0.236328125, "learning_rate": 1.8019485771361854e-05, "loss": 2.7747, "num_input_tokens_seen": 6600785920, "step": 12590 }, { "epoch": 0.6108110401852559, "grad_norm": 0.259765625, "learning_rate": 1.8000238425692052e-05, "loss": 2.7769, "num_input_tokens_seen": 6603407360, "step": 12595 }, { "epoch": 0.6110535217415025, "grad_norm": 0.25, "learning_rate": 1.7980995580912728e-05, "loss": 2.7848, "num_input_tokens_seen": 6606028800, "step": 12600 }, { "epoch": 0.6110535217415025, "eval_accuracy": 0.4559224881941052, "eval_loss": 2.7420172691345215, "eval_runtime": 5.8785, "eval_samples_per_second": 51.034, "eval_steps_per_second": 6.464, "num_input_tokens_seen": 6606028800, "step": 12600 }, { "epoch": 0.6112960032977491, "grad_norm": 0.2431640625, "learning_rate": 1.7961757249397153e-05, "loss": 2.7728, "num_input_tokens_seen": 6608650240, "step": 12605 }, { "epoch": 0.6115384848539958, "grad_norm": 0.2451171875, "learning_rate": 1.7942523443515703e-05, "loss": 2.7823, "num_input_tokens_seen": 6611271680, "step": 12610 }, { "epoch": 0.6117809664102424, "grad_norm": 0.2490234375, "learning_rate": 1.792329417563584e-05, "loss": 2.7777, "num_input_tokens_seen": 6613893120, "step": 12615 }, { "epoch": 0.612023447966489, "grad_norm": 0.240234375, "learning_rate": 1.7904069458122085e-05, "loss": 2.7676, "num_input_tokens_seen": 6616514560, "step": 12620 }, { "epoch": 0.6122659295227357, "grad_norm": 0.25, "learning_rate": 1.788484930333606e-05, "loss": 2.7701, "num_input_tokens_seen": 6619136000, "step": 12625 }, { "epoch": 0.6125084110789824, "grad_norm": 0.24609375, "learning_rate": 1.786563372363644e-05, "loss": 2.7708, "num_input_tokens_seen": 6621757440, "step": 12630 }, { "epoch": 0.612750892635229, "grad_norm": 0.25390625, "learning_rate": 1.7846422731378976e-05, "loss": 2.7842, "num_input_tokens_seen": 6624378880, "step": 12635 }, { "epoch": 0.6129933741914756, "grad_norm": 0.24609375, "learning_rate": 1.7827216338916444e-05, "loss": 2.783, "num_input_tokens_seen": 6627000320, "step": 12640 }, { "epoch": 0.6132358557477222, "grad_norm": 0.240234375, "learning_rate": 1.780801455859869e-05, "loss": 2.7665, "num_input_tokens_seen": 6629621760, "step": 12645 }, { "epoch": 0.6134783373039688, "grad_norm": 0.244140625, "learning_rate": 1.778881740277256e-05, "loss": 2.7907, "num_input_tokens_seen": 6632243200, "step": 12650 }, { "epoch": 0.6137208188602155, "grad_norm": 0.240234375, "learning_rate": 1.7769624883781952e-05, "loss": 2.7794, "num_input_tokens_seen": 6634864640, "step": 12655 }, { "epoch": 0.6139633004164621, "grad_norm": 0.248046875, "learning_rate": 1.7750437013967773e-05, "loss": 2.7774, "num_input_tokens_seen": 6637486080, "step": 12660 }, { "epoch": 0.6142057819727087, "grad_norm": 0.2431640625, "learning_rate": 1.7731253805667946e-05, "loss": 2.7857, "num_input_tokens_seen": 6640107520, "step": 12665 }, { "epoch": 0.6144482635289553, "grad_norm": 0.2373046875, "learning_rate": 1.7712075271217403e-05, "loss": 2.7687, "num_input_tokens_seen": 6642728960, "step": 12670 }, { "epoch": 0.6146907450852019, "grad_norm": 0.240234375, "learning_rate": 1.769290142294806e-05, "loss": 2.7591, "num_input_tokens_seen": 6645350400, "step": 12675 }, { "epoch": 0.6149332266414486, "grad_norm": 0.2412109375, "learning_rate": 1.7673732273188807e-05, "loss": 2.776, "num_input_tokens_seen": 6647971840, "step": 12680 }, { "epoch": 0.6151757081976952, "grad_norm": 0.244140625, "learning_rate": 1.765456783426553e-05, "loss": 2.7742, "num_input_tokens_seen": 6650593280, "step": 12685 }, { "epoch": 0.6154181897539418, "grad_norm": 0.251953125, "learning_rate": 1.763540811850109e-05, "loss": 2.7853, "num_input_tokens_seen": 6653214720, "step": 12690 }, { "epoch": 0.6156606713101884, "grad_norm": 0.2578125, "learning_rate": 1.7616253138215307e-05, "loss": 2.7748, "num_input_tokens_seen": 6655836160, "step": 12695 }, { "epoch": 0.615903152866435, "grad_norm": 0.2421875, "learning_rate": 1.7597102905724944e-05, "loss": 2.7738, "num_input_tokens_seen": 6658457600, "step": 12700 }, { "epoch": 0.6161456344226818, "grad_norm": 0.2470703125, "learning_rate": 1.757795743334374e-05, "loss": 2.784, "num_input_tokens_seen": 6661079040, "step": 12705 }, { "epoch": 0.6163881159789284, "grad_norm": 0.24609375, "learning_rate": 1.755881673338232e-05, "loss": 2.7678, "num_input_tokens_seen": 6663700480, "step": 12710 }, { "epoch": 0.616630597535175, "grad_norm": 0.2490234375, "learning_rate": 1.7539680818148303e-05, "loss": 2.7799, "num_input_tokens_seen": 6666321920, "step": 12715 }, { "epoch": 0.6168730790914216, "grad_norm": 0.25, "learning_rate": 1.7520549699946184e-05, "loss": 2.7806, "num_input_tokens_seen": 6668943360, "step": 12720 }, { "epoch": 0.6171155606476683, "grad_norm": 0.2470703125, "learning_rate": 1.7501423391077403e-05, "loss": 2.7836, "num_input_tokens_seen": 6671564800, "step": 12725 }, { "epoch": 0.6173580422039149, "grad_norm": 0.2392578125, "learning_rate": 1.7482301903840292e-05, "loss": 2.7726, "num_input_tokens_seen": 6674186240, "step": 12730 }, { "epoch": 0.6176005237601615, "grad_norm": 0.2451171875, "learning_rate": 1.74631852505301e-05, "loss": 2.7747, "num_input_tokens_seen": 6676807680, "step": 12735 }, { "epoch": 0.6178430053164081, "grad_norm": 0.251953125, "learning_rate": 1.7444073443438923e-05, "loss": 2.7729, "num_input_tokens_seen": 6679429120, "step": 12740 }, { "epoch": 0.6180854868726547, "grad_norm": 0.24609375, "learning_rate": 1.7424966494855798e-05, "loss": 2.7976, "num_input_tokens_seen": 6682050560, "step": 12745 }, { "epoch": 0.6183279684289014, "grad_norm": 0.2421875, "learning_rate": 1.740586441706661e-05, "loss": 2.7827, "num_input_tokens_seen": 6684672000, "step": 12750 }, { "epoch": 0.618570449985148, "grad_norm": 0.2353515625, "learning_rate": 1.7386767222354105e-05, "loss": 2.7784, "num_input_tokens_seen": 6687293440, "step": 12755 }, { "epoch": 0.6188129315413946, "grad_norm": 0.2412109375, "learning_rate": 1.7367674922997907e-05, "loss": 2.7745, "num_input_tokens_seen": 6689914880, "step": 12760 }, { "epoch": 0.6190554130976412, "grad_norm": 0.248046875, "learning_rate": 1.734858753127448e-05, "loss": 2.7726, "num_input_tokens_seen": 6692536320, "step": 12765 }, { "epoch": 0.6192978946538878, "grad_norm": 0.248046875, "learning_rate": 1.7329505059457143e-05, "loss": 2.7715, "num_input_tokens_seen": 6695157760, "step": 12770 }, { "epoch": 0.6195403762101345, "grad_norm": 0.24609375, "learning_rate": 1.7310427519816036e-05, "loss": 2.761, "num_input_tokens_seen": 6697779200, "step": 12775 }, { "epoch": 0.6197828577663811, "grad_norm": 0.25, "learning_rate": 1.7291354924618136e-05, "loss": 2.7712, "num_input_tokens_seen": 6700400640, "step": 12780 }, { "epoch": 0.6200253393226278, "grad_norm": 0.251953125, "learning_rate": 1.7272287286127247e-05, "loss": 2.7901, "num_input_tokens_seen": 6703022080, "step": 12785 }, { "epoch": 0.6202678208788744, "grad_norm": 0.2421875, "learning_rate": 1.725322461660398e-05, "loss": 2.8016, "num_input_tokens_seen": 6705643520, "step": 12790 }, { "epoch": 0.6205103024351211, "grad_norm": 0.25, "learning_rate": 1.7234166928305744e-05, "loss": 2.7778, "num_input_tokens_seen": 6708264960, "step": 12795 }, { "epoch": 0.6207527839913677, "grad_norm": 0.2412109375, "learning_rate": 1.7215114233486762e-05, "loss": 2.7917, "num_input_tokens_seen": 6710886400, "step": 12800 }, { "epoch": 0.6209952655476143, "grad_norm": 0.25, "learning_rate": 1.7196066544398026e-05, "loss": 2.7839, "num_input_tokens_seen": 6713507840, "step": 12805 }, { "epoch": 0.6212377471038609, "grad_norm": 0.24609375, "learning_rate": 1.7177023873287324e-05, "loss": 2.7706, "num_input_tokens_seen": 6716129280, "step": 12810 }, { "epoch": 0.6214802286601075, "grad_norm": 0.2470703125, "learning_rate": 1.715798623239921e-05, "loss": 2.766, "num_input_tokens_seen": 6718750720, "step": 12815 }, { "epoch": 0.6217227102163542, "grad_norm": 0.24609375, "learning_rate": 1.7138953633975007e-05, "loss": 2.7767, "num_input_tokens_seen": 6721372160, "step": 12820 }, { "epoch": 0.6219651917726008, "grad_norm": 0.2470703125, "learning_rate": 1.71199260902528e-05, "loss": 2.7666, "num_input_tokens_seen": 6723993600, "step": 12825 }, { "epoch": 0.6222076733288474, "grad_norm": 0.240234375, "learning_rate": 1.7100903613467416e-05, "loss": 2.7815, "num_input_tokens_seen": 6726615040, "step": 12830 }, { "epoch": 0.622450154885094, "grad_norm": 0.240234375, "learning_rate": 1.7081886215850424e-05, "loss": 2.7743, "num_input_tokens_seen": 6729236480, "step": 12835 }, { "epoch": 0.6226926364413407, "grad_norm": 0.24609375, "learning_rate": 1.7062873909630127e-05, "loss": 2.7768, "num_input_tokens_seen": 6731857920, "step": 12840 }, { "epoch": 0.6229351179975873, "grad_norm": 0.2451171875, "learning_rate": 1.7043866707031562e-05, "loss": 2.7609, "num_input_tokens_seen": 6734479360, "step": 12845 }, { "epoch": 0.6231775995538339, "grad_norm": 0.240234375, "learning_rate": 1.702486462027648e-05, "loss": 2.7722, "num_input_tokens_seen": 6737100800, "step": 12850 }, { "epoch": 0.6234200811100805, "grad_norm": 0.2421875, "learning_rate": 1.7005867661583336e-05, "loss": 2.7858, "num_input_tokens_seen": 6739722240, "step": 12855 }, { "epoch": 0.6236625626663272, "grad_norm": 0.244140625, "learning_rate": 1.6986875843167306e-05, "loss": 2.7844, "num_input_tokens_seen": 6742343680, "step": 12860 }, { "epoch": 0.6239050442225739, "grad_norm": 0.244140625, "learning_rate": 1.696788917724023e-05, "loss": 2.7749, "num_input_tokens_seen": 6744965120, "step": 12865 }, { "epoch": 0.6241475257788205, "grad_norm": 0.25, "learning_rate": 1.694890767601066e-05, "loss": 2.7744, "num_input_tokens_seen": 6747586560, "step": 12870 }, { "epoch": 0.6243900073350671, "grad_norm": 0.244140625, "learning_rate": 1.6929931351683824e-05, "loss": 2.7816, "num_input_tokens_seen": 6750208000, "step": 12875 }, { "epoch": 0.6246324888913137, "grad_norm": 0.2451171875, "learning_rate": 1.691096021646162e-05, "loss": 2.7778, "num_input_tokens_seen": 6752829440, "step": 12880 }, { "epoch": 0.6248749704475604, "grad_norm": 0.248046875, "learning_rate": 1.6891994282542595e-05, "loss": 2.78, "num_input_tokens_seen": 6755450880, "step": 12885 }, { "epoch": 0.625117452003807, "grad_norm": 0.2451171875, "learning_rate": 1.687303356212198e-05, "loss": 2.7668, "num_input_tokens_seen": 6758072320, "step": 12890 }, { "epoch": 0.6253599335600536, "grad_norm": 0.25, "learning_rate": 1.6854078067391617e-05, "loss": 2.7691, "num_input_tokens_seen": 6760693760, "step": 12895 }, { "epoch": 0.6256024151163002, "grad_norm": 0.2421875, "learning_rate": 1.6835127810540018e-05, "loss": 2.7748, "num_input_tokens_seen": 6763315200, "step": 12900 }, { "epoch": 0.6256024151163002, "eval_accuracy": 0.4559110893991207, "eval_loss": 2.742025375366211, "eval_runtime": 5.8351, "eval_samples_per_second": 51.413, "eval_steps_per_second": 6.512, "num_input_tokens_seen": 6763315200, "step": 12900 }, { "epoch": 0.6258448966725468, "grad_norm": 0.240234375, "learning_rate": 1.681618280375232e-05, "loss": 2.7705, "num_input_tokens_seen": 6765936640, "step": 12905 }, { "epoch": 0.6260873782287935, "grad_norm": 0.2412109375, "learning_rate": 1.6797243059210273e-05, "loss": 2.7711, "num_input_tokens_seen": 6768558080, "step": 12910 }, { "epoch": 0.6263298597850401, "grad_norm": 0.240234375, "learning_rate": 1.6778308589092255e-05, "loss": 2.7629, "num_input_tokens_seen": 6771179520, "step": 12915 }, { "epoch": 0.6265723413412867, "grad_norm": 0.2431640625, "learning_rate": 1.675937940557325e-05, "loss": 2.7645, "num_input_tokens_seen": 6773800960, "step": 12920 }, { "epoch": 0.6268148228975333, "grad_norm": 0.24609375, "learning_rate": 1.6740455520824852e-05, "loss": 2.7832, "num_input_tokens_seen": 6776422400, "step": 12925 }, { "epoch": 0.6270573044537799, "grad_norm": 0.255859375, "learning_rate": 1.6721536947015216e-05, "loss": 2.7708, "num_input_tokens_seen": 6779043840, "step": 12930 }, { "epoch": 0.6272997860100266, "grad_norm": 0.2373046875, "learning_rate": 1.670262369630911e-05, "loss": 2.7812, "num_input_tokens_seen": 6781665280, "step": 12935 }, { "epoch": 0.6275422675662733, "grad_norm": 0.236328125, "learning_rate": 1.6683715780867882e-05, "loss": 2.7591, "num_input_tokens_seen": 6784286720, "step": 12940 }, { "epoch": 0.6277847491225199, "grad_norm": 0.2470703125, "learning_rate": 1.6664813212849424e-05, "loss": 2.7775, "num_input_tokens_seen": 6786908160, "step": 12945 }, { "epoch": 0.6280272306787665, "grad_norm": 0.2470703125, "learning_rate": 1.664591600440822e-05, "loss": 2.7786, "num_input_tokens_seen": 6789529600, "step": 12950 }, { "epoch": 0.6282697122350132, "grad_norm": 0.248046875, "learning_rate": 1.6627024167695296e-05, "loss": 2.7621, "num_input_tokens_seen": 6792151040, "step": 12955 }, { "epoch": 0.6285121937912598, "grad_norm": 0.2421875, "learning_rate": 1.660813771485821e-05, "loss": 2.7683, "num_input_tokens_seen": 6794772480, "step": 12960 }, { "epoch": 0.6287546753475064, "grad_norm": 0.23828125, "learning_rate": 1.6589256658041062e-05, "loss": 2.7679, "num_input_tokens_seen": 6797393920, "step": 12965 }, { "epoch": 0.628997156903753, "grad_norm": 0.2421875, "learning_rate": 1.6570381009384506e-05, "loss": 2.7724, "num_input_tokens_seen": 6800015360, "step": 12970 }, { "epoch": 0.6292396384599996, "grad_norm": 0.2421875, "learning_rate": 1.65515107810257e-05, "loss": 2.7841, "num_input_tokens_seen": 6802636800, "step": 12975 }, { "epoch": 0.6294821200162463, "grad_norm": 0.2392578125, "learning_rate": 1.653264598509831e-05, "loss": 2.7713, "num_input_tokens_seen": 6805258240, "step": 12980 }, { "epoch": 0.6297246015724929, "grad_norm": 0.244140625, "learning_rate": 1.6513786633732537e-05, "loss": 2.7771, "num_input_tokens_seen": 6807879680, "step": 12985 }, { "epoch": 0.6299670831287395, "grad_norm": 0.2451171875, "learning_rate": 1.6494932739055035e-05, "loss": 2.7914, "num_input_tokens_seen": 6810501120, "step": 12990 }, { "epoch": 0.6302095646849861, "grad_norm": 0.2412109375, "learning_rate": 1.6476084313188988e-05, "loss": 2.7755, "num_input_tokens_seen": 6813122560, "step": 12995 }, { "epoch": 0.6304520462412327, "grad_norm": 0.248046875, "learning_rate": 1.6457241368254056e-05, "loss": 2.7811, "num_input_tokens_seen": 6815744000, "step": 13000 }, { "epoch": 0.6306945277974794, "grad_norm": 0.25, "learning_rate": 1.6438403916366368e-05, "loss": 2.7864, "num_input_tokens_seen": 6818365440, "step": 13005 }, { "epoch": 0.630937009353726, "grad_norm": 0.240234375, "learning_rate": 1.6419571969638525e-05, "loss": 2.7722, "num_input_tokens_seen": 6820986880, "step": 13010 }, { "epoch": 0.6311794909099726, "grad_norm": 0.2431640625, "learning_rate": 1.6400745540179592e-05, "loss": 2.7862, "num_input_tokens_seen": 6823608320, "step": 13015 }, { "epoch": 0.6314219724662193, "grad_norm": 0.2421875, "learning_rate": 1.6381924640095065e-05, "loss": 2.7735, "num_input_tokens_seen": 6826229760, "step": 13020 }, { "epoch": 0.631664454022466, "grad_norm": 0.2451171875, "learning_rate": 1.6363109281486904e-05, "loss": 2.7704, "num_input_tokens_seen": 6828851200, "step": 13025 }, { "epoch": 0.6319069355787126, "grad_norm": 0.2451171875, "learning_rate": 1.634429947645351e-05, "loss": 2.7741, "num_input_tokens_seen": 6831472640, "step": 13030 }, { "epoch": 0.6321494171349592, "grad_norm": 0.244140625, "learning_rate": 1.6325495237089704e-05, "loss": 2.7672, "num_input_tokens_seen": 6834094080, "step": 13035 }, { "epoch": 0.6323918986912058, "grad_norm": 0.2431640625, "learning_rate": 1.630669657548673e-05, "loss": 2.7789, "num_input_tokens_seen": 6836715520, "step": 13040 }, { "epoch": 0.6326343802474524, "grad_norm": 0.2421875, "learning_rate": 1.628790350373225e-05, "loss": 2.7799, "num_input_tokens_seen": 6839336960, "step": 13045 }, { "epoch": 0.6328768618036991, "grad_norm": 0.2421875, "learning_rate": 1.626911603391031e-05, "loss": 2.7767, "num_input_tokens_seen": 6841958400, "step": 13050 }, { "epoch": 0.6331193433599457, "grad_norm": 0.251953125, "learning_rate": 1.6250334178101378e-05, "loss": 2.7711, "num_input_tokens_seen": 6844579840, "step": 13055 }, { "epoch": 0.6333618249161923, "grad_norm": 0.2431640625, "learning_rate": 1.6231557948382314e-05, "loss": 2.7827, "num_input_tokens_seen": 6847201280, "step": 13060 }, { "epoch": 0.6336043064724389, "grad_norm": 0.248046875, "learning_rate": 1.6212787356826344e-05, "loss": 2.7837, "num_input_tokens_seen": 6849822720, "step": 13065 }, { "epoch": 0.6338467880286855, "grad_norm": 0.248046875, "learning_rate": 1.6194022415503072e-05, "loss": 2.7746, "num_input_tokens_seen": 6852444160, "step": 13070 }, { "epoch": 0.6340892695849322, "grad_norm": 0.2431640625, "learning_rate": 1.6175263136478478e-05, "loss": 2.786, "num_input_tokens_seen": 6855065600, "step": 13075 }, { "epoch": 0.6343317511411788, "grad_norm": 0.25, "learning_rate": 1.61565095318149e-05, "loss": 2.7771, "num_input_tokens_seen": 6857687040, "step": 13080 }, { "epoch": 0.6345742326974254, "grad_norm": 0.2490234375, "learning_rate": 1.6137761613571012e-05, "loss": 2.7756, "num_input_tokens_seen": 6860308480, "step": 13085 }, { "epoch": 0.634816714253672, "grad_norm": 0.259765625, "learning_rate": 1.611901939380185e-05, "loss": 2.793, "num_input_tokens_seen": 6862929920, "step": 13090 }, { "epoch": 0.6350591958099187, "grad_norm": 0.240234375, "learning_rate": 1.610028288455878e-05, "loss": 2.7822, "num_input_tokens_seen": 6865551360, "step": 13095 }, { "epoch": 0.6353016773661654, "grad_norm": 0.2421875, "learning_rate": 1.6081552097889484e-05, "loss": 2.7829, "num_input_tokens_seen": 6868172800, "step": 13100 }, { "epoch": 0.635544158922412, "grad_norm": 0.2373046875, "learning_rate": 1.6062827045837993e-05, "loss": 2.7773, "num_input_tokens_seen": 6870794240, "step": 13105 }, { "epoch": 0.6357866404786586, "grad_norm": 0.2431640625, "learning_rate": 1.604410774044462e-05, "loss": 2.7833, "num_input_tokens_seen": 6873415680, "step": 13110 }, { "epoch": 0.6360291220349052, "grad_norm": 0.24609375, "learning_rate": 1.6025394193745994e-05, "loss": 2.7855, "num_input_tokens_seen": 6876037120, "step": 13115 }, { "epoch": 0.6362716035911519, "grad_norm": 0.2490234375, "learning_rate": 1.6006686417775046e-05, "loss": 2.785, "num_input_tokens_seen": 6878658560, "step": 13120 }, { "epoch": 0.6365140851473985, "grad_norm": 0.2421875, "learning_rate": 1.5987984424560994e-05, "loss": 2.7609, "num_input_tokens_seen": 6881280000, "step": 13125 }, { "epoch": 0.6367565667036451, "grad_norm": 0.2421875, "learning_rate": 1.5969288226129337e-05, "loss": 2.7638, "num_input_tokens_seen": 6883901440, "step": 13130 }, { "epoch": 0.6369990482598917, "grad_norm": 0.263671875, "learning_rate": 1.5950597834501845e-05, "loss": 2.7693, "num_input_tokens_seen": 6886522880, "step": 13135 }, { "epoch": 0.6372415298161384, "grad_norm": 0.240234375, "learning_rate": 1.593191326169657e-05, "loss": 2.775, "num_input_tokens_seen": 6889144320, "step": 13140 }, { "epoch": 0.637484011372385, "grad_norm": 0.2421875, "learning_rate": 1.5913234519727783e-05, "loss": 2.7841, "num_input_tokens_seen": 6891765760, "step": 13145 }, { "epoch": 0.6377264929286316, "grad_norm": 0.23828125, "learning_rate": 1.5894561620606053e-05, "loss": 2.7787, "num_input_tokens_seen": 6894387200, "step": 13150 }, { "epoch": 0.6379689744848782, "grad_norm": 0.2412109375, "learning_rate": 1.587589457633816e-05, "loss": 2.785, "num_input_tokens_seen": 6897008640, "step": 13155 }, { "epoch": 0.6382114560411248, "grad_norm": 0.251953125, "learning_rate": 1.5857233398927136e-05, "loss": 2.7756, "num_input_tokens_seen": 6899630080, "step": 13160 }, { "epoch": 0.6384539375973715, "grad_norm": 0.2421875, "learning_rate": 1.5838578100372236e-05, "loss": 2.769, "num_input_tokens_seen": 6902251520, "step": 13165 }, { "epoch": 0.6386964191536181, "grad_norm": 0.2490234375, "learning_rate": 1.5819928692668935e-05, "loss": 2.7676, "num_input_tokens_seen": 6904872960, "step": 13170 }, { "epoch": 0.6389389007098648, "grad_norm": 0.26171875, "learning_rate": 1.5801285187808905e-05, "loss": 2.7651, "num_input_tokens_seen": 6907494400, "step": 13175 }, { "epoch": 0.6391813822661114, "grad_norm": 0.234375, "learning_rate": 1.5782647597780054e-05, "loss": 2.7674, "num_input_tokens_seen": 6910115840, "step": 13180 }, { "epoch": 0.639423863822358, "grad_norm": 0.2392578125, "learning_rate": 1.5764015934566455e-05, "loss": 2.7672, "num_input_tokens_seen": 6912737280, "step": 13185 }, { "epoch": 0.6396663453786047, "grad_norm": 0.2470703125, "learning_rate": 1.5745390210148396e-05, "loss": 2.7798, "num_input_tokens_seen": 6915358720, "step": 13190 }, { "epoch": 0.6399088269348513, "grad_norm": 0.240234375, "learning_rate": 1.5726770436502323e-05, "loss": 2.7751, "num_input_tokens_seen": 6917980160, "step": 13195 }, { "epoch": 0.6401513084910979, "grad_norm": 0.251953125, "learning_rate": 1.5708156625600885e-05, "loss": 2.7697, "num_input_tokens_seen": 6920601600, "step": 13200 }, { "epoch": 0.6401513084910979, "eval_accuracy": 0.4559745969711773, "eval_loss": 2.741948127746582, "eval_runtime": 5.8815, "eval_samples_per_second": 51.008, "eval_steps_per_second": 6.461, "num_input_tokens_seen": 6920601600, "step": 13200 }, { "epoch": 0.6403937900473445, "grad_norm": 0.251953125, "learning_rate": 1.5689548789412854e-05, "loss": 2.7679, "num_input_tokens_seen": 6923223040, "step": 13205 }, { "epoch": 0.6406362716035912, "grad_norm": 0.2431640625, "learning_rate": 1.56709469399032e-05, "loss": 2.7839, "num_input_tokens_seen": 6925844480, "step": 13210 }, { "epoch": 0.6408787531598378, "grad_norm": 0.240234375, "learning_rate": 1.5652351089033028e-05, "loss": 2.771, "num_input_tokens_seen": 6928465920, "step": 13215 }, { "epoch": 0.6411212347160844, "grad_norm": 0.24609375, "learning_rate": 1.5633761248759583e-05, "loss": 2.7905, "num_input_tokens_seen": 6931087360, "step": 13220 }, { "epoch": 0.641363716272331, "grad_norm": 0.2421875, "learning_rate": 1.561517743103625e-05, "loss": 2.7569, "num_input_tokens_seen": 6933708800, "step": 13225 }, { "epoch": 0.6416061978285776, "grad_norm": 0.2431640625, "learning_rate": 1.5596599647812543e-05, "loss": 2.7699, "num_input_tokens_seen": 6936330240, "step": 13230 }, { "epoch": 0.6418486793848243, "grad_norm": 0.24609375, "learning_rate": 1.557802791103409e-05, "loss": 2.776, "num_input_tokens_seen": 6938951680, "step": 13235 }, { "epoch": 0.6420911609410709, "grad_norm": 0.2412109375, "learning_rate": 1.555946223264263e-05, "loss": 2.7667, "num_input_tokens_seen": 6941573120, "step": 13240 }, { "epoch": 0.6423336424973175, "grad_norm": 0.248046875, "learning_rate": 1.5540902624576015e-05, "loss": 2.7752, "num_input_tokens_seen": 6944194560, "step": 13245 }, { "epoch": 0.6425761240535641, "grad_norm": 0.2431640625, "learning_rate": 1.5522349098768185e-05, "loss": 2.7761, "num_input_tokens_seen": 6946816000, "step": 13250 }, { "epoch": 0.6428186056098109, "grad_norm": 0.240234375, "learning_rate": 1.5503801667149175e-05, "loss": 2.7806, "num_input_tokens_seen": 6949437440, "step": 13255 }, { "epoch": 0.6430610871660575, "grad_norm": 0.25, "learning_rate": 1.5485260341645108e-05, "loss": 2.762, "num_input_tokens_seen": 6952058880, "step": 13260 }, { "epoch": 0.6433035687223041, "grad_norm": 0.244140625, "learning_rate": 1.546672513417817e-05, "loss": 2.7758, "num_input_tokens_seen": 6954680320, "step": 13265 }, { "epoch": 0.6435460502785507, "grad_norm": 0.24609375, "learning_rate": 1.5448196056666607e-05, "loss": 2.772, "num_input_tokens_seen": 6957301760, "step": 13270 }, { "epoch": 0.6437885318347973, "grad_norm": 0.2373046875, "learning_rate": 1.5429673121024733e-05, "loss": 2.7636, "num_input_tokens_seen": 6959923200, "step": 13275 }, { "epoch": 0.644031013391044, "grad_norm": 0.244140625, "learning_rate": 1.541115633916291e-05, "loss": 2.7764, "num_input_tokens_seen": 6962544640, "step": 13280 }, { "epoch": 0.6442734949472906, "grad_norm": 0.244140625, "learning_rate": 1.5392645722987553e-05, "loss": 2.7853, "num_input_tokens_seen": 6965166080, "step": 13285 }, { "epoch": 0.6445159765035372, "grad_norm": 0.25, "learning_rate": 1.53741412844011e-05, "loss": 2.7587, "num_input_tokens_seen": 6967787520, "step": 13290 }, { "epoch": 0.6447584580597838, "grad_norm": 0.2451171875, "learning_rate": 1.535564303530203e-05, "loss": 2.7621, "num_input_tokens_seen": 6970408960, "step": 13295 }, { "epoch": 0.6450009396160304, "grad_norm": 0.2490234375, "learning_rate": 1.533715098758481e-05, "loss": 2.7802, "num_input_tokens_seen": 6973030400, "step": 13300 }, { "epoch": 0.6452434211722771, "grad_norm": 0.2431640625, "learning_rate": 1.531866515313996e-05, "loss": 2.7959, "num_input_tokens_seen": 6975651840, "step": 13305 }, { "epoch": 0.6454859027285237, "grad_norm": 0.2470703125, "learning_rate": 1.5300185543853975e-05, "loss": 2.7798, "num_input_tokens_seen": 6978273280, "step": 13310 }, { "epoch": 0.6457283842847703, "grad_norm": 0.23828125, "learning_rate": 1.5281712171609376e-05, "loss": 2.7766, "num_input_tokens_seen": 6980894720, "step": 13315 }, { "epoch": 0.6459708658410169, "grad_norm": 0.251953125, "learning_rate": 1.5263245048284645e-05, "loss": 2.765, "num_input_tokens_seen": 6983516160, "step": 13320 }, { "epoch": 0.6462133473972635, "grad_norm": 0.23828125, "learning_rate": 1.524478418575427e-05, "loss": 2.7727, "num_input_tokens_seen": 6986137600, "step": 13325 }, { "epoch": 0.6464558289535102, "grad_norm": 0.2490234375, "learning_rate": 1.5226329595888683e-05, "loss": 2.7955, "num_input_tokens_seen": 6988759040, "step": 13330 }, { "epoch": 0.6466983105097569, "grad_norm": 0.244140625, "learning_rate": 1.5207881290554307e-05, "loss": 2.7674, "num_input_tokens_seen": 6991380480, "step": 13335 }, { "epoch": 0.6469407920660035, "grad_norm": 0.24609375, "learning_rate": 1.5189439281613524e-05, "loss": 2.7589, "num_input_tokens_seen": 6994001920, "step": 13340 }, { "epoch": 0.6471832736222501, "grad_norm": 0.2431640625, "learning_rate": 1.517100358092466e-05, "loss": 2.7761, "num_input_tokens_seen": 6996623360, "step": 13345 }, { "epoch": 0.6474257551784968, "grad_norm": 0.240234375, "learning_rate": 1.515257420034198e-05, "loss": 2.7635, "num_input_tokens_seen": 6999244800, "step": 13350 }, { "epoch": 0.6476682367347434, "grad_norm": 0.244140625, "learning_rate": 1.5134151151715702e-05, "loss": 2.7863, "num_input_tokens_seen": 7001866240, "step": 13355 }, { "epoch": 0.64791071829099, "grad_norm": 0.2470703125, "learning_rate": 1.5115734446891943e-05, "loss": 2.7819, "num_input_tokens_seen": 7004487680, "step": 13360 }, { "epoch": 0.6481531998472366, "grad_norm": 0.24609375, "learning_rate": 1.5097324097712778e-05, "loss": 2.7658, "num_input_tokens_seen": 7007109120, "step": 13365 }, { "epoch": 0.6483956814034832, "grad_norm": 0.248046875, "learning_rate": 1.5078920116016165e-05, "loss": 2.7801, "num_input_tokens_seen": 7009730560, "step": 13370 }, { "epoch": 0.6486381629597299, "grad_norm": 0.251953125, "learning_rate": 1.5060522513635986e-05, "loss": 2.7734, "num_input_tokens_seen": 7012352000, "step": 13375 }, { "epoch": 0.6488806445159765, "grad_norm": 0.2490234375, "learning_rate": 1.5042131302402013e-05, "loss": 2.7738, "num_input_tokens_seen": 7014973440, "step": 13380 }, { "epoch": 0.6491231260722231, "grad_norm": 0.248046875, "learning_rate": 1.5023746494139915e-05, "loss": 2.7794, "num_input_tokens_seen": 7017594880, "step": 13385 }, { "epoch": 0.6493656076284697, "grad_norm": 0.2431640625, "learning_rate": 1.5005368100671219e-05, "loss": 2.7745, "num_input_tokens_seen": 7020216320, "step": 13390 }, { "epoch": 0.6496080891847164, "grad_norm": 0.240234375, "learning_rate": 1.4986996133813367e-05, "loss": 2.776, "num_input_tokens_seen": 7022837760, "step": 13395 }, { "epoch": 0.649850570740963, "grad_norm": 0.2431640625, "learning_rate": 1.4968630605379641e-05, "loss": 2.7723, "num_input_tokens_seen": 7025459200, "step": 13400 }, { "epoch": 0.6500930522972096, "grad_norm": 0.2373046875, "learning_rate": 1.495027152717919e-05, "loss": 2.7722, "num_input_tokens_seen": 7028080640, "step": 13405 }, { "epoch": 0.6503355338534563, "grad_norm": 0.2431640625, "learning_rate": 1.4931918911017023e-05, "loss": 2.7961, "num_input_tokens_seen": 7030702080, "step": 13410 }, { "epoch": 0.650578015409703, "grad_norm": 0.25, "learning_rate": 1.491357276869398e-05, "loss": 2.7773, "num_input_tokens_seen": 7033323520, "step": 13415 }, { "epoch": 0.6508204969659496, "grad_norm": 0.2421875, "learning_rate": 1.4895233112006749e-05, "loss": 2.7686, "num_input_tokens_seen": 7035944960, "step": 13420 }, { "epoch": 0.6510629785221962, "grad_norm": 0.24609375, "learning_rate": 1.4876899952747838e-05, "loss": 2.7715, "num_input_tokens_seen": 7038566400, "step": 13425 }, { "epoch": 0.6513054600784428, "grad_norm": 0.244140625, "learning_rate": 1.4858573302705592e-05, "loss": 2.7805, "num_input_tokens_seen": 7041187840, "step": 13430 }, { "epoch": 0.6515479416346894, "grad_norm": 0.2421875, "learning_rate": 1.4840253173664154e-05, "loss": 2.7632, "num_input_tokens_seen": 7043809280, "step": 13435 }, { "epoch": 0.651790423190936, "grad_norm": 0.2412109375, "learning_rate": 1.4821939577403483e-05, "loss": 2.7731, "num_input_tokens_seen": 7046430720, "step": 13440 }, { "epoch": 0.6520329047471827, "grad_norm": 0.2451171875, "learning_rate": 1.4803632525699338e-05, "loss": 2.7714, "num_input_tokens_seen": 7049052160, "step": 13445 }, { "epoch": 0.6522753863034293, "grad_norm": 0.2392578125, "learning_rate": 1.4785332030323273e-05, "loss": 2.7744, "num_input_tokens_seen": 7051673600, "step": 13450 }, { "epoch": 0.6525178678596759, "grad_norm": 0.2421875, "learning_rate": 1.4767038103042613e-05, "loss": 2.7845, "num_input_tokens_seen": 7054295040, "step": 13455 }, { "epoch": 0.6527603494159225, "grad_norm": 0.2431640625, "learning_rate": 1.4748750755620466e-05, "loss": 2.7849, "num_input_tokens_seen": 7056916480, "step": 13460 }, { "epoch": 0.6530028309721692, "grad_norm": 0.251953125, "learning_rate": 1.4730469999815716e-05, "loss": 2.7664, "num_input_tokens_seen": 7059537920, "step": 13465 }, { "epoch": 0.6532453125284158, "grad_norm": 0.25, "learning_rate": 1.4712195847383003e-05, "loss": 2.7801, "num_input_tokens_seen": 7062159360, "step": 13470 }, { "epoch": 0.6534877940846624, "grad_norm": 0.240234375, "learning_rate": 1.4693928310072719e-05, "loss": 2.7584, "num_input_tokens_seen": 7064780800, "step": 13475 }, { "epoch": 0.653730275640909, "grad_norm": 0.244140625, "learning_rate": 1.4675667399631012e-05, "loss": 2.7703, "num_input_tokens_seen": 7067402240, "step": 13480 }, { "epoch": 0.6539727571971556, "grad_norm": 0.2431640625, "learning_rate": 1.4657413127799752e-05, "loss": 2.7785, "num_input_tokens_seen": 7070023680, "step": 13485 }, { "epoch": 0.6542152387534024, "grad_norm": 0.2431640625, "learning_rate": 1.4639165506316554e-05, "loss": 2.781, "num_input_tokens_seen": 7072645120, "step": 13490 }, { "epoch": 0.654457720309649, "grad_norm": 0.2431640625, "learning_rate": 1.4620924546914749e-05, "loss": 2.7606, "num_input_tokens_seen": 7075266560, "step": 13495 }, { "epoch": 0.6547002018658956, "grad_norm": 0.265625, "learning_rate": 1.4602690261323399e-05, "loss": 2.7689, "num_input_tokens_seen": 7077888000, "step": 13500 }, { "epoch": 0.6547002018658956, "eval_accuracy": 0.45599088096401236, "eval_loss": 2.7419145107269287, "eval_runtime": 5.8941, "eval_samples_per_second": 50.898, "eval_steps_per_second": 6.447, "num_input_tokens_seen": 7077888000, "step": 13500 }, { "epoch": 0.6549426834221422, "grad_norm": 0.25390625, "learning_rate": 1.4584462661267251e-05, "loss": 2.7809, "num_input_tokens_seen": 7080509440, "step": 13505 }, { "epoch": 0.6551851649783889, "grad_norm": 0.2431640625, "learning_rate": 1.456624175846678e-05, "loss": 2.7776, "num_input_tokens_seen": 7083130880, "step": 13510 }, { "epoch": 0.6554276465346355, "grad_norm": 0.2412109375, "learning_rate": 1.4548027564638125e-05, "loss": 2.7563, "num_input_tokens_seen": 7085752320, "step": 13515 }, { "epoch": 0.6556701280908821, "grad_norm": 0.2451171875, "learning_rate": 1.4529820091493123e-05, "loss": 2.7728, "num_input_tokens_seen": 7088373760, "step": 13520 }, { "epoch": 0.6559126096471287, "grad_norm": 0.2412109375, "learning_rate": 1.4511619350739313e-05, "loss": 2.7903, "num_input_tokens_seen": 7090995200, "step": 13525 }, { "epoch": 0.6561550912033753, "grad_norm": 0.2451171875, "learning_rate": 1.4493425354079876e-05, "loss": 2.7614, "num_input_tokens_seen": 7093616640, "step": 13530 }, { "epoch": 0.656397572759622, "grad_norm": 0.2431640625, "learning_rate": 1.4475238113213662e-05, "loss": 2.7707, "num_input_tokens_seen": 7096238080, "step": 13535 }, { "epoch": 0.6566400543158686, "grad_norm": 0.2421875, "learning_rate": 1.4457057639835197e-05, "loss": 2.7766, "num_input_tokens_seen": 7098859520, "step": 13540 }, { "epoch": 0.6568825358721152, "grad_norm": 0.2421875, "learning_rate": 1.4438883945634618e-05, "loss": 2.7718, "num_input_tokens_seen": 7101480960, "step": 13545 }, { "epoch": 0.6571250174283618, "grad_norm": 0.2421875, "learning_rate": 1.4420717042297727e-05, "loss": 2.7778, "num_input_tokens_seen": 7104102400, "step": 13550 }, { "epoch": 0.6573674989846084, "grad_norm": 0.2470703125, "learning_rate": 1.4402556941505969e-05, "loss": 2.7788, "num_input_tokens_seen": 7106723840, "step": 13555 }, { "epoch": 0.6576099805408551, "grad_norm": 0.2451171875, "learning_rate": 1.4384403654936387e-05, "loss": 2.779, "num_input_tokens_seen": 7109345280, "step": 13560 }, { "epoch": 0.6578524620971017, "grad_norm": 0.24609375, "learning_rate": 1.4366257194261671e-05, "loss": 2.7803, "num_input_tokens_seen": 7111966720, "step": 13565 }, { "epoch": 0.6580949436533484, "grad_norm": 0.2373046875, "learning_rate": 1.4348117571150102e-05, "loss": 2.7755, "num_input_tokens_seen": 7114588160, "step": 13570 }, { "epoch": 0.658337425209595, "grad_norm": 0.2451171875, "learning_rate": 1.4329984797265572e-05, "loss": 2.7782, "num_input_tokens_seen": 7117209600, "step": 13575 }, { "epoch": 0.6585799067658417, "grad_norm": 0.248046875, "learning_rate": 1.431185888426757e-05, "loss": 2.7722, "num_input_tokens_seen": 7119831040, "step": 13580 }, { "epoch": 0.6588223883220883, "grad_norm": 0.248046875, "learning_rate": 1.4293739843811171e-05, "loss": 2.7824, "num_input_tokens_seen": 7122452480, "step": 13585 }, { "epoch": 0.6590648698783349, "grad_norm": 0.24609375, "learning_rate": 1.4275627687547028e-05, "loss": 2.7731, "num_input_tokens_seen": 7125073920, "step": 13590 }, { "epoch": 0.6593073514345815, "grad_norm": 0.2431640625, "learning_rate": 1.4257522427121379e-05, "loss": 2.7752, "num_input_tokens_seen": 7127695360, "step": 13595 }, { "epoch": 0.6595498329908281, "grad_norm": 0.2431640625, "learning_rate": 1.4239424074176009e-05, "loss": 2.7782, "num_input_tokens_seen": 7130316800, "step": 13600 }, { "epoch": 0.6597923145470748, "grad_norm": 0.240234375, "learning_rate": 1.422133264034829e-05, "loss": 2.7719, "num_input_tokens_seen": 7132938240, "step": 13605 }, { "epoch": 0.6600347961033214, "grad_norm": 0.25, "learning_rate": 1.4203248137271102e-05, "loss": 2.7672, "num_input_tokens_seen": 7135559680, "step": 13610 }, { "epoch": 0.660277277659568, "grad_norm": 0.2412109375, "learning_rate": 1.4185170576572907e-05, "loss": 2.783, "num_input_tokens_seen": 7138181120, "step": 13615 }, { "epoch": 0.6605197592158146, "grad_norm": 0.244140625, "learning_rate": 1.416709996987769e-05, "loss": 2.7854, "num_input_tokens_seen": 7140802560, "step": 13620 }, { "epoch": 0.6607622407720612, "grad_norm": 0.248046875, "learning_rate": 1.414903632880496e-05, "loss": 2.7667, "num_input_tokens_seen": 7143424000, "step": 13625 }, { "epoch": 0.6610047223283079, "grad_norm": 0.251953125, "learning_rate": 1.4130979664969756e-05, "loss": 2.7828, "num_input_tokens_seen": 7146045440, "step": 13630 }, { "epoch": 0.6612472038845545, "grad_norm": 0.248046875, "learning_rate": 1.4112929989982623e-05, "loss": 2.7924, "num_input_tokens_seen": 7148666880, "step": 13635 }, { "epoch": 0.6614896854408011, "grad_norm": 0.251953125, "learning_rate": 1.4094887315449617e-05, "loss": 2.7629, "num_input_tokens_seen": 7151288320, "step": 13640 }, { "epoch": 0.6617321669970477, "grad_norm": 0.234375, "learning_rate": 1.407685165297229e-05, "loss": 2.7658, "num_input_tokens_seen": 7153909760, "step": 13645 }, { "epoch": 0.6619746485532945, "grad_norm": 0.2373046875, "learning_rate": 1.4058823014147683e-05, "loss": 2.7697, "num_input_tokens_seen": 7156531200, "step": 13650 }, { "epoch": 0.6622171301095411, "grad_norm": 0.2490234375, "learning_rate": 1.4040801410568327e-05, "loss": 2.7729, "num_input_tokens_seen": 7159152640, "step": 13655 }, { "epoch": 0.6624596116657877, "grad_norm": 0.2421875, "learning_rate": 1.4022786853822224e-05, "loss": 2.762, "num_input_tokens_seen": 7161774080, "step": 13660 }, { "epoch": 0.6627020932220343, "grad_norm": 0.2470703125, "learning_rate": 1.4004779355492858e-05, "loss": 2.7642, "num_input_tokens_seen": 7164395520, "step": 13665 }, { "epoch": 0.662944574778281, "grad_norm": 0.2451171875, "learning_rate": 1.3986778927159141e-05, "loss": 2.7636, "num_input_tokens_seen": 7167016960, "step": 13670 }, { "epoch": 0.6631870563345276, "grad_norm": 0.2451171875, "learning_rate": 1.3968785580395474e-05, "loss": 2.7879, "num_input_tokens_seen": 7169638400, "step": 13675 }, { "epoch": 0.6634295378907742, "grad_norm": 0.25, "learning_rate": 1.395079932677169e-05, "loss": 2.7893, "num_input_tokens_seen": 7172259840, "step": 13680 }, { "epoch": 0.6636720194470208, "grad_norm": 0.2470703125, "learning_rate": 1.3932820177853063e-05, "loss": 2.7727, "num_input_tokens_seen": 7174881280, "step": 13685 }, { "epoch": 0.6639145010032674, "grad_norm": 0.25, "learning_rate": 1.3914848145200293e-05, "loss": 2.7701, "num_input_tokens_seen": 7177502720, "step": 13690 }, { "epoch": 0.664156982559514, "grad_norm": 0.2431640625, "learning_rate": 1.3896883240369518e-05, "loss": 2.7756, "num_input_tokens_seen": 7180124160, "step": 13695 }, { "epoch": 0.6643994641157607, "grad_norm": 0.2490234375, "learning_rate": 1.3878925474912283e-05, "loss": 2.7799, "num_input_tokens_seen": 7182745600, "step": 13700 }, { "epoch": 0.6646419456720073, "grad_norm": 0.2421875, "learning_rate": 1.3860974860375536e-05, "loss": 2.7682, "num_input_tokens_seen": 7185367040, "step": 13705 }, { "epoch": 0.6648844272282539, "grad_norm": 0.2451171875, "learning_rate": 1.3843031408301644e-05, "loss": 2.7735, "num_input_tokens_seen": 7187988480, "step": 13710 }, { "epoch": 0.6651269087845005, "grad_norm": 0.240234375, "learning_rate": 1.382509513022835e-05, "loss": 2.7747, "num_input_tokens_seen": 7190609920, "step": 13715 }, { "epoch": 0.6653693903407472, "grad_norm": 0.2431640625, "learning_rate": 1.3807166037688801e-05, "loss": 2.7777, "num_input_tokens_seen": 7193231360, "step": 13720 }, { "epoch": 0.6656118718969939, "grad_norm": 0.2412109375, "learning_rate": 1.3789244142211511e-05, "loss": 2.772, "num_input_tokens_seen": 7195852800, "step": 13725 }, { "epoch": 0.6658543534532405, "grad_norm": 0.2431640625, "learning_rate": 1.3771329455320381e-05, "loss": 2.7602, "num_input_tokens_seen": 7198474240, "step": 13730 }, { "epoch": 0.6660968350094871, "grad_norm": 0.2451171875, "learning_rate": 1.3753421988534648e-05, "loss": 2.7839, "num_input_tokens_seen": 7201095680, "step": 13735 }, { "epoch": 0.6663393165657338, "grad_norm": 0.2451171875, "learning_rate": 1.3735521753368932e-05, "loss": 2.7655, "num_input_tokens_seen": 7203717120, "step": 13740 }, { "epoch": 0.6665817981219804, "grad_norm": 0.2490234375, "learning_rate": 1.3717628761333202e-05, "loss": 2.7761, "num_input_tokens_seen": 7206338560, "step": 13745 }, { "epoch": 0.666824279678227, "grad_norm": 0.23828125, "learning_rate": 1.3699743023932751e-05, "loss": 2.7763, "num_input_tokens_seen": 7208960000, "step": 13750 }, { "epoch": 0.6670667612344736, "grad_norm": 0.244140625, "learning_rate": 1.3681864552668239e-05, "loss": 2.7852, "num_input_tokens_seen": 7211581440, "step": 13755 }, { "epoch": 0.6673092427907202, "grad_norm": 0.2451171875, "learning_rate": 1.3663993359035637e-05, "loss": 2.7769, "num_input_tokens_seen": 7214202880, "step": 13760 }, { "epoch": 0.6675517243469669, "grad_norm": 0.244140625, "learning_rate": 1.3646129454526213e-05, "loss": 2.7822, "num_input_tokens_seen": 7216824320, "step": 13765 }, { "epoch": 0.6677942059032135, "grad_norm": 0.2421875, "learning_rate": 1.3628272850626577e-05, "loss": 2.7662, "num_input_tokens_seen": 7219445760, "step": 13770 }, { "epoch": 0.6680366874594601, "grad_norm": 0.23828125, "learning_rate": 1.361042355881864e-05, "loss": 2.781, "num_input_tokens_seen": 7222067200, "step": 13775 }, { "epoch": 0.6682791690157067, "grad_norm": 0.240234375, "learning_rate": 1.3592581590579608e-05, "loss": 2.776, "num_input_tokens_seen": 7224688640, "step": 13780 }, { "epoch": 0.6685216505719533, "grad_norm": 0.2412109375, "learning_rate": 1.3574746957381979e-05, "loss": 2.7814, "num_input_tokens_seen": 7227310080, "step": 13785 }, { "epoch": 0.6687641321282, "grad_norm": 0.2392578125, "learning_rate": 1.3556919670693541e-05, "loss": 2.7791, "num_input_tokens_seen": 7229931520, "step": 13790 }, { "epoch": 0.6690066136844466, "grad_norm": 0.248046875, "learning_rate": 1.3539099741977334e-05, "loss": 2.7707, "num_input_tokens_seen": 7232552960, "step": 13795 }, { "epoch": 0.6692490952406932, "grad_norm": 0.2412109375, "learning_rate": 1.3521287182691695e-05, "loss": 2.7747, "num_input_tokens_seen": 7235174400, "step": 13800 }, { "epoch": 0.6692490952406932, "eval_accuracy": 0.45594365738479076, "eval_loss": 2.741860866546631, "eval_runtime": 5.8127, "eval_samples_per_second": 51.611, "eval_steps_per_second": 6.537, "num_input_tokens_seen": 7235174400, "step": 13800 }, { "epoch": 0.6694915767969399, "grad_norm": 0.248046875, "learning_rate": 1.3503482004290194e-05, "loss": 2.7805, "num_input_tokens_seen": 7237795840, "step": 13805 }, { "epoch": 0.6697340583531866, "grad_norm": 0.2421875, "learning_rate": 1.3485684218221694e-05, "loss": 2.7735, "num_input_tokens_seen": 7240417280, "step": 13810 }, { "epoch": 0.6699765399094332, "grad_norm": 0.2431640625, "learning_rate": 1.3467893835930281e-05, "loss": 2.7828, "num_input_tokens_seen": 7243038720, "step": 13815 }, { "epoch": 0.6702190214656798, "grad_norm": 0.25390625, "learning_rate": 1.3450110868855283e-05, "loss": 2.7697, "num_input_tokens_seen": 7245660160, "step": 13820 }, { "epoch": 0.6704615030219264, "grad_norm": 0.2412109375, "learning_rate": 1.3432335328431244e-05, "loss": 2.7886, "num_input_tokens_seen": 7248281600, "step": 13825 }, { "epoch": 0.670703984578173, "grad_norm": 0.2431640625, "learning_rate": 1.3414567226087954e-05, "loss": 2.7661, "num_input_tokens_seen": 7250903040, "step": 13830 }, { "epoch": 0.6709464661344197, "grad_norm": 0.2470703125, "learning_rate": 1.3396806573250418e-05, "loss": 2.7697, "num_input_tokens_seen": 7253524480, "step": 13835 }, { "epoch": 0.6711889476906663, "grad_norm": 0.259765625, "learning_rate": 1.337905338133884e-05, "loss": 2.7628, "num_input_tokens_seen": 7256145920, "step": 13840 }, { "epoch": 0.6714314292469129, "grad_norm": 0.23828125, "learning_rate": 1.3361307661768647e-05, "loss": 2.7721, "num_input_tokens_seen": 7258767360, "step": 13845 }, { "epoch": 0.6716739108031595, "grad_norm": 0.2470703125, "learning_rate": 1.3343569425950442e-05, "loss": 2.7688, "num_input_tokens_seen": 7261388800, "step": 13850 }, { "epoch": 0.6719163923594061, "grad_norm": 0.248046875, "learning_rate": 1.3325838685289998e-05, "loss": 2.7698, "num_input_tokens_seen": 7264010240, "step": 13855 }, { "epoch": 0.6721588739156528, "grad_norm": 0.2392578125, "learning_rate": 1.3308115451188327e-05, "loss": 2.7731, "num_input_tokens_seen": 7266631680, "step": 13860 }, { "epoch": 0.6724013554718994, "grad_norm": 0.23828125, "learning_rate": 1.3290399735041564e-05, "loss": 2.7781, "num_input_tokens_seen": 7269253120, "step": 13865 }, { "epoch": 0.672643837028146, "grad_norm": 0.2412109375, "learning_rate": 1.3272691548241023e-05, "loss": 2.7817, "num_input_tokens_seen": 7271874560, "step": 13870 }, { "epoch": 0.6728863185843926, "grad_norm": 0.2431640625, "learning_rate": 1.3254990902173186e-05, "loss": 2.7869, "num_input_tokens_seen": 7274496000, "step": 13875 }, { "epoch": 0.6731288001406392, "grad_norm": 0.23828125, "learning_rate": 1.3237297808219676e-05, "loss": 2.7643, "num_input_tokens_seen": 7277117440, "step": 13880 }, { "epoch": 0.673371281696886, "grad_norm": 0.236328125, "learning_rate": 1.3219612277757271e-05, "loss": 2.7731, "num_input_tokens_seen": 7279738880, "step": 13885 }, { "epoch": 0.6736137632531326, "grad_norm": 0.2412109375, "learning_rate": 1.3201934322157861e-05, "loss": 2.7812, "num_input_tokens_seen": 7282360320, "step": 13890 }, { "epoch": 0.6738562448093792, "grad_norm": 0.2373046875, "learning_rate": 1.318426395278849e-05, "loss": 2.7731, "num_input_tokens_seen": 7284981760, "step": 13895 }, { "epoch": 0.6740987263656258, "grad_norm": 0.244140625, "learning_rate": 1.3166601181011312e-05, "loss": 2.7771, "num_input_tokens_seen": 7287603200, "step": 13900 }, { "epoch": 0.6743412079218725, "grad_norm": 0.2431640625, "learning_rate": 1.3148946018183612e-05, "loss": 2.7676, "num_input_tokens_seen": 7290224640, "step": 13905 }, { "epoch": 0.6745836894781191, "grad_norm": 0.244140625, "learning_rate": 1.3131298475657755e-05, "loss": 2.7656, "num_input_tokens_seen": 7292846080, "step": 13910 }, { "epoch": 0.6748261710343657, "grad_norm": 0.2421875, "learning_rate": 1.3113658564781233e-05, "loss": 2.77, "num_input_tokens_seen": 7295467520, "step": 13915 }, { "epoch": 0.6750686525906123, "grad_norm": 0.23828125, "learning_rate": 1.3096026296896612e-05, "loss": 2.7662, "num_input_tokens_seen": 7298088960, "step": 13920 }, { "epoch": 0.675311134146859, "grad_norm": 0.2490234375, "learning_rate": 1.3078401683341554e-05, "loss": 2.7743, "num_input_tokens_seen": 7300710400, "step": 13925 }, { "epoch": 0.6755536157031056, "grad_norm": 0.25, "learning_rate": 1.3060784735448794e-05, "loss": 2.7882, "num_input_tokens_seen": 7303331840, "step": 13930 }, { "epoch": 0.6757960972593522, "grad_norm": 0.2451171875, "learning_rate": 1.3043175464546142e-05, "loss": 2.7775, "num_input_tokens_seen": 7305953280, "step": 13935 }, { "epoch": 0.6760385788155988, "grad_norm": 0.2470703125, "learning_rate": 1.302557388195647e-05, "loss": 2.7761, "num_input_tokens_seen": 7308574720, "step": 13940 }, { "epoch": 0.6762810603718454, "grad_norm": 0.24609375, "learning_rate": 1.3007979998997711e-05, "loss": 2.7898, "num_input_tokens_seen": 7311196160, "step": 13945 }, { "epoch": 0.676523541928092, "grad_norm": 0.240234375, "learning_rate": 1.2990393826982828e-05, "loss": 2.769, "num_input_tokens_seen": 7313817600, "step": 13950 }, { "epoch": 0.6767660234843387, "grad_norm": 0.240234375, "learning_rate": 1.2972815377219843e-05, "loss": 2.771, "num_input_tokens_seen": 7316439040, "step": 13955 }, { "epoch": 0.6770085050405853, "grad_norm": 0.2412109375, "learning_rate": 1.2955244661011811e-05, "loss": 2.7778, "num_input_tokens_seen": 7319060480, "step": 13960 }, { "epoch": 0.677250986596832, "grad_norm": 0.2421875, "learning_rate": 1.2937681689656817e-05, "loss": 2.7733, "num_input_tokens_seen": 7321681920, "step": 13965 }, { "epoch": 0.6774934681530786, "grad_norm": 0.2412109375, "learning_rate": 1.2920126474447958e-05, "loss": 2.7849, "num_input_tokens_seen": 7324303360, "step": 13970 }, { "epoch": 0.6777359497093253, "grad_norm": 0.2431640625, "learning_rate": 1.2902579026673345e-05, "loss": 2.7883, "num_input_tokens_seen": 7326924800, "step": 13975 }, { "epoch": 0.6779784312655719, "grad_norm": 0.2431640625, "learning_rate": 1.2885039357616102e-05, "loss": 2.7775, "num_input_tokens_seen": 7329546240, "step": 13980 }, { "epoch": 0.6782209128218185, "grad_norm": 0.2412109375, "learning_rate": 1.2867507478554341e-05, "loss": 2.7754, "num_input_tokens_seen": 7332167680, "step": 13985 }, { "epoch": 0.6784633943780651, "grad_norm": 0.248046875, "learning_rate": 1.2849983400761173e-05, "loss": 2.7754, "num_input_tokens_seen": 7334789120, "step": 13990 }, { "epoch": 0.6787058759343118, "grad_norm": 0.2421875, "learning_rate": 1.283246713550469e-05, "loss": 2.7898, "num_input_tokens_seen": 7337410560, "step": 13995 }, { "epoch": 0.6789483574905584, "grad_norm": 0.244140625, "learning_rate": 1.2814958694047955e-05, "loss": 2.7864, "num_input_tokens_seen": 7340032000, "step": 14000 }, { "epoch": 0.679190839046805, "grad_norm": 0.2490234375, "learning_rate": 1.2797458087649022e-05, "loss": 2.7635, "num_input_tokens_seen": 7342653440, "step": 14005 }, { "epoch": 0.6794333206030516, "grad_norm": 0.2392578125, "learning_rate": 1.2779965327560867e-05, "loss": 2.7807, "num_input_tokens_seen": 7345274880, "step": 14010 }, { "epoch": 0.6796758021592982, "grad_norm": 0.2431640625, "learning_rate": 1.2762480425031454e-05, "loss": 2.7861, "num_input_tokens_seen": 7347896320, "step": 14015 }, { "epoch": 0.6799182837155449, "grad_norm": 0.244140625, "learning_rate": 1.2745003391303684e-05, "loss": 2.787, "num_input_tokens_seen": 7350517760, "step": 14020 }, { "epoch": 0.6801607652717915, "grad_norm": 0.244140625, "learning_rate": 1.2727534237615404e-05, "loss": 2.7763, "num_input_tokens_seen": 7353139200, "step": 14025 }, { "epoch": 0.6804032468280381, "grad_norm": 0.2490234375, "learning_rate": 1.2710072975199383e-05, "loss": 2.7791, "num_input_tokens_seen": 7355760640, "step": 14030 }, { "epoch": 0.6806457283842847, "grad_norm": 0.2412109375, "learning_rate": 1.2692619615283318e-05, "loss": 2.767, "num_input_tokens_seen": 7358382080, "step": 14035 }, { "epoch": 0.6808882099405315, "grad_norm": 0.2421875, "learning_rate": 1.2675174169089854e-05, "loss": 2.762, "num_input_tokens_seen": 7361003520, "step": 14040 }, { "epoch": 0.6811306914967781, "grad_norm": 0.2431640625, "learning_rate": 1.2657736647836491e-05, "loss": 2.7783, "num_input_tokens_seen": 7363624960, "step": 14045 }, { "epoch": 0.6813731730530247, "grad_norm": 0.240234375, "learning_rate": 1.2640307062735679e-05, "loss": 2.7651, "num_input_tokens_seen": 7366246400, "step": 14050 }, { "epoch": 0.6816156546092713, "grad_norm": 0.2421875, "learning_rate": 1.2622885424994746e-05, "loss": 2.7771, "num_input_tokens_seen": 7368867840, "step": 14055 }, { "epoch": 0.6818581361655179, "grad_norm": 0.2373046875, "learning_rate": 1.2605471745815917e-05, "loss": 2.7671, "num_input_tokens_seen": 7371489280, "step": 14060 }, { "epoch": 0.6821006177217646, "grad_norm": 0.2421875, "learning_rate": 1.2588066036396292e-05, "loss": 2.7893, "num_input_tokens_seen": 7374110720, "step": 14065 }, { "epoch": 0.6823430992780112, "grad_norm": 0.240234375, "learning_rate": 1.2570668307927868e-05, "loss": 2.7845, "num_input_tokens_seen": 7376732160, "step": 14070 }, { "epoch": 0.6825855808342578, "grad_norm": 0.2470703125, "learning_rate": 1.2553278571597467e-05, "loss": 2.772, "num_input_tokens_seen": 7379353600, "step": 14075 }, { "epoch": 0.6828280623905044, "grad_norm": 0.255859375, "learning_rate": 1.2535896838586813e-05, "loss": 2.7774, "num_input_tokens_seen": 7381975040, "step": 14080 }, { "epoch": 0.683070543946751, "grad_norm": 0.244140625, "learning_rate": 1.2518523120072467e-05, "loss": 2.7781, "num_input_tokens_seen": 7384596480, "step": 14085 }, { "epoch": 0.6833130255029977, "grad_norm": 0.2373046875, "learning_rate": 1.250115742722583e-05, "loss": 2.7711, "num_input_tokens_seen": 7387217920, "step": 14090 }, { "epoch": 0.6835555070592443, "grad_norm": 0.2421875, "learning_rate": 1.2483799771213168e-05, "loss": 2.7704, "num_input_tokens_seen": 7389839360, "step": 14095 }, { "epoch": 0.6837979886154909, "grad_norm": 0.236328125, "learning_rate": 1.2466450163195564e-05, "loss": 2.786, "num_input_tokens_seen": 7392460800, "step": 14100 }, { "epoch": 0.6837979886154909, "eval_accuracy": 0.4560804429246051, "eval_loss": 2.7417774200439453, "eval_runtime": 5.8578, "eval_samples_per_second": 51.213, "eval_steps_per_second": 6.487, "num_input_tokens_seen": 7392460800, "step": 14100 }, { "epoch": 0.6840404701717375, "grad_norm": 0.2451171875, "learning_rate": 1.2449108614328905e-05, "loss": 2.7715, "num_input_tokens_seen": 7395082240, "step": 14105 }, { "epoch": 0.6842829517279841, "grad_norm": 0.2431640625, "learning_rate": 1.2431775135763927e-05, "loss": 2.7642, "num_input_tokens_seen": 7397703680, "step": 14110 }, { "epoch": 0.6845254332842308, "grad_norm": 0.2392578125, "learning_rate": 1.241444973864616e-05, "loss": 2.7718, "num_input_tokens_seen": 7400325120, "step": 14115 }, { "epoch": 0.6847679148404775, "grad_norm": 0.2412109375, "learning_rate": 1.2397132434115952e-05, "loss": 2.7672, "num_input_tokens_seen": 7402946560, "step": 14120 }, { "epoch": 0.6850103963967241, "grad_norm": 0.24609375, "learning_rate": 1.2379823233308426e-05, "loss": 2.7775, "num_input_tokens_seen": 7405568000, "step": 14125 }, { "epoch": 0.6852528779529707, "grad_norm": 0.23828125, "learning_rate": 1.2362522147353525e-05, "loss": 2.7756, "num_input_tokens_seen": 7408189440, "step": 14130 }, { "epoch": 0.6854953595092174, "grad_norm": 0.2470703125, "learning_rate": 1.2345229187375934e-05, "loss": 2.7837, "num_input_tokens_seen": 7410810880, "step": 14135 }, { "epoch": 0.685737841065464, "grad_norm": 0.2421875, "learning_rate": 1.2327944364495133e-05, "loss": 2.7667, "num_input_tokens_seen": 7413432320, "step": 14140 }, { "epoch": 0.6859803226217106, "grad_norm": 0.2412109375, "learning_rate": 1.2310667689825393e-05, "loss": 2.7667, "num_input_tokens_seen": 7416053760, "step": 14145 }, { "epoch": 0.6862228041779572, "grad_norm": 0.2470703125, "learning_rate": 1.229339917447571e-05, "loss": 2.7726, "num_input_tokens_seen": 7418675200, "step": 14150 }, { "epoch": 0.6864652857342038, "grad_norm": 0.2392578125, "learning_rate": 1.2276138829549852e-05, "loss": 2.781, "num_input_tokens_seen": 7421296640, "step": 14155 }, { "epoch": 0.6867077672904505, "grad_norm": 0.244140625, "learning_rate": 1.2258886666146336e-05, "loss": 2.7706, "num_input_tokens_seen": 7423918080, "step": 14160 }, { "epoch": 0.6869502488466971, "grad_norm": 0.24609375, "learning_rate": 1.2241642695358391e-05, "loss": 2.7736, "num_input_tokens_seen": 7426539520, "step": 14165 }, { "epoch": 0.6871927304029437, "grad_norm": 0.248046875, "learning_rate": 1.2224406928274013e-05, "loss": 2.7664, "num_input_tokens_seen": 7429160960, "step": 14170 }, { "epoch": 0.6874352119591903, "grad_norm": 0.251953125, "learning_rate": 1.2207179375975899e-05, "loss": 2.7743, "num_input_tokens_seen": 7431782400, "step": 14175 }, { "epoch": 0.687677693515437, "grad_norm": 0.24609375, "learning_rate": 1.2189960049541482e-05, "loss": 2.7699, "num_input_tokens_seen": 7434403840, "step": 14180 }, { "epoch": 0.6879201750716836, "grad_norm": 0.2490234375, "learning_rate": 1.217274896004289e-05, "loss": 2.7784, "num_input_tokens_seen": 7437025280, "step": 14185 }, { "epoch": 0.6881626566279302, "grad_norm": 0.2451171875, "learning_rate": 1.2155546118546965e-05, "loss": 2.7813, "num_input_tokens_seen": 7439646720, "step": 14190 }, { "epoch": 0.6884051381841768, "grad_norm": 0.244140625, "learning_rate": 1.2138351536115238e-05, "loss": 2.7777, "num_input_tokens_seen": 7442268160, "step": 14195 }, { "epoch": 0.6886476197404235, "grad_norm": 0.2578125, "learning_rate": 1.212116522380394e-05, "loss": 2.7671, "num_input_tokens_seen": 7444889600, "step": 14200 }, { "epoch": 0.6888901012966702, "grad_norm": 0.2421875, "learning_rate": 1.210398719266397e-05, "loss": 2.7755, "num_input_tokens_seen": 7447511040, "step": 14205 }, { "epoch": 0.6891325828529168, "grad_norm": 0.248046875, "learning_rate": 1.2086817453740914e-05, "loss": 2.7646, "num_input_tokens_seen": 7450132480, "step": 14210 }, { "epoch": 0.6893750644091634, "grad_norm": 0.240234375, "learning_rate": 1.2069656018075018e-05, "loss": 2.7783, "num_input_tokens_seen": 7452753920, "step": 14215 }, { "epoch": 0.68961754596541, "grad_norm": 0.2470703125, "learning_rate": 1.2052502896701195e-05, "loss": 2.7845, "num_input_tokens_seen": 7455375360, "step": 14220 }, { "epoch": 0.6898600275216566, "grad_norm": 0.2470703125, "learning_rate": 1.2035358100649019e-05, "loss": 2.7704, "num_input_tokens_seen": 7457996800, "step": 14225 }, { "epoch": 0.6901025090779033, "grad_norm": 0.236328125, "learning_rate": 1.2018221640942681e-05, "loss": 2.7633, "num_input_tokens_seen": 7460618240, "step": 14230 }, { "epoch": 0.6903449906341499, "grad_norm": 0.24609375, "learning_rate": 1.2001093528601043e-05, "loss": 2.7743, "num_input_tokens_seen": 7463239680, "step": 14235 }, { "epoch": 0.6905874721903965, "grad_norm": 0.2412109375, "learning_rate": 1.1983973774637585e-05, "loss": 2.7795, "num_input_tokens_seen": 7465861120, "step": 14240 }, { "epoch": 0.6908299537466431, "grad_norm": 0.244140625, "learning_rate": 1.196686239006042e-05, "loss": 2.7712, "num_input_tokens_seen": 7468482560, "step": 14245 }, { "epoch": 0.6910724353028898, "grad_norm": 0.2421875, "learning_rate": 1.1949759385872273e-05, "loss": 2.7846, "num_input_tokens_seen": 7471104000, "step": 14250 }, { "epoch": 0.6913149168591364, "grad_norm": 0.24609375, "learning_rate": 1.1932664773070481e-05, "loss": 2.7697, "num_input_tokens_seen": 7473725440, "step": 14255 }, { "epoch": 0.691557398415383, "grad_norm": 0.248046875, "learning_rate": 1.1915578562646992e-05, "loss": 2.7537, "num_input_tokens_seen": 7476346880, "step": 14260 }, { "epoch": 0.6917998799716296, "grad_norm": 0.2431640625, "learning_rate": 1.1898500765588342e-05, "loss": 2.7691, "num_input_tokens_seen": 7478968320, "step": 14265 }, { "epoch": 0.6920423615278762, "grad_norm": 0.2431640625, "learning_rate": 1.188143139287566e-05, "loss": 2.762, "num_input_tokens_seen": 7481589760, "step": 14270 }, { "epoch": 0.6922848430841229, "grad_norm": 0.244140625, "learning_rate": 1.1864370455484663e-05, "loss": 2.7646, "num_input_tokens_seen": 7484211200, "step": 14275 }, { "epoch": 0.6925273246403696, "grad_norm": 0.244140625, "learning_rate": 1.1847317964385643e-05, "loss": 2.777, "num_input_tokens_seen": 7486832640, "step": 14280 }, { "epoch": 0.6927698061966162, "grad_norm": 0.2421875, "learning_rate": 1.1830273930543462e-05, "loss": 2.7653, "num_input_tokens_seen": 7489454080, "step": 14285 }, { "epoch": 0.6930122877528628, "grad_norm": 0.2421875, "learning_rate": 1.1813238364917523e-05, "loss": 2.7679, "num_input_tokens_seen": 7492075520, "step": 14290 }, { "epoch": 0.6932547693091095, "grad_norm": 0.2451171875, "learning_rate": 1.1796211278461811e-05, "loss": 2.7715, "num_input_tokens_seen": 7494696960, "step": 14295 }, { "epoch": 0.6934972508653561, "grad_norm": 0.2421875, "learning_rate": 1.177919268212485e-05, "loss": 2.7786, "num_input_tokens_seen": 7497318400, "step": 14300 }, { "epoch": 0.6937397324216027, "grad_norm": 0.240234375, "learning_rate": 1.1762182586849708e-05, "loss": 2.7742, "num_input_tokens_seen": 7499939840, "step": 14305 }, { "epoch": 0.6939822139778493, "grad_norm": 0.248046875, "learning_rate": 1.1745181003573971e-05, "loss": 2.7611, "num_input_tokens_seen": 7502561280, "step": 14310 }, { "epoch": 0.6942246955340959, "grad_norm": 0.25, "learning_rate": 1.1728187943229776e-05, "loss": 2.7864, "num_input_tokens_seen": 7505182720, "step": 14315 }, { "epoch": 0.6944671770903426, "grad_norm": 0.25390625, "learning_rate": 1.171120341674376e-05, "loss": 2.7645, "num_input_tokens_seen": 7507804160, "step": 14320 }, { "epoch": 0.6947096586465892, "grad_norm": 0.23828125, "learning_rate": 1.169422743503708e-05, "loss": 2.7846, "num_input_tokens_seen": 7510425600, "step": 14325 }, { "epoch": 0.6949521402028358, "grad_norm": 0.23828125, "learning_rate": 1.1677260009025403e-05, "loss": 2.7721, "num_input_tokens_seen": 7513047040, "step": 14330 }, { "epoch": 0.6951946217590824, "grad_norm": 0.2470703125, "learning_rate": 1.1660301149618885e-05, "loss": 2.7866, "num_input_tokens_seen": 7515668480, "step": 14335 }, { "epoch": 0.695437103315329, "grad_norm": 0.2392578125, "learning_rate": 1.1643350867722184e-05, "loss": 2.775, "num_input_tokens_seen": 7518289920, "step": 14340 }, { "epoch": 0.6956795848715757, "grad_norm": 0.2431640625, "learning_rate": 1.1626409174234432e-05, "loss": 2.7889, "num_input_tokens_seen": 7520911360, "step": 14345 }, { "epoch": 0.6959220664278223, "grad_norm": 0.2451171875, "learning_rate": 1.1609476080049252e-05, "loss": 2.7905, "num_input_tokens_seen": 7523532800, "step": 14350 }, { "epoch": 0.696164547984069, "grad_norm": 0.2421875, "learning_rate": 1.1592551596054717e-05, "loss": 2.7599, "num_input_tokens_seen": 7526154240, "step": 14355 }, { "epoch": 0.6964070295403156, "grad_norm": 0.2431640625, "learning_rate": 1.1575635733133383e-05, "loss": 2.7664, "num_input_tokens_seen": 7528775680, "step": 14360 }, { "epoch": 0.6966495110965623, "grad_norm": 0.25, "learning_rate": 1.1558728502162256e-05, "loss": 2.783, "num_input_tokens_seen": 7531397120, "step": 14365 }, { "epoch": 0.6968919926528089, "grad_norm": 0.2392578125, "learning_rate": 1.1541829914012789e-05, "loss": 2.7766, "num_input_tokens_seen": 7534018560, "step": 14370 }, { "epoch": 0.6971344742090555, "grad_norm": 0.244140625, "learning_rate": 1.1524939979550873e-05, "loss": 2.7805, "num_input_tokens_seen": 7536640000, "step": 14375 }, { "epoch": 0.6973769557653021, "grad_norm": 0.24609375, "learning_rate": 1.1508058709636869e-05, "loss": 2.7912, "num_input_tokens_seen": 7539261440, "step": 14380 }, { "epoch": 0.6976194373215487, "grad_norm": 0.2431640625, "learning_rate": 1.149118611512551e-05, "loss": 2.7833, "num_input_tokens_seen": 7541882880, "step": 14385 }, { "epoch": 0.6978619188777954, "grad_norm": 0.2431640625, "learning_rate": 1.147432220686599e-05, "loss": 2.7638, "num_input_tokens_seen": 7544504320, "step": 14390 }, { "epoch": 0.698104400434042, "grad_norm": 0.23828125, "learning_rate": 1.1457466995701907e-05, "loss": 2.7693, "num_input_tokens_seen": 7547125760, "step": 14395 }, { "epoch": 0.6983468819902886, "grad_norm": 0.2470703125, "learning_rate": 1.144062049247127e-05, "loss": 2.7801, "num_input_tokens_seen": 7549747200, "step": 14400 }, { "epoch": 0.6983468819902886, "eval_accuracy": 0.4559990229604299, "eval_loss": 2.7416889667510986, "eval_runtime": 5.8935, "eval_samples_per_second": 50.903, "eval_steps_per_second": 6.448, "num_input_tokens_seen": 7549747200, "step": 14400 }, { "epoch": 0.6985893635465352, "grad_norm": 0.2431640625, "learning_rate": 1.1423782708006478e-05, "loss": 2.7737, "num_input_tokens_seen": 7552368640, "step": 14405 }, { "epoch": 0.6988318451027818, "grad_norm": 0.2470703125, "learning_rate": 1.140695365313435e-05, "loss": 2.7775, "num_input_tokens_seen": 7554990080, "step": 14410 }, { "epoch": 0.6990743266590285, "grad_norm": 0.2412109375, "learning_rate": 1.1390133338676054e-05, "loss": 2.7722, "num_input_tokens_seen": 7557611520, "step": 14415 }, { "epoch": 0.6993168082152751, "grad_norm": 0.2412109375, "learning_rate": 1.137332177544716e-05, "loss": 2.7895, "num_input_tokens_seen": 7560232960, "step": 14420 }, { "epoch": 0.6995592897715217, "grad_norm": 0.24609375, "learning_rate": 1.1356518974257607e-05, "loss": 2.7665, "num_input_tokens_seen": 7562854400, "step": 14425 }, { "epoch": 0.6998017713277683, "grad_norm": 0.240234375, "learning_rate": 1.1339724945911714e-05, "loss": 2.7744, "num_input_tokens_seen": 7565475840, "step": 14430 }, { "epoch": 0.7000442528840151, "grad_norm": 0.2392578125, "learning_rate": 1.1322939701208141e-05, "loss": 2.7759, "num_input_tokens_seen": 7568097280, "step": 14435 }, { "epoch": 0.7002867344402617, "grad_norm": 0.2431640625, "learning_rate": 1.1306163250939913e-05, "loss": 2.771, "num_input_tokens_seen": 7570718720, "step": 14440 }, { "epoch": 0.7005292159965083, "grad_norm": 0.2431640625, "learning_rate": 1.1289395605894374e-05, "loss": 2.7742, "num_input_tokens_seen": 7573340160, "step": 14445 }, { "epoch": 0.7007716975527549, "grad_norm": 0.2490234375, "learning_rate": 1.1272636776853231e-05, "loss": 2.7796, "num_input_tokens_seen": 7575961600, "step": 14450 }, { "epoch": 0.7010141791090015, "grad_norm": 0.236328125, "learning_rate": 1.125588677459252e-05, "loss": 2.7699, "num_input_tokens_seen": 7578583040, "step": 14455 }, { "epoch": 0.7012566606652482, "grad_norm": 0.244140625, "learning_rate": 1.1239145609882596e-05, "loss": 2.7712, "num_input_tokens_seen": 7581204480, "step": 14460 }, { "epoch": 0.7014991422214948, "grad_norm": 0.2431640625, "learning_rate": 1.1222413293488134e-05, "loss": 2.7802, "num_input_tokens_seen": 7583825920, "step": 14465 }, { "epoch": 0.7017416237777414, "grad_norm": 0.2578125, "learning_rate": 1.1205689836168123e-05, "loss": 2.7592, "num_input_tokens_seen": 7586447360, "step": 14470 }, { "epoch": 0.701984105333988, "grad_norm": 0.25, "learning_rate": 1.1188975248675837e-05, "loss": 2.7796, "num_input_tokens_seen": 7589068800, "step": 14475 }, { "epoch": 0.7022265868902346, "grad_norm": 0.240234375, "learning_rate": 1.117226954175886e-05, "loss": 2.7748, "num_input_tokens_seen": 7591690240, "step": 14480 }, { "epoch": 0.7024690684464813, "grad_norm": 0.2373046875, "learning_rate": 1.115557272615908e-05, "loss": 2.7782, "num_input_tokens_seen": 7594311680, "step": 14485 }, { "epoch": 0.7027115500027279, "grad_norm": 0.244140625, "learning_rate": 1.113888481261265e-05, "loss": 2.776, "num_input_tokens_seen": 7596933120, "step": 14490 }, { "epoch": 0.7029540315589745, "grad_norm": 0.234375, "learning_rate": 1.1122205811850001e-05, "loss": 2.7818, "num_input_tokens_seen": 7599554560, "step": 14495 }, { "epoch": 0.7031965131152211, "grad_norm": 0.23828125, "learning_rate": 1.1105535734595832e-05, "loss": 2.7658, "num_input_tokens_seen": 7602176000, "step": 14500 }, { "epoch": 0.7034389946714678, "grad_norm": 0.251953125, "learning_rate": 1.1088874591569119e-05, "loss": 2.7687, "num_input_tokens_seen": 7604797440, "step": 14505 }, { "epoch": 0.7036814762277144, "grad_norm": 0.26171875, "learning_rate": 1.1072222393483061e-05, "loss": 2.7809, "num_input_tokens_seen": 7607418880, "step": 14510 }, { "epoch": 0.7039239577839611, "grad_norm": 0.2431640625, "learning_rate": 1.1055579151045137e-05, "loss": 2.7769, "num_input_tokens_seen": 7610040320, "step": 14515 }, { "epoch": 0.7041664393402077, "grad_norm": 0.248046875, "learning_rate": 1.1038944874957058e-05, "loss": 2.7791, "num_input_tokens_seen": 7612661760, "step": 14520 }, { "epoch": 0.7044089208964543, "grad_norm": 0.2451171875, "learning_rate": 1.102231957591476e-05, "loss": 2.7721, "num_input_tokens_seen": 7615283200, "step": 14525 }, { "epoch": 0.704651402452701, "grad_norm": 0.2431640625, "learning_rate": 1.1005703264608422e-05, "loss": 2.7704, "num_input_tokens_seen": 7617904640, "step": 14530 }, { "epoch": 0.7048938840089476, "grad_norm": 0.2353515625, "learning_rate": 1.098909595172243e-05, "loss": 2.7726, "num_input_tokens_seen": 7620526080, "step": 14535 }, { "epoch": 0.7051363655651942, "grad_norm": 0.2412109375, "learning_rate": 1.0972497647935395e-05, "loss": 2.7839, "num_input_tokens_seen": 7623147520, "step": 14540 }, { "epoch": 0.7053788471214408, "grad_norm": 0.2451171875, "learning_rate": 1.0955908363920128e-05, "loss": 2.7762, "num_input_tokens_seen": 7625768960, "step": 14545 }, { "epoch": 0.7056213286776875, "grad_norm": 0.25, "learning_rate": 1.0939328110343645e-05, "loss": 2.7703, "num_input_tokens_seen": 7628390400, "step": 14550 }, { "epoch": 0.7058638102339341, "grad_norm": 0.240234375, "learning_rate": 1.0922756897867148e-05, "loss": 2.7857, "num_input_tokens_seen": 7631011840, "step": 14555 }, { "epoch": 0.7061062917901807, "grad_norm": 0.2431640625, "learning_rate": 1.090619473714603e-05, "loss": 2.772, "num_input_tokens_seen": 7633633280, "step": 14560 }, { "epoch": 0.7063487733464273, "grad_norm": 0.2412109375, "learning_rate": 1.0889641638829881e-05, "loss": 2.7785, "num_input_tokens_seen": 7636254720, "step": 14565 }, { "epoch": 0.7065912549026739, "grad_norm": 0.2490234375, "learning_rate": 1.0873097613562421e-05, "loss": 2.7744, "num_input_tokens_seen": 7638876160, "step": 14570 }, { "epoch": 0.7068337364589206, "grad_norm": 0.24609375, "learning_rate": 1.0856562671981574e-05, "loss": 2.7844, "num_input_tokens_seen": 7641497600, "step": 14575 }, { "epoch": 0.7070762180151672, "grad_norm": 0.240234375, "learning_rate": 1.0840036824719407e-05, "loss": 2.7699, "num_input_tokens_seen": 7644119040, "step": 14580 }, { "epoch": 0.7073186995714138, "grad_norm": 0.2431640625, "learning_rate": 1.082352008240215e-05, "loss": 2.7642, "num_input_tokens_seen": 7646740480, "step": 14585 }, { "epoch": 0.7075611811276605, "grad_norm": 0.25, "learning_rate": 1.0807012455650164e-05, "loss": 2.779, "num_input_tokens_seen": 7649361920, "step": 14590 }, { "epoch": 0.7078036626839072, "grad_norm": 0.2431640625, "learning_rate": 1.0790513955077963e-05, "loss": 2.7698, "num_input_tokens_seen": 7651983360, "step": 14595 }, { "epoch": 0.7080461442401538, "grad_norm": 0.244140625, "learning_rate": 1.0774024591294184e-05, "loss": 2.7717, "num_input_tokens_seen": 7654604800, "step": 14600 }, { "epoch": 0.7082886257964004, "grad_norm": 0.251953125, "learning_rate": 1.075754437490159e-05, "loss": 2.7837, "num_input_tokens_seen": 7657226240, "step": 14605 }, { "epoch": 0.708531107352647, "grad_norm": 0.2421875, "learning_rate": 1.074107331649706e-05, "loss": 2.7815, "num_input_tokens_seen": 7659847680, "step": 14610 }, { "epoch": 0.7087735889088936, "grad_norm": 0.240234375, "learning_rate": 1.0724611426671596e-05, "loss": 2.7753, "num_input_tokens_seen": 7662469120, "step": 14615 }, { "epoch": 0.7090160704651403, "grad_norm": 0.2470703125, "learning_rate": 1.0708158716010288e-05, "loss": 2.7691, "num_input_tokens_seen": 7665090560, "step": 14620 }, { "epoch": 0.7092585520213869, "grad_norm": 0.24609375, "learning_rate": 1.0691715195092348e-05, "loss": 2.7713, "num_input_tokens_seen": 7667712000, "step": 14625 }, { "epoch": 0.7095010335776335, "grad_norm": 0.2412109375, "learning_rate": 1.0675280874491036e-05, "loss": 2.7836, "num_input_tokens_seen": 7670333440, "step": 14630 }, { "epoch": 0.7097435151338801, "grad_norm": 0.2412109375, "learning_rate": 1.065885576477374e-05, "loss": 2.7788, "num_input_tokens_seen": 7672954880, "step": 14635 }, { "epoch": 0.7099859966901267, "grad_norm": 0.2431640625, "learning_rate": 1.0642439876501903e-05, "loss": 2.7919, "num_input_tokens_seen": 7675576320, "step": 14640 }, { "epoch": 0.7102284782463734, "grad_norm": 0.2412109375, "learning_rate": 1.062603322023105e-05, "loss": 2.7742, "num_input_tokens_seen": 7678197760, "step": 14645 }, { "epoch": 0.71047095980262, "grad_norm": 0.248046875, "learning_rate": 1.0609635806510757e-05, "loss": 2.7755, "num_input_tokens_seen": 7680819200, "step": 14650 }, { "epoch": 0.7107134413588666, "grad_norm": 0.2431640625, "learning_rate": 1.0593247645884666e-05, "loss": 2.7858, "num_input_tokens_seen": 7683440640, "step": 14655 }, { "epoch": 0.7109559229151132, "grad_norm": 0.2451171875, "learning_rate": 1.0576868748890468e-05, "loss": 2.7576, "num_input_tokens_seen": 7686062080, "step": 14660 }, { "epoch": 0.7111984044713598, "grad_norm": 0.23828125, "learning_rate": 1.0560499126059894e-05, "loss": 2.7726, "num_input_tokens_seen": 7688683520, "step": 14665 }, { "epoch": 0.7114408860276066, "grad_norm": 0.2412109375, "learning_rate": 1.0544138787918716e-05, "loss": 2.781, "num_input_tokens_seen": 7691304960, "step": 14670 }, { "epoch": 0.7116833675838532, "grad_norm": 0.2333984375, "learning_rate": 1.0527787744986733e-05, "loss": 2.7759, "num_input_tokens_seen": 7693926400, "step": 14675 }, { "epoch": 0.7119258491400998, "grad_norm": 0.248046875, "learning_rate": 1.051144600777777e-05, "loss": 2.7842, "num_input_tokens_seen": 7696547840, "step": 14680 }, { "epoch": 0.7121683306963464, "grad_norm": 0.2421875, "learning_rate": 1.0495113586799663e-05, "loss": 2.7737, "num_input_tokens_seen": 7699169280, "step": 14685 }, { "epoch": 0.7124108122525931, "grad_norm": 0.2373046875, "learning_rate": 1.047879049255427e-05, "loss": 2.7812, "num_input_tokens_seen": 7701790720, "step": 14690 }, { "epoch": 0.7126532938088397, "grad_norm": 0.244140625, "learning_rate": 1.0462476735537427e-05, "loss": 2.7582, "num_input_tokens_seen": 7704412160, "step": 14695 }, { "epoch": 0.7128957753650863, "grad_norm": 0.240234375, "learning_rate": 1.0446172326238987e-05, "loss": 2.7658, "num_input_tokens_seen": 7707033600, "step": 14700 }, { "epoch": 0.7128957753650863, "eval_accuracy": 0.45605764533463605, "eval_loss": 2.7416930198669434, "eval_runtime": 5.8258, "eval_samples_per_second": 51.495, "eval_steps_per_second": 6.523, "num_input_tokens_seen": 7707033600, "step": 14700 }, { "epoch": 0.7131382569213329, "grad_norm": 0.2431640625, "learning_rate": 1.0429877275142793e-05, "loss": 2.7716, "num_input_tokens_seen": 7709655040, "step": 14705 }, { "epoch": 0.7133807384775795, "grad_norm": 0.25, "learning_rate": 1.041359159272666e-05, "loss": 2.7776, "num_input_tokens_seen": 7712276480, "step": 14710 }, { "epoch": 0.7136232200338262, "grad_norm": 0.2451171875, "learning_rate": 1.0397315289462379e-05, "loss": 2.7718, "num_input_tokens_seen": 7714897920, "step": 14715 }, { "epoch": 0.7138657015900728, "grad_norm": 0.2490234375, "learning_rate": 1.0381048375815736e-05, "loss": 2.7821, "num_input_tokens_seen": 7717519360, "step": 14720 }, { "epoch": 0.7141081831463194, "grad_norm": 0.240234375, "learning_rate": 1.0364790862246435e-05, "loss": 2.7742, "num_input_tokens_seen": 7720140800, "step": 14725 }, { "epoch": 0.714350664702566, "grad_norm": 0.2451171875, "learning_rate": 1.0348542759208166e-05, "loss": 2.7596, "num_input_tokens_seen": 7722762240, "step": 14730 }, { "epoch": 0.7145931462588127, "grad_norm": 0.2421875, "learning_rate": 1.0332304077148564e-05, "loss": 2.7709, "num_input_tokens_seen": 7725383680, "step": 14735 }, { "epoch": 0.7148356278150593, "grad_norm": 0.2421875, "learning_rate": 1.03160748265092e-05, "loss": 2.7716, "num_input_tokens_seen": 7728005120, "step": 14740 }, { "epoch": 0.7150781093713059, "grad_norm": 0.2431640625, "learning_rate": 1.0299855017725585e-05, "loss": 2.7775, "num_input_tokens_seen": 7730626560, "step": 14745 }, { "epoch": 0.7153205909275526, "grad_norm": 0.244140625, "learning_rate": 1.0283644661227168e-05, "loss": 2.7817, "num_input_tokens_seen": 7733248000, "step": 14750 }, { "epoch": 0.7155630724837992, "grad_norm": 0.244140625, "learning_rate": 1.026744376743729e-05, "loss": 2.7767, "num_input_tokens_seen": 7735869440, "step": 14755 }, { "epoch": 0.7158055540400459, "grad_norm": 0.2412109375, "learning_rate": 1.0251252346773235e-05, "loss": 2.7886, "num_input_tokens_seen": 7738490880, "step": 14760 }, { "epoch": 0.7160480355962925, "grad_norm": 0.2373046875, "learning_rate": 1.023507040964618e-05, "loss": 2.7864, "num_input_tokens_seen": 7741112320, "step": 14765 }, { "epoch": 0.7162905171525391, "grad_norm": 0.2490234375, "learning_rate": 1.021889796646123e-05, "loss": 2.7857, "num_input_tokens_seen": 7743733760, "step": 14770 }, { "epoch": 0.7165329987087857, "grad_norm": 0.2490234375, "learning_rate": 1.020273502761736e-05, "loss": 2.7647, "num_input_tokens_seen": 7746355200, "step": 14775 }, { "epoch": 0.7167754802650323, "grad_norm": 0.2451171875, "learning_rate": 1.0186581603507444e-05, "loss": 2.7694, "num_input_tokens_seen": 7748976640, "step": 14780 }, { "epoch": 0.717017961821279, "grad_norm": 0.236328125, "learning_rate": 1.0170437704518224e-05, "loss": 2.7795, "num_input_tokens_seen": 7751598080, "step": 14785 }, { "epoch": 0.7172604433775256, "grad_norm": 0.2451171875, "learning_rate": 1.0154303341030334e-05, "loss": 2.779, "num_input_tokens_seen": 7754219520, "step": 14790 }, { "epoch": 0.7175029249337722, "grad_norm": 0.2431640625, "learning_rate": 1.0138178523418266e-05, "loss": 2.775, "num_input_tokens_seen": 7756840960, "step": 14795 }, { "epoch": 0.7177454064900188, "grad_norm": 0.2451171875, "learning_rate": 1.0122063262050386e-05, "loss": 2.7736, "num_input_tokens_seen": 7759462400, "step": 14800 }, { "epoch": 0.7179878880462655, "grad_norm": 0.2470703125, "learning_rate": 1.0105957567288904e-05, "loss": 2.7799, "num_input_tokens_seen": 7762083840, "step": 14805 }, { "epoch": 0.7182303696025121, "grad_norm": 0.24609375, "learning_rate": 1.0089861449489881e-05, "loss": 2.7856, "num_input_tokens_seen": 7764705280, "step": 14810 }, { "epoch": 0.7184728511587587, "grad_norm": 0.244140625, "learning_rate": 1.0073774919003235e-05, "loss": 2.776, "num_input_tokens_seen": 7767326720, "step": 14815 }, { "epoch": 0.7187153327150053, "grad_norm": 0.24609375, "learning_rate": 1.0057697986172677e-05, "loss": 2.7797, "num_input_tokens_seen": 7769948160, "step": 14820 }, { "epoch": 0.7189578142712519, "grad_norm": 0.2431640625, "learning_rate": 1.00416306613358e-05, "loss": 2.7675, "num_input_tokens_seen": 7772569600, "step": 14825 }, { "epoch": 0.7192002958274987, "grad_norm": 0.244140625, "learning_rate": 1.002557295482399e-05, "loss": 2.7745, "num_input_tokens_seen": 7775191040, "step": 14830 }, { "epoch": 0.7194427773837453, "grad_norm": 0.2412109375, "learning_rate": 1.0009524876962445e-05, "loss": 2.7749, "num_input_tokens_seen": 7777812480, "step": 14835 }, { "epoch": 0.7196852589399919, "grad_norm": 0.2490234375, "learning_rate": 9.993486438070187e-06, "loss": 2.7682, "num_input_tokens_seen": 7780433920, "step": 14840 }, { "epoch": 0.7199277404962385, "grad_norm": 0.240234375, "learning_rate": 9.97745764846004e-06, "loss": 2.783, "num_input_tokens_seen": 7783055360, "step": 14845 }, { "epoch": 0.7201702220524852, "grad_norm": 0.2490234375, "learning_rate": 9.961438518438596e-06, "loss": 2.765, "num_input_tokens_seen": 7785676800, "step": 14850 }, { "epoch": 0.7204127036087318, "grad_norm": 0.2392578125, "learning_rate": 9.945429058306262e-06, "loss": 2.7655, "num_input_tokens_seen": 7788298240, "step": 14855 }, { "epoch": 0.7206551851649784, "grad_norm": 0.2412109375, "learning_rate": 9.92942927835723e-06, "loss": 2.7745, "num_input_tokens_seen": 7790919680, "step": 14860 }, { "epoch": 0.720897666721225, "grad_norm": 0.2373046875, "learning_rate": 9.91343918887945e-06, "loss": 2.7714, "num_input_tokens_seen": 7793541120, "step": 14865 }, { "epoch": 0.7211401482774716, "grad_norm": 0.25390625, "learning_rate": 9.897458800154654e-06, "loss": 2.7894, "num_input_tokens_seen": 7796162560, "step": 14870 }, { "epoch": 0.7213826298337183, "grad_norm": 0.23828125, "learning_rate": 9.881488122458332e-06, "loss": 2.7655, "num_input_tokens_seen": 7798784000, "step": 14875 }, { "epoch": 0.7216251113899649, "grad_norm": 0.2412109375, "learning_rate": 9.865527166059726e-06, "loss": 2.7617, "num_input_tokens_seen": 7801405440, "step": 14880 }, { "epoch": 0.7218675929462115, "grad_norm": 0.2431640625, "learning_rate": 9.849575941221837e-06, "loss": 2.7722, "num_input_tokens_seen": 7804026880, "step": 14885 }, { "epoch": 0.7221100745024581, "grad_norm": 0.2373046875, "learning_rate": 9.833634458201397e-06, "loss": 2.7608, "num_input_tokens_seen": 7806648320, "step": 14890 }, { "epoch": 0.7223525560587047, "grad_norm": 0.2421875, "learning_rate": 9.817702727248878e-06, "loss": 2.7713, "num_input_tokens_seen": 7809269760, "step": 14895 }, { "epoch": 0.7225950376149514, "grad_norm": 0.244140625, "learning_rate": 9.80178075860849e-06, "loss": 2.7885, "num_input_tokens_seen": 7811891200, "step": 14900 }, { "epoch": 0.7228375191711981, "grad_norm": 0.2421875, "learning_rate": 9.785868562518161e-06, "loss": 2.771, "num_input_tokens_seen": 7814512640, "step": 14905 }, { "epoch": 0.7230800007274447, "grad_norm": 0.2421875, "learning_rate": 9.769966149209517e-06, "loss": 2.7874, "num_input_tokens_seen": 7817134080, "step": 14910 }, { "epoch": 0.7233224822836913, "grad_norm": 0.2392578125, "learning_rate": 9.754073528907918e-06, "loss": 2.771, "num_input_tokens_seen": 7819755520, "step": 14915 }, { "epoch": 0.723564963839938, "grad_norm": 0.251953125, "learning_rate": 9.738190711832415e-06, "loss": 2.7903, "num_input_tokens_seen": 7822376960, "step": 14920 }, { "epoch": 0.7238074453961846, "grad_norm": 0.24609375, "learning_rate": 9.722317708195767e-06, "loss": 2.7652, "num_input_tokens_seen": 7824998400, "step": 14925 }, { "epoch": 0.7240499269524312, "grad_norm": 0.2431640625, "learning_rate": 9.706454528204406e-06, "loss": 2.7664, "num_input_tokens_seen": 7827619840, "step": 14930 }, { "epoch": 0.7242924085086778, "grad_norm": 0.240234375, "learning_rate": 9.690601182058462e-06, "loss": 2.7806, "num_input_tokens_seen": 7830241280, "step": 14935 }, { "epoch": 0.7245348900649244, "grad_norm": 0.2490234375, "learning_rate": 9.674757679951733e-06, "loss": 2.78, "num_input_tokens_seen": 7832862720, "step": 14940 }, { "epoch": 0.7247773716211711, "grad_norm": 0.2431640625, "learning_rate": 9.65892403207169e-06, "loss": 2.7743, "num_input_tokens_seen": 7835484160, "step": 14945 }, { "epoch": 0.7250198531774177, "grad_norm": 0.236328125, "learning_rate": 9.643100248599465e-06, "loss": 2.773, "num_input_tokens_seen": 7838105600, "step": 14950 }, { "epoch": 0.7252623347336643, "grad_norm": 0.2421875, "learning_rate": 9.627286339709857e-06, "loss": 2.792, "num_input_tokens_seen": 7840727040, "step": 14955 }, { "epoch": 0.7255048162899109, "grad_norm": 0.2392578125, "learning_rate": 9.611482315571301e-06, "loss": 2.7856, "num_input_tokens_seen": 7843348480, "step": 14960 }, { "epoch": 0.7257472978461575, "grad_norm": 0.248046875, "learning_rate": 9.595688186345889e-06, "loss": 2.787, "num_input_tokens_seen": 7845969920, "step": 14965 }, { "epoch": 0.7259897794024042, "grad_norm": 0.2373046875, "learning_rate": 9.57990396218935e-06, "loss": 2.7743, "num_input_tokens_seen": 7848591360, "step": 14970 }, { "epoch": 0.7262322609586508, "grad_norm": 0.24609375, "learning_rate": 9.564129653251023e-06, "loss": 2.7873, "num_input_tokens_seen": 7851212800, "step": 14975 }, { "epoch": 0.7264747425148974, "grad_norm": 0.25390625, "learning_rate": 9.548365269673892e-06, "loss": 2.7697, "num_input_tokens_seen": 7853834240, "step": 14980 }, { "epoch": 0.7267172240711441, "grad_norm": 0.2490234375, "learning_rate": 9.532610821594562e-06, "loss": 2.7779, "num_input_tokens_seen": 7856455680, "step": 14985 }, { "epoch": 0.7269597056273908, "grad_norm": 0.2490234375, "learning_rate": 9.516866319143236e-06, "loss": 2.779, "num_input_tokens_seen": 7859077120, "step": 14990 }, { "epoch": 0.7272021871836374, "grad_norm": 0.2412109375, "learning_rate": 9.50113177244373e-06, "loss": 2.7757, "num_input_tokens_seen": 7861698560, "step": 14995 }, { "epoch": 0.727444668739884, "grad_norm": 0.24609375, "learning_rate": 9.485407191613455e-06, "loss": 2.7717, "num_input_tokens_seen": 7864320000, "step": 15000 }, { "epoch": 0.727444668739884, "eval_accuracy": 0.45601693535254845, "eval_loss": 2.7416605949401855, "eval_runtime": 5.8717, "eval_samples_per_second": 51.092, "eval_steps_per_second": 6.472, "num_input_tokens_seen": 7864320000, "step": 15000 }, { "epoch": 0.7276871502961306, "grad_norm": 0.2421875, "learning_rate": 9.469692586763412e-06, "loss": 2.7943, "num_input_tokens_seen": 7866941440, "step": 15005 }, { "epoch": 0.7279296318523772, "grad_norm": 0.2421875, "learning_rate": 9.453987967998196e-06, "loss": 2.7836, "num_input_tokens_seen": 7869562880, "step": 15010 }, { "epoch": 0.7281721134086239, "grad_norm": 0.240234375, "learning_rate": 9.438293345415972e-06, "loss": 2.7705, "num_input_tokens_seen": 7872184320, "step": 15015 }, { "epoch": 0.7284145949648705, "grad_norm": 0.2470703125, "learning_rate": 9.422608729108482e-06, "loss": 2.7605, "num_input_tokens_seen": 7874805760, "step": 15020 }, { "epoch": 0.7286570765211171, "grad_norm": 0.2421875, "learning_rate": 9.406934129161026e-06, "loss": 2.7745, "num_input_tokens_seen": 7877427200, "step": 15025 }, { "epoch": 0.7288995580773637, "grad_norm": 0.240234375, "learning_rate": 9.39126955565249e-06, "loss": 2.7718, "num_input_tokens_seen": 7880048640, "step": 15030 }, { "epoch": 0.7291420396336104, "grad_norm": 0.2392578125, "learning_rate": 9.375615018655265e-06, "loss": 2.7636, "num_input_tokens_seen": 7882670080, "step": 15035 }, { "epoch": 0.729384521189857, "grad_norm": 0.2373046875, "learning_rate": 9.35997052823533e-06, "loss": 2.7707, "num_input_tokens_seen": 7885291520, "step": 15040 }, { "epoch": 0.7296270027461036, "grad_norm": 0.2470703125, "learning_rate": 9.344336094452188e-06, "loss": 2.7737, "num_input_tokens_seen": 7887912960, "step": 15045 }, { "epoch": 0.7298694843023502, "grad_norm": 0.24609375, "learning_rate": 9.328711727358872e-06, "loss": 2.7785, "num_input_tokens_seen": 7890534400, "step": 15050 }, { "epoch": 0.7301119658585968, "grad_norm": 0.244140625, "learning_rate": 9.313097437001964e-06, "loss": 2.7749, "num_input_tokens_seen": 7893155840, "step": 15055 }, { "epoch": 0.7303544474148435, "grad_norm": 0.2451171875, "learning_rate": 9.297493233421548e-06, "loss": 2.7579, "num_input_tokens_seen": 7895777280, "step": 15060 }, { "epoch": 0.7305969289710902, "grad_norm": 0.2412109375, "learning_rate": 9.281899126651208e-06, "loss": 2.7679, "num_input_tokens_seen": 7898398720, "step": 15065 }, { "epoch": 0.7308394105273368, "grad_norm": 0.2373046875, "learning_rate": 9.266315126718064e-06, "loss": 2.7735, "num_input_tokens_seen": 7901020160, "step": 15070 }, { "epoch": 0.7310818920835834, "grad_norm": 0.2392578125, "learning_rate": 9.250741243642724e-06, "loss": 2.7689, "num_input_tokens_seen": 7903641600, "step": 15075 }, { "epoch": 0.73132437363983, "grad_norm": 0.2451171875, "learning_rate": 9.235177487439289e-06, "loss": 2.7721, "num_input_tokens_seen": 7906263040, "step": 15080 }, { "epoch": 0.7315668551960767, "grad_norm": 0.251953125, "learning_rate": 9.219623868115354e-06, "loss": 2.768, "num_input_tokens_seen": 7908884480, "step": 15085 }, { "epoch": 0.7318093367523233, "grad_norm": 0.240234375, "learning_rate": 9.204080395672004e-06, "loss": 2.7745, "num_input_tokens_seen": 7911505920, "step": 15090 }, { "epoch": 0.7320518183085699, "grad_norm": 0.2421875, "learning_rate": 9.188547080103767e-06, "loss": 2.7722, "num_input_tokens_seen": 7914127360, "step": 15095 }, { "epoch": 0.7322942998648165, "grad_norm": 0.2412109375, "learning_rate": 9.173023931398677e-06, "loss": 2.7603, "num_input_tokens_seen": 7916748800, "step": 15100 }, { "epoch": 0.7325367814210632, "grad_norm": 0.25, "learning_rate": 9.157510959538204e-06, "loss": 2.7757, "num_input_tokens_seen": 7919370240, "step": 15105 }, { "epoch": 0.7327792629773098, "grad_norm": 0.2431640625, "learning_rate": 9.142008174497302e-06, "loss": 2.7881, "num_input_tokens_seen": 7921991680, "step": 15110 }, { "epoch": 0.7330217445335564, "grad_norm": 0.2421875, "learning_rate": 9.126515586244353e-06, "loss": 2.7824, "num_input_tokens_seen": 7924613120, "step": 15115 }, { "epoch": 0.733264226089803, "grad_norm": 0.2412109375, "learning_rate": 9.111033204741182e-06, "loss": 2.7632, "num_input_tokens_seen": 7927234560, "step": 15120 }, { "epoch": 0.7335067076460496, "grad_norm": 0.2470703125, "learning_rate": 9.095561039943073e-06, "loss": 2.7759, "num_input_tokens_seen": 7929856000, "step": 15125 }, { "epoch": 0.7337491892022963, "grad_norm": 0.240234375, "learning_rate": 9.080099101798703e-06, "loss": 2.7831, "num_input_tokens_seen": 7932477440, "step": 15130 }, { "epoch": 0.7339916707585429, "grad_norm": 0.25390625, "learning_rate": 9.06464740025021e-06, "loss": 2.7837, "num_input_tokens_seen": 7935098880, "step": 15135 }, { "epoch": 0.7342341523147895, "grad_norm": 0.2421875, "learning_rate": 9.049205945233124e-06, "loss": 2.7663, "num_input_tokens_seen": 7937720320, "step": 15140 }, { "epoch": 0.7344766338710362, "grad_norm": 0.2431640625, "learning_rate": 9.033774746676404e-06, "loss": 2.7727, "num_input_tokens_seen": 7940341760, "step": 15145 }, { "epoch": 0.7347191154272829, "grad_norm": 0.2392578125, "learning_rate": 9.01835381450241e-06, "loss": 2.778, "num_input_tokens_seen": 7942963200, "step": 15150 }, { "epoch": 0.7349615969835295, "grad_norm": 0.248046875, "learning_rate": 9.0029431586269e-06, "loss": 2.7732, "num_input_tokens_seen": 7945584640, "step": 15155 }, { "epoch": 0.7352040785397761, "grad_norm": 0.25390625, "learning_rate": 8.987542788959e-06, "loss": 2.7764, "num_input_tokens_seen": 7948206080, "step": 15160 }, { "epoch": 0.7354465600960227, "grad_norm": 0.234375, "learning_rate": 8.97215271540127e-06, "loss": 2.7871, "num_input_tokens_seen": 7950827520, "step": 15165 }, { "epoch": 0.7356890416522693, "grad_norm": 0.23828125, "learning_rate": 8.956772947849613e-06, "loss": 2.7671, "num_input_tokens_seen": 7953448960, "step": 15170 }, { "epoch": 0.735931523208516, "grad_norm": 0.240234375, "learning_rate": 8.941403496193315e-06, "loss": 2.7794, "num_input_tokens_seen": 7956070400, "step": 15175 }, { "epoch": 0.7361740047647626, "grad_norm": 0.2421875, "learning_rate": 8.926044370315035e-06, "loss": 2.7523, "num_input_tokens_seen": 7958691840, "step": 15180 }, { "epoch": 0.7364164863210092, "grad_norm": 0.255859375, "learning_rate": 8.910695580090789e-06, "loss": 2.7807, "num_input_tokens_seen": 7961313280, "step": 15185 }, { "epoch": 0.7366589678772558, "grad_norm": 0.24609375, "learning_rate": 8.895357135389931e-06, "loss": 2.783, "num_input_tokens_seen": 7963934720, "step": 15190 }, { "epoch": 0.7369014494335024, "grad_norm": 0.2421875, "learning_rate": 8.880029046075186e-06, "loss": 2.7765, "num_input_tokens_seen": 7966556160, "step": 15195 }, { "epoch": 0.7371439309897491, "grad_norm": 0.23828125, "learning_rate": 8.86471132200261e-06, "loss": 2.7609, "num_input_tokens_seen": 7969177600, "step": 15200 }, { "epoch": 0.7373864125459957, "grad_norm": 0.2431640625, "learning_rate": 8.8494039730216e-06, "loss": 2.7844, "num_input_tokens_seen": 7971799040, "step": 15205 }, { "epoch": 0.7376288941022423, "grad_norm": 0.240234375, "learning_rate": 8.834107008974871e-06, "loss": 2.7716, "num_input_tokens_seen": 7974420480, "step": 15210 }, { "epoch": 0.7378713756584889, "grad_norm": 0.23828125, "learning_rate": 8.818820439698466e-06, "loss": 2.7762, "num_input_tokens_seen": 7977041920, "step": 15215 }, { "epoch": 0.7381138572147357, "grad_norm": 0.24609375, "learning_rate": 8.803544275021749e-06, "loss": 2.7816, "num_input_tokens_seen": 7979663360, "step": 15220 }, { "epoch": 0.7383563387709823, "grad_norm": 0.244140625, "learning_rate": 8.78827852476739e-06, "loss": 2.7946, "num_input_tokens_seen": 7982284800, "step": 15225 }, { "epoch": 0.7385988203272289, "grad_norm": 0.2431640625, "learning_rate": 8.77302319875136e-06, "loss": 2.7671, "num_input_tokens_seen": 7984906240, "step": 15230 }, { "epoch": 0.7388413018834755, "grad_norm": 0.2392578125, "learning_rate": 8.757778306782927e-06, "loss": 2.7829, "num_input_tokens_seen": 7987527680, "step": 15235 }, { "epoch": 0.7390837834397221, "grad_norm": 0.2490234375, "learning_rate": 8.742543858664653e-06, "loss": 2.7741, "num_input_tokens_seen": 7990149120, "step": 15240 }, { "epoch": 0.7393262649959688, "grad_norm": 0.25, "learning_rate": 8.727319864192394e-06, "loss": 2.7885, "num_input_tokens_seen": 7992770560, "step": 15245 }, { "epoch": 0.7395687465522154, "grad_norm": 0.236328125, "learning_rate": 8.712106333155257e-06, "loss": 2.7774, "num_input_tokens_seen": 7995392000, "step": 15250 }, { "epoch": 0.739811228108462, "grad_norm": 0.2421875, "learning_rate": 8.696903275335642e-06, "loss": 2.7666, "num_input_tokens_seen": 7998013440, "step": 15255 }, { "epoch": 0.7400537096647086, "grad_norm": 0.2412109375, "learning_rate": 8.681710700509213e-06, "loss": 2.7855, "num_input_tokens_seen": 8000634880, "step": 15260 }, { "epoch": 0.7402961912209552, "grad_norm": 0.24609375, "learning_rate": 8.666528618444892e-06, "loss": 2.7845, "num_input_tokens_seen": 8003256320, "step": 15265 }, { "epoch": 0.7405386727772019, "grad_norm": 0.2431640625, "learning_rate": 8.65135703890485e-06, "loss": 2.7693, "num_input_tokens_seen": 8005877760, "step": 15270 }, { "epoch": 0.7407811543334485, "grad_norm": 0.24609375, "learning_rate": 8.636195971644511e-06, "loss": 2.7681, "num_input_tokens_seen": 8008499200, "step": 15275 }, { "epoch": 0.7410236358896951, "grad_norm": 0.25, "learning_rate": 8.621045426412533e-06, "loss": 2.7567, "num_input_tokens_seen": 8011120640, "step": 15280 }, { "epoch": 0.7412661174459417, "grad_norm": 0.24609375, "learning_rate": 8.605905412950812e-06, "loss": 2.7772, "num_input_tokens_seen": 8013742080, "step": 15285 }, { "epoch": 0.7415085990021884, "grad_norm": 0.2373046875, "learning_rate": 8.590775940994472e-06, "loss": 2.7667, "num_input_tokens_seen": 8016363520, "step": 15290 }, { "epoch": 0.741751080558435, "grad_norm": 0.2353515625, "learning_rate": 8.575657020271857e-06, "loss": 2.765, "num_input_tokens_seen": 8018984960, "step": 15295 }, { "epoch": 0.7419935621146817, "grad_norm": 0.240234375, "learning_rate": 8.56054866050453e-06, "loss": 2.7717, "num_input_tokens_seen": 8021606400, "step": 15300 }, { "epoch": 0.7419935621146817, "eval_accuracy": 0.456018563751832, "eval_loss": 2.7416841983795166, "eval_runtime": 5.8648, "eval_samples_per_second": 51.152, "eval_steps_per_second": 6.479, "num_input_tokens_seen": 8021606400, "step": 15300 }, { "epoch": 0.7422360436709283, "grad_norm": 0.2392578125, "learning_rate": 8.54545087140726e-06, "loss": 2.7912, "num_input_tokens_seen": 8024227840, "step": 15305 }, { "epoch": 0.742478525227175, "grad_norm": 0.25390625, "learning_rate": 8.530363662688023e-06, "loss": 2.776, "num_input_tokens_seen": 8026849280, "step": 15310 }, { "epoch": 0.7427210067834216, "grad_norm": 0.244140625, "learning_rate": 8.515287044047982e-06, "loss": 2.783, "num_input_tokens_seen": 8029470720, "step": 15315 }, { "epoch": 0.7429634883396682, "grad_norm": 0.2412109375, "learning_rate": 8.500221025181496e-06, "loss": 2.7864, "num_input_tokens_seen": 8032092160, "step": 15320 }, { "epoch": 0.7432059698959148, "grad_norm": 0.244140625, "learning_rate": 8.485165615776114e-06, "loss": 2.7776, "num_input_tokens_seen": 8034713600, "step": 15325 }, { "epoch": 0.7434484514521614, "grad_norm": 0.2470703125, "learning_rate": 8.47012082551256e-06, "loss": 2.7796, "num_input_tokens_seen": 8037335040, "step": 15330 }, { "epoch": 0.743690933008408, "grad_norm": 0.2431640625, "learning_rate": 8.455086664064713e-06, "loss": 2.7897, "num_input_tokens_seen": 8039956480, "step": 15335 }, { "epoch": 0.7439334145646547, "grad_norm": 0.2392578125, "learning_rate": 8.440063141099666e-06, "loss": 2.7773, "num_input_tokens_seen": 8042577920, "step": 15340 }, { "epoch": 0.7441758961209013, "grad_norm": 0.2451171875, "learning_rate": 8.425050266277609e-06, "loss": 2.7693, "num_input_tokens_seen": 8045199360, "step": 15345 }, { "epoch": 0.7444183776771479, "grad_norm": 0.2431640625, "learning_rate": 8.410048049251922e-06, "loss": 2.7884, "num_input_tokens_seen": 8047820800, "step": 15350 }, { "epoch": 0.7446608592333945, "grad_norm": 0.2431640625, "learning_rate": 8.395056499669127e-06, "loss": 2.7588, "num_input_tokens_seen": 8050442240, "step": 15355 }, { "epoch": 0.7449033407896412, "grad_norm": 0.2392578125, "learning_rate": 8.380075627168884e-06, "loss": 2.76, "num_input_tokens_seen": 8053063680, "step": 15360 }, { "epoch": 0.7451458223458878, "grad_norm": 0.244140625, "learning_rate": 8.365105441383986e-06, "loss": 2.7677, "num_input_tokens_seen": 8055685120, "step": 15365 }, { "epoch": 0.7453883039021344, "grad_norm": 0.23828125, "learning_rate": 8.350145951940364e-06, "loss": 2.7764, "num_input_tokens_seen": 8058306560, "step": 15370 }, { "epoch": 0.745630785458381, "grad_norm": 0.2412109375, "learning_rate": 8.335197168457048e-06, "loss": 2.7667, "num_input_tokens_seen": 8060928000, "step": 15375 }, { "epoch": 0.7458732670146277, "grad_norm": 0.248046875, "learning_rate": 8.32025910054621e-06, "loss": 2.7695, "num_input_tokens_seen": 8063549440, "step": 15380 }, { "epoch": 0.7461157485708744, "grad_norm": 0.2451171875, "learning_rate": 8.305331757813115e-06, "loss": 2.7786, "num_input_tokens_seen": 8066170880, "step": 15385 }, { "epoch": 0.746358230127121, "grad_norm": 0.2412109375, "learning_rate": 8.290415149856134e-06, "loss": 2.758, "num_input_tokens_seen": 8068792320, "step": 15390 }, { "epoch": 0.7466007116833676, "grad_norm": 0.240234375, "learning_rate": 8.275509286266755e-06, "loss": 2.7753, "num_input_tokens_seen": 8071413760, "step": 15395 }, { "epoch": 0.7468431932396142, "grad_norm": 0.24609375, "learning_rate": 8.260614176629544e-06, "loss": 2.7729, "num_input_tokens_seen": 8074035200, "step": 15400 }, { "epoch": 0.7470856747958609, "grad_norm": 0.248046875, "learning_rate": 8.245729830522128e-06, "loss": 2.7754, "num_input_tokens_seen": 8076656640, "step": 15405 }, { "epoch": 0.7473281563521075, "grad_norm": 0.2412109375, "learning_rate": 8.230856257515245e-06, "loss": 2.7642, "num_input_tokens_seen": 8079278080, "step": 15410 }, { "epoch": 0.7475706379083541, "grad_norm": 0.2431640625, "learning_rate": 8.215993467172697e-06, "loss": 2.7596, "num_input_tokens_seen": 8081899520, "step": 15415 }, { "epoch": 0.7478131194646007, "grad_norm": 0.23828125, "learning_rate": 8.201141469051351e-06, "loss": 2.7666, "num_input_tokens_seen": 8084520960, "step": 15420 }, { "epoch": 0.7480556010208473, "grad_norm": 0.25390625, "learning_rate": 8.186300272701134e-06, "loss": 2.7715, "num_input_tokens_seen": 8087142400, "step": 15425 }, { "epoch": 0.748298082577094, "grad_norm": 0.2470703125, "learning_rate": 8.171469887665035e-06, "loss": 2.7671, "num_input_tokens_seen": 8089763840, "step": 15430 }, { "epoch": 0.7485405641333406, "grad_norm": 0.2392578125, "learning_rate": 8.15665032347907e-06, "loss": 2.78, "num_input_tokens_seen": 8092385280, "step": 15435 }, { "epoch": 0.7487830456895872, "grad_norm": 0.2421875, "learning_rate": 8.141841589672316e-06, "loss": 2.7722, "num_input_tokens_seen": 8095006720, "step": 15440 }, { "epoch": 0.7490255272458338, "grad_norm": 0.2392578125, "learning_rate": 8.127043695766879e-06, "loss": 2.7718, "num_input_tokens_seen": 8097628160, "step": 15445 }, { "epoch": 0.7492680088020804, "grad_norm": 0.2392578125, "learning_rate": 8.11225665127791e-06, "loss": 2.7835, "num_input_tokens_seen": 8100249600, "step": 15450 }, { "epoch": 0.7495104903583272, "grad_norm": 0.2431640625, "learning_rate": 8.097480465713563e-06, "loss": 2.7689, "num_input_tokens_seen": 8102871040, "step": 15455 }, { "epoch": 0.7497529719145738, "grad_norm": 0.23828125, "learning_rate": 8.082715148575018e-06, "loss": 2.7796, "num_input_tokens_seen": 8105492480, "step": 15460 }, { "epoch": 0.7499954534708204, "grad_norm": 0.244140625, "learning_rate": 8.067960709356478e-06, "loss": 2.7587, "num_input_tokens_seen": 8108113920, "step": 15465 }, { "epoch": 0.750237935027067, "grad_norm": 0.244140625, "learning_rate": 8.053217157545117e-06, "loss": 2.7646, "num_input_tokens_seen": 8110735360, "step": 15470 }, { "epoch": 0.7504804165833137, "grad_norm": 0.2451171875, "learning_rate": 8.038484502621144e-06, "loss": 2.7692, "num_input_tokens_seen": 8113356800, "step": 15475 }, { "epoch": 0.7507228981395603, "grad_norm": 0.2412109375, "learning_rate": 8.023762754057748e-06, "loss": 2.7649, "num_input_tokens_seen": 8115978240, "step": 15480 }, { "epoch": 0.7509653796958069, "grad_norm": 0.244140625, "learning_rate": 8.009051921321101e-06, "loss": 2.7863, "num_input_tokens_seen": 8118599680, "step": 15485 }, { "epoch": 0.7512078612520535, "grad_norm": 0.240234375, "learning_rate": 7.994352013870366e-06, "loss": 2.7695, "num_input_tokens_seen": 8121221120, "step": 15490 }, { "epoch": 0.7514503428083001, "grad_norm": 0.2451171875, "learning_rate": 7.979663041157673e-06, "loss": 2.7841, "num_input_tokens_seen": 8123842560, "step": 15495 }, { "epoch": 0.7516928243645468, "grad_norm": 0.2412109375, "learning_rate": 7.964985012628123e-06, "loss": 2.7864, "num_input_tokens_seen": 8126464000, "step": 15500 }, { "epoch": 0.7519353059207934, "grad_norm": 0.2431640625, "learning_rate": 7.950317937719782e-06, "loss": 2.7813, "num_input_tokens_seen": 8129085440, "step": 15505 }, { "epoch": 0.75217778747704, "grad_norm": 0.244140625, "learning_rate": 7.935661825863669e-06, "loss": 2.7689, "num_input_tokens_seen": 8131706880, "step": 15510 }, { "epoch": 0.7524202690332866, "grad_norm": 0.2451171875, "learning_rate": 7.921016686483757e-06, "loss": 2.779, "num_input_tokens_seen": 8134328320, "step": 15515 }, { "epoch": 0.7526627505895332, "grad_norm": 0.2421875, "learning_rate": 7.906382528996958e-06, "loss": 2.7874, "num_input_tokens_seen": 8136949760, "step": 15520 }, { "epoch": 0.7529052321457799, "grad_norm": 0.2490234375, "learning_rate": 7.891759362813142e-06, "loss": 2.7839, "num_input_tokens_seen": 8139571200, "step": 15525 }, { "epoch": 0.7531477137020265, "grad_norm": 0.2421875, "learning_rate": 7.877147197335075e-06, "loss": 2.7597, "num_input_tokens_seen": 8142192640, "step": 15530 }, { "epoch": 0.7533901952582732, "grad_norm": 0.2431640625, "learning_rate": 7.862546041958482e-06, "loss": 2.7801, "num_input_tokens_seen": 8144814080, "step": 15535 }, { "epoch": 0.7536326768145198, "grad_norm": 0.24609375, "learning_rate": 7.847955906071994e-06, "loss": 2.7751, "num_input_tokens_seen": 8147435520, "step": 15540 }, { "epoch": 0.7538751583707665, "grad_norm": 0.2490234375, "learning_rate": 7.833376799057163e-06, "loss": 2.7744, "num_input_tokens_seen": 8150056960, "step": 15545 }, { "epoch": 0.7541176399270131, "grad_norm": 0.2353515625, "learning_rate": 7.81880873028844e-06, "loss": 2.7735, "num_input_tokens_seen": 8152678400, "step": 15550 }, { "epoch": 0.7543601214832597, "grad_norm": 0.2470703125, "learning_rate": 7.804251709133192e-06, "loss": 2.7793, "num_input_tokens_seen": 8155299840, "step": 15555 }, { "epoch": 0.7546026030395063, "grad_norm": 0.2373046875, "learning_rate": 7.789705744951672e-06, "loss": 2.7715, "num_input_tokens_seen": 8157921280, "step": 15560 }, { "epoch": 0.754845084595753, "grad_norm": 0.24609375, "learning_rate": 7.775170847097026e-06, "loss": 2.7799, "num_input_tokens_seen": 8160542720, "step": 15565 }, { "epoch": 0.7550875661519996, "grad_norm": 0.240234375, "learning_rate": 7.760647024915283e-06, "loss": 2.7828, "num_input_tokens_seen": 8163164160, "step": 15570 }, { "epoch": 0.7553300477082462, "grad_norm": 0.25390625, "learning_rate": 7.746134287745349e-06, "loss": 2.784, "num_input_tokens_seen": 8165785600, "step": 15575 }, { "epoch": 0.7555725292644928, "grad_norm": 0.2470703125, "learning_rate": 7.731632644919012e-06, "loss": 2.7859, "num_input_tokens_seen": 8168407040, "step": 15580 }, { "epoch": 0.7558150108207394, "grad_norm": 0.23828125, "learning_rate": 7.717142105760921e-06, "loss": 2.7775, "num_input_tokens_seen": 8171028480, "step": 15585 }, { "epoch": 0.756057492376986, "grad_norm": 0.2421875, "learning_rate": 7.702662679588572e-06, "loss": 2.7839, "num_input_tokens_seen": 8173649920, "step": 15590 }, { "epoch": 0.7562999739332327, "grad_norm": 0.2431640625, "learning_rate": 7.688194375712327e-06, "loss": 2.7691, "num_input_tokens_seen": 8176271360, "step": 15595 }, { "epoch": 0.7565424554894793, "grad_norm": 0.2421875, "learning_rate": 7.673737203435405e-06, "loss": 2.777, "num_input_tokens_seen": 8178892800, "step": 15600 }, { "epoch": 0.7565424554894793, "eval_accuracy": 0.45587363621560006, "eval_loss": 2.741650342941284, "eval_runtime": 5.8981, "eval_samples_per_second": 50.864, "eval_steps_per_second": 6.443, "num_input_tokens_seen": 8178892800, "step": 15600 }, { "epoch": 0.7567849370457259, "grad_norm": 0.2490234375, "learning_rate": 7.65929117205385e-06, "loss": 2.7818, "num_input_tokens_seen": 8181514240, "step": 15605 }, { "epoch": 0.7570274186019725, "grad_norm": 0.2451171875, "learning_rate": 7.644856290856559e-06, "loss": 2.777, "num_input_tokens_seen": 8184135680, "step": 15610 }, { "epoch": 0.7572699001582193, "grad_norm": 0.2451171875, "learning_rate": 7.630432569125245e-06, "loss": 2.7863, "num_input_tokens_seen": 8186757120, "step": 15615 }, { "epoch": 0.7575123817144659, "grad_norm": 0.23828125, "learning_rate": 7.616020016134451e-06, "loss": 2.7721, "num_input_tokens_seen": 8189378560, "step": 15620 }, { "epoch": 0.7577548632707125, "grad_norm": 0.2353515625, "learning_rate": 7.601618641151542e-06, "loss": 2.7765, "num_input_tokens_seen": 8192000000, "step": 15625 }, { "epoch": 0.7579973448269591, "grad_norm": 0.25, "learning_rate": 7.587228453436693e-06, "loss": 2.7773, "num_input_tokens_seen": 8194621440, "step": 15630 }, { "epoch": 0.7582398263832058, "grad_norm": 0.244140625, "learning_rate": 7.572849462242879e-06, "loss": 2.7723, "num_input_tokens_seen": 8197242880, "step": 15635 }, { "epoch": 0.7584823079394524, "grad_norm": 0.2412109375, "learning_rate": 7.558481676815887e-06, "loss": 2.7852, "num_input_tokens_seen": 8199864320, "step": 15640 }, { "epoch": 0.758724789495699, "grad_norm": 0.2412109375, "learning_rate": 7.544125106394289e-06, "loss": 2.7693, "num_input_tokens_seen": 8202485760, "step": 15645 }, { "epoch": 0.7589672710519456, "grad_norm": 0.2353515625, "learning_rate": 7.52977976020946e-06, "loss": 2.7761, "num_input_tokens_seen": 8205107200, "step": 15650 }, { "epoch": 0.7592097526081922, "grad_norm": 0.2421875, "learning_rate": 7.5154456474855305e-06, "loss": 2.7785, "num_input_tokens_seen": 8207728640, "step": 15655 }, { "epoch": 0.7594522341644389, "grad_norm": 0.244140625, "learning_rate": 7.501122777439435e-06, "loss": 2.7717, "num_input_tokens_seen": 8210350080, "step": 15660 }, { "epoch": 0.7596947157206855, "grad_norm": 0.2421875, "learning_rate": 7.486811159280863e-06, "loss": 2.7675, "num_input_tokens_seen": 8212971520, "step": 15665 }, { "epoch": 0.7599371972769321, "grad_norm": 0.251953125, "learning_rate": 7.472510802212279e-06, "loss": 2.7714, "num_input_tokens_seen": 8215592960, "step": 15670 }, { "epoch": 0.7601796788331787, "grad_norm": 0.234375, "learning_rate": 7.458221715428893e-06, "loss": 2.7733, "num_input_tokens_seen": 8218214400, "step": 15675 }, { "epoch": 0.7604221603894253, "grad_norm": 0.2412109375, "learning_rate": 7.443943908118703e-06, "loss": 2.777, "num_input_tokens_seen": 8220835840, "step": 15680 }, { "epoch": 0.760664641945672, "grad_norm": 0.2451171875, "learning_rate": 7.4296773894624e-06, "loss": 2.773, "num_input_tokens_seen": 8223457280, "step": 15685 }, { "epoch": 0.7609071235019186, "grad_norm": 0.24609375, "learning_rate": 7.415422168633457e-06, "loss": 2.7707, "num_input_tokens_seen": 8226078720, "step": 15690 }, { "epoch": 0.7611496050581653, "grad_norm": 0.23828125, "learning_rate": 7.4011782547980665e-06, "loss": 2.7761, "num_input_tokens_seen": 8228700160, "step": 15695 }, { "epoch": 0.7613920866144119, "grad_norm": 0.24609375, "learning_rate": 7.386945657115158e-06, "loss": 2.7625, "num_input_tokens_seen": 8231321600, "step": 15700 }, { "epoch": 0.7616345681706586, "grad_norm": 0.2412109375, "learning_rate": 7.372724384736377e-06, "loss": 2.7709, "num_input_tokens_seen": 8233943040, "step": 15705 }, { "epoch": 0.7618770497269052, "grad_norm": 0.24609375, "learning_rate": 7.358514446806103e-06, "loss": 2.7756, "num_input_tokens_seen": 8236564480, "step": 15710 }, { "epoch": 0.7621195312831518, "grad_norm": 0.2392578125, "learning_rate": 7.3443158524613946e-06, "loss": 2.7763, "num_input_tokens_seen": 8239185920, "step": 15715 }, { "epoch": 0.7623620128393984, "grad_norm": 0.2431640625, "learning_rate": 7.330128610832049e-06, "loss": 2.7757, "num_input_tokens_seen": 8241807360, "step": 15720 }, { "epoch": 0.762604494395645, "grad_norm": 0.240234375, "learning_rate": 7.3159527310405454e-06, "loss": 2.7879, "num_input_tokens_seen": 8244428800, "step": 15725 }, { "epoch": 0.7628469759518917, "grad_norm": 0.251953125, "learning_rate": 7.301788222202063e-06, "loss": 2.7797, "num_input_tokens_seen": 8247050240, "step": 15730 }, { "epoch": 0.7630894575081383, "grad_norm": 0.25, "learning_rate": 7.28763509342448e-06, "loss": 2.787, "num_input_tokens_seen": 8249671680, "step": 15735 }, { "epoch": 0.7633319390643849, "grad_norm": 0.2392578125, "learning_rate": 7.273493353808347e-06, "loss": 2.7699, "num_input_tokens_seen": 8252293120, "step": 15740 }, { "epoch": 0.7635744206206315, "grad_norm": 0.2392578125, "learning_rate": 7.259363012446876e-06, "loss": 2.7927, "num_input_tokens_seen": 8254914560, "step": 15745 }, { "epoch": 0.7638169021768781, "grad_norm": 0.2451171875, "learning_rate": 7.245244078425975e-06, "loss": 2.769, "num_input_tokens_seen": 8257536000, "step": 15750 }, { "epoch": 0.7640593837331248, "grad_norm": 0.240234375, "learning_rate": 7.231136560824206e-06, "loss": 2.7718, "num_input_tokens_seen": 8260157440, "step": 15755 }, { "epoch": 0.7643018652893714, "grad_norm": 0.25, "learning_rate": 7.217040468712788e-06, "loss": 2.7671, "num_input_tokens_seen": 8262778880, "step": 15760 }, { "epoch": 0.764544346845618, "grad_norm": 0.2353515625, "learning_rate": 7.202955811155601e-06, "loss": 2.7749, "num_input_tokens_seen": 8265400320, "step": 15765 }, { "epoch": 0.7647868284018647, "grad_norm": 0.2373046875, "learning_rate": 7.188882597209162e-06, "loss": 2.7727, "num_input_tokens_seen": 8268021760, "step": 15770 }, { "epoch": 0.7650293099581114, "grad_norm": 0.23828125, "learning_rate": 7.174820835922649e-06, "loss": 2.7711, "num_input_tokens_seen": 8270643200, "step": 15775 }, { "epoch": 0.765271791514358, "grad_norm": 0.240234375, "learning_rate": 7.160770536337838e-06, "loss": 2.7879, "num_input_tokens_seen": 8273264640, "step": 15780 }, { "epoch": 0.7655142730706046, "grad_norm": 0.24609375, "learning_rate": 7.146731707489179e-06, "loss": 2.7784, "num_input_tokens_seen": 8275886080, "step": 15785 }, { "epoch": 0.7657567546268512, "grad_norm": 0.244140625, "learning_rate": 7.132704358403724e-06, "loss": 2.7661, "num_input_tokens_seen": 8278507520, "step": 15790 }, { "epoch": 0.7659992361830978, "grad_norm": 0.23828125, "learning_rate": 7.118688498101145e-06, "loss": 2.7736, "num_input_tokens_seen": 8281128960, "step": 15795 }, { "epoch": 0.7662417177393445, "grad_norm": 0.248046875, "learning_rate": 7.104684135593726e-06, "loss": 2.7734, "num_input_tokens_seen": 8283750400, "step": 15800 }, { "epoch": 0.7664841992955911, "grad_norm": 0.2431640625, "learning_rate": 7.0906912798863666e-06, "loss": 2.7645, "num_input_tokens_seen": 8286371840, "step": 15805 }, { "epoch": 0.7667266808518377, "grad_norm": 0.240234375, "learning_rate": 7.076709939976548e-06, "loss": 2.7691, "num_input_tokens_seen": 8288993280, "step": 15810 }, { "epoch": 0.7669691624080843, "grad_norm": 0.240234375, "learning_rate": 7.062740124854367e-06, "loss": 2.7679, "num_input_tokens_seen": 8291614720, "step": 15815 }, { "epoch": 0.767211643964331, "grad_norm": 0.244140625, "learning_rate": 7.0487818435025e-06, "loss": 2.7658, "num_input_tokens_seen": 8294236160, "step": 15820 }, { "epoch": 0.7674541255205776, "grad_norm": 0.2421875, "learning_rate": 7.034835104896209e-06, "loss": 2.7741, "num_input_tokens_seen": 8296857600, "step": 15825 }, { "epoch": 0.7676966070768242, "grad_norm": 0.2431640625, "learning_rate": 7.020899918003337e-06, "loss": 2.7861, "num_input_tokens_seen": 8299479040, "step": 15830 }, { "epoch": 0.7679390886330708, "grad_norm": 0.2373046875, "learning_rate": 7.006976291784295e-06, "loss": 2.7906, "num_input_tokens_seen": 8302100480, "step": 15835 }, { "epoch": 0.7681815701893174, "grad_norm": 0.2470703125, "learning_rate": 6.99306423519206e-06, "loss": 2.763, "num_input_tokens_seen": 8304721920, "step": 15840 }, { "epoch": 0.768424051745564, "grad_norm": 0.2451171875, "learning_rate": 6.9791637571721744e-06, "loss": 2.7909, "num_input_tokens_seen": 8307343360, "step": 15845 }, { "epoch": 0.7686665333018108, "grad_norm": 0.2421875, "learning_rate": 6.96527486666273e-06, "loss": 2.7655, "num_input_tokens_seen": 8309964800, "step": 15850 }, { "epoch": 0.7689090148580574, "grad_norm": 0.25, "learning_rate": 6.951397572594373e-06, "loss": 2.7775, "num_input_tokens_seen": 8312586240, "step": 15855 }, { "epoch": 0.769151496414304, "grad_norm": 0.244140625, "learning_rate": 6.937531883890286e-06, "loss": 2.7707, "num_input_tokens_seen": 8315207680, "step": 15860 }, { "epoch": 0.7693939779705506, "grad_norm": 0.24609375, "learning_rate": 6.923677809466206e-06, "loss": 2.7756, "num_input_tokens_seen": 8317829120, "step": 15865 }, { "epoch": 0.7696364595267973, "grad_norm": 0.2451171875, "learning_rate": 6.909835358230371e-06, "loss": 2.7584, "num_input_tokens_seen": 8320450560, "step": 15870 }, { "epoch": 0.7698789410830439, "grad_norm": 0.2412109375, "learning_rate": 6.896004539083573e-06, "loss": 2.7819, "num_input_tokens_seen": 8323072000, "step": 15875 }, { "epoch": 0.7701214226392905, "grad_norm": 0.2451171875, "learning_rate": 6.8821853609191165e-06, "loss": 2.7654, "num_input_tokens_seen": 8325693440, "step": 15880 }, { "epoch": 0.7703639041955371, "grad_norm": 0.2421875, "learning_rate": 6.868377832622813e-06, "loss": 2.7798, "num_input_tokens_seen": 8328314880, "step": 15885 }, { "epoch": 0.7706063857517838, "grad_norm": 0.2412109375, "learning_rate": 6.854581963072998e-06, "loss": 2.7789, "num_input_tokens_seen": 8330936320, "step": 15890 }, { "epoch": 0.7708488673080304, "grad_norm": 0.2431640625, "learning_rate": 6.840797761140497e-06, "loss": 2.7896, "num_input_tokens_seen": 8333557760, "step": 15895 }, { "epoch": 0.771091348864277, "grad_norm": 0.2392578125, "learning_rate": 6.827025235688641e-06, "loss": 2.7793, "num_input_tokens_seen": 8336179200, "step": 15900 }, { "epoch": 0.771091348864277, "eval_accuracy": 0.45600227975899693, "eval_loss": 2.7416441440582275, "eval_runtime": 5.8798, "eval_samples_per_second": 51.022, "eval_steps_per_second": 6.463, "num_input_tokens_seen": 8336179200, "step": 15900 }, { "epoch": 0.7713338304205236, "grad_norm": 0.24609375, "learning_rate": 6.813264395573246e-06, "loss": 2.7759, "num_input_tokens_seen": 8338800640, "step": 15905 }, { "epoch": 0.7715763119767702, "grad_norm": 0.248046875, "learning_rate": 6.7995152496426215e-06, "loss": 2.7731, "num_input_tokens_seen": 8341422080, "step": 15910 }, { "epoch": 0.7718187935330169, "grad_norm": 0.2412109375, "learning_rate": 6.785777806737554e-06, "loss": 2.7777, "num_input_tokens_seen": 8344043520, "step": 15915 }, { "epoch": 0.7720612750892635, "grad_norm": 0.251953125, "learning_rate": 6.772052075691304e-06, "loss": 2.7744, "num_input_tokens_seen": 8346664960, "step": 15920 }, { "epoch": 0.7723037566455101, "grad_norm": 0.251953125, "learning_rate": 6.758338065329603e-06, "loss": 2.7757, "num_input_tokens_seen": 8349286400, "step": 15925 }, { "epoch": 0.7725462382017568, "grad_norm": 0.2451171875, "learning_rate": 6.744635784470654e-06, "loss": 2.7793, "num_input_tokens_seen": 8351907840, "step": 15930 }, { "epoch": 0.7727887197580035, "grad_norm": 0.251953125, "learning_rate": 6.730945241925093e-06, "loss": 2.7813, "num_input_tokens_seen": 8354529280, "step": 15935 }, { "epoch": 0.7730312013142501, "grad_norm": 0.2353515625, "learning_rate": 6.717266446496034e-06, "loss": 2.764, "num_input_tokens_seen": 8357150720, "step": 15940 }, { "epoch": 0.7732736828704967, "grad_norm": 0.2431640625, "learning_rate": 6.703599406979025e-06, "loss": 2.7767, "num_input_tokens_seen": 8359772160, "step": 15945 }, { "epoch": 0.7735161644267433, "grad_norm": 0.251953125, "learning_rate": 6.689944132162057e-06, "loss": 2.7694, "num_input_tokens_seen": 8362393600, "step": 15950 }, { "epoch": 0.7737586459829899, "grad_norm": 0.244140625, "learning_rate": 6.676300630825563e-06, "loss": 2.7759, "num_input_tokens_seen": 8365015040, "step": 15955 }, { "epoch": 0.7740011275392366, "grad_norm": 0.2412109375, "learning_rate": 6.662668911742395e-06, "loss": 2.778, "num_input_tokens_seen": 8367636480, "step": 15960 }, { "epoch": 0.7742436090954832, "grad_norm": 0.25, "learning_rate": 6.649048983677833e-06, "loss": 2.7658, "num_input_tokens_seen": 8370257920, "step": 15965 }, { "epoch": 0.7744860906517298, "grad_norm": 0.234375, "learning_rate": 6.63544085538958e-06, "loss": 2.765, "num_input_tokens_seen": 8372879360, "step": 15970 }, { "epoch": 0.7747285722079764, "grad_norm": 0.2421875, "learning_rate": 6.621844535627744e-06, "loss": 2.7716, "num_input_tokens_seen": 8375500800, "step": 15975 }, { "epoch": 0.774971053764223, "grad_norm": 0.2412109375, "learning_rate": 6.608260033134847e-06, "loss": 2.7742, "num_input_tokens_seen": 8378122240, "step": 15980 }, { "epoch": 0.7752135353204697, "grad_norm": 0.2392578125, "learning_rate": 6.594687356645807e-06, "loss": 2.765, "num_input_tokens_seen": 8380743680, "step": 15985 }, { "epoch": 0.7754560168767163, "grad_norm": 0.23828125, "learning_rate": 6.5811265148879444e-06, "loss": 2.7757, "num_input_tokens_seen": 8383365120, "step": 15990 }, { "epoch": 0.7756984984329629, "grad_norm": 0.2353515625, "learning_rate": 6.5675775165809585e-06, "loss": 2.7782, "num_input_tokens_seen": 8385986560, "step": 15995 }, { "epoch": 0.7759409799892095, "grad_norm": 0.2392578125, "learning_rate": 6.554040370436939e-06, "loss": 2.7818, "num_input_tokens_seen": 8388608000, "step": 16000 }, { "epoch": 0.7761834615454561, "grad_norm": 0.2373046875, "learning_rate": 6.540515085160359e-06, "loss": 2.7716, "num_input_tokens_seen": 8391229440, "step": 16005 }, { "epoch": 0.7764259431017029, "grad_norm": 0.240234375, "learning_rate": 6.527001669448063e-06, "loss": 2.7779, "num_input_tokens_seen": 8393850880, "step": 16010 }, { "epoch": 0.7766684246579495, "grad_norm": 0.2353515625, "learning_rate": 6.51350013198925e-06, "loss": 2.7789, "num_input_tokens_seen": 8396472320, "step": 16015 }, { "epoch": 0.7769109062141961, "grad_norm": 0.236328125, "learning_rate": 6.500010481465521e-06, "loss": 2.7627, "num_input_tokens_seen": 8399093760, "step": 16020 }, { "epoch": 0.7771533877704427, "grad_norm": 0.2451171875, "learning_rate": 6.4865327265507815e-06, "loss": 2.7756, "num_input_tokens_seen": 8401715200, "step": 16025 }, { "epoch": 0.7773958693266894, "grad_norm": 0.2412109375, "learning_rate": 6.473066875911315e-06, "loss": 2.7769, "num_input_tokens_seen": 8404336640, "step": 16030 }, { "epoch": 0.777638350882936, "grad_norm": 0.23828125, "learning_rate": 6.459612938205755e-06, "loss": 2.7742, "num_input_tokens_seen": 8406958080, "step": 16035 }, { "epoch": 0.7778808324391826, "grad_norm": 0.2451171875, "learning_rate": 6.446170922085063e-06, "loss": 2.7723, "num_input_tokens_seen": 8409579520, "step": 16040 }, { "epoch": 0.7781233139954292, "grad_norm": 0.2451171875, "learning_rate": 6.432740836192541e-06, "loss": 2.7519, "num_input_tokens_seen": 8412200960, "step": 16045 }, { "epoch": 0.7783657955516758, "grad_norm": 0.2431640625, "learning_rate": 6.419322689163826e-06, "loss": 2.7786, "num_input_tokens_seen": 8414822400, "step": 16050 }, { "epoch": 0.7786082771079225, "grad_norm": 0.2421875, "learning_rate": 6.4059164896268534e-06, "loss": 2.776, "num_input_tokens_seen": 8417443840, "step": 16055 }, { "epoch": 0.7788507586641691, "grad_norm": 0.25390625, "learning_rate": 6.392522246201901e-06, "loss": 2.7704, "num_input_tokens_seen": 8420065280, "step": 16060 }, { "epoch": 0.7790932402204157, "grad_norm": 0.2421875, "learning_rate": 6.379139967501555e-06, "loss": 2.7883, "num_input_tokens_seen": 8422686720, "step": 16065 }, { "epoch": 0.7793357217766623, "grad_norm": 0.2373046875, "learning_rate": 6.365769662130694e-06, "loss": 2.7877, "num_input_tokens_seen": 8425308160, "step": 16070 }, { "epoch": 0.779578203332909, "grad_norm": 0.2470703125, "learning_rate": 6.352411338686523e-06, "loss": 2.7648, "num_input_tokens_seen": 8427929600, "step": 16075 }, { "epoch": 0.7798206848891556, "grad_norm": 0.2470703125, "learning_rate": 6.339065005758521e-06, "loss": 2.7677, "num_input_tokens_seen": 8430551040, "step": 16080 }, { "epoch": 0.7800631664454023, "grad_norm": 0.2373046875, "learning_rate": 6.325730671928468e-06, "loss": 2.7849, "num_input_tokens_seen": 8433172480, "step": 16085 }, { "epoch": 0.7803056480016489, "grad_norm": 0.2431640625, "learning_rate": 6.312408345770413e-06, "loss": 2.7696, "num_input_tokens_seen": 8435793920, "step": 16090 }, { "epoch": 0.7805481295578955, "grad_norm": 0.2431640625, "learning_rate": 6.299098035850701e-06, "loss": 2.7739, "num_input_tokens_seen": 8438415360, "step": 16095 }, { "epoch": 0.7807906111141422, "grad_norm": 0.2431640625, "learning_rate": 6.2857997507279445e-06, "loss": 2.7591, "num_input_tokens_seen": 8441036800, "step": 16100 }, { "epoch": 0.7810330926703888, "grad_norm": 0.2431640625, "learning_rate": 6.272513498953022e-06, "loss": 2.7713, "num_input_tokens_seen": 8443658240, "step": 16105 }, { "epoch": 0.7812755742266354, "grad_norm": 0.2451171875, "learning_rate": 6.259239289069083e-06, "loss": 2.7859, "num_input_tokens_seen": 8446279680, "step": 16110 }, { "epoch": 0.781518055782882, "grad_norm": 0.2412109375, "learning_rate": 6.245977129611527e-06, "loss": 2.7817, "num_input_tokens_seen": 8448901120, "step": 16115 }, { "epoch": 0.7817605373391286, "grad_norm": 0.2431640625, "learning_rate": 6.2327270291079876e-06, "loss": 2.7595, "num_input_tokens_seen": 8451522560, "step": 16120 }, { "epoch": 0.7820030188953753, "grad_norm": 0.2431640625, "learning_rate": 6.219488996078385e-06, "loss": 2.7827, "num_input_tokens_seen": 8454144000, "step": 16125 }, { "epoch": 0.7822455004516219, "grad_norm": 0.2373046875, "learning_rate": 6.206263039034846e-06, "loss": 2.7706, "num_input_tokens_seen": 8456765440, "step": 16130 }, { "epoch": 0.7824879820078685, "grad_norm": 0.2392578125, "learning_rate": 6.193049166481749e-06, "loss": 2.7797, "num_input_tokens_seen": 8459386880, "step": 16135 }, { "epoch": 0.7827304635641151, "grad_norm": 0.2421875, "learning_rate": 6.179847386915691e-06, "loss": 2.7743, "num_input_tokens_seen": 8462008320, "step": 16140 }, { "epoch": 0.7829729451203618, "grad_norm": 0.248046875, "learning_rate": 6.16665770882551e-06, "loss": 2.7944, "num_input_tokens_seen": 8464629760, "step": 16145 }, { "epoch": 0.7832154266766084, "grad_norm": 0.2490234375, "learning_rate": 6.1534801406922385e-06, "loss": 2.7896, "num_input_tokens_seen": 8467251200, "step": 16150 }, { "epoch": 0.783457908232855, "grad_norm": 0.2421875, "learning_rate": 6.140314690989138e-06, "loss": 2.7812, "num_input_tokens_seen": 8469872640, "step": 16155 }, { "epoch": 0.7837003897891016, "grad_norm": 0.24609375, "learning_rate": 6.1271613681816776e-06, "loss": 2.7842, "num_input_tokens_seen": 8472494080, "step": 16160 }, { "epoch": 0.7839428713453483, "grad_norm": 0.2373046875, "learning_rate": 6.114020180727525e-06, "loss": 2.7922, "num_input_tokens_seen": 8475115520, "step": 16165 }, { "epoch": 0.784185352901595, "grad_norm": 0.2412109375, "learning_rate": 6.100891137076548e-06, "loss": 2.785, "num_input_tokens_seen": 8477736960, "step": 16170 }, { "epoch": 0.7844278344578416, "grad_norm": 0.2490234375, "learning_rate": 6.087774245670802e-06, "loss": 2.7698, "num_input_tokens_seen": 8480358400, "step": 16175 }, { "epoch": 0.7846703160140882, "grad_norm": 0.236328125, "learning_rate": 6.074669514944528e-06, "loss": 2.7688, "num_input_tokens_seen": 8482979840, "step": 16180 }, { "epoch": 0.7849127975703348, "grad_norm": 0.2431640625, "learning_rate": 6.061576953324155e-06, "loss": 2.7773, "num_input_tokens_seen": 8485601280, "step": 16185 }, { "epoch": 0.7851552791265815, "grad_norm": 0.25, "learning_rate": 6.048496569228279e-06, "loss": 2.7752, "num_input_tokens_seen": 8488222720, "step": 16190 }, { "epoch": 0.7853977606828281, "grad_norm": 0.2421875, "learning_rate": 6.03542837106767e-06, "loss": 2.7706, "num_input_tokens_seen": 8490844160, "step": 16195 }, { "epoch": 0.7856402422390747, "grad_norm": 0.2451171875, "learning_rate": 6.0223723672452605e-06, "loss": 2.7718, "num_input_tokens_seen": 8493465600, "step": 16200 }, { "epoch": 0.7856402422390747, "eval_accuracy": 0.4559338869890897, "eval_loss": 2.741624355316162, "eval_runtime": 5.8381, "eval_samples_per_second": 51.387, "eval_steps_per_second": 6.509, "num_input_tokens_seen": 8493465600, "step": 16200 }, { "epoch": 0.7858827237953213, "grad_norm": 0.240234375, "learning_rate": 6.0093285661561495e-06, "loss": 2.7792, "num_input_tokens_seen": 8496087040, "step": 16205 }, { "epoch": 0.7861252053515679, "grad_norm": 0.24609375, "learning_rate": 5.996296976187568e-06, "loss": 2.7785, "num_input_tokens_seen": 8498708480, "step": 16210 }, { "epoch": 0.7863676869078146, "grad_norm": 0.2451171875, "learning_rate": 5.983277605718921e-06, "loss": 2.7751, "num_input_tokens_seen": 8501329920, "step": 16215 }, { "epoch": 0.7866101684640612, "grad_norm": 0.251953125, "learning_rate": 5.9702704631217385e-06, "loss": 2.7609, "num_input_tokens_seen": 8503951360, "step": 16220 }, { "epoch": 0.7868526500203078, "grad_norm": 0.240234375, "learning_rate": 5.9572755567596975e-06, "loss": 2.7857, "num_input_tokens_seen": 8506572800, "step": 16225 }, { "epoch": 0.7870951315765544, "grad_norm": 0.25, "learning_rate": 5.944292894988607e-06, "loss": 2.7803, "num_input_tokens_seen": 8509194240, "step": 16230 }, { "epoch": 0.787337613132801, "grad_norm": 0.2412109375, "learning_rate": 5.931322486156396e-06, "loss": 2.7821, "num_input_tokens_seen": 8511815680, "step": 16235 }, { "epoch": 0.7875800946890477, "grad_norm": 0.23828125, "learning_rate": 5.918364338603119e-06, "loss": 2.7767, "num_input_tokens_seen": 8514437120, "step": 16240 }, { "epoch": 0.7878225762452944, "grad_norm": 0.2451171875, "learning_rate": 5.905418460660947e-06, "loss": 2.7744, "num_input_tokens_seen": 8517058560, "step": 16245 }, { "epoch": 0.788065057801541, "grad_norm": 0.2412109375, "learning_rate": 5.892484860654162e-06, "loss": 2.7776, "num_input_tokens_seen": 8519680000, "step": 16250 }, { "epoch": 0.7883075393577876, "grad_norm": 0.2470703125, "learning_rate": 5.879563546899148e-06, "loss": 2.7839, "num_input_tokens_seen": 8522301440, "step": 16255 }, { "epoch": 0.7885500209140343, "grad_norm": 0.2451171875, "learning_rate": 5.8666545277043875e-06, "loss": 2.7745, "num_input_tokens_seen": 8524922880, "step": 16260 }, { "epoch": 0.7887925024702809, "grad_norm": 0.2392578125, "learning_rate": 5.853757811370467e-06, "loss": 2.7805, "num_input_tokens_seen": 8527544320, "step": 16265 }, { "epoch": 0.7890349840265275, "grad_norm": 0.2451171875, "learning_rate": 5.840873406190056e-06, "loss": 2.7676, "num_input_tokens_seen": 8530165760, "step": 16270 }, { "epoch": 0.7892774655827741, "grad_norm": 0.2412109375, "learning_rate": 5.828001320447898e-06, "loss": 2.795, "num_input_tokens_seen": 8532787200, "step": 16275 }, { "epoch": 0.7895199471390207, "grad_norm": 0.248046875, "learning_rate": 5.815141562420834e-06, "loss": 2.7721, "num_input_tokens_seen": 8535408640, "step": 16280 }, { "epoch": 0.7897624286952674, "grad_norm": 0.240234375, "learning_rate": 5.802294140377762e-06, "loss": 2.7863, "num_input_tokens_seen": 8538030080, "step": 16285 }, { "epoch": 0.790004910251514, "grad_norm": 0.2412109375, "learning_rate": 5.78945906257966e-06, "loss": 2.7674, "num_input_tokens_seen": 8540651520, "step": 16290 }, { "epoch": 0.7902473918077606, "grad_norm": 0.24609375, "learning_rate": 5.776636337279561e-06, "loss": 2.7645, "num_input_tokens_seen": 8543272960, "step": 16295 }, { "epoch": 0.7904898733640072, "grad_norm": 0.2392578125, "learning_rate": 5.7638259727225585e-06, "loss": 2.7767, "num_input_tokens_seen": 8545894400, "step": 16300 }, { "epoch": 0.7907323549202538, "grad_norm": 0.2421875, "learning_rate": 5.751027977145795e-06, "loss": 2.7812, "num_input_tokens_seen": 8548515840, "step": 16305 }, { "epoch": 0.7909748364765005, "grad_norm": 0.2412109375, "learning_rate": 5.738242358778467e-06, "loss": 2.7753, "num_input_tokens_seen": 8551137280, "step": 16310 }, { "epoch": 0.7912173180327471, "grad_norm": 0.2412109375, "learning_rate": 5.725469125841801e-06, "loss": 2.7749, "num_input_tokens_seen": 8553758720, "step": 16315 }, { "epoch": 0.7914597995889938, "grad_norm": 0.2490234375, "learning_rate": 5.712708286549071e-06, "loss": 2.7923, "num_input_tokens_seen": 8556380160, "step": 16320 }, { "epoch": 0.7917022811452404, "grad_norm": 0.236328125, "learning_rate": 5.699959849105571e-06, "loss": 2.7709, "num_input_tokens_seen": 8559001600, "step": 16325 }, { "epoch": 0.7919447627014871, "grad_norm": 0.2470703125, "learning_rate": 5.687223821708637e-06, "loss": 2.7688, "num_input_tokens_seen": 8561623040, "step": 16330 }, { "epoch": 0.7921872442577337, "grad_norm": 0.2470703125, "learning_rate": 5.674500212547598e-06, "loss": 2.7836, "num_input_tokens_seen": 8564244480, "step": 16335 }, { "epoch": 0.7924297258139803, "grad_norm": 0.2431640625, "learning_rate": 5.661789029803824e-06, "loss": 2.7867, "num_input_tokens_seen": 8566865920, "step": 16340 }, { "epoch": 0.7926722073702269, "grad_norm": 0.2421875, "learning_rate": 5.649090281650682e-06, "loss": 2.7848, "num_input_tokens_seen": 8569487360, "step": 16345 }, { "epoch": 0.7929146889264735, "grad_norm": 0.240234375, "learning_rate": 5.636403976253548e-06, "loss": 2.7859, "num_input_tokens_seen": 8572108800, "step": 16350 }, { "epoch": 0.7931571704827202, "grad_norm": 0.2421875, "learning_rate": 5.623730121769788e-06, "loss": 2.774, "num_input_tokens_seen": 8574730240, "step": 16355 }, { "epoch": 0.7933996520389668, "grad_norm": 0.232421875, "learning_rate": 5.611068726348795e-06, "loss": 2.7726, "num_input_tokens_seen": 8577351680, "step": 16360 }, { "epoch": 0.7936421335952134, "grad_norm": 0.2451171875, "learning_rate": 5.598419798131896e-06, "loss": 2.777, "num_input_tokens_seen": 8579973120, "step": 16365 }, { "epoch": 0.79388461515146, "grad_norm": 0.240234375, "learning_rate": 5.585783345252446e-06, "loss": 2.7749, "num_input_tokens_seen": 8582594560, "step": 16370 }, { "epoch": 0.7941270967077066, "grad_norm": 0.2373046875, "learning_rate": 5.57315937583576e-06, "loss": 2.7805, "num_input_tokens_seen": 8585216000, "step": 16375 }, { "epoch": 0.7943695782639533, "grad_norm": 0.2451171875, "learning_rate": 5.560547897999127e-06, "loss": 2.7796, "num_input_tokens_seen": 8587837440, "step": 16380 }, { "epoch": 0.7946120598201999, "grad_norm": 0.2421875, "learning_rate": 5.547948919851811e-06, "loss": 2.7696, "num_input_tokens_seen": 8590458880, "step": 16385 }, { "epoch": 0.7948545413764465, "grad_norm": 0.2333984375, "learning_rate": 5.535362449495032e-06, "loss": 2.7814, "num_input_tokens_seen": 8593080320, "step": 16390 }, { "epoch": 0.7950970229326931, "grad_norm": 0.236328125, "learning_rate": 5.522788495021975e-06, "loss": 2.7723, "num_input_tokens_seen": 8595701760, "step": 16395 }, { "epoch": 0.7953395044889399, "grad_norm": 0.2373046875, "learning_rate": 5.510227064517756e-06, "loss": 2.7899, "num_input_tokens_seen": 8598323200, "step": 16400 }, { "epoch": 0.7955819860451865, "grad_norm": 0.2451171875, "learning_rate": 5.4976781660594555e-06, "loss": 2.7542, "num_input_tokens_seen": 8600944640, "step": 16405 }, { "epoch": 0.7958244676014331, "grad_norm": 0.2333984375, "learning_rate": 5.485141807716107e-06, "loss": 2.7663, "num_input_tokens_seen": 8603566080, "step": 16410 }, { "epoch": 0.7960669491576797, "grad_norm": 0.2392578125, "learning_rate": 5.472617997548662e-06, "loss": 2.7791, "num_input_tokens_seen": 8606187520, "step": 16415 }, { "epoch": 0.7963094307139263, "grad_norm": 0.236328125, "learning_rate": 5.460106743610008e-06, "loss": 2.7766, "num_input_tokens_seen": 8608808960, "step": 16420 }, { "epoch": 0.796551912270173, "grad_norm": 0.2431640625, "learning_rate": 5.4476080539449665e-06, "loss": 2.78, "num_input_tokens_seen": 8611430400, "step": 16425 }, { "epoch": 0.7967943938264196, "grad_norm": 0.23828125, "learning_rate": 5.435121936590256e-06, "loss": 2.7582, "num_input_tokens_seen": 8614051840, "step": 16430 }, { "epoch": 0.7970368753826662, "grad_norm": 0.2431640625, "learning_rate": 5.422648399574543e-06, "loss": 2.7646, "num_input_tokens_seen": 8616673280, "step": 16435 }, { "epoch": 0.7972793569389128, "grad_norm": 0.2412109375, "learning_rate": 5.4101874509183805e-06, "loss": 2.7804, "num_input_tokens_seen": 8619294720, "step": 16440 }, { "epoch": 0.7975218384951595, "grad_norm": 0.25, "learning_rate": 5.3977390986342415e-06, "loss": 2.7669, "num_input_tokens_seen": 8621916160, "step": 16445 }, { "epoch": 0.7977643200514061, "grad_norm": 0.2451171875, "learning_rate": 5.385303350726495e-06, "loss": 2.7883, "num_input_tokens_seen": 8624537600, "step": 16450 }, { "epoch": 0.7980068016076527, "grad_norm": 0.2392578125, "learning_rate": 5.372880215191409e-06, "loss": 2.7731, "num_input_tokens_seen": 8627159040, "step": 16455 }, { "epoch": 0.7982492831638993, "grad_norm": 0.2421875, "learning_rate": 5.360469700017118e-06, "loss": 2.7927, "num_input_tokens_seen": 8629780480, "step": 16460 }, { "epoch": 0.7984917647201459, "grad_norm": 0.2431640625, "learning_rate": 5.348071813183681e-06, "loss": 2.7798, "num_input_tokens_seen": 8632401920, "step": 16465 }, { "epoch": 0.7987342462763926, "grad_norm": 0.2490234375, "learning_rate": 5.335686562663011e-06, "loss": 2.7614, "num_input_tokens_seen": 8635023360, "step": 16470 }, { "epoch": 0.7989767278326392, "grad_norm": 0.2392578125, "learning_rate": 5.3233139564189016e-06, "loss": 2.7756, "num_input_tokens_seen": 8637644800, "step": 16475 }, { "epoch": 0.7992192093888859, "grad_norm": 0.236328125, "learning_rate": 5.310954002407012e-06, "loss": 2.7858, "num_input_tokens_seen": 8640266240, "step": 16480 }, { "epoch": 0.7994616909451325, "grad_norm": 0.2353515625, "learning_rate": 5.298606708574883e-06, "loss": 2.7759, "num_input_tokens_seen": 8642887680, "step": 16485 }, { "epoch": 0.7997041725013792, "grad_norm": 0.244140625, "learning_rate": 5.286272082861885e-06, "loss": 2.7638, "num_input_tokens_seen": 8645509120, "step": 16490 }, { "epoch": 0.7999466540576258, "grad_norm": 0.2451171875, "learning_rate": 5.2739501331992654e-06, "loss": 2.7705, "num_input_tokens_seen": 8648130560, "step": 16495 }, { "epoch": 0.8001891356138724, "grad_norm": 0.2431640625, "learning_rate": 5.261640867510118e-06, "loss": 2.7757, "num_input_tokens_seen": 8650752000, "step": 16500 }, { "epoch": 0.8001891356138724, "eval_accuracy": 0.45599088096401236, "eval_loss": 2.7416229248046875, "eval_runtime": 5.8629, "eval_samples_per_second": 51.169, "eval_steps_per_second": 6.481, "num_input_tokens_seen": 8650752000, "step": 16500 }, { "epoch": 0.800431617170119, "grad_norm": 0.248046875, "learning_rate": 5.249344293709374e-06, "loss": 2.7701, "num_input_tokens_seen": 8653373440, "step": 16505 }, { "epoch": 0.8006740987263656, "grad_norm": 0.2431640625, "learning_rate": 5.23706041970381e-06, "loss": 2.7628, "num_input_tokens_seen": 8655994880, "step": 16510 }, { "epoch": 0.8009165802826123, "grad_norm": 0.2392578125, "learning_rate": 5.224789253392032e-06, "loss": 2.7705, "num_input_tokens_seen": 8658616320, "step": 16515 }, { "epoch": 0.8011590618388589, "grad_norm": 0.24609375, "learning_rate": 5.212530802664478e-06, "loss": 2.77, "num_input_tokens_seen": 8661237760, "step": 16520 }, { "epoch": 0.8014015433951055, "grad_norm": 0.2470703125, "learning_rate": 5.200285075403408e-06, "loss": 2.766, "num_input_tokens_seen": 8663859200, "step": 16525 }, { "epoch": 0.8016440249513521, "grad_norm": 0.2431640625, "learning_rate": 5.188052079482899e-06, "loss": 2.7898, "num_input_tokens_seen": 8666480640, "step": 16530 }, { "epoch": 0.8018865065075987, "grad_norm": 0.2421875, "learning_rate": 5.175831822768848e-06, "loss": 2.7635, "num_input_tokens_seen": 8669102080, "step": 16535 }, { "epoch": 0.8021289880638454, "grad_norm": 0.244140625, "learning_rate": 5.163624313118956e-06, "loss": 2.7692, "num_input_tokens_seen": 8671723520, "step": 16540 }, { "epoch": 0.802371469620092, "grad_norm": 0.240234375, "learning_rate": 5.151429558382725e-06, "loss": 2.7643, "num_input_tokens_seen": 8674344960, "step": 16545 }, { "epoch": 0.8026139511763386, "grad_norm": 0.251953125, "learning_rate": 5.13924756640147e-06, "loss": 2.7837, "num_input_tokens_seen": 8676966400, "step": 16550 }, { "epoch": 0.8028564327325852, "grad_norm": 0.240234375, "learning_rate": 5.127078345008268e-06, "loss": 2.7662, "num_input_tokens_seen": 8679587840, "step": 16555 }, { "epoch": 0.803098914288832, "grad_norm": 0.2421875, "learning_rate": 5.1149219020280164e-06, "loss": 2.78, "num_input_tokens_seen": 8682209280, "step": 16560 }, { "epoch": 0.8033413958450786, "grad_norm": 0.244140625, "learning_rate": 5.1027782452773815e-06, "loss": 2.7752, "num_input_tokens_seen": 8684830720, "step": 16565 }, { "epoch": 0.8035838774013252, "grad_norm": 0.2412109375, "learning_rate": 5.0906473825648144e-06, "loss": 2.7597, "num_input_tokens_seen": 8687452160, "step": 16570 }, { "epoch": 0.8038263589575718, "grad_norm": 0.26171875, "learning_rate": 5.0785293216905314e-06, "loss": 2.7859, "num_input_tokens_seen": 8690073600, "step": 16575 }, { "epoch": 0.8040688405138184, "grad_norm": 0.2412109375, "learning_rate": 5.066424070446521e-06, "loss": 2.7611, "num_input_tokens_seen": 8692695040, "step": 16580 }, { "epoch": 0.8043113220700651, "grad_norm": 0.2421875, "learning_rate": 5.054331636616541e-06, "loss": 2.7694, "num_input_tokens_seen": 8695316480, "step": 16585 }, { "epoch": 0.8045538036263117, "grad_norm": 0.240234375, "learning_rate": 5.042252027976097e-06, "loss": 2.7625, "num_input_tokens_seen": 8697937920, "step": 16590 }, { "epoch": 0.8047962851825583, "grad_norm": 0.2451171875, "learning_rate": 5.030185252292452e-06, "loss": 2.7724, "num_input_tokens_seen": 8700559360, "step": 16595 }, { "epoch": 0.8050387667388049, "grad_norm": 0.2412109375, "learning_rate": 5.018131317324623e-06, "loss": 2.7761, "num_input_tokens_seen": 8703180800, "step": 16600 }, { "epoch": 0.8052812482950515, "grad_norm": 0.2431640625, "learning_rate": 5.006090230823366e-06, "loss": 2.7766, "num_input_tokens_seen": 8705802240, "step": 16605 }, { "epoch": 0.8055237298512982, "grad_norm": 0.240234375, "learning_rate": 4.994062000531175e-06, "loss": 2.78, "num_input_tokens_seen": 8708423680, "step": 16610 }, { "epoch": 0.8057662114075448, "grad_norm": 0.244140625, "learning_rate": 4.982046634182269e-06, "loss": 2.7699, "num_input_tokens_seen": 8711045120, "step": 16615 }, { "epoch": 0.8060086929637914, "grad_norm": 0.2490234375, "learning_rate": 4.970044139502608e-06, "loss": 2.7882, "num_input_tokens_seen": 8713666560, "step": 16620 }, { "epoch": 0.806251174520038, "grad_norm": 0.2314453125, "learning_rate": 4.958054524209873e-06, "loss": 2.7768, "num_input_tokens_seen": 8716288000, "step": 16625 }, { "epoch": 0.8064936560762846, "grad_norm": 0.2392578125, "learning_rate": 4.946077796013462e-06, "loss": 2.7735, "num_input_tokens_seen": 8718909440, "step": 16630 }, { "epoch": 0.8067361376325314, "grad_norm": 0.2412109375, "learning_rate": 4.934113962614484e-06, "loss": 2.7737, "num_input_tokens_seen": 8721530880, "step": 16635 }, { "epoch": 0.806978619188778, "grad_norm": 0.25, "learning_rate": 4.922163031705762e-06, "loss": 2.7673, "num_input_tokens_seen": 8724152320, "step": 16640 }, { "epoch": 0.8072211007450246, "grad_norm": 0.24609375, "learning_rate": 4.910225010971817e-06, "loss": 2.7684, "num_input_tokens_seen": 8726773760, "step": 16645 }, { "epoch": 0.8074635823012712, "grad_norm": 0.23828125, "learning_rate": 4.8982999080888684e-06, "loss": 2.7862, "num_input_tokens_seen": 8729395200, "step": 16650 }, { "epoch": 0.8077060638575179, "grad_norm": 0.24609375, "learning_rate": 4.886387730724837e-06, "loss": 2.7664, "num_input_tokens_seen": 8732016640, "step": 16655 }, { "epoch": 0.8079485454137645, "grad_norm": 0.2421875, "learning_rate": 4.874488486539325e-06, "loss": 2.7739, "num_input_tokens_seen": 8734638080, "step": 16660 }, { "epoch": 0.8081910269700111, "grad_norm": 0.255859375, "learning_rate": 4.862602183183623e-06, "loss": 2.777, "num_input_tokens_seen": 8737259520, "step": 16665 }, { "epoch": 0.8084335085262577, "grad_norm": 0.240234375, "learning_rate": 4.850728828300702e-06, "loss": 2.7794, "num_input_tokens_seen": 8739880960, "step": 16670 }, { "epoch": 0.8086759900825043, "grad_norm": 0.2470703125, "learning_rate": 4.838868429525189e-06, "loss": 2.7603, "num_input_tokens_seen": 8742502400, "step": 16675 }, { "epoch": 0.808918471638751, "grad_norm": 0.240234375, "learning_rate": 4.827020994483405e-06, "loss": 2.779, "num_input_tokens_seen": 8745123840, "step": 16680 }, { "epoch": 0.8091609531949976, "grad_norm": 0.2431640625, "learning_rate": 4.815186530793325e-06, "loss": 2.7771, "num_input_tokens_seen": 8747745280, "step": 16685 }, { "epoch": 0.8094034347512442, "grad_norm": 0.24609375, "learning_rate": 4.803365046064573e-06, "loss": 2.7663, "num_input_tokens_seen": 8750366720, "step": 16690 }, { "epoch": 0.8096459163074908, "grad_norm": 0.2431640625, "learning_rate": 4.791556547898454e-06, "loss": 2.7674, "num_input_tokens_seen": 8752988160, "step": 16695 }, { "epoch": 0.8098883978637375, "grad_norm": 0.2373046875, "learning_rate": 4.779761043887898e-06, "loss": 2.7811, "num_input_tokens_seen": 8755609600, "step": 16700 }, { "epoch": 0.8101308794199841, "grad_norm": 0.2451171875, "learning_rate": 4.767978541617493e-06, "loss": 2.7719, "num_input_tokens_seen": 8758231040, "step": 16705 }, { "epoch": 0.8103733609762307, "grad_norm": 0.24609375, "learning_rate": 4.756209048663454e-06, "loss": 2.765, "num_input_tokens_seen": 8760852480, "step": 16710 }, { "epoch": 0.8106158425324774, "grad_norm": 0.2451171875, "learning_rate": 4.744452572593638e-06, "loss": 2.7773, "num_input_tokens_seen": 8763473920, "step": 16715 }, { "epoch": 0.810858324088724, "grad_norm": 0.2451171875, "learning_rate": 4.732709120967541e-06, "loss": 2.7854, "num_input_tokens_seen": 8766095360, "step": 16720 }, { "epoch": 0.8111008056449707, "grad_norm": 0.2412109375, "learning_rate": 4.720978701336268e-06, "loss": 2.7703, "num_input_tokens_seen": 8768716800, "step": 16725 }, { "epoch": 0.8113432872012173, "grad_norm": 0.240234375, "learning_rate": 4.709261321242556e-06, "loss": 2.7705, "num_input_tokens_seen": 8771338240, "step": 16730 }, { "epoch": 0.8115857687574639, "grad_norm": 0.2431640625, "learning_rate": 4.697556988220758e-06, "loss": 2.7867, "num_input_tokens_seen": 8773959680, "step": 16735 }, { "epoch": 0.8118282503137105, "grad_norm": 0.2431640625, "learning_rate": 4.685865709796822e-06, "loss": 2.7696, "num_input_tokens_seen": 8776581120, "step": 16740 }, { "epoch": 0.8120707318699572, "grad_norm": 0.25, "learning_rate": 4.6741874934883165e-06, "loss": 2.7622, "num_input_tokens_seen": 8779202560, "step": 16745 }, { "epoch": 0.8123132134262038, "grad_norm": 0.2431640625, "learning_rate": 4.662522346804413e-06, "loss": 2.7774, "num_input_tokens_seen": 8781824000, "step": 16750 }, { "epoch": 0.8125556949824504, "grad_norm": 0.2431640625, "learning_rate": 4.650870277245872e-06, "loss": 2.7759, "num_input_tokens_seen": 8784445440, "step": 16755 }, { "epoch": 0.812798176538697, "grad_norm": 0.2451171875, "learning_rate": 4.639231292305049e-06, "loss": 2.7717, "num_input_tokens_seen": 8787066880, "step": 16760 }, { "epoch": 0.8130406580949436, "grad_norm": 0.2431640625, "learning_rate": 4.627605399465887e-06, "loss": 2.7829, "num_input_tokens_seen": 8789688320, "step": 16765 }, { "epoch": 0.8132831396511903, "grad_norm": 0.25, "learning_rate": 4.615992606203898e-06, "loss": 2.7757, "num_input_tokens_seen": 8792309760, "step": 16770 }, { "epoch": 0.8135256212074369, "grad_norm": 0.2470703125, "learning_rate": 4.604392919986183e-06, "loss": 2.7807, "num_input_tokens_seen": 8794931200, "step": 16775 }, { "epoch": 0.8137681027636835, "grad_norm": 0.2431640625, "learning_rate": 4.592806348271414e-06, "loss": 2.7764, "num_input_tokens_seen": 8797552640, "step": 16780 }, { "epoch": 0.8140105843199301, "grad_norm": 0.240234375, "learning_rate": 4.5812328985098325e-06, "loss": 2.7687, "num_input_tokens_seen": 8800174080, "step": 16785 }, { "epoch": 0.8142530658761767, "grad_norm": 0.2470703125, "learning_rate": 4.569672578143236e-06, "loss": 2.7812, "num_input_tokens_seen": 8802795520, "step": 16790 }, { "epoch": 0.8144955474324235, "grad_norm": 0.2373046875, "learning_rate": 4.55812539460499e-06, "loss": 2.7851, "num_input_tokens_seen": 8805416960, "step": 16795 }, { "epoch": 0.8147380289886701, "grad_norm": 0.2451171875, "learning_rate": 4.546591355319988e-06, "loss": 2.7763, "num_input_tokens_seen": 8808038400, "step": 16800 }, { "epoch": 0.8147380289886701, "eval_accuracy": 0.4559338869890897, "eval_loss": 2.741605520248413, "eval_runtime": 5.8207, "eval_samples_per_second": 51.54, "eval_steps_per_second": 6.528, "num_input_tokens_seen": 8808038400, "step": 16800 }, { "epoch": 0.8149805105449167, "grad_norm": 0.244140625, "learning_rate": 4.535070467704705e-06, "loss": 2.7509, "num_input_tokens_seen": 8810659840, "step": 16805 }, { "epoch": 0.8152229921011633, "grad_norm": 0.240234375, "learning_rate": 4.523562739167139e-06, "loss": 2.7738, "num_input_tokens_seen": 8813281280, "step": 16810 }, { "epoch": 0.81546547365741, "grad_norm": 0.240234375, "learning_rate": 4.512068177106834e-06, "loss": 2.7704, "num_input_tokens_seen": 8815902720, "step": 16815 }, { "epoch": 0.8157079552136566, "grad_norm": 0.23828125, "learning_rate": 4.500586788914862e-06, "loss": 2.7768, "num_input_tokens_seen": 8818524160, "step": 16820 }, { "epoch": 0.8159504367699032, "grad_norm": 0.2412109375, "learning_rate": 4.489118581973837e-06, "loss": 2.7685, "num_input_tokens_seen": 8821145600, "step": 16825 }, { "epoch": 0.8161929183261498, "grad_norm": 0.2431640625, "learning_rate": 4.477663563657871e-06, "loss": 2.7703, "num_input_tokens_seen": 8823767040, "step": 16830 }, { "epoch": 0.8164353998823964, "grad_norm": 0.2412109375, "learning_rate": 4.4662217413326244e-06, "loss": 2.7731, "num_input_tokens_seen": 8826388480, "step": 16835 }, { "epoch": 0.8166778814386431, "grad_norm": 0.23828125, "learning_rate": 4.454793122355253e-06, "loss": 2.7761, "num_input_tokens_seen": 8829009920, "step": 16840 }, { "epoch": 0.8169203629948897, "grad_norm": 0.2431640625, "learning_rate": 4.443377714074437e-06, "loss": 2.7776, "num_input_tokens_seen": 8831631360, "step": 16845 }, { "epoch": 0.8171628445511363, "grad_norm": 0.240234375, "learning_rate": 4.4319755238303535e-06, "loss": 2.7614, "num_input_tokens_seen": 8834252800, "step": 16850 }, { "epoch": 0.8174053261073829, "grad_norm": 0.2470703125, "learning_rate": 4.420586558954678e-06, "loss": 2.7568, "num_input_tokens_seen": 8836874240, "step": 16855 }, { "epoch": 0.8176478076636295, "grad_norm": 0.2421875, "learning_rate": 4.4092108267705935e-06, "loss": 2.7777, "num_input_tokens_seen": 8839495680, "step": 16860 }, { "epoch": 0.8178902892198762, "grad_norm": 0.248046875, "learning_rate": 4.39784833459276e-06, "loss": 2.7866, "num_input_tokens_seen": 8842117120, "step": 16865 }, { "epoch": 0.8181327707761228, "grad_norm": 0.25, "learning_rate": 4.386499089727336e-06, "loss": 2.7815, "num_input_tokens_seen": 8844738560, "step": 16870 }, { "epoch": 0.8183752523323695, "grad_norm": 0.2353515625, "learning_rate": 4.375163099471954e-06, "loss": 2.7827, "num_input_tokens_seen": 8847360000, "step": 16875 }, { "epoch": 0.8186177338886161, "grad_norm": 0.2451171875, "learning_rate": 4.36384037111573e-06, "loss": 2.7754, "num_input_tokens_seen": 8849981440, "step": 16880 }, { "epoch": 0.8188602154448628, "grad_norm": 0.2431640625, "learning_rate": 4.3525309119392454e-06, "loss": 2.7746, "num_input_tokens_seen": 8852602880, "step": 16885 }, { "epoch": 0.8191026970011094, "grad_norm": 0.240234375, "learning_rate": 4.34123472921456e-06, "loss": 2.7826, "num_input_tokens_seen": 8855224320, "step": 16890 }, { "epoch": 0.819345178557356, "grad_norm": 0.2421875, "learning_rate": 4.3299518302051785e-06, "loss": 2.7716, "num_input_tokens_seen": 8857845760, "step": 16895 }, { "epoch": 0.8195876601136026, "grad_norm": 0.244140625, "learning_rate": 4.318682222166082e-06, "loss": 2.7766, "num_input_tokens_seen": 8860467200, "step": 16900 }, { "epoch": 0.8198301416698492, "grad_norm": 0.2490234375, "learning_rate": 4.3074259123436985e-06, "loss": 2.7779, "num_input_tokens_seen": 8863088640, "step": 16905 }, { "epoch": 0.8200726232260959, "grad_norm": 0.2431640625, "learning_rate": 4.296182907975907e-06, "loss": 2.7793, "num_input_tokens_seen": 8865710080, "step": 16910 }, { "epoch": 0.8203151047823425, "grad_norm": 0.236328125, "learning_rate": 4.284953216292028e-06, "loss": 2.7676, "num_input_tokens_seen": 8868331520, "step": 16915 }, { "epoch": 0.8205575863385891, "grad_norm": 0.24609375, "learning_rate": 4.273736844512824e-06, "loss": 2.7754, "num_input_tokens_seen": 8870952960, "step": 16920 }, { "epoch": 0.8208000678948357, "grad_norm": 0.23828125, "learning_rate": 4.262533799850494e-06, "loss": 2.7845, "num_input_tokens_seen": 8873574400, "step": 16925 }, { "epoch": 0.8210425494510823, "grad_norm": 0.2392578125, "learning_rate": 4.251344089508661e-06, "loss": 2.77, "num_input_tokens_seen": 8876195840, "step": 16930 }, { "epoch": 0.821285031007329, "grad_norm": 0.2431640625, "learning_rate": 4.240167720682384e-06, "loss": 2.7704, "num_input_tokens_seen": 8878817280, "step": 16935 }, { "epoch": 0.8215275125635756, "grad_norm": 0.2451171875, "learning_rate": 4.229004700558134e-06, "loss": 2.7912, "num_input_tokens_seen": 8881438720, "step": 16940 }, { "epoch": 0.8217699941198222, "grad_norm": 0.236328125, "learning_rate": 4.217855036313806e-06, "loss": 2.7766, "num_input_tokens_seen": 8884060160, "step": 16945 }, { "epoch": 0.8220124756760689, "grad_norm": 0.248046875, "learning_rate": 4.206718735118706e-06, "loss": 2.782, "num_input_tokens_seen": 8886681600, "step": 16950 }, { "epoch": 0.8222549572323156, "grad_norm": 0.2451171875, "learning_rate": 4.1955958041335395e-06, "loss": 2.774, "num_input_tokens_seen": 8889303040, "step": 16955 }, { "epoch": 0.8224974387885622, "grad_norm": 0.2470703125, "learning_rate": 4.18448625051042e-06, "loss": 2.7903, "num_input_tokens_seen": 8891924480, "step": 16960 }, { "epoch": 0.8227399203448088, "grad_norm": 0.244140625, "learning_rate": 4.173390081392864e-06, "loss": 2.7809, "num_input_tokens_seen": 8894545920, "step": 16965 }, { "epoch": 0.8229824019010554, "grad_norm": 0.2412109375, "learning_rate": 4.162307303915777e-06, "loss": 2.7625, "num_input_tokens_seen": 8897167360, "step": 16970 }, { "epoch": 0.823224883457302, "grad_norm": 0.2431640625, "learning_rate": 4.151237925205448e-06, "loss": 2.7629, "num_input_tokens_seen": 8899788800, "step": 16975 }, { "epoch": 0.8234673650135487, "grad_norm": 0.2373046875, "learning_rate": 4.140181952379574e-06, "loss": 2.7776, "num_input_tokens_seen": 8902410240, "step": 16980 }, { "epoch": 0.8237098465697953, "grad_norm": 0.2470703125, "learning_rate": 4.129139392547199e-06, "loss": 2.7826, "num_input_tokens_seen": 8905031680, "step": 16985 }, { "epoch": 0.8239523281260419, "grad_norm": 0.2392578125, "learning_rate": 4.11811025280876e-06, "loss": 2.765, "num_input_tokens_seen": 8907653120, "step": 16990 }, { "epoch": 0.8241948096822885, "grad_norm": 0.2451171875, "learning_rate": 4.107094540256065e-06, "loss": 2.7687, "num_input_tokens_seen": 8910274560, "step": 16995 }, { "epoch": 0.8244372912385352, "grad_norm": 0.2490234375, "learning_rate": 4.09609226197229e-06, "loss": 2.7773, "num_input_tokens_seen": 8912896000, "step": 17000 }, { "epoch": 0.8246797727947818, "grad_norm": 0.2412109375, "learning_rate": 4.085103425031961e-06, "loss": 2.7758, "num_input_tokens_seen": 8915517440, "step": 17005 }, { "epoch": 0.8249222543510284, "grad_norm": 0.2451171875, "learning_rate": 4.074128036500977e-06, "loss": 2.7864, "num_input_tokens_seen": 8918138880, "step": 17010 }, { "epoch": 0.825164735907275, "grad_norm": 0.2392578125, "learning_rate": 4.06316610343658e-06, "loss": 2.7751, "num_input_tokens_seen": 8920760320, "step": 17015 }, { "epoch": 0.8254072174635216, "grad_norm": 0.2412109375, "learning_rate": 4.052217632887354e-06, "loss": 2.7696, "num_input_tokens_seen": 8923381760, "step": 17020 }, { "epoch": 0.8256496990197683, "grad_norm": 0.2431640625, "learning_rate": 4.041282631893239e-06, "loss": 2.7632, "num_input_tokens_seen": 8926003200, "step": 17025 }, { "epoch": 0.825892180576015, "grad_norm": 0.2431640625, "learning_rate": 4.030361107485503e-06, "loss": 2.7936, "num_input_tokens_seen": 8928624640, "step": 17030 }, { "epoch": 0.8261346621322616, "grad_norm": 0.251953125, "learning_rate": 4.019453066686768e-06, "loss": 2.7729, "num_input_tokens_seen": 8931246080, "step": 17035 }, { "epoch": 0.8263771436885082, "grad_norm": 0.244140625, "learning_rate": 4.008558516510966e-06, "loss": 2.7768, "num_input_tokens_seen": 8933867520, "step": 17040 }, { "epoch": 0.8266196252447549, "grad_norm": 0.240234375, "learning_rate": 3.997677463963364e-06, "loss": 2.7713, "num_input_tokens_seen": 8936488960, "step": 17045 }, { "epoch": 0.8268621068010015, "grad_norm": 0.240234375, "learning_rate": 3.986809916040538e-06, "loss": 2.767, "num_input_tokens_seen": 8939110400, "step": 17050 }, { "epoch": 0.8271045883572481, "grad_norm": 0.2412109375, "learning_rate": 3.975955879730392e-06, "loss": 2.7786, "num_input_tokens_seen": 8941731840, "step": 17055 }, { "epoch": 0.8273470699134947, "grad_norm": 0.2421875, "learning_rate": 3.965115362012145e-06, "loss": 2.7786, "num_input_tokens_seen": 8944353280, "step": 17060 }, { "epoch": 0.8275895514697413, "grad_norm": 0.2451171875, "learning_rate": 3.95428836985631e-06, "loss": 2.7854, "num_input_tokens_seen": 8946974720, "step": 17065 }, { "epoch": 0.827832033025988, "grad_norm": 0.2431640625, "learning_rate": 3.943474910224717e-06, "loss": 2.7802, "num_input_tokens_seen": 8949596160, "step": 17070 }, { "epoch": 0.8280745145822346, "grad_norm": 0.24609375, "learning_rate": 3.932674990070492e-06, "loss": 2.7667, "num_input_tokens_seen": 8952217600, "step": 17075 }, { "epoch": 0.8283169961384812, "grad_norm": 0.2412109375, "learning_rate": 3.9218886163380406e-06, "loss": 2.778, "num_input_tokens_seen": 8954839040, "step": 17080 }, { "epoch": 0.8285594776947278, "grad_norm": 0.2412109375, "learning_rate": 3.911115795963066e-06, "loss": 2.757, "num_input_tokens_seen": 8957460480, "step": 17085 }, { "epoch": 0.8288019592509744, "grad_norm": 0.2431640625, "learning_rate": 3.900356535872574e-06, "loss": 2.7719, "num_input_tokens_seen": 8960081920, "step": 17090 }, { "epoch": 0.8290444408072211, "grad_norm": 0.244140625, "learning_rate": 3.889610842984826e-06, "loss": 2.7834, "num_input_tokens_seen": 8962703360, "step": 17095 }, { "epoch": 0.8292869223634677, "grad_norm": 0.2421875, "learning_rate": 3.878878724209373e-06, "loss": 2.7581, "num_input_tokens_seen": 8965324800, "step": 17100 }, { "epoch": 0.8292869223634677, "eval_accuracy": 0.45594365738479076, "eval_loss": 2.7415835857391357, "eval_runtime": 5.7955, "eval_samples_per_second": 51.764, "eval_steps_per_second": 6.557, "num_input_tokens_seen": 8965324800, "step": 17100 }, { "epoch": 0.8295294039197143, "grad_norm": 0.236328125, "learning_rate": 3.868160186447039e-06, "loss": 2.7817, "num_input_tokens_seen": 8967946240, "step": 17105 }, { "epoch": 0.829771885475961, "grad_norm": 0.2421875, "learning_rate": 3.857455236589902e-06, "loss": 2.771, "num_input_tokens_seen": 8970567680, "step": 17110 }, { "epoch": 0.8300143670322077, "grad_norm": 0.25390625, "learning_rate": 3.846763881521315e-06, "loss": 2.7916, "num_input_tokens_seen": 8973189120, "step": 17115 }, { "epoch": 0.8302568485884543, "grad_norm": 0.2431640625, "learning_rate": 3.836086128115884e-06, "loss": 2.7745, "num_input_tokens_seen": 8975810560, "step": 17120 }, { "epoch": 0.8304993301447009, "grad_norm": 0.2431640625, "learning_rate": 3.825421983239477e-06, "loss": 2.7617, "num_input_tokens_seen": 8978432000, "step": 17125 }, { "epoch": 0.8307418117009475, "grad_norm": 0.2451171875, "learning_rate": 3.8147714537492e-06, "loss": 2.7784, "num_input_tokens_seen": 8981053440, "step": 17130 }, { "epoch": 0.8309842932571941, "grad_norm": 0.23828125, "learning_rate": 3.804134546493415e-06, "loss": 2.7833, "num_input_tokens_seen": 8983674880, "step": 17135 }, { "epoch": 0.8312267748134408, "grad_norm": 0.2431640625, "learning_rate": 3.793511268311717e-06, "loss": 2.7544, "num_input_tokens_seen": 8986296320, "step": 17140 }, { "epoch": 0.8314692563696874, "grad_norm": 0.2421875, "learning_rate": 3.7829016260349405e-06, "loss": 2.785, "num_input_tokens_seen": 8988917760, "step": 17145 }, { "epoch": 0.831711737925934, "grad_norm": 0.240234375, "learning_rate": 3.772305626485151e-06, "loss": 2.7674, "num_input_tokens_seen": 8991539200, "step": 17150 }, { "epoch": 0.8319542194821806, "grad_norm": 0.244140625, "learning_rate": 3.7617232764756455e-06, "loss": 2.7631, "num_input_tokens_seen": 8994160640, "step": 17155 }, { "epoch": 0.8321967010384272, "grad_norm": 0.2490234375, "learning_rate": 3.7511545828109396e-06, "loss": 2.7729, "num_input_tokens_seen": 8996782080, "step": 17160 }, { "epoch": 0.8324391825946739, "grad_norm": 0.23828125, "learning_rate": 3.7405995522867692e-06, "loss": 2.7832, "num_input_tokens_seen": 8999403520, "step": 17165 }, { "epoch": 0.8326816641509205, "grad_norm": 0.2412109375, "learning_rate": 3.730058191690089e-06, "loss": 2.7824, "num_input_tokens_seen": 9002024960, "step": 17170 }, { "epoch": 0.8329241457071671, "grad_norm": 0.2421875, "learning_rate": 3.7195305077990544e-06, "loss": 2.7762, "num_input_tokens_seen": 9004646400, "step": 17175 }, { "epoch": 0.8331666272634137, "grad_norm": 0.2392578125, "learning_rate": 3.7090165073830315e-06, "loss": 2.7828, "num_input_tokens_seen": 9007267840, "step": 17180 }, { "epoch": 0.8334091088196603, "grad_norm": 0.2451171875, "learning_rate": 3.6985161972025896e-06, "loss": 2.7801, "num_input_tokens_seen": 9009889280, "step": 17185 }, { "epoch": 0.8336515903759071, "grad_norm": 0.2451171875, "learning_rate": 3.6880295840094947e-06, "loss": 2.7699, "num_input_tokens_seen": 9012510720, "step": 17190 }, { "epoch": 0.8338940719321537, "grad_norm": 0.2353515625, "learning_rate": 3.677556674546706e-06, "loss": 2.7702, "num_input_tokens_seen": 9015132160, "step": 17195 }, { "epoch": 0.8341365534884003, "grad_norm": 0.23828125, "learning_rate": 3.6670974755483673e-06, "loss": 2.7698, "num_input_tokens_seen": 9017753600, "step": 17200 }, { "epoch": 0.8343790350446469, "grad_norm": 0.25390625, "learning_rate": 3.65665199373981e-06, "loss": 2.7663, "num_input_tokens_seen": 9020375040, "step": 17205 }, { "epoch": 0.8346215166008936, "grad_norm": 0.2421875, "learning_rate": 3.6462202358375468e-06, "loss": 2.77, "num_input_tokens_seen": 9022996480, "step": 17210 }, { "epoch": 0.8348639981571402, "grad_norm": 0.2421875, "learning_rate": 3.6358022085492576e-06, "loss": 2.781, "num_input_tokens_seen": 9025617920, "step": 17215 }, { "epoch": 0.8351064797133868, "grad_norm": 0.244140625, "learning_rate": 3.625397918573806e-06, "loss": 2.7623, "num_input_tokens_seen": 9028239360, "step": 17220 }, { "epoch": 0.8353489612696334, "grad_norm": 0.2431640625, "learning_rate": 3.615007372601209e-06, "loss": 2.7775, "num_input_tokens_seen": 9030860800, "step": 17225 }, { "epoch": 0.83559144282588, "grad_norm": 0.2470703125, "learning_rate": 3.604630577312662e-06, "loss": 2.7881, "num_input_tokens_seen": 9033482240, "step": 17230 }, { "epoch": 0.8358339243821267, "grad_norm": 0.2431640625, "learning_rate": 3.594267539380497e-06, "loss": 2.7761, "num_input_tokens_seen": 9036103680, "step": 17235 }, { "epoch": 0.8360764059383733, "grad_norm": 0.2421875, "learning_rate": 3.5839182654682197e-06, "loss": 2.7679, "num_input_tokens_seen": 9038725120, "step": 17240 }, { "epoch": 0.8363188874946199, "grad_norm": 0.2412109375, "learning_rate": 3.5735827622304763e-06, "loss": 2.7748, "num_input_tokens_seen": 9041346560, "step": 17245 }, { "epoch": 0.8365613690508665, "grad_norm": 0.23828125, "learning_rate": 3.563261036313059e-06, "loss": 2.7666, "num_input_tokens_seen": 9043968000, "step": 17250 }, { "epoch": 0.8368038506071132, "grad_norm": 0.244140625, "learning_rate": 3.552953094352904e-06, "loss": 2.7714, "num_input_tokens_seen": 9046589440, "step": 17255 }, { "epoch": 0.8370463321633598, "grad_norm": 0.2373046875, "learning_rate": 3.5426589429780803e-06, "loss": 2.7695, "num_input_tokens_seen": 9049210880, "step": 17260 }, { "epoch": 0.8372888137196065, "grad_norm": 0.244140625, "learning_rate": 3.5323785888077942e-06, "loss": 2.774, "num_input_tokens_seen": 9051832320, "step": 17265 }, { "epoch": 0.8375312952758531, "grad_norm": 0.2421875, "learning_rate": 3.522112038452377e-06, "loss": 2.7805, "num_input_tokens_seen": 9054453760, "step": 17270 }, { "epoch": 0.8377737768320997, "grad_norm": 0.25, "learning_rate": 3.5118592985132843e-06, "loss": 2.7826, "num_input_tokens_seen": 9057075200, "step": 17275 }, { "epoch": 0.8380162583883464, "grad_norm": 0.251953125, "learning_rate": 3.5016203755830924e-06, "loss": 2.7708, "num_input_tokens_seen": 9059696640, "step": 17280 }, { "epoch": 0.838258739944593, "grad_norm": 0.236328125, "learning_rate": 3.4913952762454904e-06, "loss": 2.7727, "num_input_tokens_seen": 9062318080, "step": 17285 }, { "epoch": 0.8385012215008396, "grad_norm": 0.23828125, "learning_rate": 3.4811840070752914e-06, "loss": 2.7693, "num_input_tokens_seen": 9064939520, "step": 17290 }, { "epoch": 0.8387437030570862, "grad_norm": 0.2373046875, "learning_rate": 3.4709865746383906e-06, "loss": 2.769, "num_input_tokens_seen": 9067560960, "step": 17295 }, { "epoch": 0.8389861846133329, "grad_norm": 0.2412109375, "learning_rate": 3.4608029854918095e-06, "loss": 2.7697, "num_input_tokens_seen": 9070182400, "step": 17300 }, { "epoch": 0.8392286661695795, "grad_norm": 0.244140625, "learning_rate": 3.4506332461836543e-06, "loss": 2.7832, "num_input_tokens_seen": 9072803840, "step": 17305 }, { "epoch": 0.8394711477258261, "grad_norm": 0.2412109375, "learning_rate": 3.4404773632531363e-06, "loss": 2.7537, "num_input_tokens_seen": 9075425280, "step": 17310 }, { "epoch": 0.8397136292820727, "grad_norm": 0.2431640625, "learning_rate": 3.430335343230545e-06, "loss": 2.7754, "num_input_tokens_seen": 9078046720, "step": 17315 }, { "epoch": 0.8399561108383193, "grad_norm": 0.2412109375, "learning_rate": 3.420207192637273e-06, "loss": 2.7824, "num_input_tokens_seen": 9080668160, "step": 17320 }, { "epoch": 0.840198592394566, "grad_norm": 0.2451171875, "learning_rate": 3.4100929179857827e-06, "loss": 2.7847, "num_input_tokens_seen": 9083289600, "step": 17325 }, { "epoch": 0.8404410739508126, "grad_norm": 0.2392578125, "learning_rate": 3.399992525779608e-06, "loss": 2.772, "num_input_tokens_seen": 9085911040, "step": 17330 }, { "epoch": 0.8406835555070592, "grad_norm": 0.2421875, "learning_rate": 3.389906022513367e-06, "loss": 2.7597, "num_input_tokens_seen": 9088532480, "step": 17335 }, { "epoch": 0.8409260370633058, "grad_norm": 0.2412109375, "learning_rate": 3.379833414672748e-06, "loss": 2.7796, "num_input_tokens_seen": 9091153920, "step": 17340 }, { "epoch": 0.8411685186195526, "grad_norm": 0.23828125, "learning_rate": 3.3697747087344996e-06, "loss": 2.7706, "num_input_tokens_seen": 9093775360, "step": 17345 }, { "epoch": 0.8414110001757992, "grad_norm": 0.2470703125, "learning_rate": 3.359729911166429e-06, "loss": 2.7746, "num_input_tokens_seen": 9096396800, "step": 17350 }, { "epoch": 0.8416534817320458, "grad_norm": 0.2421875, "learning_rate": 3.349699028427414e-06, "loss": 2.7739, "num_input_tokens_seen": 9099018240, "step": 17355 }, { "epoch": 0.8418959632882924, "grad_norm": 0.2470703125, "learning_rate": 3.339682066967362e-06, "loss": 2.7794, "num_input_tokens_seen": 9101639680, "step": 17360 }, { "epoch": 0.842138444844539, "grad_norm": 0.2421875, "learning_rate": 3.329679033227248e-06, "loss": 2.7765, "num_input_tokens_seen": 9104261120, "step": 17365 }, { "epoch": 0.8423809264007857, "grad_norm": 0.240234375, "learning_rate": 3.319689933639078e-06, "loss": 2.7758, "num_input_tokens_seen": 9106882560, "step": 17370 }, { "epoch": 0.8426234079570323, "grad_norm": 0.2392578125, "learning_rate": 3.3097147746259187e-06, "loss": 2.7804, "num_input_tokens_seen": 9109504000, "step": 17375 }, { "epoch": 0.8428658895132789, "grad_norm": 0.2431640625, "learning_rate": 3.299753562601854e-06, "loss": 2.7664, "num_input_tokens_seen": 9112125440, "step": 17380 }, { "epoch": 0.8431083710695255, "grad_norm": 0.234375, "learning_rate": 3.2898063039720096e-06, "loss": 2.7705, "num_input_tokens_seen": 9114746880, "step": 17385 }, { "epoch": 0.8433508526257721, "grad_norm": 0.240234375, "learning_rate": 3.279873005132525e-06, "loss": 2.7841, "num_input_tokens_seen": 9117368320, "step": 17390 }, { "epoch": 0.8435933341820188, "grad_norm": 0.24609375, "learning_rate": 3.26995367247058e-06, "loss": 2.7931, "num_input_tokens_seen": 9119989760, "step": 17395 }, { "epoch": 0.8438358157382654, "grad_norm": 0.2373046875, "learning_rate": 3.2600483123643665e-06, "loss": 2.7719, "num_input_tokens_seen": 9122611200, "step": 17400 }, { "epoch": 0.8438358157382654, "eval_accuracy": 0.45597296857189384, "eval_loss": 2.741610288619995, "eval_runtime": 5.8647, "eval_samples_per_second": 51.153, "eval_steps_per_second": 6.479, "num_input_tokens_seen": 9122611200, "step": 17400 }, { "epoch": 0.844078297294512, "grad_norm": 0.236328125, "learning_rate": 3.2501569311830904e-06, "loss": 2.7818, "num_input_tokens_seen": 9125232640, "step": 17405 }, { "epoch": 0.8443207788507586, "grad_norm": 0.244140625, "learning_rate": 3.2402795352869774e-06, "loss": 2.7849, "num_input_tokens_seen": 9127854080, "step": 17410 }, { "epoch": 0.8445632604070052, "grad_norm": 0.2451171875, "learning_rate": 3.2304161310272556e-06, "loss": 2.7755, "num_input_tokens_seen": 9130475520, "step": 17415 }, { "epoch": 0.8448057419632519, "grad_norm": 0.25, "learning_rate": 3.220566724746141e-06, "loss": 2.7745, "num_input_tokens_seen": 9133096960, "step": 17420 }, { "epoch": 0.8450482235194986, "grad_norm": 0.240234375, "learning_rate": 3.2107313227768803e-06, "loss": 2.7735, "num_input_tokens_seen": 9135718400, "step": 17425 }, { "epoch": 0.8452907050757452, "grad_norm": 0.2392578125, "learning_rate": 3.200909931443691e-06, "loss": 2.7659, "num_input_tokens_seen": 9138339840, "step": 17430 }, { "epoch": 0.8455331866319918, "grad_norm": 0.2373046875, "learning_rate": 3.1911025570617924e-06, "loss": 2.7857, "num_input_tokens_seen": 9140961280, "step": 17435 }, { "epoch": 0.8457756681882385, "grad_norm": 0.244140625, "learning_rate": 3.1813092059373857e-06, "loss": 2.7647, "num_input_tokens_seen": 9143582720, "step": 17440 }, { "epoch": 0.8460181497444851, "grad_norm": 0.2451171875, "learning_rate": 3.1715298843676606e-06, "loss": 2.7746, "num_input_tokens_seen": 9146204160, "step": 17445 }, { "epoch": 0.8462606313007317, "grad_norm": 0.2451171875, "learning_rate": 3.1617645986407733e-06, "loss": 2.7904, "num_input_tokens_seen": 9148825600, "step": 17450 }, { "epoch": 0.8465031128569783, "grad_norm": 0.2431640625, "learning_rate": 3.152013355035871e-06, "loss": 2.7764, "num_input_tokens_seen": 9151447040, "step": 17455 }, { "epoch": 0.846745594413225, "grad_norm": 0.2373046875, "learning_rate": 3.1422761598230598e-06, "loss": 2.7668, "num_input_tokens_seen": 9154068480, "step": 17460 }, { "epoch": 0.8469880759694716, "grad_norm": 0.244140625, "learning_rate": 3.1325530192634207e-06, "loss": 2.76, "num_input_tokens_seen": 9156689920, "step": 17465 }, { "epoch": 0.8472305575257182, "grad_norm": 0.2421875, "learning_rate": 3.1228439396089936e-06, "loss": 2.7694, "num_input_tokens_seen": 9159311360, "step": 17470 }, { "epoch": 0.8474730390819648, "grad_norm": 0.2353515625, "learning_rate": 3.1131489271027743e-06, "loss": 2.763, "num_input_tokens_seen": 9161932800, "step": 17475 }, { "epoch": 0.8477155206382114, "grad_norm": 0.248046875, "learning_rate": 3.10346798797872e-06, "loss": 2.7746, "num_input_tokens_seen": 9164554240, "step": 17480 }, { "epoch": 0.847958002194458, "grad_norm": 0.2412109375, "learning_rate": 3.093801128461735e-06, "loss": 2.7769, "num_input_tokens_seen": 9167175680, "step": 17485 }, { "epoch": 0.8482004837507047, "grad_norm": 0.240234375, "learning_rate": 3.0841483547676656e-06, "loss": 2.7659, "num_input_tokens_seen": 9169797120, "step": 17490 }, { "epoch": 0.8484429653069513, "grad_norm": 0.23828125, "learning_rate": 3.0745096731033124e-06, "loss": 2.7704, "num_input_tokens_seen": 9172418560, "step": 17495 }, { "epoch": 0.848685446863198, "grad_norm": 0.23828125, "learning_rate": 3.0648850896664054e-06, "loss": 2.7727, "num_input_tokens_seen": 9175040000, "step": 17500 }, { "epoch": 0.8489279284194446, "grad_norm": 0.244140625, "learning_rate": 3.0552746106456087e-06, "loss": 2.7707, "num_input_tokens_seen": 9177661440, "step": 17505 }, { "epoch": 0.8491704099756913, "grad_norm": 0.240234375, "learning_rate": 3.0456782422205313e-06, "loss": 2.7882, "num_input_tokens_seen": 9180282880, "step": 17510 }, { "epoch": 0.8494128915319379, "grad_norm": 0.24609375, "learning_rate": 3.0360959905616825e-06, "loss": 2.7695, "num_input_tokens_seen": 9182904320, "step": 17515 }, { "epoch": 0.8496553730881845, "grad_norm": 0.2470703125, "learning_rate": 3.026527861830519e-06, "loss": 2.7869, "num_input_tokens_seen": 9185525760, "step": 17520 }, { "epoch": 0.8498978546444311, "grad_norm": 0.255859375, "learning_rate": 3.016973862179406e-06, "loss": 2.7681, "num_input_tokens_seen": 9188147200, "step": 17525 }, { "epoch": 0.8501403362006777, "grad_norm": 0.2451171875, "learning_rate": 3.007433997751624e-06, "loss": 2.7779, "num_input_tokens_seen": 9190768640, "step": 17530 }, { "epoch": 0.8503828177569244, "grad_norm": 0.2373046875, "learning_rate": 2.9979082746813684e-06, "loss": 2.7594, "num_input_tokens_seen": 9193390080, "step": 17535 }, { "epoch": 0.850625299313171, "grad_norm": 0.2470703125, "learning_rate": 2.9883966990937374e-06, "loss": 2.7668, "num_input_tokens_seen": 9196011520, "step": 17540 }, { "epoch": 0.8508677808694176, "grad_norm": 0.2412109375, "learning_rate": 2.9788992771047324e-06, "loss": 2.7633, "num_input_tokens_seen": 9198632960, "step": 17545 }, { "epoch": 0.8511102624256642, "grad_norm": 0.240234375, "learning_rate": 2.9694160148212554e-06, "loss": 2.7752, "num_input_tokens_seen": 9201254400, "step": 17550 }, { "epoch": 0.8513527439819109, "grad_norm": 0.24609375, "learning_rate": 2.9599469183411027e-06, "loss": 2.7721, "num_input_tokens_seen": 9203875840, "step": 17555 }, { "epoch": 0.8515952255381575, "grad_norm": 0.2412109375, "learning_rate": 2.9504919937529657e-06, "loss": 2.7741, "num_input_tokens_seen": 9206497280, "step": 17560 }, { "epoch": 0.8518377070944041, "grad_norm": 0.2412109375, "learning_rate": 2.941051247136417e-06, "loss": 2.7825, "num_input_tokens_seen": 9209118720, "step": 17565 }, { "epoch": 0.8520801886506507, "grad_norm": 0.236328125, "learning_rate": 2.931624684561923e-06, "loss": 2.7743, "num_input_tokens_seen": 9211740160, "step": 17570 }, { "epoch": 0.8523226702068973, "grad_norm": 0.2392578125, "learning_rate": 2.9222123120908108e-06, "loss": 2.784, "num_input_tokens_seen": 9214361600, "step": 17575 }, { "epoch": 0.8525651517631441, "grad_norm": 0.23828125, "learning_rate": 2.912814135775299e-06, "loss": 2.7787, "num_input_tokens_seen": 9216983040, "step": 17580 }, { "epoch": 0.8528076333193907, "grad_norm": 0.2451171875, "learning_rate": 2.903430161658477e-06, "loss": 2.7779, "num_input_tokens_seen": 9219604480, "step": 17585 }, { "epoch": 0.8530501148756373, "grad_norm": 0.236328125, "learning_rate": 2.8940603957742952e-06, "loss": 2.7788, "num_input_tokens_seen": 9222225920, "step": 17590 }, { "epoch": 0.8532925964318839, "grad_norm": 0.2421875, "learning_rate": 2.884704844147576e-06, "loss": 2.7872, "num_input_tokens_seen": 9224847360, "step": 17595 }, { "epoch": 0.8535350779881306, "grad_norm": 0.2470703125, "learning_rate": 2.8753635127939937e-06, "loss": 2.7634, "num_input_tokens_seen": 9227468800, "step": 17600 }, { "epoch": 0.8537775595443772, "grad_norm": 0.232421875, "learning_rate": 2.8660364077200824e-06, "loss": 2.776, "num_input_tokens_seen": 9230090240, "step": 17605 }, { "epoch": 0.8540200411006238, "grad_norm": 0.23828125, "learning_rate": 2.8567235349232334e-06, "loss": 2.7772, "num_input_tokens_seen": 9232711680, "step": 17610 }, { "epoch": 0.8542625226568704, "grad_norm": 0.248046875, "learning_rate": 2.8474249003916762e-06, "loss": 2.7883, "num_input_tokens_seen": 9235333120, "step": 17615 }, { "epoch": 0.854505004213117, "grad_norm": 0.2431640625, "learning_rate": 2.838140510104498e-06, "loss": 2.7755, "num_input_tokens_seen": 9237954560, "step": 17620 }, { "epoch": 0.8547474857693637, "grad_norm": 0.240234375, "learning_rate": 2.828870370031614e-06, "loss": 2.7773, "num_input_tokens_seen": 9240576000, "step": 17625 }, { "epoch": 0.8549899673256103, "grad_norm": 0.2421875, "learning_rate": 2.8196144861337896e-06, "loss": 2.7741, "num_input_tokens_seen": 9243197440, "step": 17630 }, { "epoch": 0.8552324488818569, "grad_norm": 0.2412109375, "learning_rate": 2.8103728643626064e-06, "loss": 2.7591, "num_input_tokens_seen": 9245818880, "step": 17635 }, { "epoch": 0.8554749304381035, "grad_norm": 0.2392578125, "learning_rate": 2.8011455106604882e-06, "loss": 2.7689, "num_input_tokens_seen": 9248440320, "step": 17640 }, { "epoch": 0.8557174119943501, "grad_norm": 0.24609375, "learning_rate": 2.7919324309606836e-06, "loss": 2.782, "num_input_tokens_seen": 9251061760, "step": 17645 }, { "epoch": 0.8559598935505968, "grad_norm": 0.240234375, "learning_rate": 2.7827336311872563e-06, "loss": 2.7647, "num_input_tokens_seen": 9253683200, "step": 17650 }, { "epoch": 0.8562023751068434, "grad_norm": 0.23828125, "learning_rate": 2.773549117255095e-06, "loss": 2.7577, "num_input_tokens_seen": 9256304640, "step": 17655 }, { "epoch": 0.8564448566630901, "grad_norm": 0.2412109375, "learning_rate": 2.7643788950698996e-06, "loss": 2.7694, "num_input_tokens_seen": 9258926080, "step": 17660 }, { "epoch": 0.8566873382193367, "grad_norm": 0.25390625, "learning_rate": 2.7552229705281903e-06, "loss": 2.7797, "num_input_tokens_seen": 9261547520, "step": 17665 }, { "epoch": 0.8569298197755834, "grad_norm": 0.2392578125, "learning_rate": 2.7460813495172655e-06, "loss": 2.7775, "num_input_tokens_seen": 9264168960, "step": 17670 }, { "epoch": 0.85717230133183, "grad_norm": 0.236328125, "learning_rate": 2.736954037915254e-06, "loss": 2.7733, "num_input_tokens_seen": 9266790400, "step": 17675 }, { "epoch": 0.8574147828880766, "grad_norm": 0.2421875, "learning_rate": 2.7278410415910753e-06, "loss": 2.7777, "num_input_tokens_seen": 9269411840, "step": 17680 }, { "epoch": 0.8576572644443232, "grad_norm": 0.248046875, "learning_rate": 2.7187423664044392e-06, "loss": 2.7751, "num_input_tokens_seen": 9272033280, "step": 17685 }, { "epoch": 0.8578997460005698, "grad_norm": 0.240234375, "learning_rate": 2.709658018205852e-06, "loss": 2.7821, "num_input_tokens_seen": 9274654720, "step": 17690 }, { "epoch": 0.8581422275568165, "grad_norm": 0.2431640625, "learning_rate": 2.7005880028366127e-06, "loss": 2.7772, "num_input_tokens_seen": 9277276160, "step": 17695 }, { "epoch": 0.8583847091130631, "grad_norm": 0.240234375, "learning_rate": 2.6915323261287902e-06, "loss": 2.7609, "num_input_tokens_seen": 9279897600, "step": 17700 }, { "epoch": 0.8583847091130631, "eval_accuracy": 0.4559599413776258, "eval_loss": 2.7415964603424072, "eval_runtime": 5.9052, "eval_samples_per_second": 50.803, "eval_steps_per_second": 6.435, "num_input_tokens_seen": 9279897600, "step": 17700 }, { "epoch": 0.8586271906693097, "grad_norm": 0.240234375, "learning_rate": 2.68249099390524e-06, "loss": 2.7497, "num_input_tokens_seen": 9282519040, "step": 17705 }, { "epoch": 0.8588696722255563, "grad_norm": 0.2421875, "learning_rate": 2.6734640119795956e-06, "loss": 2.7744, "num_input_tokens_seen": 9285140480, "step": 17710 }, { "epoch": 0.859112153781803, "grad_norm": 0.2421875, "learning_rate": 2.6644513861562692e-06, "loss": 2.7646, "num_input_tokens_seen": 9287761920, "step": 17715 }, { "epoch": 0.8593546353380496, "grad_norm": 0.2412109375, "learning_rate": 2.6554531222304334e-06, "loss": 2.7827, "num_input_tokens_seen": 9290383360, "step": 17720 }, { "epoch": 0.8595971168942962, "grad_norm": 0.2490234375, "learning_rate": 2.6464692259880326e-06, "loss": 2.7679, "num_input_tokens_seen": 9293004800, "step": 17725 }, { "epoch": 0.8598395984505428, "grad_norm": 0.24609375, "learning_rate": 2.637499703205759e-06, "loss": 2.7727, "num_input_tokens_seen": 9295626240, "step": 17730 }, { "epoch": 0.8600820800067894, "grad_norm": 0.236328125, "learning_rate": 2.628544559651075e-06, "loss": 2.7729, "num_input_tokens_seen": 9298247680, "step": 17735 }, { "epoch": 0.8603245615630362, "grad_norm": 0.2373046875, "learning_rate": 2.619603801082193e-06, "loss": 2.7817, "num_input_tokens_seen": 9300869120, "step": 17740 }, { "epoch": 0.8605670431192828, "grad_norm": 0.236328125, "learning_rate": 2.6106774332480795e-06, "loss": 2.7744, "num_input_tokens_seen": 9303490560, "step": 17745 }, { "epoch": 0.8608095246755294, "grad_norm": 0.244140625, "learning_rate": 2.6017654618884446e-06, "loss": 2.7788, "num_input_tokens_seen": 9306112000, "step": 17750 }, { "epoch": 0.861052006231776, "grad_norm": 0.2353515625, "learning_rate": 2.592867892733744e-06, "loss": 2.7656, "num_input_tokens_seen": 9308733440, "step": 17755 }, { "epoch": 0.8612944877880226, "grad_norm": 0.244140625, "learning_rate": 2.58398473150516e-06, "loss": 2.7706, "num_input_tokens_seen": 9311354880, "step": 17760 }, { "epoch": 0.8615369693442693, "grad_norm": 0.232421875, "learning_rate": 2.5751159839146306e-06, "loss": 2.785, "num_input_tokens_seen": 9313976320, "step": 17765 }, { "epoch": 0.8617794509005159, "grad_norm": 0.244140625, "learning_rate": 2.566261655664812e-06, "loss": 2.7827, "num_input_tokens_seen": 9316597760, "step": 17770 }, { "epoch": 0.8620219324567625, "grad_norm": 0.2373046875, "learning_rate": 2.557421752449096e-06, "loss": 2.7805, "num_input_tokens_seen": 9319219200, "step": 17775 }, { "epoch": 0.8622644140130091, "grad_norm": 0.2373046875, "learning_rate": 2.5485962799515926e-06, "loss": 2.7767, "num_input_tokens_seen": 9321840640, "step": 17780 }, { "epoch": 0.8625068955692557, "grad_norm": 0.24609375, "learning_rate": 2.5397852438471424e-06, "loss": 2.7797, "num_input_tokens_seen": 9324462080, "step": 17785 }, { "epoch": 0.8627493771255024, "grad_norm": 0.2353515625, "learning_rate": 2.5309886498012858e-06, "loss": 2.7749, "num_input_tokens_seen": 9327083520, "step": 17790 }, { "epoch": 0.862991858681749, "grad_norm": 0.23828125, "learning_rate": 2.5222065034702953e-06, "loss": 2.7847, "num_input_tokens_seen": 9329704960, "step": 17795 }, { "epoch": 0.8632343402379956, "grad_norm": 0.2412109375, "learning_rate": 2.5134388105011423e-06, "loss": 2.7751, "num_input_tokens_seen": 9332326400, "step": 17800 }, { "epoch": 0.8634768217942422, "grad_norm": 0.2392578125, "learning_rate": 2.50468557653151e-06, "loss": 2.7782, "num_input_tokens_seen": 9334947840, "step": 17805 }, { "epoch": 0.8637193033504889, "grad_norm": 0.2421875, "learning_rate": 2.495946807189781e-06, "loss": 2.7754, "num_input_tokens_seen": 9337569280, "step": 17810 }, { "epoch": 0.8639617849067356, "grad_norm": 0.2421875, "learning_rate": 2.4872225080950413e-06, "loss": 2.7742, "num_input_tokens_seen": 9340190720, "step": 17815 }, { "epoch": 0.8642042664629822, "grad_norm": 0.240234375, "learning_rate": 2.4785126848570677e-06, "loss": 2.7639, "num_input_tokens_seen": 9342812160, "step": 17820 }, { "epoch": 0.8644467480192288, "grad_norm": 0.234375, "learning_rate": 2.4698173430763333e-06, "loss": 2.7779, "num_input_tokens_seen": 9345433600, "step": 17825 }, { "epoch": 0.8646892295754754, "grad_norm": 0.23828125, "learning_rate": 2.4611364883439956e-06, "loss": 2.7735, "num_input_tokens_seen": 9348055040, "step": 17830 }, { "epoch": 0.8649317111317221, "grad_norm": 0.2392578125, "learning_rate": 2.4524701262418987e-06, "loss": 2.7766, "num_input_tokens_seen": 9350676480, "step": 17835 }, { "epoch": 0.8651741926879687, "grad_norm": 0.2373046875, "learning_rate": 2.4438182623425674e-06, "loss": 2.7746, "num_input_tokens_seen": 9353297920, "step": 17840 }, { "epoch": 0.8654166742442153, "grad_norm": 0.23828125, "learning_rate": 2.4351809022092027e-06, "loss": 2.772, "num_input_tokens_seen": 9355919360, "step": 17845 }, { "epoch": 0.8656591558004619, "grad_norm": 0.23828125, "learning_rate": 2.4265580513956887e-06, "loss": 2.7688, "num_input_tokens_seen": 9358540800, "step": 17850 }, { "epoch": 0.8659016373567086, "grad_norm": 0.2431640625, "learning_rate": 2.417949715446563e-06, "loss": 2.7797, "num_input_tokens_seen": 9361162240, "step": 17855 }, { "epoch": 0.8661441189129552, "grad_norm": 0.2470703125, "learning_rate": 2.409355899897045e-06, "loss": 2.7749, "num_input_tokens_seen": 9363783680, "step": 17860 }, { "epoch": 0.8663866004692018, "grad_norm": 0.2431640625, "learning_rate": 2.4007766102730064e-06, "loss": 2.7594, "num_input_tokens_seen": 9366405120, "step": 17865 }, { "epoch": 0.8666290820254484, "grad_norm": 0.244140625, "learning_rate": 2.392211852090989e-06, "loss": 2.7497, "num_input_tokens_seen": 9369026560, "step": 17870 }, { "epoch": 0.866871563581695, "grad_norm": 0.2373046875, "learning_rate": 2.383661630858186e-06, "loss": 2.7691, "num_input_tokens_seen": 9371648000, "step": 17875 }, { "epoch": 0.8671140451379417, "grad_norm": 0.2373046875, "learning_rate": 2.3751259520724434e-06, "loss": 2.7691, "num_input_tokens_seen": 9374269440, "step": 17880 }, { "epoch": 0.8673565266941883, "grad_norm": 0.24609375, "learning_rate": 2.3666048212222512e-06, "loss": 2.7712, "num_input_tokens_seen": 9376890880, "step": 17885 }, { "epoch": 0.8675990082504349, "grad_norm": 0.2373046875, "learning_rate": 2.358098243786755e-06, "loss": 2.7797, "num_input_tokens_seen": 9379512320, "step": 17890 }, { "epoch": 0.8678414898066816, "grad_norm": 0.25, "learning_rate": 2.3496062252357343e-06, "loss": 2.7765, "num_input_tokens_seen": 9382133760, "step": 17895 }, { "epoch": 0.8680839713629283, "grad_norm": 0.244140625, "learning_rate": 2.3411287710296104e-06, "loss": 2.7689, "num_input_tokens_seen": 9384755200, "step": 17900 }, { "epoch": 0.8683264529191749, "grad_norm": 0.2431640625, "learning_rate": 2.3326658866194422e-06, "loss": 2.7686, "num_input_tokens_seen": 9387376640, "step": 17905 }, { "epoch": 0.8685689344754215, "grad_norm": 0.2431640625, "learning_rate": 2.3242175774469215e-06, "loss": 2.774, "num_input_tokens_seen": 9389998080, "step": 17910 }, { "epoch": 0.8688114160316681, "grad_norm": 0.2421875, "learning_rate": 2.31578384894435e-06, "loss": 2.7767, "num_input_tokens_seen": 9392619520, "step": 17915 }, { "epoch": 0.8690538975879147, "grad_norm": 0.2392578125, "learning_rate": 2.3073647065346788e-06, "loss": 2.7694, "num_input_tokens_seen": 9395240960, "step": 17920 }, { "epoch": 0.8692963791441614, "grad_norm": 0.251953125, "learning_rate": 2.2989601556314634e-06, "loss": 2.769, "num_input_tokens_seen": 9397862400, "step": 17925 }, { "epoch": 0.869538860700408, "grad_norm": 0.23828125, "learning_rate": 2.2905702016388864e-06, "loss": 2.7607, "num_input_tokens_seen": 9400483840, "step": 17930 }, { "epoch": 0.8697813422566546, "grad_norm": 0.2431640625, "learning_rate": 2.2821948499517383e-06, "loss": 2.7688, "num_input_tokens_seen": 9403105280, "step": 17935 }, { "epoch": 0.8700238238129012, "grad_norm": 0.248046875, "learning_rate": 2.2738341059554274e-06, "loss": 2.7786, "num_input_tokens_seen": 9405726720, "step": 17940 }, { "epoch": 0.8702663053691478, "grad_norm": 0.251953125, "learning_rate": 2.2654879750259567e-06, "loss": 2.7688, "num_input_tokens_seen": 9408348160, "step": 17945 }, { "epoch": 0.8705087869253945, "grad_norm": 0.2392578125, "learning_rate": 2.257156462529947e-06, "loss": 2.7638, "num_input_tokens_seen": 9410969600, "step": 17950 }, { "epoch": 0.8707512684816411, "grad_norm": 0.2421875, "learning_rate": 2.2488395738246127e-06, "loss": 2.7643, "num_input_tokens_seen": 9413591040, "step": 17955 }, { "epoch": 0.8709937500378877, "grad_norm": 0.251953125, "learning_rate": 2.2405373142577598e-06, "loss": 2.7659, "num_input_tokens_seen": 9416212480, "step": 17960 }, { "epoch": 0.8712362315941343, "grad_norm": 0.23046875, "learning_rate": 2.2322496891678008e-06, "loss": 2.7605, "num_input_tokens_seen": 9418833920, "step": 17965 }, { "epoch": 0.871478713150381, "grad_norm": 0.240234375, "learning_rate": 2.2239767038837235e-06, "loss": 2.7799, "num_input_tokens_seen": 9421455360, "step": 17970 }, { "epoch": 0.8717211947066277, "grad_norm": 0.2451171875, "learning_rate": 2.2157183637251166e-06, "loss": 2.7794, "num_input_tokens_seen": 9424076800, "step": 17975 }, { "epoch": 0.8719636762628743, "grad_norm": 0.2490234375, "learning_rate": 2.2074746740021357e-06, "loss": 2.7728, "num_input_tokens_seen": 9426698240, "step": 17980 }, { "epoch": 0.8722061578191209, "grad_norm": 0.240234375, "learning_rate": 2.199245640015529e-06, "loss": 2.7765, "num_input_tokens_seen": 9429319680, "step": 17985 }, { "epoch": 0.8724486393753675, "grad_norm": 0.2470703125, "learning_rate": 2.1910312670566173e-06, "loss": 2.7675, "num_input_tokens_seen": 9431941120, "step": 17990 }, { "epoch": 0.8726911209316142, "grad_norm": 0.2412109375, "learning_rate": 2.1828315604072892e-06, "loss": 2.7799, "num_input_tokens_seen": 9434562560, "step": 17995 }, { "epoch": 0.8729336024878608, "grad_norm": 0.23828125, "learning_rate": 2.1746465253400155e-06, "loss": 2.7753, "num_input_tokens_seen": 9437184000, "step": 18000 }, { "epoch": 0.8729336024878608, "eval_accuracy": 0.45591271779840414, "eval_loss": 2.741586685180664, "eval_runtime": 5.9258, "eval_samples_per_second": 50.626, "eval_steps_per_second": 6.413, "num_input_tokens_seen": 9437184000, "step": 18000 }, { "epoch": 0.8731760840441074, "grad_norm": 0.2392578125, "learning_rate": 2.1664761671178286e-06, "loss": 2.7627, "num_input_tokens_seen": 9439805440, "step": 18005 }, { "epoch": 0.873418565600354, "grad_norm": 0.2431640625, "learning_rate": 2.1583204909943033e-06, "loss": 2.7716, "num_input_tokens_seen": 9442426880, "step": 18010 }, { "epoch": 0.8736610471566006, "grad_norm": 0.2431640625, "learning_rate": 2.1501795022136032e-06, "loss": 2.7705, "num_input_tokens_seen": 9445048320, "step": 18015 }, { "epoch": 0.8739035287128473, "grad_norm": 0.2373046875, "learning_rate": 2.1420532060104304e-06, "loss": 2.7747, "num_input_tokens_seen": 9447669760, "step": 18020 }, { "epoch": 0.8741460102690939, "grad_norm": 0.248046875, "learning_rate": 2.133941607610043e-06, "loss": 2.779, "num_input_tokens_seen": 9450291200, "step": 18025 }, { "epoch": 0.8743884918253405, "grad_norm": 0.2451171875, "learning_rate": 2.1258447122282534e-06, "loss": 2.7875, "num_input_tokens_seen": 9452912640, "step": 18030 }, { "epoch": 0.8746309733815871, "grad_norm": 0.2490234375, "learning_rate": 2.1177625250714207e-06, "loss": 2.7781, "num_input_tokens_seen": 9455534080, "step": 18035 }, { "epoch": 0.8748734549378337, "grad_norm": 0.240234375, "learning_rate": 2.1096950513364273e-06, "loss": 2.7652, "num_input_tokens_seen": 9458155520, "step": 18040 }, { "epoch": 0.8751159364940804, "grad_norm": 0.2431640625, "learning_rate": 2.101642296210715e-06, "loss": 2.7838, "num_input_tokens_seen": 9460776960, "step": 18045 }, { "epoch": 0.875358418050327, "grad_norm": 0.2353515625, "learning_rate": 2.093604264872262e-06, "loss": 2.7826, "num_input_tokens_seen": 9463398400, "step": 18050 }, { "epoch": 0.8756008996065737, "grad_norm": 0.2431640625, "learning_rate": 2.0855809624895695e-06, "loss": 2.7639, "num_input_tokens_seen": 9466019840, "step": 18055 }, { "epoch": 0.8758433811628203, "grad_norm": 0.244140625, "learning_rate": 2.077572394221669e-06, "loss": 2.7792, "num_input_tokens_seen": 9468641280, "step": 18060 }, { "epoch": 0.876085862719067, "grad_norm": 0.25, "learning_rate": 2.069578565218125e-06, "loss": 2.7733, "num_input_tokens_seen": 9471262720, "step": 18065 }, { "epoch": 0.8763283442753136, "grad_norm": 0.2333984375, "learning_rate": 2.0615994806190143e-06, "loss": 2.7806, "num_input_tokens_seen": 9473884160, "step": 18070 }, { "epoch": 0.8765708258315602, "grad_norm": 0.236328125, "learning_rate": 2.0536351455549383e-06, "loss": 2.7733, "num_input_tokens_seen": 9476505600, "step": 18075 }, { "epoch": 0.8768133073878068, "grad_norm": 0.240234375, "learning_rate": 2.045685565147015e-06, "loss": 2.7788, "num_input_tokens_seen": 9479127040, "step": 18080 }, { "epoch": 0.8770557889440534, "grad_norm": 0.2412109375, "learning_rate": 2.037750744506878e-06, "loss": 2.7666, "num_input_tokens_seen": 9481748480, "step": 18085 }, { "epoch": 0.8772982705003001, "grad_norm": 0.2470703125, "learning_rate": 2.0298306887366616e-06, "loss": 2.7749, "num_input_tokens_seen": 9484369920, "step": 18090 }, { "epoch": 0.8775407520565467, "grad_norm": 0.2431640625, "learning_rate": 2.0219254029290174e-06, "loss": 2.7599, "num_input_tokens_seen": 9486991360, "step": 18095 }, { "epoch": 0.8777832336127933, "grad_norm": 0.2421875, "learning_rate": 2.014034892167083e-06, "loss": 2.7807, "num_input_tokens_seen": 9489612800, "step": 18100 }, { "epoch": 0.8780257151690399, "grad_norm": 0.240234375, "learning_rate": 2.006159161524515e-06, "loss": 2.791, "num_input_tokens_seen": 9492234240, "step": 18105 }, { "epoch": 0.8782681967252866, "grad_norm": 0.244140625, "learning_rate": 1.9982982160654586e-06, "loss": 2.769, "num_input_tokens_seen": 9494855680, "step": 18110 }, { "epoch": 0.8785106782815332, "grad_norm": 0.2412109375, "learning_rate": 1.9904520608445444e-06, "loss": 2.7708, "num_input_tokens_seen": 9497477120, "step": 18115 }, { "epoch": 0.8787531598377798, "grad_norm": 0.2412109375, "learning_rate": 1.9826207009069038e-06, "loss": 2.78, "num_input_tokens_seen": 9500098560, "step": 18120 }, { "epoch": 0.8789956413940264, "grad_norm": 0.2431640625, "learning_rate": 1.9748041412881473e-06, "loss": 2.7839, "num_input_tokens_seen": 9502720000, "step": 18125 }, { "epoch": 0.8792381229502731, "grad_norm": 0.236328125, "learning_rate": 1.96700238701438e-06, "loss": 2.7713, "num_input_tokens_seen": 9505341440, "step": 18130 }, { "epoch": 0.8794806045065198, "grad_norm": 0.2470703125, "learning_rate": 1.9592154431021666e-06, "loss": 2.7611, "num_input_tokens_seen": 9507962880, "step": 18135 }, { "epoch": 0.8797230860627664, "grad_norm": 0.248046875, "learning_rate": 1.951443314558565e-06, "loss": 2.7761, "num_input_tokens_seen": 9510584320, "step": 18140 }, { "epoch": 0.879965567619013, "grad_norm": 0.2431640625, "learning_rate": 1.9436860063811042e-06, "loss": 2.7841, "num_input_tokens_seen": 9513205760, "step": 18145 }, { "epoch": 0.8802080491752596, "grad_norm": 0.2421875, "learning_rate": 1.9359435235577818e-06, "loss": 2.7725, "num_input_tokens_seen": 9515827200, "step": 18150 }, { "epoch": 0.8804505307315063, "grad_norm": 0.2431640625, "learning_rate": 1.9282158710670627e-06, "loss": 2.7834, "num_input_tokens_seen": 9518448640, "step": 18155 }, { "epoch": 0.8806930122877529, "grad_norm": 0.25, "learning_rate": 1.9205030538778756e-06, "loss": 2.7838, "num_input_tokens_seen": 9521070080, "step": 18160 }, { "epoch": 0.8809354938439995, "grad_norm": 0.2421875, "learning_rate": 1.9128050769496086e-06, "loss": 2.7681, "num_input_tokens_seen": 9523691520, "step": 18165 }, { "epoch": 0.8811779754002461, "grad_norm": 0.244140625, "learning_rate": 1.9051219452321106e-06, "loss": 2.7725, "num_input_tokens_seen": 9526312960, "step": 18170 }, { "epoch": 0.8814204569564927, "grad_norm": 0.2353515625, "learning_rate": 1.8974536636656825e-06, "loss": 2.7649, "num_input_tokens_seen": 9528934400, "step": 18175 }, { "epoch": 0.8816629385127394, "grad_norm": 0.244140625, "learning_rate": 1.8898002371810774e-06, "loss": 2.7761, "num_input_tokens_seen": 9531555840, "step": 18180 }, { "epoch": 0.881905420068986, "grad_norm": 0.2421875, "learning_rate": 1.8821616706995004e-06, "loss": 2.7944, "num_input_tokens_seen": 9534177280, "step": 18185 }, { "epoch": 0.8821479016252326, "grad_norm": 0.2470703125, "learning_rate": 1.8745379691325947e-06, "loss": 2.7843, "num_input_tokens_seen": 9536798720, "step": 18190 }, { "epoch": 0.8823903831814792, "grad_norm": 0.244140625, "learning_rate": 1.866929137382445e-06, "loss": 2.7689, "num_input_tokens_seen": 9539420160, "step": 18195 }, { "epoch": 0.8826328647377258, "grad_norm": 0.2392578125, "learning_rate": 1.8593351803415788e-06, "loss": 2.7867, "num_input_tokens_seen": 9542041600, "step": 18200 }, { "epoch": 0.8828753462939725, "grad_norm": 0.2451171875, "learning_rate": 1.8517561028929597e-06, "loss": 2.7613, "num_input_tokens_seen": 9544663040, "step": 18205 }, { "epoch": 0.8831178278502192, "grad_norm": 0.240234375, "learning_rate": 1.8441919099099813e-06, "loss": 2.7643, "num_input_tokens_seen": 9547284480, "step": 18210 }, { "epoch": 0.8833603094064658, "grad_norm": 0.232421875, "learning_rate": 1.8366426062564696e-06, "loss": 2.7662, "num_input_tokens_seen": 9549905920, "step": 18215 }, { "epoch": 0.8836027909627124, "grad_norm": 0.2431640625, "learning_rate": 1.8291081967866692e-06, "loss": 2.7678, "num_input_tokens_seen": 9552527360, "step": 18220 }, { "epoch": 0.8838452725189591, "grad_norm": 0.2470703125, "learning_rate": 1.8215886863452548e-06, "loss": 2.7744, "num_input_tokens_seen": 9555148800, "step": 18225 }, { "epoch": 0.8840877540752057, "grad_norm": 0.236328125, "learning_rate": 1.8140840797673198e-06, "loss": 2.7649, "num_input_tokens_seen": 9557770240, "step": 18230 }, { "epoch": 0.8843302356314523, "grad_norm": 0.244140625, "learning_rate": 1.8065943818783736e-06, "loss": 2.777, "num_input_tokens_seen": 9560391680, "step": 18235 }, { "epoch": 0.8845727171876989, "grad_norm": 0.2470703125, "learning_rate": 1.7991195974943364e-06, "loss": 2.7729, "num_input_tokens_seen": 9563013120, "step": 18240 }, { "epoch": 0.8848151987439455, "grad_norm": 0.251953125, "learning_rate": 1.791659731421541e-06, "loss": 2.7799, "num_input_tokens_seen": 9565634560, "step": 18245 }, { "epoch": 0.8850576803001922, "grad_norm": 0.2412109375, "learning_rate": 1.7842147884567368e-06, "loss": 2.7769, "num_input_tokens_seen": 9568256000, "step": 18250 }, { "epoch": 0.8853001618564388, "grad_norm": 0.240234375, "learning_rate": 1.7767847733870523e-06, "loss": 2.7899, "num_input_tokens_seen": 9570877440, "step": 18255 }, { "epoch": 0.8855426434126854, "grad_norm": 0.2470703125, "learning_rate": 1.769369690990047e-06, "loss": 2.7757, "num_input_tokens_seen": 9573498880, "step": 18260 }, { "epoch": 0.885785124968932, "grad_norm": 0.2431640625, "learning_rate": 1.7619695460336593e-06, "loss": 2.7993, "num_input_tokens_seen": 9576120320, "step": 18265 }, { "epoch": 0.8860276065251786, "grad_norm": 0.2373046875, "learning_rate": 1.7545843432762305e-06, "loss": 2.7668, "num_input_tokens_seen": 9578741760, "step": 18270 }, { "epoch": 0.8862700880814253, "grad_norm": 0.248046875, "learning_rate": 1.7472140874664921e-06, "loss": 2.7855, "num_input_tokens_seen": 9581363200, "step": 18275 }, { "epoch": 0.8865125696376719, "grad_norm": 0.2392578125, "learning_rate": 1.7398587833435593e-06, "loss": 2.7856, "num_input_tokens_seen": 9583984640, "step": 18280 }, { "epoch": 0.8867550511939185, "grad_norm": 0.2412109375, "learning_rate": 1.732518435636954e-06, "loss": 2.778, "num_input_tokens_seen": 9586606080, "step": 18285 }, { "epoch": 0.8869975327501652, "grad_norm": 0.244140625, "learning_rate": 1.7251930490665509e-06, "loss": 2.7769, "num_input_tokens_seen": 9589227520, "step": 18290 }, { "epoch": 0.8872400143064119, "grad_norm": 0.24609375, "learning_rate": 1.7178826283426235e-06, "loss": 2.7797, "num_input_tokens_seen": 9591848960, "step": 18295 }, { "epoch": 0.8874824958626585, "grad_norm": 0.2451171875, "learning_rate": 1.7105871781658178e-06, "loss": 2.7674, "num_input_tokens_seen": 9594470400, "step": 18300 }, { "epoch": 0.8874824958626585, "eval_accuracy": 0.4559631981761928, "eval_loss": 2.741525888442993, "eval_runtime": 5.9551, "eval_samples_per_second": 50.377, "eval_steps_per_second": 6.381, "num_input_tokens_seen": 9594470400, "step": 18300 }, { "epoch": 0.8877249774189051, "grad_norm": 0.2451171875, "learning_rate": 1.703306703227156e-06, "loss": 2.7576, "num_input_tokens_seen": 9597091840, "step": 18305 }, { "epoch": 0.8879674589751517, "grad_norm": 0.240234375, "learning_rate": 1.6960412082080295e-06, "loss": 2.7891, "num_input_tokens_seen": 9599713280, "step": 18310 }, { "epoch": 0.8882099405313983, "grad_norm": 0.2431640625, "learning_rate": 1.688790697780196e-06, "loss": 2.7742, "num_input_tokens_seen": 9602334720, "step": 18315 }, { "epoch": 0.888452422087645, "grad_norm": 0.248046875, "learning_rate": 1.6815551766057757e-06, "loss": 2.7634, "num_input_tokens_seen": 9604956160, "step": 18320 }, { "epoch": 0.8886949036438916, "grad_norm": 0.2373046875, "learning_rate": 1.6743346493372587e-06, "loss": 2.7755, "num_input_tokens_seen": 9607577600, "step": 18325 }, { "epoch": 0.8889373852001382, "grad_norm": 0.2431640625, "learning_rate": 1.6671291206174805e-06, "loss": 2.7775, "num_input_tokens_seen": 9610199040, "step": 18330 }, { "epoch": 0.8891798667563848, "grad_norm": 0.244140625, "learning_rate": 1.6599385950796547e-06, "loss": 2.7788, "num_input_tokens_seen": 9612820480, "step": 18335 }, { "epoch": 0.8894223483126315, "grad_norm": 0.2421875, "learning_rate": 1.6527630773473247e-06, "loss": 2.7869, "num_input_tokens_seen": 9615441920, "step": 18340 }, { "epoch": 0.8896648298688781, "grad_norm": 0.2431640625, "learning_rate": 1.6456025720343993e-06, "loss": 2.7755, "num_input_tokens_seen": 9618063360, "step": 18345 }, { "epoch": 0.8899073114251247, "grad_norm": 0.236328125, "learning_rate": 1.6384570837451236e-06, "loss": 2.7877, "num_input_tokens_seen": 9620684800, "step": 18350 }, { "epoch": 0.8901497929813713, "grad_norm": 0.2490234375, "learning_rate": 1.6313266170740915e-06, "loss": 2.7855, "num_input_tokens_seen": 9623306240, "step": 18355 }, { "epoch": 0.8903922745376179, "grad_norm": 0.2412109375, "learning_rate": 1.6242111766062384e-06, "loss": 2.7603, "num_input_tokens_seen": 9625927680, "step": 18360 }, { "epoch": 0.8906347560938647, "grad_norm": 0.23828125, "learning_rate": 1.6171107669168378e-06, "loss": 2.7888, "num_input_tokens_seen": 9628549120, "step": 18365 }, { "epoch": 0.8908772376501113, "grad_norm": 0.2421875, "learning_rate": 1.6100253925714987e-06, "loss": 2.7791, "num_input_tokens_seen": 9631170560, "step": 18370 }, { "epoch": 0.8911197192063579, "grad_norm": 0.2431640625, "learning_rate": 1.6029550581261598e-06, "loss": 2.7952, "num_input_tokens_seen": 9633792000, "step": 18375 }, { "epoch": 0.8913622007626045, "grad_norm": 0.240234375, "learning_rate": 1.5958997681270876e-06, "loss": 2.7777, "num_input_tokens_seen": 9636413440, "step": 18380 }, { "epoch": 0.8916046823188511, "grad_norm": 0.240234375, "learning_rate": 1.588859527110878e-06, "loss": 2.7779, "num_input_tokens_seen": 9639034880, "step": 18385 }, { "epoch": 0.8918471638750978, "grad_norm": 0.2373046875, "learning_rate": 1.5818343396044543e-06, "loss": 2.7775, "num_input_tokens_seen": 9641656320, "step": 18390 }, { "epoch": 0.8920896454313444, "grad_norm": 0.248046875, "learning_rate": 1.574824210125056e-06, "loss": 2.7864, "num_input_tokens_seen": 9644277760, "step": 18395 }, { "epoch": 0.892332126987591, "grad_norm": 0.236328125, "learning_rate": 1.5678291431802355e-06, "loss": 2.7854, "num_input_tokens_seen": 9646899200, "step": 18400 }, { "epoch": 0.8925746085438376, "grad_norm": 0.2490234375, "learning_rate": 1.560849143267873e-06, "loss": 2.7777, "num_input_tokens_seen": 9649520640, "step": 18405 }, { "epoch": 0.8928170901000843, "grad_norm": 0.2392578125, "learning_rate": 1.5538842148761418e-06, "loss": 2.7695, "num_input_tokens_seen": 9652142080, "step": 18410 }, { "epoch": 0.8930595716563309, "grad_norm": 0.2412109375, "learning_rate": 1.5469343624835403e-06, "loss": 2.7667, "num_input_tokens_seen": 9654763520, "step": 18415 }, { "epoch": 0.8933020532125775, "grad_norm": 0.2392578125, "learning_rate": 1.5399995905588633e-06, "loss": 2.7874, "num_input_tokens_seen": 9657384960, "step": 18420 }, { "epoch": 0.8935445347688241, "grad_norm": 0.2490234375, "learning_rate": 1.5330799035612187e-06, "loss": 2.7733, "num_input_tokens_seen": 9660006400, "step": 18425 }, { "epoch": 0.8937870163250707, "grad_norm": 0.24609375, "learning_rate": 1.5261753059400003e-06, "loss": 2.7813, "num_input_tokens_seen": 9662627840, "step": 18430 }, { "epoch": 0.8940294978813174, "grad_norm": 0.23828125, "learning_rate": 1.519285802134915e-06, "loss": 2.7694, "num_input_tokens_seen": 9665249280, "step": 18435 }, { "epoch": 0.894271979437564, "grad_norm": 0.2412109375, "learning_rate": 1.512411396575955e-06, "loss": 2.7832, "num_input_tokens_seen": 9667870720, "step": 18440 }, { "epoch": 0.8945144609938107, "grad_norm": 0.25, "learning_rate": 1.5055520936834016e-06, "loss": 2.7733, "num_input_tokens_seen": 9670492160, "step": 18445 }, { "epoch": 0.8947569425500573, "grad_norm": 0.25, "learning_rate": 1.498707897867835e-06, "loss": 2.7886, "num_input_tokens_seen": 9673113600, "step": 18450 }, { "epoch": 0.894999424106304, "grad_norm": 0.2392578125, "learning_rate": 1.491878813530115e-06, "loss": 2.7727, "num_input_tokens_seen": 9675735040, "step": 18455 }, { "epoch": 0.8952419056625506, "grad_norm": 0.2451171875, "learning_rate": 1.485064845061382e-06, "loss": 2.7743, "num_input_tokens_seen": 9678356480, "step": 18460 }, { "epoch": 0.8954843872187972, "grad_norm": 0.248046875, "learning_rate": 1.4782659968430646e-06, "loss": 2.7796, "num_input_tokens_seen": 9680977920, "step": 18465 }, { "epoch": 0.8957268687750438, "grad_norm": 0.236328125, "learning_rate": 1.4714822732468658e-06, "loss": 2.7592, "num_input_tokens_seen": 9683599360, "step": 18470 }, { "epoch": 0.8959693503312904, "grad_norm": 0.251953125, "learning_rate": 1.4647136786347548e-06, "loss": 2.7815, "num_input_tokens_seen": 9686220800, "step": 18475 }, { "epoch": 0.8962118318875371, "grad_norm": 0.2392578125, "learning_rate": 1.4579602173589862e-06, "loss": 2.7768, "num_input_tokens_seen": 9688842240, "step": 18480 }, { "epoch": 0.8964543134437837, "grad_norm": 0.2421875, "learning_rate": 1.4512218937620752e-06, "loss": 2.7648, "num_input_tokens_seen": 9691463680, "step": 18485 }, { "epoch": 0.8966967950000303, "grad_norm": 0.2451171875, "learning_rate": 1.444498712176809e-06, "loss": 2.7862, "num_input_tokens_seen": 9694085120, "step": 18490 }, { "epoch": 0.8969392765562769, "grad_norm": 0.2412109375, "learning_rate": 1.4377906769262317e-06, "loss": 2.7747, "num_input_tokens_seen": 9696706560, "step": 18495 }, { "epoch": 0.8971817581125235, "grad_norm": 0.234375, "learning_rate": 1.4310977923236547e-06, "loss": 2.78, "num_input_tokens_seen": 9699328000, "step": 18500 }, { "epoch": 0.8974242396687702, "grad_norm": 0.244140625, "learning_rate": 1.4244200626726462e-06, "loss": 2.7744, "num_input_tokens_seen": 9701949440, "step": 18505 }, { "epoch": 0.8976667212250168, "grad_norm": 0.2353515625, "learning_rate": 1.4177574922670218e-06, "loss": 2.769, "num_input_tokens_seen": 9704570880, "step": 18510 }, { "epoch": 0.8979092027812634, "grad_norm": 0.25, "learning_rate": 1.4111100853908627e-06, "loss": 2.7785, "num_input_tokens_seen": 9707192320, "step": 18515 }, { "epoch": 0.89815168433751, "grad_norm": 0.2451171875, "learning_rate": 1.4044778463184915e-06, "loss": 2.7619, "num_input_tokens_seen": 9709813760, "step": 18520 }, { "epoch": 0.8983941658937568, "grad_norm": 0.240234375, "learning_rate": 1.3978607793144776e-06, "loss": 2.7747, "num_input_tokens_seen": 9712435200, "step": 18525 }, { "epoch": 0.8986366474500034, "grad_norm": 0.2373046875, "learning_rate": 1.3912588886336398e-06, "loss": 2.7821, "num_input_tokens_seen": 9715056640, "step": 18530 }, { "epoch": 0.89887912900625, "grad_norm": 0.25, "learning_rate": 1.3846721785210292e-06, "loss": 2.7866, "num_input_tokens_seen": 9717678080, "step": 18535 }, { "epoch": 0.8991216105624966, "grad_norm": 0.2431640625, "learning_rate": 1.3781006532119445e-06, "loss": 2.7765, "num_input_tokens_seen": 9720299520, "step": 18540 }, { "epoch": 0.8993640921187432, "grad_norm": 0.2470703125, "learning_rate": 1.3715443169319191e-06, "loss": 2.7961, "num_input_tokens_seen": 9722920960, "step": 18545 }, { "epoch": 0.8996065736749899, "grad_norm": 0.24609375, "learning_rate": 1.3650031738967172e-06, "loss": 2.7744, "num_input_tokens_seen": 9725542400, "step": 18550 }, { "epoch": 0.8998490552312365, "grad_norm": 0.2392578125, "learning_rate": 1.3584772283123353e-06, "loss": 2.777, "num_input_tokens_seen": 9728163840, "step": 18555 }, { "epoch": 0.9000915367874831, "grad_norm": 0.240234375, "learning_rate": 1.3519664843749947e-06, "loss": 2.7662, "num_input_tokens_seen": 9730785280, "step": 18560 }, { "epoch": 0.9003340183437297, "grad_norm": 0.2470703125, "learning_rate": 1.345470946271149e-06, "loss": 2.7771, "num_input_tokens_seen": 9733406720, "step": 18565 }, { "epoch": 0.9005764998999763, "grad_norm": 0.244140625, "learning_rate": 1.3389906181774658e-06, "loss": 2.7581, "num_input_tokens_seen": 9736028160, "step": 18570 }, { "epoch": 0.900818981456223, "grad_norm": 0.2412109375, "learning_rate": 1.332525504260837e-06, "loss": 2.7769, "num_input_tokens_seen": 9738649600, "step": 18575 }, { "epoch": 0.9010614630124696, "grad_norm": 0.23828125, "learning_rate": 1.3260756086783732e-06, "loss": 2.7798, "num_input_tokens_seen": 9741271040, "step": 18580 }, { "epoch": 0.9013039445687162, "grad_norm": 0.2333984375, "learning_rate": 1.3196409355773986e-06, "loss": 2.7692, "num_input_tokens_seen": 9743892480, "step": 18585 }, { "epoch": 0.9015464261249628, "grad_norm": 0.2392578125, "learning_rate": 1.3132214890954453e-06, "loss": 2.7824, "num_input_tokens_seen": 9746513920, "step": 18590 }, { "epoch": 0.9017889076812095, "grad_norm": 0.240234375, "learning_rate": 1.3068172733602613e-06, "loss": 2.7804, "num_input_tokens_seen": 9749135360, "step": 18595 }, { "epoch": 0.9020313892374561, "grad_norm": 0.2412109375, "learning_rate": 1.3004282924897915e-06, "loss": 2.7601, "num_input_tokens_seen": 9751756800, "step": 18600 }, { "epoch": 0.9020313892374561, "eval_accuracy": 0.4559957661618629, "eval_loss": 2.741598606109619, "eval_runtime": 5.8657, "eval_samples_per_second": 51.145, "eval_steps_per_second": 6.478, "num_input_tokens_seen": 9751756800, "step": 18600 }, { "epoch": 0.9022738707937028, "grad_norm": 0.2392578125, "learning_rate": 1.294054550592194e-06, "loss": 2.7642, "num_input_tokens_seen": 9754378240, "step": 18605 }, { "epoch": 0.9025163523499494, "grad_norm": 0.240234375, "learning_rate": 1.2876960517658242e-06, "loss": 2.7627, "num_input_tokens_seen": 9756999680, "step": 18610 }, { "epoch": 0.902758833906196, "grad_norm": 0.2421875, "learning_rate": 1.2813528000992337e-06, "loss": 2.7592, "num_input_tokens_seen": 9759621120, "step": 18615 }, { "epoch": 0.9030013154624427, "grad_norm": 0.248046875, "learning_rate": 1.2750247996711789e-06, "loss": 2.7852, "num_input_tokens_seen": 9762242560, "step": 18620 }, { "epoch": 0.9032437970186893, "grad_norm": 0.2421875, "learning_rate": 1.2687120545506054e-06, "loss": 2.7813, "num_input_tokens_seen": 9764864000, "step": 18625 }, { "epoch": 0.9034862785749359, "grad_norm": 0.240234375, "learning_rate": 1.262414568796641e-06, "loss": 2.7722, "num_input_tokens_seen": 9767485440, "step": 18630 }, { "epoch": 0.9037287601311825, "grad_norm": 0.2412109375, "learning_rate": 1.2561323464586105e-06, "loss": 2.7861, "num_input_tokens_seen": 9770106880, "step": 18635 }, { "epoch": 0.9039712416874292, "grad_norm": 0.2431640625, "learning_rate": 1.2498653915760216e-06, "loss": 2.7734, "num_input_tokens_seen": 9772728320, "step": 18640 }, { "epoch": 0.9042137232436758, "grad_norm": 0.2451171875, "learning_rate": 1.2436137081785677e-06, "loss": 2.7927, "num_input_tokens_seen": 9775349760, "step": 18645 }, { "epoch": 0.9044562047999224, "grad_norm": 0.2392578125, "learning_rate": 1.2373773002861161e-06, "loss": 2.7753, "num_input_tokens_seen": 9777971200, "step": 18650 }, { "epoch": 0.904698686356169, "grad_norm": 0.23828125, "learning_rate": 1.231156171908726e-06, "loss": 2.7794, "num_input_tokens_seen": 9780592640, "step": 18655 }, { "epoch": 0.9049411679124156, "grad_norm": 0.24609375, "learning_rate": 1.2249503270466089e-06, "loss": 2.7738, "num_input_tokens_seen": 9783214080, "step": 18660 }, { "epoch": 0.9051836494686623, "grad_norm": 0.236328125, "learning_rate": 1.2187597696901698e-06, "loss": 2.7723, "num_input_tokens_seen": 9785835520, "step": 18665 }, { "epoch": 0.9054261310249089, "grad_norm": 0.2412109375, "learning_rate": 1.212584503819969e-06, "loss": 2.7683, "num_input_tokens_seen": 9788456960, "step": 18670 }, { "epoch": 0.9056686125811555, "grad_norm": 0.2431640625, "learning_rate": 1.2064245334067526e-06, "loss": 2.791, "num_input_tokens_seen": 9791078400, "step": 18675 }, { "epoch": 0.9059110941374022, "grad_norm": 0.248046875, "learning_rate": 1.2002798624114102e-06, "loss": 2.7764, "num_input_tokens_seen": 9793699840, "step": 18680 }, { "epoch": 0.9061535756936488, "grad_norm": 0.2353515625, "learning_rate": 1.1941504947850125e-06, "loss": 2.7651, "num_input_tokens_seen": 9796321280, "step": 18685 }, { "epoch": 0.9063960572498955, "grad_norm": 0.2421875, "learning_rate": 1.18803643446877e-06, "loss": 2.7711, "num_input_tokens_seen": 9798942720, "step": 18690 }, { "epoch": 0.9066385388061421, "grad_norm": 0.240234375, "learning_rate": 1.1819376853940688e-06, "loss": 2.7745, "num_input_tokens_seen": 9801564160, "step": 18695 }, { "epoch": 0.9068810203623887, "grad_norm": 0.251953125, "learning_rate": 1.1758542514824416e-06, "loss": 2.7726, "num_input_tokens_seen": 9804185600, "step": 18700 }, { "epoch": 0.9071235019186353, "grad_norm": 0.240234375, "learning_rate": 1.169786136645573e-06, "loss": 2.7618, "num_input_tokens_seen": 9806807040, "step": 18705 }, { "epoch": 0.907365983474882, "grad_norm": 0.248046875, "learning_rate": 1.1637333447853006e-06, "loss": 2.7598, "num_input_tokens_seen": 9809428480, "step": 18710 }, { "epoch": 0.9076084650311286, "grad_norm": 0.2431640625, "learning_rate": 1.1576958797936105e-06, "loss": 2.7753, "num_input_tokens_seen": 9812049920, "step": 18715 }, { "epoch": 0.9078509465873752, "grad_norm": 0.2412109375, "learning_rate": 1.1516737455526228e-06, "loss": 2.7694, "num_input_tokens_seen": 9814671360, "step": 18720 }, { "epoch": 0.9080934281436218, "grad_norm": 0.2373046875, "learning_rate": 1.1456669459346091e-06, "loss": 2.7757, "num_input_tokens_seen": 9817292800, "step": 18725 }, { "epoch": 0.9083359096998684, "grad_norm": 0.2412109375, "learning_rate": 1.139675484801986e-06, "loss": 2.7682, "num_input_tokens_seen": 9819914240, "step": 18730 }, { "epoch": 0.9085783912561151, "grad_norm": 0.2412109375, "learning_rate": 1.133699366007293e-06, "loss": 2.7597, "num_input_tokens_seen": 9822535680, "step": 18735 }, { "epoch": 0.9088208728123617, "grad_norm": 0.25, "learning_rate": 1.1277385933932183e-06, "loss": 2.777, "num_input_tokens_seen": 9825157120, "step": 18740 }, { "epoch": 0.9090633543686083, "grad_norm": 0.2353515625, "learning_rate": 1.1217931707925704e-06, "loss": 2.784, "num_input_tokens_seen": 9827778560, "step": 18745 }, { "epoch": 0.9093058359248549, "grad_norm": 0.23828125, "learning_rate": 1.1158631020282972e-06, "loss": 2.7776, "num_input_tokens_seen": 9830400000, "step": 18750 }, { "epoch": 0.9095483174811015, "grad_norm": 0.2412109375, "learning_rate": 1.1099483909134678e-06, "loss": 2.7726, "num_input_tokens_seen": 9833021440, "step": 18755 }, { "epoch": 0.9097907990373483, "grad_norm": 0.240234375, "learning_rate": 1.1040490412512787e-06, "loss": 2.7638, "num_input_tokens_seen": 9835642880, "step": 18760 }, { "epoch": 0.9100332805935949, "grad_norm": 0.244140625, "learning_rate": 1.0981650568350487e-06, "loss": 2.7806, "num_input_tokens_seen": 9838264320, "step": 18765 }, { "epoch": 0.9102757621498415, "grad_norm": 0.2412109375, "learning_rate": 1.0922964414482151e-06, "loss": 2.7563, "num_input_tokens_seen": 9840885760, "step": 18770 }, { "epoch": 0.9105182437060881, "grad_norm": 0.240234375, "learning_rate": 1.086443198864337e-06, "loss": 2.7906, "num_input_tokens_seen": 9843507200, "step": 18775 }, { "epoch": 0.9107607252623348, "grad_norm": 0.2353515625, "learning_rate": 1.0806053328470843e-06, "loss": 2.7728, "num_input_tokens_seen": 9846128640, "step": 18780 }, { "epoch": 0.9110032068185814, "grad_norm": 0.244140625, "learning_rate": 1.0747828471502435e-06, "loss": 2.7883, "num_input_tokens_seen": 9848750080, "step": 18785 }, { "epoch": 0.911245688374828, "grad_norm": 0.2392578125, "learning_rate": 1.0689757455177057e-06, "loss": 2.7661, "num_input_tokens_seen": 9851371520, "step": 18790 }, { "epoch": 0.9114881699310746, "grad_norm": 0.2431640625, "learning_rate": 1.0631840316834785e-06, "loss": 2.7722, "num_input_tokens_seen": 9853992960, "step": 18795 }, { "epoch": 0.9117306514873212, "grad_norm": 0.24609375, "learning_rate": 1.0574077093716661e-06, "loss": 2.7715, "num_input_tokens_seen": 9856614400, "step": 18800 }, { "epoch": 0.9119731330435679, "grad_norm": 0.244140625, "learning_rate": 1.051646782296481e-06, "loss": 2.7717, "num_input_tokens_seen": 9859235840, "step": 18805 }, { "epoch": 0.9122156145998145, "grad_norm": 0.2421875, "learning_rate": 1.0459012541622376e-06, "loss": 2.7854, "num_input_tokens_seen": 9861857280, "step": 18810 }, { "epoch": 0.9124580961560611, "grad_norm": 0.23828125, "learning_rate": 1.040171128663342e-06, "loss": 2.7665, "num_input_tokens_seen": 9864478720, "step": 18815 }, { "epoch": 0.9127005777123077, "grad_norm": 0.2392578125, "learning_rate": 1.0344564094843023e-06, "loss": 2.7631, "num_input_tokens_seen": 9867100160, "step": 18820 }, { "epoch": 0.9129430592685543, "grad_norm": 0.2431640625, "learning_rate": 1.028757100299721e-06, "loss": 2.7683, "num_input_tokens_seen": 9869721600, "step": 18825 }, { "epoch": 0.913185540824801, "grad_norm": 0.2421875, "learning_rate": 1.0230732047742857e-06, "loss": 2.7905, "num_input_tokens_seen": 9872343040, "step": 18830 }, { "epoch": 0.9134280223810476, "grad_norm": 0.2470703125, "learning_rate": 1.0174047265627818e-06, "loss": 2.7753, "num_input_tokens_seen": 9874964480, "step": 18835 }, { "epoch": 0.9136705039372943, "grad_norm": 0.2421875, "learning_rate": 1.011751669310071e-06, "loss": 2.77, "num_input_tokens_seen": 9877585920, "step": 18840 }, { "epoch": 0.9139129854935409, "grad_norm": 0.2373046875, "learning_rate": 1.006114036651107e-06, "loss": 2.7594, "num_input_tokens_seen": 9880207360, "step": 18845 }, { "epoch": 0.9141554670497876, "grad_norm": 0.25, "learning_rate": 1.0004918322109258e-06, "loss": 2.7693, "num_input_tokens_seen": 9882828800, "step": 18850 }, { "epoch": 0.9143979486060342, "grad_norm": 0.2412109375, "learning_rate": 9.948850596046332e-07, "loss": 2.7856, "num_input_tokens_seen": 9885450240, "step": 18855 }, { "epoch": 0.9146404301622808, "grad_norm": 0.236328125, "learning_rate": 9.892937224374261e-07, "loss": 2.7889, "num_input_tokens_seen": 9888071680, "step": 18860 }, { "epoch": 0.9148829117185274, "grad_norm": 0.2421875, "learning_rate": 9.837178243045641e-07, "loss": 2.7731, "num_input_tokens_seen": 9890693120, "step": 18865 }, { "epoch": 0.915125393274774, "grad_norm": 0.2451171875, "learning_rate": 9.781573687913909e-07, "loss": 2.7776, "num_input_tokens_seen": 9893314560, "step": 18870 }, { "epoch": 0.9153678748310207, "grad_norm": 0.2412109375, "learning_rate": 9.726123594733072e-07, "loss": 2.7727, "num_input_tokens_seen": 9895936000, "step": 18875 }, { "epoch": 0.9156103563872673, "grad_norm": 0.244140625, "learning_rate": 9.67082799915789e-07, "loss": 2.7768, "num_input_tokens_seen": 9898557440, "step": 18880 }, { "epoch": 0.9158528379435139, "grad_norm": 0.2412109375, "learning_rate": 9.615686936743834e-07, "loss": 2.7858, "num_input_tokens_seen": 9901178880, "step": 18885 }, { "epoch": 0.9160953194997605, "grad_norm": 0.2421875, "learning_rate": 9.560700442946906e-07, "loss": 2.7612, "num_input_tokens_seen": 9903800320, "step": 18890 }, { "epoch": 0.9163378010560072, "grad_norm": 0.2470703125, "learning_rate": 9.50586855312377e-07, "loss": 2.7813, "num_input_tokens_seen": 9906421760, "step": 18895 }, { "epoch": 0.9165802826122538, "grad_norm": 0.23828125, "learning_rate": 9.451191302531693e-07, "loss": 2.7823, "num_input_tokens_seen": 9909043200, "step": 18900 }, { "epoch": 0.9165802826122538, "eval_accuracy": 0.45599088096401236, "eval_loss": 2.7415921688079834, "eval_runtime": 5.8578, "eval_samples_per_second": 51.213, "eval_steps_per_second": 6.487, "num_input_tokens_seen": 9909043200, "step": 18900 }, { "epoch": 0.9168227641685004, "grad_norm": 0.236328125, "learning_rate": 9.396668726328467e-07, "loss": 2.767, "num_input_tokens_seen": 9911664640, "step": 18905 }, { "epoch": 0.917065245724747, "grad_norm": 0.240234375, "learning_rate": 9.342300859572467e-07, "loss": 2.756, "num_input_tokens_seen": 9914286080, "step": 18910 }, { "epoch": 0.9173077272809936, "grad_norm": 0.23828125, "learning_rate": 9.288087737222562e-07, "loss": 2.7867, "num_input_tokens_seen": 9916907520, "step": 18915 }, { "epoch": 0.9175502088372404, "grad_norm": 0.2451171875, "learning_rate": 9.234029394138116e-07, "loss": 2.7726, "num_input_tokens_seen": 9919528960, "step": 18920 }, { "epoch": 0.917792690393487, "grad_norm": 0.2451171875, "learning_rate": 9.18012586507902e-07, "loss": 2.7795, "num_input_tokens_seen": 9922150400, "step": 18925 }, { "epoch": 0.9180351719497336, "grad_norm": 0.240234375, "learning_rate": 9.126377184705576e-07, "loss": 2.7705, "num_input_tokens_seen": 9924771840, "step": 18930 }, { "epoch": 0.9182776535059802, "grad_norm": 0.244140625, "learning_rate": 9.072783387578499e-07, "loss": 2.7789, "num_input_tokens_seen": 9927393280, "step": 18935 }, { "epoch": 0.9185201350622269, "grad_norm": 0.2451171875, "learning_rate": 9.019344508158945e-07, "loss": 2.7743, "num_input_tokens_seen": 9930014720, "step": 18940 }, { "epoch": 0.9187626166184735, "grad_norm": 0.248046875, "learning_rate": 8.9660605808084e-07, "loss": 2.774, "num_input_tokens_seen": 9932636160, "step": 18945 }, { "epoch": 0.9190050981747201, "grad_norm": 0.2392578125, "learning_rate": 8.912931639788847e-07, "loss": 2.7861, "num_input_tokens_seen": 9935257600, "step": 18950 }, { "epoch": 0.9192475797309667, "grad_norm": 0.2431640625, "learning_rate": 8.859957719262429e-07, "loss": 2.7704, "num_input_tokens_seen": 9937879040, "step": 18955 }, { "epoch": 0.9194900612872133, "grad_norm": 0.251953125, "learning_rate": 8.807138853291818e-07, "loss": 2.7902, "num_input_tokens_seen": 9940500480, "step": 18960 }, { "epoch": 0.91973254284346, "grad_norm": 0.2431640625, "learning_rate": 8.754475075839846e-07, "loss": 2.7725, "num_input_tokens_seen": 9943121920, "step": 18965 }, { "epoch": 0.9199750243997066, "grad_norm": 0.2392578125, "learning_rate": 8.701966420769591e-07, "loss": 2.7651, "num_input_tokens_seen": 9945743360, "step": 18970 }, { "epoch": 0.9202175059559532, "grad_norm": 0.2431640625, "learning_rate": 8.649612921844491e-07, "loss": 2.7622, "num_input_tokens_seen": 9948364800, "step": 18975 }, { "epoch": 0.9204599875121998, "grad_norm": 0.244140625, "learning_rate": 8.597414612728172e-07, "loss": 2.7767, "num_input_tokens_seen": 9950986240, "step": 18980 }, { "epoch": 0.9207024690684464, "grad_norm": 0.2431640625, "learning_rate": 8.54537152698448e-07, "loss": 2.7723, "num_input_tokens_seen": 9953607680, "step": 18985 }, { "epoch": 0.9209449506246931, "grad_norm": 0.24609375, "learning_rate": 8.493483698077398e-07, "loss": 2.7736, "num_input_tokens_seen": 9956229120, "step": 18990 }, { "epoch": 0.9211874321809398, "grad_norm": 0.2412109375, "learning_rate": 8.441751159371209e-07, "loss": 2.7762, "num_input_tokens_seen": 9958850560, "step": 18995 }, { "epoch": 0.9214299137371864, "grad_norm": 0.255859375, "learning_rate": 8.390173944130192e-07, "loss": 2.7613, "num_input_tokens_seen": 9961472000, "step": 19000 }, { "epoch": 0.921672395293433, "grad_norm": 0.2392578125, "learning_rate": 8.338752085518819e-07, "loss": 2.7686, "num_input_tokens_seen": 9964093440, "step": 19005 }, { "epoch": 0.9219148768496797, "grad_norm": 0.240234375, "learning_rate": 8.28748561660167e-07, "loss": 2.7632, "num_input_tokens_seen": 9966714880, "step": 19010 }, { "epoch": 0.9221573584059263, "grad_norm": 0.25, "learning_rate": 8.23637457034343e-07, "loss": 2.7734, "num_input_tokens_seen": 9969336320, "step": 19015 }, { "epoch": 0.9223998399621729, "grad_norm": 0.2421875, "learning_rate": 8.185418979608811e-07, "loss": 2.7805, "num_input_tokens_seen": 9971957760, "step": 19020 }, { "epoch": 0.9226423215184195, "grad_norm": 0.2314453125, "learning_rate": 8.134618877162631e-07, "loss": 2.7605, "num_input_tokens_seen": 9974579200, "step": 19025 }, { "epoch": 0.9228848030746661, "grad_norm": 0.2421875, "learning_rate": 8.083974295669566e-07, "loss": 2.7682, "num_input_tokens_seen": 9977200640, "step": 19030 }, { "epoch": 0.9231272846309128, "grad_norm": 0.2392578125, "learning_rate": 8.033485267694457e-07, "loss": 2.7782, "num_input_tokens_seen": 9979822080, "step": 19035 }, { "epoch": 0.9233697661871594, "grad_norm": 0.2392578125, "learning_rate": 7.983151825702085e-07, "loss": 2.7764, "num_input_tokens_seen": 9982443520, "step": 19040 }, { "epoch": 0.923612247743406, "grad_norm": 0.2392578125, "learning_rate": 7.932974002057115e-07, "loss": 2.7665, "num_input_tokens_seen": 9985064960, "step": 19045 }, { "epoch": 0.9238547292996526, "grad_norm": 0.2431640625, "learning_rate": 7.882951829024237e-07, "loss": 2.7671, "num_input_tokens_seen": 9987686400, "step": 19050 }, { "epoch": 0.9240972108558992, "grad_norm": 0.2421875, "learning_rate": 7.833085338768003e-07, "loss": 2.7761, "num_input_tokens_seen": 9990307840, "step": 19055 }, { "epoch": 0.9243396924121459, "grad_norm": 0.234375, "learning_rate": 7.783374563352902e-07, "loss": 2.7673, "num_input_tokens_seen": 9992929280, "step": 19060 }, { "epoch": 0.9245821739683925, "grad_norm": 0.240234375, "learning_rate": 7.73381953474328e-07, "loss": 2.7913, "num_input_tokens_seen": 9995550720, "step": 19065 }, { "epoch": 0.9248246555246391, "grad_norm": 0.2451171875, "learning_rate": 7.68442028480329e-07, "loss": 2.7814, "num_input_tokens_seen": 9998172160, "step": 19070 }, { "epoch": 0.9250671370808858, "grad_norm": 0.2412109375, "learning_rate": 7.635176845296966e-07, "loss": 2.7733, "num_input_tokens_seen": 10000793600, "step": 19075 }, { "epoch": 0.9253096186371325, "grad_norm": 0.2412109375, "learning_rate": 7.586089247888173e-07, "loss": 2.7804, "num_input_tokens_seen": 10003415040, "step": 19080 }, { "epoch": 0.9255521001933791, "grad_norm": 0.2373046875, "learning_rate": 7.537157524140554e-07, "loss": 2.7858, "num_input_tokens_seen": 10006036480, "step": 19085 }, { "epoch": 0.9257945817496257, "grad_norm": 0.2431640625, "learning_rate": 7.488381705517494e-07, "loss": 2.7652, "num_input_tokens_seen": 10008657920, "step": 19090 }, { "epoch": 0.9260370633058723, "grad_norm": 0.2412109375, "learning_rate": 7.439761823382129e-07, "loss": 2.7694, "num_input_tokens_seen": 10011279360, "step": 19095 }, { "epoch": 0.9262795448621189, "grad_norm": 0.240234375, "learning_rate": 7.391297908997341e-07, "loss": 2.762, "num_input_tokens_seen": 10013900800, "step": 19100 }, { "epoch": 0.9265220264183656, "grad_norm": 0.2431640625, "learning_rate": 7.342989993525784e-07, "loss": 2.7734, "num_input_tokens_seen": 10016522240, "step": 19105 }, { "epoch": 0.9267645079746122, "grad_norm": 0.2353515625, "learning_rate": 7.294838108029722e-07, "loss": 2.768, "num_input_tokens_seen": 10019143680, "step": 19110 }, { "epoch": 0.9270069895308588, "grad_norm": 0.25, "learning_rate": 7.246842283471084e-07, "loss": 2.7644, "num_input_tokens_seen": 10021765120, "step": 19115 }, { "epoch": 0.9272494710871054, "grad_norm": 0.236328125, "learning_rate": 7.199002550711542e-07, "loss": 2.7643, "num_input_tokens_seen": 10024386560, "step": 19120 }, { "epoch": 0.927491952643352, "grad_norm": 0.248046875, "learning_rate": 7.151318940512325e-07, "loss": 2.7756, "num_input_tokens_seen": 10027008000, "step": 19125 }, { "epoch": 0.9277344341995987, "grad_norm": 0.2392578125, "learning_rate": 7.103791483534267e-07, "loss": 2.7716, "num_input_tokens_seen": 10029629440, "step": 19130 }, { "epoch": 0.9279769157558453, "grad_norm": 0.244140625, "learning_rate": 7.056420210337866e-07, "loss": 2.7803, "num_input_tokens_seen": 10032250880, "step": 19135 }, { "epoch": 0.9282193973120919, "grad_norm": 0.2392578125, "learning_rate": 7.009205151383119e-07, "loss": 2.778, "num_input_tokens_seen": 10034872320, "step": 19140 }, { "epoch": 0.9284618788683385, "grad_norm": 0.2412109375, "learning_rate": 6.962146337029573e-07, "loss": 2.7613, "num_input_tokens_seen": 10037493760, "step": 19145 }, { "epoch": 0.9287043604245852, "grad_norm": 0.244140625, "learning_rate": 6.915243797536442e-07, "loss": 2.7707, "num_input_tokens_seen": 10040115200, "step": 19150 }, { "epoch": 0.9289468419808319, "grad_norm": 0.2353515625, "learning_rate": 6.868497563062237e-07, "loss": 2.7964, "num_input_tokens_seen": 10042736640, "step": 19155 }, { "epoch": 0.9291893235370785, "grad_norm": 0.251953125, "learning_rate": 6.821907663665111e-07, "loss": 2.7723, "num_input_tokens_seen": 10045358080, "step": 19160 }, { "epoch": 0.9294318050933251, "grad_norm": 0.2470703125, "learning_rate": 6.775474129302711e-07, "loss": 2.7703, "num_input_tokens_seen": 10047979520, "step": 19165 }, { "epoch": 0.9296742866495717, "grad_norm": 0.2412109375, "learning_rate": 6.729196989832043e-07, "loss": 2.7921, "num_input_tokens_seen": 10050600960, "step": 19170 }, { "epoch": 0.9299167682058184, "grad_norm": 0.244140625, "learning_rate": 6.683076275009581e-07, "loss": 2.7561, "num_input_tokens_seen": 10053222400, "step": 19175 }, { "epoch": 0.930159249762065, "grad_norm": 0.2431640625, "learning_rate": 6.637112014491298e-07, "loss": 2.7668, "num_input_tokens_seen": 10055843840, "step": 19180 }, { "epoch": 0.9304017313183116, "grad_norm": 0.2353515625, "learning_rate": 6.591304237832441e-07, "loss": 2.7632, "num_input_tokens_seen": 10058465280, "step": 19185 }, { "epoch": 0.9306442128745582, "grad_norm": 0.23828125, "learning_rate": 6.545652974487754e-07, "loss": 2.7815, "num_input_tokens_seen": 10061086720, "step": 19190 }, { "epoch": 0.9308866944308049, "grad_norm": 0.2412109375, "learning_rate": 6.500158253811228e-07, "loss": 2.7864, "num_input_tokens_seen": 10063708160, "step": 19195 }, { "epoch": 0.9311291759870515, "grad_norm": 0.248046875, "learning_rate": 6.45482010505627e-07, "loss": 2.7767, "num_input_tokens_seen": 10066329600, "step": 19200 }, { "epoch": 0.9311291759870515, "eval_accuracy": 0.45601367855398145, "eval_loss": 2.7415668964385986, "eval_runtime": 5.8962, "eval_samples_per_second": 50.88, "eval_steps_per_second": 6.445, "num_input_tokens_seen": 10066329600, "step": 19200 }, { "epoch": 0.9313716575432981, "grad_norm": 0.244140625, "learning_rate": 6.409638557375613e-07, "loss": 2.7922, "num_input_tokens_seen": 10068951040, "step": 19205 }, { "epoch": 0.9316141390995447, "grad_norm": 0.24609375, "learning_rate": 6.364613639821243e-07, "loss": 2.7786, "num_input_tokens_seen": 10071572480, "step": 19210 }, { "epoch": 0.9318566206557913, "grad_norm": 0.251953125, "learning_rate": 6.319745381344527e-07, "loss": 2.7652, "num_input_tokens_seen": 10074193920, "step": 19215 }, { "epoch": 0.932099102212038, "grad_norm": 0.2392578125, "learning_rate": 6.275033810795944e-07, "loss": 2.7671, "num_input_tokens_seen": 10076815360, "step": 19220 }, { "epoch": 0.9323415837682846, "grad_norm": 0.244140625, "learning_rate": 6.230478956925384e-07, "loss": 2.7792, "num_input_tokens_seen": 10079436800, "step": 19225 }, { "epoch": 0.9325840653245313, "grad_norm": 0.2451171875, "learning_rate": 6.186080848381876e-07, "loss": 2.774, "num_input_tokens_seen": 10082058240, "step": 19230 }, { "epoch": 0.9328265468807779, "grad_norm": 0.2421875, "learning_rate": 6.141839513713666e-07, "loss": 2.7823, "num_input_tokens_seen": 10084679680, "step": 19235 }, { "epoch": 0.9330690284370246, "grad_norm": 0.2421875, "learning_rate": 6.097754981368192e-07, "loss": 2.7738, "num_input_tokens_seen": 10087301120, "step": 19240 }, { "epoch": 0.9333115099932712, "grad_norm": 0.2412109375, "learning_rate": 6.05382727969217e-07, "loss": 2.7682, "num_input_tokens_seen": 10089922560, "step": 19245 }, { "epoch": 0.9335539915495178, "grad_norm": 0.2353515625, "learning_rate": 6.010056436931311e-07, "loss": 2.777, "num_input_tokens_seen": 10092544000, "step": 19250 }, { "epoch": 0.9337964731057644, "grad_norm": 0.2373046875, "learning_rate": 5.966442481230544e-07, "loss": 2.7631, "num_input_tokens_seen": 10095165440, "step": 19255 }, { "epoch": 0.934038954662011, "grad_norm": 0.248046875, "learning_rate": 5.922985440633965e-07, "loss": 2.7715, "num_input_tokens_seen": 10097786880, "step": 19260 }, { "epoch": 0.9342814362182577, "grad_norm": 0.240234375, "learning_rate": 5.879685343084668e-07, "loss": 2.7787, "num_input_tokens_seen": 10100408320, "step": 19265 }, { "epoch": 0.9345239177745043, "grad_norm": 0.2421875, "learning_rate": 5.836542216424907e-07, "loss": 2.7713, "num_input_tokens_seen": 10103029760, "step": 19270 }, { "epoch": 0.9347663993307509, "grad_norm": 0.2431640625, "learning_rate": 5.793556088396018e-07, "loss": 2.7874, "num_input_tokens_seen": 10105651200, "step": 19275 }, { "epoch": 0.9350088808869975, "grad_norm": 0.240234375, "learning_rate": 5.750726986638283e-07, "loss": 2.7682, "num_input_tokens_seen": 10108272640, "step": 19280 }, { "epoch": 0.9352513624432441, "grad_norm": 0.25390625, "learning_rate": 5.708054938691115e-07, "loss": 2.7856, "num_input_tokens_seen": 10110894080, "step": 19285 }, { "epoch": 0.9354938439994908, "grad_norm": 0.2470703125, "learning_rate": 5.665539971992928e-07, "loss": 2.7906, "num_input_tokens_seen": 10113515520, "step": 19290 }, { "epoch": 0.9357363255557374, "grad_norm": 0.2470703125, "learning_rate": 5.623182113881048e-07, "loss": 2.7751, "num_input_tokens_seen": 10116136960, "step": 19295 }, { "epoch": 0.935978807111984, "grad_norm": 0.244140625, "learning_rate": 5.580981391591911e-07, "loss": 2.7692, "num_input_tokens_seen": 10118758400, "step": 19300 }, { "epoch": 0.9362212886682306, "grad_norm": 0.2470703125, "learning_rate": 5.538937832260838e-07, "loss": 2.7718, "num_input_tokens_seen": 10121379840, "step": 19305 }, { "epoch": 0.9364637702244774, "grad_norm": 0.23828125, "learning_rate": 5.497051462922093e-07, "loss": 2.787, "num_input_tokens_seen": 10124001280, "step": 19310 }, { "epoch": 0.936706251780724, "grad_norm": 0.248046875, "learning_rate": 5.455322310508826e-07, "loss": 2.7713, "num_input_tokens_seen": 10126622720, "step": 19315 }, { "epoch": 0.9369487333369706, "grad_norm": 0.2431640625, "learning_rate": 5.413750401853213e-07, "loss": 2.7678, "num_input_tokens_seen": 10129244160, "step": 19320 }, { "epoch": 0.9371912148932172, "grad_norm": 0.2431640625, "learning_rate": 5.372335763686203e-07, "loss": 2.7763, "num_input_tokens_seen": 10131865600, "step": 19325 }, { "epoch": 0.9374336964494638, "grad_norm": 0.2373046875, "learning_rate": 5.331078422637692e-07, "loss": 2.7631, "num_input_tokens_seen": 10134487040, "step": 19330 }, { "epoch": 0.9376761780057105, "grad_norm": 0.2421875, "learning_rate": 5.289978405236429e-07, "loss": 2.77, "num_input_tokens_seen": 10137108480, "step": 19335 }, { "epoch": 0.9379186595619571, "grad_norm": 0.240234375, "learning_rate": 5.249035737909913e-07, "loss": 2.7744, "num_input_tokens_seen": 10139729920, "step": 19340 }, { "epoch": 0.9381611411182037, "grad_norm": 0.236328125, "learning_rate": 5.208250446984586e-07, "loss": 2.7839, "num_input_tokens_seen": 10142351360, "step": 19345 }, { "epoch": 0.9384036226744503, "grad_norm": 0.2421875, "learning_rate": 5.167622558685609e-07, "loss": 2.7684, "num_input_tokens_seen": 10144972800, "step": 19350 }, { "epoch": 0.9386461042306969, "grad_norm": 0.240234375, "learning_rate": 5.127152099137028e-07, "loss": 2.765, "num_input_tokens_seen": 10147594240, "step": 19355 }, { "epoch": 0.9388885857869436, "grad_norm": 0.248046875, "learning_rate": 5.086839094361557e-07, "loss": 2.7745, "num_input_tokens_seen": 10150215680, "step": 19360 }, { "epoch": 0.9391310673431902, "grad_norm": 0.2392578125, "learning_rate": 5.046683570280708e-07, "loss": 2.7613, "num_input_tokens_seen": 10152837120, "step": 19365 }, { "epoch": 0.9393735488994368, "grad_norm": 0.2412109375, "learning_rate": 5.006685552714802e-07, "loss": 2.7718, "num_input_tokens_seen": 10155458560, "step": 19370 }, { "epoch": 0.9396160304556834, "grad_norm": 0.2353515625, "learning_rate": 4.966845067382708e-07, "loss": 2.761, "num_input_tokens_seen": 10158080000, "step": 19375 }, { "epoch": 0.93985851201193, "grad_norm": 0.2412109375, "learning_rate": 4.927162139902186e-07, "loss": 2.7875, "num_input_tokens_seen": 10160701440, "step": 19380 }, { "epoch": 0.9401009935681767, "grad_norm": 0.240234375, "learning_rate": 4.887636795789574e-07, "loss": 2.7785, "num_input_tokens_seen": 10163322880, "step": 19385 }, { "epoch": 0.9403434751244234, "grad_norm": 0.240234375, "learning_rate": 4.848269060459904e-07, "loss": 2.7833, "num_input_tokens_seen": 10165944320, "step": 19390 }, { "epoch": 0.94058595668067, "grad_norm": 0.248046875, "learning_rate": 4.809058959226925e-07, "loss": 2.781, "num_input_tokens_seen": 10168565760, "step": 19395 }, { "epoch": 0.9408284382369166, "grad_norm": 0.251953125, "learning_rate": 4.770006517302917e-07, "loss": 2.7736, "num_input_tokens_seen": 10171187200, "step": 19400 }, { "epoch": 0.9410709197931633, "grad_norm": 0.2451171875, "learning_rate": 4.7311117597989007e-07, "loss": 2.7747, "num_input_tokens_seen": 10173808640, "step": 19405 }, { "epoch": 0.9413134013494099, "grad_norm": 0.2421875, "learning_rate": 4.692374711724401e-07, "loss": 2.7705, "num_input_tokens_seen": 10176430080, "step": 19410 }, { "epoch": 0.9415558829056565, "grad_norm": 0.2412109375, "learning_rate": 4.6537953979876035e-07, "loss": 2.7655, "num_input_tokens_seen": 10179051520, "step": 19415 }, { "epoch": 0.9417983644619031, "grad_norm": 0.25, "learning_rate": 4.615373843395249e-07, "loss": 2.7514, "num_input_tokens_seen": 10181672960, "step": 19420 }, { "epoch": 0.9420408460181497, "grad_norm": 0.2412109375, "learning_rate": 4.577110072652657e-07, "loss": 2.7658, "num_input_tokens_seen": 10184294400, "step": 19425 }, { "epoch": 0.9422833275743964, "grad_norm": 0.248046875, "learning_rate": 4.5390041103636484e-07, "loss": 2.7706, "num_input_tokens_seen": 10186915840, "step": 19430 }, { "epoch": 0.942525809130643, "grad_norm": 0.2373046875, "learning_rate": 4.501055981030594e-07, "loss": 2.7677, "num_input_tokens_seen": 10189537280, "step": 19435 }, { "epoch": 0.9427682906868896, "grad_norm": 0.2421875, "learning_rate": 4.463265709054365e-07, "loss": 2.7691, "num_input_tokens_seen": 10192158720, "step": 19440 }, { "epoch": 0.9430107722431362, "grad_norm": 0.2421875, "learning_rate": 4.425633318734357e-07, "loss": 2.7725, "num_input_tokens_seen": 10194780160, "step": 19445 }, { "epoch": 0.9432532537993829, "grad_norm": 0.244140625, "learning_rate": 4.3881588342684357e-07, "loss": 2.7773, "num_input_tokens_seen": 10197401600, "step": 19450 }, { "epoch": 0.9434957353556295, "grad_norm": 0.2431640625, "learning_rate": 4.350842279752937e-07, "loss": 2.7832, "num_input_tokens_seen": 10200023040, "step": 19455 }, { "epoch": 0.9437382169118761, "grad_norm": 0.2392578125, "learning_rate": 4.3136836791826395e-07, "loss": 2.7669, "num_input_tokens_seen": 10202644480, "step": 19460 }, { "epoch": 0.9439806984681227, "grad_norm": 0.2451171875, "learning_rate": 4.276683056450737e-07, "loss": 2.7769, "num_input_tokens_seen": 10205265920, "step": 19465 }, { "epoch": 0.9442231800243694, "grad_norm": 0.23828125, "learning_rate": 4.239840435348863e-07, "loss": 2.7749, "num_input_tokens_seen": 10207887360, "step": 19470 }, { "epoch": 0.9444656615806161, "grad_norm": 0.2392578125, "learning_rate": 4.203155839567069e-07, "loss": 2.7921, "num_input_tokens_seen": 10210508800, "step": 19475 }, { "epoch": 0.9447081431368627, "grad_norm": 0.2421875, "learning_rate": 4.166629292693791e-07, "loss": 2.7716, "num_input_tokens_seen": 10213130240, "step": 19480 }, { "epoch": 0.9449506246931093, "grad_norm": 0.236328125, "learning_rate": 4.1302608182157697e-07, "loss": 2.7795, "num_input_tokens_seen": 10215751680, "step": 19485 }, { "epoch": 0.9451931062493559, "grad_norm": 0.2431640625, "learning_rate": 4.0940504395182435e-07, "loss": 2.7746, "num_input_tokens_seen": 10218373120, "step": 19490 }, { "epoch": 0.9454355878056026, "grad_norm": 0.244140625, "learning_rate": 4.0579981798846423e-07, "loss": 2.769, "num_input_tokens_seen": 10220994560, "step": 19495 }, { "epoch": 0.9456780693618492, "grad_norm": 0.2470703125, "learning_rate": 4.0221040624968397e-07, "loss": 2.7759, "num_input_tokens_seen": 10223616000, "step": 19500 }, { "epoch": 0.9456780693618492, "eval_accuracy": 0.4559973945611464, "eval_loss": 2.7415828704833984, "eval_runtime": 5.8729, "eval_samples_per_second": 51.082, "eval_steps_per_second": 6.47, "num_input_tokens_seen": 10223616000, "step": 19500 }, { "epoch": 0.9459205509180958, "grad_norm": 0.244140625, "learning_rate": 3.986368110434929e-07, "loss": 2.7689, "num_input_tokens_seen": 10226237440, "step": 19505 }, { "epoch": 0.9461630324743424, "grad_norm": 0.24609375, "learning_rate": 3.950790346677391e-07, "loss": 2.7671, "num_input_tokens_seen": 10228858880, "step": 19510 }, { "epoch": 0.946405514030589, "grad_norm": 0.244140625, "learning_rate": 3.915370794100953e-07, "loss": 2.7709, "num_input_tokens_seen": 10231480320, "step": 19515 }, { "epoch": 0.9466479955868357, "grad_norm": 0.240234375, "learning_rate": 3.880109475480592e-07, "loss": 2.7691, "num_input_tokens_seen": 10234101760, "step": 19520 }, { "epoch": 0.9468904771430823, "grad_norm": 0.236328125, "learning_rate": 3.84500641348956e-07, "loss": 2.7686, "num_input_tokens_seen": 10236723200, "step": 19525 }, { "epoch": 0.9471329586993289, "grad_norm": 0.236328125, "learning_rate": 3.81006163069933e-07, "loss": 2.7714, "num_input_tokens_seen": 10239344640, "step": 19530 }, { "epoch": 0.9473754402555755, "grad_norm": 0.2353515625, "learning_rate": 3.775275149579649e-07, "loss": 2.7857, "num_input_tokens_seen": 10241966080, "step": 19535 }, { "epoch": 0.9476179218118221, "grad_norm": 0.2470703125, "learning_rate": 3.740646992498431e-07, "loss": 2.7812, "num_input_tokens_seen": 10244587520, "step": 19540 }, { "epoch": 0.9478604033680689, "grad_norm": 0.2490234375, "learning_rate": 3.706177181721782e-07, "loss": 2.7824, "num_input_tokens_seen": 10247208960, "step": 19545 }, { "epoch": 0.9481028849243155, "grad_norm": 0.2421875, "learning_rate": 3.6718657394140264e-07, "loss": 2.7808, "num_input_tokens_seen": 10249830400, "step": 19550 }, { "epoch": 0.9483453664805621, "grad_norm": 0.2392578125, "learning_rate": 3.6377126876376286e-07, "loss": 2.7664, "num_input_tokens_seen": 10252451840, "step": 19555 }, { "epoch": 0.9485878480368087, "grad_norm": 0.248046875, "learning_rate": 3.6037180483532163e-07, "loss": 2.7897, "num_input_tokens_seen": 10255073280, "step": 19560 }, { "epoch": 0.9488303295930554, "grad_norm": 0.236328125, "learning_rate": 3.569881843419526e-07, "loss": 2.779, "num_input_tokens_seen": 10257694720, "step": 19565 }, { "epoch": 0.949072811149302, "grad_norm": 0.2412109375, "learning_rate": 3.5362040945934873e-07, "loss": 2.7659, "num_input_tokens_seen": 10260316160, "step": 19570 }, { "epoch": 0.9493152927055486, "grad_norm": 0.2431640625, "learning_rate": 3.5026848235300836e-07, "loss": 2.7726, "num_input_tokens_seen": 10262937600, "step": 19575 }, { "epoch": 0.9495577742617952, "grad_norm": 0.240234375, "learning_rate": 3.4693240517824076e-07, "loss": 2.7706, "num_input_tokens_seen": 10265559040, "step": 19580 }, { "epoch": 0.9498002558180418, "grad_norm": 0.2451171875, "learning_rate": 3.4361218008016893e-07, "loss": 2.7792, "num_input_tokens_seen": 10268180480, "step": 19585 }, { "epoch": 0.9500427373742885, "grad_norm": 0.25, "learning_rate": 3.4030780919371284e-07, "loss": 2.7616, "num_input_tokens_seen": 10270801920, "step": 19590 }, { "epoch": 0.9502852189305351, "grad_norm": 0.244140625, "learning_rate": 3.3701929464360905e-07, "loss": 2.7807, "num_input_tokens_seen": 10273423360, "step": 19595 }, { "epoch": 0.9505277004867817, "grad_norm": 0.244140625, "learning_rate": 3.3374663854438825e-07, "loss": 2.7795, "num_input_tokens_seen": 10276044800, "step": 19600 }, { "epoch": 0.9507701820430283, "grad_norm": 0.23828125, "learning_rate": 3.304898430003894e-07, "loss": 2.7736, "num_input_tokens_seen": 10278666240, "step": 19605 }, { "epoch": 0.9510126635992749, "grad_norm": 0.248046875, "learning_rate": 3.272489101057541e-07, "loss": 2.7625, "num_input_tokens_seen": 10281287680, "step": 19610 }, { "epoch": 0.9512551451555216, "grad_norm": 0.23828125, "learning_rate": 3.2402384194442635e-07, "loss": 2.7683, "num_input_tokens_seen": 10283909120, "step": 19615 }, { "epoch": 0.9514976267117682, "grad_norm": 0.23828125, "learning_rate": 3.2081464059013635e-07, "loss": 2.7761, "num_input_tokens_seen": 10286530560, "step": 19620 }, { "epoch": 0.9517401082680149, "grad_norm": 0.240234375, "learning_rate": 3.1762130810642497e-07, "loss": 2.7908, "num_input_tokens_seen": 10289152000, "step": 19625 }, { "epoch": 0.9519825898242615, "grad_norm": 0.2421875, "learning_rate": 3.144438465466276e-07, "loss": 2.7889, "num_input_tokens_seen": 10291773440, "step": 19630 }, { "epoch": 0.9522250713805082, "grad_norm": 0.2353515625, "learning_rate": 3.1128225795386545e-07, "loss": 2.7676, "num_input_tokens_seen": 10294394880, "step": 19635 }, { "epoch": 0.9524675529367548, "grad_norm": 0.240234375, "learning_rate": 3.0813654436106787e-07, "loss": 2.7925, "num_input_tokens_seen": 10297016320, "step": 19640 }, { "epoch": 0.9527100344930014, "grad_norm": 0.24609375, "learning_rate": 3.0500670779094186e-07, "loss": 2.7682, "num_input_tokens_seen": 10299637760, "step": 19645 }, { "epoch": 0.952952516049248, "grad_norm": 0.240234375, "learning_rate": 3.0189275025599706e-07, "loss": 2.7753, "num_input_tokens_seen": 10302259200, "step": 19650 }, { "epoch": 0.9531949976054946, "grad_norm": 0.23828125, "learning_rate": 2.9879467375852065e-07, "loss": 2.7743, "num_input_tokens_seen": 10304880640, "step": 19655 }, { "epoch": 0.9534374791617413, "grad_norm": 0.24609375, "learning_rate": 2.957124802906025e-07, "loss": 2.7639, "num_input_tokens_seen": 10307502080, "step": 19660 }, { "epoch": 0.9536799607179879, "grad_norm": 0.2392578125, "learning_rate": 2.926461718341073e-07, "loss": 2.7674, "num_input_tokens_seen": 10310123520, "step": 19665 }, { "epoch": 0.9539224422742345, "grad_norm": 0.244140625, "learning_rate": 2.895957503606939e-07, "loss": 2.7662, "num_input_tokens_seen": 10312744960, "step": 19670 }, { "epoch": 0.9541649238304811, "grad_norm": 0.244140625, "learning_rate": 2.8656121783180447e-07, "loss": 2.7779, "num_input_tokens_seen": 10315366400, "step": 19675 }, { "epoch": 0.9544074053867277, "grad_norm": 0.234375, "learning_rate": 2.835425761986532e-07, "loss": 2.7761, "num_input_tokens_seen": 10317987840, "step": 19680 }, { "epoch": 0.9546498869429744, "grad_norm": 0.2373046875, "learning_rate": 2.805398274022514e-07, "loss": 2.7715, "num_input_tokens_seen": 10320609280, "step": 19685 }, { "epoch": 0.954892368499221, "grad_norm": 0.25, "learning_rate": 2.775529733733878e-07, "loss": 2.7503, "num_input_tokens_seen": 10323230720, "step": 19690 }, { "epoch": 0.9551348500554676, "grad_norm": 0.2392578125, "learning_rate": 2.7458201603262344e-07, "loss": 2.7773, "num_input_tokens_seen": 10325852160, "step": 19695 }, { "epoch": 0.9553773316117142, "grad_norm": 0.234375, "learning_rate": 2.7162695729030517e-07, "loss": 2.7728, "num_input_tokens_seen": 10328473600, "step": 19700 }, { "epoch": 0.955619813167961, "grad_norm": 0.2353515625, "learning_rate": 2.6868779904655475e-07, "loss": 2.7788, "num_input_tokens_seen": 10331095040, "step": 19705 }, { "epoch": 0.9558622947242076, "grad_norm": 0.240234375, "learning_rate": 2.657645431912714e-07, "loss": 2.7761, "num_input_tokens_seen": 10333716480, "step": 19710 }, { "epoch": 0.9561047762804542, "grad_norm": 0.23828125, "learning_rate": 2.628571916041184e-07, "loss": 2.7804, "num_input_tokens_seen": 10336337920, "step": 19715 }, { "epoch": 0.9563472578367008, "grad_norm": 0.2412109375, "learning_rate": 2.5996574615455015e-07, "loss": 2.7756, "num_input_tokens_seen": 10338959360, "step": 19720 }, { "epoch": 0.9565897393929474, "grad_norm": 0.2392578125, "learning_rate": 2.570902087017768e-07, "loss": 2.7901, "num_input_tokens_seen": 10341580800, "step": 19725 }, { "epoch": 0.9568322209491941, "grad_norm": 0.2470703125, "learning_rate": 2.5423058109479427e-07, "loss": 2.7712, "num_input_tokens_seen": 10344202240, "step": 19730 }, { "epoch": 0.9570747025054407, "grad_norm": 0.2373046875, "learning_rate": 2.513868651723539e-07, "loss": 2.7652, "num_input_tokens_seen": 10346823680, "step": 19735 }, { "epoch": 0.9573171840616873, "grad_norm": 0.2431640625, "learning_rate": 2.485590627629902e-07, "loss": 2.7714, "num_input_tokens_seen": 10349445120, "step": 19740 }, { "epoch": 0.9575596656179339, "grad_norm": 0.24609375, "learning_rate": 2.457471756849905e-07, "loss": 2.7796, "num_input_tokens_seen": 10352066560, "step": 19745 }, { "epoch": 0.9578021471741806, "grad_norm": 0.234375, "learning_rate": 2.429512057464195e-07, "loss": 2.7815, "num_input_tokens_seen": 10354688000, "step": 19750 }, { "epoch": 0.9580446287304272, "grad_norm": 0.2373046875, "learning_rate": 2.40171154745103e-07, "loss": 2.7736, "num_input_tokens_seen": 10357309440, "step": 19755 }, { "epoch": 0.9582871102866738, "grad_norm": 0.2392578125, "learning_rate": 2.3740702446863327e-07, "loss": 2.7773, "num_input_tokens_seen": 10359930880, "step": 19760 }, { "epoch": 0.9585295918429204, "grad_norm": 0.24609375, "learning_rate": 2.3465881669435807e-07, "loss": 2.7771, "num_input_tokens_seen": 10362552320, "step": 19765 }, { "epoch": 0.958772073399167, "grad_norm": 0.2353515625, "learning_rate": 2.3192653318939994e-07, "loss": 2.7682, "num_input_tokens_seen": 10365173760, "step": 19770 }, { "epoch": 0.9590145549554137, "grad_norm": 0.2412109375, "learning_rate": 2.2921017571062575e-07, "loss": 2.7875, "num_input_tokens_seen": 10367795200, "step": 19775 }, { "epoch": 0.9592570365116603, "grad_norm": 0.2451171875, "learning_rate": 2.2650974600467444e-07, "loss": 2.7694, "num_input_tokens_seen": 10370416640, "step": 19780 }, { "epoch": 0.959499518067907, "grad_norm": 0.251953125, "learning_rate": 2.238252458079404e-07, "loss": 2.7656, "num_input_tokens_seen": 10373038080, "step": 19785 }, { "epoch": 0.9597419996241536, "grad_norm": 0.2421875, "learning_rate": 2.2115667684657337e-07, "loss": 2.7817, "num_input_tokens_seen": 10375659520, "step": 19790 }, { "epoch": 0.9599844811804003, "grad_norm": 0.2412109375, "learning_rate": 2.1850404083647857e-07, "loss": 2.7811, "num_input_tokens_seen": 10378280960, "step": 19795 }, { "epoch": 0.9602269627366469, "grad_norm": 0.248046875, "learning_rate": 2.158673394833166e-07, "loss": 2.7722, "num_input_tokens_seen": 10380902400, "step": 19800 }, { "epoch": 0.9602269627366469, "eval_accuracy": 0.4560039081582804, "eval_loss": 2.741548776626587, "eval_runtime": 5.9407, "eval_samples_per_second": 50.499, "eval_steps_per_second": 6.397, "num_input_tokens_seen": 10380902400, "step": 19800 }, { "epoch": 0.9604694442928935, "grad_norm": 0.2412109375, "learning_rate": 2.1324657448250628e-07, "loss": 2.7782, "num_input_tokens_seen": 10383523840, "step": 19805 }, { "epoch": 0.9607119258491401, "grad_norm": 0.2412109375, "learning_rate": 2.106417475192135e-07, "loss": 2.7799, "num_input_tokens_seen": 10386145280, "step": 19810 }, { "epoch": 0.9609544074053867, "grad_norm": 0.236328125, "learning_rate": 2.0805286026835958e-07, "loss": 2.7777, "num_input_tokens_seen": 10388766720, "step": 19815 }, { "epoch": 0.9611968889616334, "grad_norm": 0.255859375, "learning_rate": 2.054799143946129e-07, "loss": 2.7767, "num_input_tokens_seen": 10391388160, "step": 19820 }, { "epoch": 0.96143937051788, "grad_norm": 0.2412109375, "learning_rate": 2.029229115523973e-07, "loss": 2.7986, "num_input_tokens_seen": 10394009600, "step": 19825 }, { "epoch": 0.9616818520741266, "grad_norm": 0.2421875, "learning_rate": 2.003818533858809e-07, "loss": 2.7712, "num_input_tokens_seen": 10396631040, "step": 19830 }, { "epoch": 0.9619243336303732, "grad_norm": 0.2392578125, "learning_rate": 1.9785674152897616e-07, "loss": 2.7747, "num_input_tokens_seen": 10399252480, "step": 19835 }, { "epoch": 0.9621668151866198, "grad_norm": 0.2353515625, "learning_rate": 1.9534757760534817e-07, "loss": 2.7692, "num_input_tokens_seen": 10401873920, "step": 19840 }, { "epoch": 0.9624092967428665, "grad_norm": 0.2431640625, "learning_rate": 1.9285436322840633e-07, "loss": 2.762, "num_input_tokens_seen": 10404495360, "step": 19845 }, { "epoch": 0.9626517782991131, "grad_norm": 0.2431640625, "learning_rate": 1.9037710000130438e-07, "loss": 2.7655, "num_input_tokens_seen": 10407116800, "step": 19850 }, { "epoch": 0.9628942598553597, "grad_norm": 0.24609375, "learning_rate": 1.87915789516932e-07, "loss": 2.7766, "num_input_tokens_seen": 10409738240, "step": 19855 }, { "epoch": 0.9631367414116064, "grad_norm": 0.2421875, "learning_rate": 1.8547043335793435e-07, "loss": 2.7652, "num_input_tokens_seen": 10412359680, "step": 19860 }, { "epoch": 0.963379222967853, "grad_norm": 0.248046875, "learning_rate": 1.830410330966842e-07, "loss": 2.779, "num_input_tokens_seen": 10414981120, "step": 19865 }, { "epoch": 0.9636217045240997, "grad_norm": 0.2421875, "learning_rate": 1.8062759029530696e-07, "loss": 2.7745, "num_input_tokens_seen": 10417602560, "step": 19870 }, { "epoch": 0.9638641860803463, "grad_norm": 0.2373046875, "learning_rate": 1.7823010650565852e-07, "loss": 2.7718, "num_input_tokens_seen": 10420224000, "step": 19875 }, { "epoch": 0.9641066676365929, "grad_norm": 0.25, "learning_rate": 1.758485832693335e-07, "loss": 2.7801, "num_input_tokens_seen": 10422845440, "step": 19880 }, { "epoch": 0.9643491491928395, "grad_norm": 0.2421875, "learning_rate": 1.7348302211767087e-07, "loss": 2.7739, "num_input_tokens_seen": 10425466880, "step": 19885 }, { "epoch": 0.9645916307490862, "grad_norm": 0.248046875, "learning_rate": 1.7113342457173996e-07, "loss": 2.7781, "num_input_tokens_seen": 10428088320, "step": 19890 }, { "epoch": 0.9648341123053328, "grad_norm": 0.2412109375, "learning_rate": 1.6879979214234898e-07, "loss": 2.7845, "num_input_tokens_seen": 10430709760, "step": 19895 }, { "epoch": 0.9650765938615794, "grad_norm": 0.2451171875, "learning_rate": 1.664821263300309e-07, "loss": 2.7658, "num_input_tokens_seen": 10433331200, "step": 19900 }, { "epoch": 0.965319075417826, "grad_norm": 0.244140625, "learning_rate": 1.641804286250659e-07, "loss": 2.753, "num_input_tokens_seen": 10435952640, "step": 19905 }, { "epoch": 0.9655615569740726, "grad_norm": 0.2421875, "learning_rate": 1.6189470050745615e-07, "loss": 2.7827, "num_input_tokens_seen": 10438574080, "step": 19910 }, { "epoch": 0.9658040385303193, "grad_norm": 0.2431640625, "learning_rate": 1.596249434469399e-07, "loss": 2.7746, "num_input_tokens_seen": 10441195520, "step": 19915 }, { "epoch": 0.9660465200865659, "grad_norm": 0.2373046875, "learning_rate": 1.573711589029858e-07, "loss": 2.7763, "num_input_tokens_seen": 10443816960, "step": 19920 }, { "epoch": 0.9662890016428125, "grad_norm": 0.2412109375, "learning_rate": 1.5513334832479575e-07, "loss": 2.7699, "num_input_tokens_seen": 10446438400, "step": 19925 }, { "epoch": 0.9665314831990591, "grad_norm": 0.2421875, "learning_rate": 1.5291151315129093e-07, "loss": 2.7926, "num_input_tokens_seen": 10449059840, "step": 19930 }, { "epoch": 0.9667739647553057, "grad_norm": 0.240234375, "learning_rate": 1.5070565481112297e-07, "loss": 2.7908, "num_input_tokens_seen": 10451681280, "step": 19935 }, { "epoch": 0.9670164463115525, "grad_norm": 0.2421875, "learning_rate": 1.4851577472267952e-07, "loss": 2.7765, "num_input_tokens_seen": 10454302720, "step": 19940 }, { "epoch": 0.9672589278677991, "grad_norm": 0.2421875, "learning_rate": 1.46341874294062e-07, "loss": 2.7696, "num_input_tokens_seen": 10456924160, "step": 19945 }, { "epoch": 0.9675014094240457, "grad_norm": 0.2431640625, "learning_rate": 1.441839549231022e-07, "loss": 2.7869, "num_input_tokens_seen": 10459545600, "step": 19950 }, { "epoch": 0.9677438909802923, "grad_norm": 0.2431640625, "learning_rate": 1.4204201799735973e-07, "loss": 2.7702, "num_input_tokens_seen": 10462167040, "step": 19955 }, { "epoch": 0.967986372536539, "grad_norm": 0.2490234375, "learning_rate": 1.399160648941078e-07, "loss": 2.7836, "num_input_tokens_seen": 10464788480, "step": 19960 }, { "epoch": 0.9682288540927856, "grad_norm": 0.2431640625, "learning_rate": 1.3780609698035296e-07, "loss": 2.762, "num_input_tokens_seen": 10467409920, "step": 19965 }, { "epoch": 0.9684713356490322, "grad_norm": 0.2353515625, "learning_rate": 1.357121156128127e-07, "loss": 2.7724, "num_input_tokens_seen": 10470031360, "step": 19970 }, { "epoch": 0.9687138172052788, "grad_norm": 0.236328125, "learning_rate": 1.3363412213793226e-07, "loss": 2.7693, "num_input_tokens_seen": 10472652800, "step": 19975 }, { "epoch": 0.9689562987615254, "grad_norm": 0.2421875, "learning_rate": 1.3157211789187885e-07, "loss": 2.7787, "num_input_tokens_seen": 10475274240, "step": 19980 }, { "epoch": 0.9691987803177721, "grad_norm": 0.2373046875, "learning_rate": 1.29526104200528e-07, "loss": 2.7758, "num_input_tokens_seen": 10477895680, "step": 19985 }, { "epoch": 0.9694412618740187, "grad_norm": 0.2431640625, "learning_rate": 1.2749608237948296e-07, "loss": 2.7705, "num_input_tokens_seen": 10480517120, "step": 19990 }, { "epoch": 0.9696837434302653, "grad_norm": 0.2412109375, "learning_rate": 1.2548205373405508e-07, "loss": 2.7784, "num_input_tokens_seen": 10483138560, "step": 19995 }, { "epoch": 0.9699262249865119, "grad_norm": 0.2451171875, "learning_rate": 1.2348401955928623e-07, "loss": 2.7826, "num_input_tokens_seen": 10485760000, "step": 20000 }, { "epoch": 0.9701687065427586, "grad_norm": 0.2373046875, "learning_rate": 1.215019811399154e-07, "loss": 2.7817, "num_input_tokens_seen": 10488381440, "step": 20005 }, { "epoch": 0.9704111880990052, "grad_norm": 0.2421875, "learning_rate": 1.1953593975041477e-07, "loss": 2.771, "num_input_tokens_seen": 10491002880, "step": 20010 }, { "epoch": 0.9706536696552518, "grad_norm": 0.244140625, "learning_rate": 1.1758589665495368e-07, "loss": 2.7607, "num_input_tokens_seen": 10493624320, "step": 20015 }, { "epoch": 0.9708961512114985, "grad_norm": 0.23828125, "learning_rate": 1.1565185310742632e-07, "loss": 2.7761, "num_input_tokens_seen": 10496245760, "step": 20020 }, { "epoch": 0.9711386327677451, "grad_norm": 0.2431640625, "learning_rate": 1.1373381035143239e-07, "loss": 2.7788, "num_input_tokens_seen": 10498867200, "step": 20025 }, { "epoch": 0.9713811143239918, "grad_norm": 0.2431640625, "learning_rate": 1.1183176962028808e-07, "loss": 2.7646, "num_input_tokens_seen": 10501488640, "step": 20030 }, { "epoch": 0.9716235958802384, "grad_norm": 0.2490234375, "learning_rate": 1.0994573213701509e-07, "loss": 2.7843, "num_input_tokens_seen": 10504110080, "step": 20035 }, { "epoch": 0.971866077436485, "grad_norm": 0.232421875, "learning_rate": 1.0807569911434611e-07, "loss": 2.781, "num_input_tokens_seen": 10506731520, "step": 20040 }, { "epoch": 0.9721085589927316, "grad_norm": 0.2392578125, "learning_rate": 1.0622167175472763e-07, "loss": 2.7677, "num_input_tokens_seen": 10509352960, "step": 20045 }, { "epoch": 0.9723510405489783, "grad_norm": 0.2421875, "learning_rate": 1.0438365125031157e-07, "loss": 2.7833, "num_input_tokens_seen": 10511974400, "step": 20050 }, { "epoch": 0.9725935221052249, "grad_norm": 0.2421875, "learning_rate": 1.0256163878295255e-07, "loss": 2.7805, "num_input_tokens_seen": 10514595840, "step": 20055 }, { "epoch": 0.9728360036614715, "grad_norm": 0.2431640625, "learning_rate": 1.0075563552421896e-07, "loss": 2.7851, "num_input_tokens_seen": 10517217280, "step": 20060 }, { "epoch": 0.9730784852177181, "grad_norm": 0.240234375, "learning_rate": 9.89656426353791e-08, "loss": 2.7839, "num_input_tokens_seen": 10519838720, "step": 20065 }, { "epoch": 0.9733209667739647, "grad_norm": 0.25, "learning_rate": 9.71916612674123e-08, "loss": 2.7593, "num_input_tokens_seen": 10522460160, "step": 20070 }, { "epoch": 0.9735634483302114, "grad_norm": 0.2412109375, "learning_rate": 9.543369256100055e-08, "loss": 2.7544, "num_input_tokens_seen": 10525081600, "step": 20075 }, { "epoch": 0.973805929886458, "grad_norm": 0.2451171875, "learning_rate": 9.369173764652572e-08, "loss": 2.7705, "num_input_tokens_seen": 10527703040, "step": 20080 }, { "epoch": 0.9740484114427046, "grad_norm": 0.23828125, "learning_rate": 9.196579764407797e-08, "loss": 2.7658, "num_input_tokens_seen": 10530324480, "step": 20085 }, { "epoch": 0.9742908929989512, "grad_norm": 0.2412109375, "learning_rate": 9.025587366344456e-08, "loss": 2.7833, "num_input_tokens_seen": 10532945920, "step": 20090 }, { "epoch": 0.9745333745551978, "grad_norm": 0.24609375, "learning_rate": 8.856196680412099e-08, "loss": 2.7785, "num_input_tokens_seen": 10535567360, "step": 20095 }, { "epoch": 0.9747758561114446, "grad_norm": 0.255859375, "learning_rate": 8.688407815529709e-08, "loss": 2.7764, "num_input_tokens_seen": 10538188800, "step": 20100 }, { "epoch": 0.9747758561114446, "eval_accuracy": 0.4559745969711773, "eval_loss": 2.741562604904175, "eval_runtime": 5.908, "eval_samples_per_second": 50.778, "eval_steps_per_second": 6.432, "num_input_tokens_seen": 10538188800, "step": 20100 }, { "epoch": 0.9750183376676912, "grad_norm": 0.2421875, "learning_rate": 8.522220879586818e-08, "loss": 2.771, "num_input_tokens_seen": 10540810240, "step": 20105 }, { "epoch": 0.9752608192239378, "grad_norm": 0.236328125, "learning_rate": 8.357635979442668e-08, "loss": 2.7807, "num_input_tokens_seen": 10543431680, "step": 20110 }, { "epoch": 0.9755033007801844, "grad_norm": 0.248046875, "learning_rate": 8.194653220926219e-08, "loss": 2.7655, "num_input_tokens_seen": 10546053120, "step": 20115 }, { "epoch": 0.9757457823364311, "grad_norm": 0.2451171875, "learning_rate": 8.033272708836414e-08, "loss": 2.7725, "num_input_tokens_seen": 10548674560, "step": 20120 }, { "epoch": 0.9759882638926777, "grad_norm": 0.25, "learning_rate": 7.873494546941917e-08, "loss": 2.7939, "num_input_tokens_seen": 10551296000, "step": 20125 }, { "epoch": 0.9762307454489243, "grad_norm": 0.2392578125, "learning_rate": 7.7153188379811e-08, "loss": 2.7654, "num_input_tokens_seen": 10553917440, "step": 20130 }, { "epoch": 0.9764732270051709, "grad_norm": 0.2451171875, "learning_rate": 7.558745683662049e-08, "loss": 2.7739, "num_input_tokens_seen": 10556538880, "step": 20135 }, { "epoch": 0.9767157085614175, "grad_norm": 0.23828125, "learning_rate": 7.40377518466201e-08, "loss": 2.7829, "num_input_tokens_seen": 10559160320, "step": 20140 }, { "epoch": 0.9769581901176642, "grad_norm": 0.248046875, "learning_rate": 7.250407440628493e-08, "loss": 2.7772, "num_input_tokens_seen": 10561781760, "step": 20145 }, { "epoch": 0.9772006716739108, "grad_norm": 0.248046875, "learning_rate": 7.098642550177336e-08, "loss": 2.7752, "num_input_tokens_seen": 10564403200, "step": 20150 }, { "epoch": 0.9774431532301574, "grad_norm": 0.2392578125, "learning_rate": 6.948480610894648e-08, "loss": 2.7711, "num_input_tokens_seen": 10567024640, "step": 20155 }, { "epoch": 0.977685634786404, "grad_norm": 0.23828125, "learning_rate": 6.799921719335411e-08, "loss": 2.7867, "num_input_tokens_seen": 10569646080, "step": 20160 }, { "epoch": 0.9779281163426506, "grad_norm": 0.240234375, "learning_rate": 6.65296597102405e-08, "loss": 2.78, "num_input_tokens_seen": 10572267520, "step": 20165 }, { "epoch": 0.9781705978988973, "grad_norm": 0.2431640625, "learning_rate": 6.507613460453865e-08, "loss": 2.7664, "num_input_tokens_seen": 10574888960, "step": 20170 }, { "epoch": 0.978413079455144, "grad_norm": 0.2392578125, "learning_rate": 6.363864281087595e-08, "loss": 2.7752, "num_input_tokens_seen": 10577510400, "step": 20175 }, { "epoch": 0.9786555610113906, "grad_norm": 0.2421875, "learning_rate": 6.221718525356579e-08, "loss": 2.7806, "num_input_tokens_seen": 10580131840, "step": 20180 }, { "epoch": 0.9788980425676372, "grad_norm": 0.240234375, "learning_rate": 6.081176284661594e-08, "loss": 2.7785, "num_input_tokens_seen": 10582753280, "step": 20185 }, { "epoch": 0.9791405241238839, "grad_norm": 0.23828125, "learning_rate": 5.9422376493722953e-08, "loss": 2.7799, "num_input_tokens_seen": 10585374720, "step": 20190 }, { "epoch": 0.9793830056801305, "grad_norm": 0.244140625, "learning_rate": 5.804902708826665e-08, "loss": 2.7928, "num_input_tokens_seen": 10587996160, "step": 20195 }, { "epoch": 0.9796254872363771, "grad_norm": 0.2421875, "learning_rate": 5.669171551332675e-08, "loss": 2.7681, "num_input_tokens_seen": 10590617600, "step": 20200 }, { "epoch": 0.9798679687926237, "grad_norm": 0.2392578125, "learning_rate": 5.535044264165512e-08, "loss": 2.7884, "num_input_tokens_seen": 10593239040, "step": 20205 }, { "epoch": 0.9801104503488703, "grad_norm": 0.2412109375, "learning_rate": 5.402520933570354e-08, "loss": 2.7794, "num_input_tokens_seen": 10595860480, "step": 20210 }, { "epoch": 0.980352931905117, "grad_norm": 0.23828125, "learning_rate": 5.271601644760426e-08, "loss": 2.7662, "num_input_tokens_seen": 10598481920, "step": 20215 }, { "epoch": 0.9805954134613636, "grad_norm": 0.248046875, "learning_rate": 5.1422864819178354e-08, "loss": 2.7762, "num_input_tokens_seen": 10601103360, "step": 20220 }, { "epoch": 0.9808378950176102, "grad_norm": 0.244140625, "learning_rate": 5.0145755281924556e-08, "loss": 2.7787, "num_input_tokens_seen": 10603724800, "step": 20225 }, { "epoch": 0.9810803765738568, "grad_norm": 0.2431640625, "learning_rate": 4.888468865703877e-08, "loss": 2.7703, "num_input_tokens_seen": 10606346240, "step": 20230 }, { "epoch": 0.9813228581301034, "grad_norm": 0.2392578125, "learning_rate": 4.76396657553918e-08, "loss": 2.7845, "num_input_tokens_seen": 10608967680, "step": 20235 }, { "epoch": 0.9815653396863501, "grad_norm": 0.23828125, "learning_rate": 4.6410687377540505e-08, "loss": 2.7719, "num_input_tokens_seen": 10611589120, "step": 20240 }, { "epoch": 0.9818078212425967, "grad_norm": 0.2490234375, "learning_rate": 4.519775431372775e-08, "loss": 2.779, "num_input_tokens_seen": 10614210560, "step": 20245 }, { "epoch": 0.9820503027988433, "grad_norm": 0.240234375, "learning_rate": 4.40008673438741e-08, "loss": 2.77, "num_input_tokens_seen": 10616832000, "step": 20250 }, { "epoch": 0.98229278435509, "grad_norm": 0.232421875, "learning_rate": 4.282002723758616e-08, "loss": 2.7562, "num_input_tokens_seen": 10619453440, "step": 20255 }, { "epoch": 0.9825352659113367, "grad_norm": 0.244140625, "learning_rate": 4.165523475415656e-08, "loss": 2.7774, "num_input_tokens_seen": 10622074880, "step": 20260 }, { "epoch": 0.9827777474675833, "grad_norm": 0.2490234375, "learning_rate": 4.05064906425473e-08, "loss": 2.7768, "num_input_tokens_seen": 10624696320, "step": 20265 }, { "epoch": 0.9830202290238299, "grad_norm": 0.2333984375, "learning_rate": 3.937379564141197e-08, "loss": 2.7821, "num_input_tokens_seen": 10627317760, "step": 20270 }, { "epoch": 0.9832627105800765, "grad_norm": 0.24609375, "learning_rate": 3.8257150479079073e-08, "loss": 2.7754, "num_input_tokens_seen": 10629939200, "step": 20275 }, { "epoch": 0.9835051921363231, "grad_norm": 0.2333984375, "learning_rate": 3.71565558735576e-08, "loss": 2.7737, "num_input_tokens_seen": 10632560640, "step": 20280 }, { "epoch": 0.9837476736925698, "grad_norm": 0.240234375, "learning_rate": 3.6072012532539776e-08, "loss": 2.7817, "num_input_tokens_seen": 10635182080, "step": 20285 }, { "epoch": 0.9839901552488164, "grad_norm": 0.2431640625, "learning_rate": 3.500352115339001e-08, "loss": 2.7746, "num_input_tokens_seen": 10637803520, "step": 20290 }, { "epoch": 0.984232636805063, "grad_norm": 0.2392578125, "learning_rate": 3.39510824231587e-08, "loss": 2.763, "num_input_tokens_seen": 10640424960, "step": 20295 }, { "epoch": 0.9844751183613096, "grad_norm": 0.24609375, "learning_rate": 3.2914697018565645e-08, "loss": 2.7655, "num_input_tokens_seen": 10643046400, "step": 20300 }, { "epoch": 0.9847175999175563, "grad_norm": 0.240234375, "learning_rate": 3.189436560601944e-08, "loss": 2.7836, "num_input_tokens_seen": 10645667840, "step": 20305 }, { "epoch": 0.9849600814738029, "grad_norm": 0.2373046875, "learning_rate": 3.0890088841595276e-08, "loss": 2.7791, "num_input_tokens_seen": 10648289280, "step": 20310 }, { "epoch": 0.9852025630300495, "grad_norm": 0.2431640625, "learning_rate": 2.990186737104883e-08, "loss": 2.7727, "num_input_tokens_seen": 10650910720, "step": 20315 }, { "epoch": 0.9854450445862961, "grad_norm": 0.244140625, "learning_rate": 2.8929701829816247e-08, "loss": 2.779, "num_input_tokens_seen": 10653532160, "step": 20320 }, { "epoch": 0.9856875261425427, "grad_norm": 0.248046875, "learning_rate": 2.7973592843003048e-08, "loss": 2.7835, "num_input_tokens_seen": 10656153600, "step": 20325 }, { "epoch": 0.9859300076987894, "grad_norm": 0.2431640625, "learning_rate": 2.7033541025395237e-08, "loss": 2.7604, "num_input_tokens_seen": 10658775040, "step": 20330 }, { "epoch": 0.9861724892550361, "grad_norm": 0.2353515625, "learning_rate": 2.610954698145096e-08, "loss": 2.7776, "num_input_tokens_seen": 10661396480, "step": 20335 }, { "epoch": 0.9864149708112827, "grad_norm": 0.24609375, "learning_rate": 2.5201611305303296e-08, "loss": 2.7612, "num_input_tokens_seen": 10664017920, "step": 20340 }, { "epoch": 0.9866574523675293, "grad_norm": 0.2392578125, "learning_rate": 2.4309734580760248e-08, "loss": 2.7758, "num_input_tokens_seen": 10666639360, "step": 20345 }, { "epoch": 0.986899933923776, "grad_norm": 0.2392578125, "learning_rate": 2.343391738130751e-08, "loss": 2.7584, "num_input_tokens_seen": 10669260800, "step": 20350 }, { "epoch": 0.9871424154800226, "grad_norm": 0.2451171875, "learning_rate": 2.2574160270100174e-08, "loss": 2.7869, "num_input_tokens_seen": 10671882240, "step": 20355 }, { "epoch": 0.9873848970362692, "grad_norm": 0.2421875, "learning_rate": 2.1730463799965463e-08, "loss": 2.7762, "num_input_tokens_seen": 10674503680, "step": 20360 }, { "epoch": 0.9876273785925158, "grad_norm": 0.2392578125, "learning_rate": 2.090282851340275e-08, "loss": 2.7877, "num_input_tokens_seen": 10677125120, "step": 20365 }, { "epoch": 0.9878698601487624, "grad_norm": 0.2470703125, "learning_rate": 2.0091254942594674e-08, "loss": 2.7722, "num_input_tokens_seen": 10679746560, "step": 20370 }, { "epoch": 0.9881123417050091, "grad_norm": 0.2353515625, "learning_rate": 1.9295743609382134e-08, "loss": 2.7713, "num_input_tokens_seen": 10682368000, "step": 20375 }, { "epoch": 0.9883548232612557, "grad_norm": 0.2333984375, "learning_rate": 1.8516295025283735e-08, "loss": 2.7799, "num_input_tokens_seen": 10684989440, "step": 20380 }, { "epoch": 0.9885973048175023, "grad_norm": 0.2412109375, "learning_rate": 1.7752909691493013e-08, "loss": 2.7776, "num_input_tokens_seen": 10687610880, "step": 20385 }, { "epoch": 0.9888397863737489, "grad_norm": 0.24609375, "learning_rate": 1.7005588098870094e-08, "loss": 2.7842, "num_input_tokens_seen": 10690232320, "step": 20390 }, { "epoch": 0.9890822679299955, "grad_norm": 0.240234375, "learning_rate": 1.6274330727947262e-08, "loss": 2.7785, "num_input_tokens_seen": 10692853760, "step": 20395 }, { "epoch": 0.9893247494862422, "grad_norm": 0.2353515625, "learning_rate": 1.5559138048928946e-08, "loss": 2.7724, "num_input_tokens_seen": 10695475200, "step": 20400 }, { "epoch": 0.9893247494862422, "eval_accuracy": 0.4559290017912392, "eval_loss": 2.741572380065918, "eval_runtime": 5.8791, "eval_samples_per_second": 51.028, "eval_steps_per_second": 6.464, "num_input_tokens_seen": 10695475200, "step": 20400 }, { "epoch": 0.9895672310424888, "grad_norm": 0.24609375, "learning_rate": 1.4860010521683399e-08, "loss": 2.7642, "num_input_tokens_seen": 10698096640, "step": 20405 }, { "epoch": 0.9898097125987355, "grad_norm": 0.2451171875, "learning_rate": 1.4176948595762129e-08, "loss": 2.7712, "num_input_tokens_seen": 10700718080, "step": 20410 }, { "epoch": 0.9900521941549821, "grad_norm": 0.2470703125, "learning_rate": 1.3509952710372143e-08, "loss": 2.7852, "num_input_tokens_seen": 10703339520, "step": 20415 }, { "epoch": 0.9902946757112288, "grad_norm": 0.25390625, "learning_rate": 1.2859023294395367e-08, "loss": 2.7632, "num_input_tokens_seen": 10705960960, "step": 20420 }, { "epoch": 0.9905371572674754, "grad_norm": 0.23828125, "learning_rate": 1.222416076638866e-08, "loss": 2.7766, "num_input_tokens_seen": 10708582400, "step": 20425 }, { "epoch": 0.990779638823722, "grad_norm": 0.240234375, "learning_rate": 1.1605365534569924e-08, "loss": 2.7627, "num_input_tokens_seen": 10711203840, "step": 20430 }, { "epoch": 0.9910221203799686, "grad_norm": 0.244140625, "learning_rate": 1.1002637996826437e-08, "loss": 2.774, "num_input_tokens_seen": 10713825280, "step": 20435 }, { "epoch": 0.9912646019362152, "grad_norm": 0.2392578125, "learning_rate": 1.041597854071763e-08, "loss": 2.7797, "num_input_tokens_seen": 10716446720, "step": 20440 }, { "epoch": 0.9915070834924619, "grad_norm": 0.2490234375, "learning_rate": 9.845387543469532e-09, "loss": 2.7749, "num_input_tokens_seen": 10719068160, "step": 20445 }, { "epoch": 0.9917495650487085, "grad_norm": 0.234375, "learning_rate": 9.290865371977542e-09, "loss": 2.7768, "num_input_tokens_seen": 10721689600, "step": 20450 }, { "epoch": 0.9919920466049551, "grad_norm": 0.2392578125, "learning_rate": 8.752412382798114e-09, "loss": 2.7748, "num_input_tokens_seen": 10724311040, "step": 20455 }, { "epoch": 0.9922345281612017, "grad_norm": 0.2412109375, "learning_rate": 8.230028922162625e-09, "loss": 2.7646, "num_input_tokens_seen": 10726932480, "step": 20460 }, { "epoch": 0.9924770097174483, "grad_norm": 0.240234375, "learning_rate": 7.723715325966274e-09, "loss": 2.7714, "num_input_tokens_seen": 10729553920, "step": 20465 }, { "epoch": 0.992719491273695, "grad_norm": 0.2392578125, "learning_rate": 7.233471919773638e-09, "loss": 2.7831, "num_input_tokens_seen": 10732175360, "step": 20470 }, { "epoch": 0.9929619728299416, "grad_norm": 0.244140625, "learning_rate": 6.759299018813114e-09, "loss": 2.7739, "num_input_tokens_seen": 10734796800, "step": 20475 }, { "epoch": 0.9932044543861882, "grad_norm": 0.24609375, "learning_rate": 6.301196927979702e-09, "loss": 2.7904, "num_input_tokens_seen": 10737418240, "step": 20480 }, { "epoch": 0.9934469359424348, "grad_norm": 0.244140625, "learning_rate": 5.859165941837774e-09, "loss": 2.7693, "num_input_tokens_seen": 10740039680, "step": 20485 }, { "epoch": 0.9936894174986816, "grad_norm": 0.2392578125, "learning_rate": 5.433206344615527e-09, "loss": 2.7836, "num_input_tokens_seen": 10742661120, "step": 20490 }, { "epoch": 0.9939318990549282, "grad_norm": 0.2431640625, "learning_rate": 5.0233184102049805e-09, "loss": 2.7773, "num_input_tokens_seen": 10745282560, "step": 20495 }, { "epoch": 0.9941743806111748, "grad_norm": 0.25, "learning_rate": 4.6295024021703045e-09, "loss": 2.7832, "num_input_tokens_seen": 10747904000, "step": 20500 }, { "epoch": 0.9944168621674214, "grad_norm": 0.2421875, "learning_rate": 4.251758573736719e-09, "loss": 2.7662, "num_input_tokens_seen": 10750525440, "step": 20505 }, { "epoch": 0.994659343723668, "grad_norm": 0.2421875, "learning_rate": 3.8900871677960415e-09, "loss": 2.781, "num_input_tokens_seen": 10753146880, "step": 20510 }, { "epoch": 0.9949018252799147, "grad_norm": 0.248046875, "learning_rate": 3.5444884169039126e-09, "loss": 2.7735, "num_input_tokens_seen": 10755768320, "step": 20515 }, { "epoch": 0.9951443068361613, "grad_norm": 0.240234375, "learning_rate": 3.2149625432825736e-09, "loss": 2.7776, "num_input_tokens_seen": 10758389760, "step": 20520 }, { "epoch": 0.9953867883924079, "grad_norm": 0.25, "learning_rate": 2.901509758820864e-09, "loss": 2.7735, "num_input_tokens_seen": 10761011200, "step": 20525 }, { "epoch": 0.9956292699486545, "grad_norm": 0.2392578125, "learning_rate": 2.6041302650686717e-09, "loss": 2.7768, "num_input_tokens_seen": 10763632640, "step": 20530 }, { "epoch": 0.9958717515049011, "grad_norm": 0.2451171875, "learning_rate": 2.3228242532452592e-09, "loss": 2.7728, "num_input_tokens_seen": 10766254080, "step": 20535 }, { "epoch": 0.9961142330611478, "grad_norm": 0.25, "learning_rate": 2.0575919042309378e-09, "loss": 2.7801, "num_input_tokens_seen": 10768875520, "step": 20540 }, { "epoch": 0.9963567146173944, "grad_norm": 0.2412109375, "learning_rate": 1.8084333885698413e-09, "loss": 2.7743, "num_input_tokens_seen": 10771496960, "step": 20545 }, { "epoch": 0.996599196173641, "grad_norm": 0.2373046875, "learning_rate": 1.575348866475479e-09, "loss": 2.7709, "num_input_tokens_seen": 10774118400, "step": 20550 }, { "epoch": 0.9968416777298876, "grad_norm": 0.236328125, "learning_rate": 1.3583384878224081e-09, "loss": 2.7763, "num_input_tokens_seen": 10776739840, "step": 20555 }, { "epoch": 0.9970841592861343, "grad_norm": 0.23828125, "learning_rate": 1.1574023921490095e-09, "loss": 2.7781, "num_input_tokens_seen": 10779361280, "step": 20560 }, { "epoch": 0.9973266408423809, "grad_norm": 0.24609375, "learning_rate": 9.725407086574878e-10, "loss": 2.7687, "num_input_tokens_seen": 10781982720, "step": 20565 }, { "epoch": 0.9975691223986276, "grad_norm": 0.2421875, "learning_rate": 8.037535562166465e-10, "loss": 2.7821, "num_input_tokens_seen": 10784604160, "step": 20570 }, { "epoch": 0.9978116039548742, "grad_norm": 0.24609375, "learning_rate": 6.510410433563374e-10, "loss": 2.786, "num_input_tokens_seen": 10787225600, "step": 20575 }, { "epoch": 0.9980540855111208, "grad_norm": 0.2490234375, "learning_rate": 5.144032682730116e-10, "loss": 2.7812, "num_input_tokens_seen": 10789847040, "step": 20580 }, { "epoch": 0.9982965670673675, "grad_norm": 0.2421875, "learning_rate": 3.9384031882416797e-10, "loss": 2.786, "num_input_tokens_seen": 10792468480, "step": 20585 }, { "epoch": 0.9985390486236141, "grad_norm": 0.240234375, "learning_rate": 2.8935227253112927e-10, "loss": 2.7736, "num_input_tokens_seen": 10795089920, "step": 20590 }, { "epoch": 0.9987815301798607, "grad_norm": 0.2392578125, "learning_rate": 2.0093919658459303e-10, "loss": 2.7753, "num_input_tokens_seen": 10797711360, "step": 20595 }, { "epoch": 0.9990240117361073, "grad_norm": 0.2392578125, "learning_rate": 1.2860114783352917e-10, "loss": 2.765, "num_input_tokens_seen": 10800332800, "step": 20600 }, { "epoch": 0.999266493292354, "grad_norm": 0.2373046875, "learning_rate": 7.233817278795574e-11, "loss": 2.7774, "num_input_tokens_seen": 10802954240, "step": 20605 }, { "epoch": 0.9995089748486006, "grad_norm": 0.2392578125, "learning_rate": 3.215030763004112e-11, "loss": 2.7738, "num_input_tokens_seen": 10805575680, "step": 20610 }, { "epoch": 0.9997514564048472, "grad_norm": 0.2431640625, "learning_rate": 8.037578200226214e-12, "loss": 2.7771, "num_input_tokens_seen": 10808197120, "step": 20615 }, { "epoch": 0.9999939379610938, "grad_norm": 0.2353515625, "learning_rate": 0.0, "loss": 2.7621, "num_input_tokens_seen": 10810818560, "step": 20620 }, { "epoch": 0.9999939379610938, "num_input_tokens_seen": 10810818560, "step": 20620, "total_flos": 1.199767969182253e+19, "train_loss": 2.7848671950609445, "train_runtime": 243040.2706, "train_samples_per_second": 21.72, "train_steps_per_second": 0.085 } ], "logging_steps": 5, "max_steps": 20620, "num_input_tokens_seen": 10810818560, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.199767969182253e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }