{ "best_metric": 0.038246750831604004, "best_model_checkpoint": "saves/qwen-8b/lora/sft/checkpoint-20000", "epoch": 2.4024024024024024, "eval_steps": 250, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006006006006006006, "grad_norm": 0.1276276707649231, "learning_rate": 9.999999015013712e-05, "loss": 0.7206, "step": 5 }, { "epoch": 0.0012012012012012011, "grad_norm": 0.11029542982578278, "learning_rate": 9.999996052155599e-05, "loss": 0.7604, "step": 10 }, { "epoch": 0.0018018018018018018, "grad_norm": 0.11351778358221054, "learning_rate": 9.999991111422876e-05, "loss": 0.6428, "step": 15 }, { "epoch": 0.0024024024024024023, "grad_norm": 0.08558386564254761, "learning_rate": 9.9999841928175e-05, "loss": 0.6202, "step": 20 }, { "epoch": 0.003003003003003003, "grad_norm": 0.10890918225049973, "learning_rate": 9.999975296342206e-05, "loss": 0.5867, "step": 25 }, { "epoch": 0.0036036036036036037, "grad_norm": 0.1286676824092865, "learning_rate": 9.999964422000514e-05, "loss": 0.7121, "step": 30 }, { "epoch": 0.004204204204204204, "grad_norm": 0.12503540515899658, "learning_rate": 9.999951569796724e-05, "loss": 0.7087, "step": 35 }, { "epoch": 0.004804804804804805, "grad_norm": 0.12578998506069183, "learning_rate": 9.999936739735923e-05, "loss": 0.5874, "step": 40 }, { "epoch": 0.005405405405405406, "grad_norm": 0.1145617812871933, "learning_rate": 9.999919931823976e-05, "loss": 0.5829, "step": 45 }, { "epoch": 0.006006006006006006, "grad_norm": 0.09650918841362, "learning_rate": 9.999901146067532e-05, "loss": 0.5985, "step": 50 }, { "epoch": 0.006606606606606606, "grad_norm": 0.10375821590423584, "learning_rate": 9.999880382474021e-05, "loss": 0.5882, "step": 55 }, { "epoch": 0.007207207207207207, "grad_norm": 0.10667344927787781, "learning_rate": 9.999857641051658e-05, "loss": 0.5403, "step": 60 }, { "epoch": 0.007807807807807808, "grad_norm": 0.1100311204791069, "learning_rate": 9.999832921809437e-05, "loss": 0.6201, "step": 65 }, { "epoch": 0.008408408408408409, "grad_norm": 0.10455172508955002, "learning_rate": 9.999806224757138e-05, "loss": 0.6557, "step": 70 }, { "epoch": 0.009009009009009009, "grad_norm": 0.11311007291078568, "learning_rate": 9.999777549905322e-05, "loss": 0.6266, "step": 75 }, { "epoch": 0.00960960960960961, "grad_norm": 0.1126667782664299, "learning_rate": 9.999746897265331e-05, "loss": 0.5493, "step": 80 }, { "epoch": 0.01021021021021021, "grad_norm": 0.1300688236951828, "learning_rate": 9.999714266849292e-05, "loss": 0.5692, "step": 85 }, { "epoch": 0.010810810810810811, "grad_norm": 0.08558473736047745, "learning_rate": 9.999679658670111e-05, "loss": 0.5598, "step": 90 }, { "epoch": 0.011411411411411412, "grad_norm": 0.10892773419618607, "learning_rate": 9.999643072741478e-05, "loss": 0.6837, "step": 95 }, { "epoch": 0.012012012012012012, "grad_norm": 0.09890025109052658, "learning_rate": 9.999604509077867e-05, "loss": 0.6372, "step": 100 }, { "epoch": 0.012612612612612612, "grad_norm": 0.11107968538999557, "learning_rate": 9.999563967694532e-05, "loss": 0.6601, "step": 105 }, { "epoch": 0.013213213213213212, "grad_norm": 0.10226837545633316, "learning_rate": 9.99952144860751e-05, "loss": 0.5446, "step": 110 }, { "epoch": 0.013813813813813814, "grad_norm": 0.0968766137957573, "learning_rate": 9.999476951833621e-05, "loss": 0.5698, "step": 115 }, { "epoch": 0.014414414414414415, "grad_norm": 0.10840737074613571, "learning_rate": 9.999430477390466e-05, "loss": 0.6412, "step": 120 }, { "epoch": 0.015015015015015015, "grad_norm": 0.10740071535110474, "learning_rate": 9.999382025296431e-05, "loss": 0.6262, "step": 125 }, { "epoch": 0.015615615615615615, "grad_norm": 0.10787483304738998, "learning_rate": 9.99933159557068e-05, "loss": 0.6068, "step": 130 }, { "epoch": 0.016216216216216217, "grad_norm": 0.10583048313856125, "learning_rate": 9.999279188233164e-05, "loss": 0.6107, "step": 135 }, { "epoch": 0.016816816816816817, "grad_norm": 0.11268199980258942, "learning_rate": 9.999224803304612e-05, "loss": 0.5476, "step": 140 }, { "epoch": 0.017417417417417418, "grad_norm": 0.11169681698083878, "learning_rate": 9.999168440806538e-05, "loss": 0.6548, "step": 145 }, { "epoch": 0.018018018018018018, "grad_norm": 0.1366148591041565, "learning_rate": 9.999110100761237e-05, "loss": 0.5648, "step": 150 }, { "epoch": 0.018618618618618618, "grad_norm": 0.11121531575918198, "learning_rate": 9.99904978319179e-05, "loss": 0.5888, "step": 155 }, { "epoch": 0.01921921921921922, "grad_norm": 0.13757368922233582, "learning_rate": 9.998987488122054e-05, "loss": 0.6201, "step": 160 }, { "epoch": 0.01981981981981982, "grad_norm": 0.12323565781116486, "learning_rate": 9.998923215576672e-05, "loss": 0.7153, "step": 165 }, { "epoch": 0.02042042042042042, "grad_norm": 0.10789189487695694, "learning_rate": 9.998856965581069e-05, "loss": 0.61, "step": 170 }, { "epoch": 0.021021021021021023, "grad_norm": 0.10350632667541504, "learning_rate": 9.998788738161454e-05, "loss": 0.5933, "step": 175 }, { "epoch": 0.021621621621621623, "grad_norm": 0.11346318572759628, "learning_rate": 9.998718533344811e-05, "loss": 0.5772, "step": 180 }, { "epoch": 0.022222222222222223, "grad_norm": 0.09915696084499359, "learning_rate": 9.998646351158915e-05, "loss": 0.5585, "step": 185 }, { "epoch": 0.022822822822822823, "grad_norm": 0.09726600348949432, "learning_rate": 9.998572191632319e-05, "loss": 0.6143, "step": 190 }, { "epoch": 0.023423423423423424, "grad_norm": 0.10023459047079086, "learning_rate": 9.998496054794358e-05, "loss": 0.5735, "step": 195 }, { "epoch": 0.024024024024024024, "grad_norm": 0.09451427310705185, "learning_rate": 9.998417940675148e-05, "loss": 0.538, "step": 200 }, { "epoch": 0.024624624624624624, "grad_norm": 0.10174587368965149, "learning_rate": 9.998337849305594e-05, "loss": 0.6243, "step": 205 }, { "epoch": 0.025225225225225224, "grad_norm": 0.08829119056463242, "learning_rate": 9.998255780717375e-05, "loss": 0.553, "step": 210 }, { "epoch": 0.025825825825825825, "grad_norm": 0.10895121842622757, "learning_rate": 9.998171734942956e-05, "loss": 0.5862, "step": 215 }, { "epoch": 0.026426426426426425, "grad_norm": 0.11514592170715332, "learning_rate": 9.998085712015582e-05, "loss": 0.6394, "step": 220 }, { "epoch": 0.02702702702702703, "grad_norm": 0.10050612688064575, "learning_rate": 9.997997711969283e-05, "loss": 0.5616, "step": 225 }, { "epoch": 0.02762762762762763, "grad_norm": 0.11400890350341797, "learning_rate": 9.997907734838868e-05, "loss": 0.5646, "step": 230 }, { "epoch": 0.02822822822822823, "grad_norm": 0.10154697299003601, "learning_rate": 9.997815780659931e-05, "loss": 0.6241, "step": 235 }, { "epoch": 0.02882882882882883, "grad_norm": 0.09731791168451309, "learning_rate": 9.997721849468848e-05, "loss": 0.5987, "step": 240 }, { "epoch": 0.02942942942942943, "grad_norm": 0.09646134078502655, "learning_rate": 9.997625941302775e-05, "loss": 0.6354, "step": 245 }, { "epoch": 0.03003003003003003, "grad_norm": 0.11228446662425995, "learning_rate": 9.99752805619965e-05, "loss": 0.6073, "step": 250 }, { "epoch": 0.03003003003003003, "eval_loss": 0.5788276791572571, "eval_runtime": 35.6288, "eval_samples_per_second": 22.454, "eval_steps_per_second": 5.613, "step": 250 }, { "epoch": 0.03063063063063063, "grad_norm": 0.10133231431245804, "learning_rate": 9.997428194198196e-05, "loss": 0.548, "step": 255 }, { "epoch": 0.03123123123123123, "grad_norm": 0.10743261128664017, "learning_rate": 9.99732635533791e-05, "loss": 0.6082, "step": 260 }, { "epoch": 0.03183183183183183, "grad_norm": 0.10012677311897278, "learning_rate": 9.997222539659085e-05, "loss": 0.6075, "step": 265 }, { "epoch": 0.032432432432432434, "grad_norm": 0.10682181268930435, "learning_rate": 9.997116747202783e-05, "loss": 0.568, "step": 270 }, { "epoch": 0.03303303303303303, "grad_norm": 0.12774191796779633, "learning_rate": 9.997008978010854e-05, "loss": 0.5942, "step": 275 }, { "epoch": 0.033633633633633635, "grad_norm": 0.1001734733581543, "learning_rate": 9.996899232125929e-05, "loss": 0.5681, "step": 280 }, { "epoch": 0.03423423423423423, "grad_norm": 0.09887179732322693, "learning_rate": 9.99678750959142e-05, "loss": 0.6101, "step": 285 }, { "epoch": 0.034834834834834835, "grad_norm": 0.1097242683172226, "learning_rate": 9.996673810451525e-05, "loss": 0.5765, "step": 290 }, { "epoch": 0.03543543543543543, "grad_norm": 0.119800865650177, "learning_rate": 9.996558134751214e-05, "loss": 0.5942, "step": 295 }, { "epoch": 0.036036036036036036, "grad_norm": 0.11146628856658936, "learning_rate": 9.996440482536252e-05, "loss": 0.581, "step": 300 }, { "epoch": 0.03663663663663664, "grad_norm": 0.10826515406370163, "learning_rate": 9.996320853853176e-05, "loss": 0.5716, "step": 305 }, { "epoch": 0.037237237237237236, "grad_norm": 0.10633689910173416, "learning_rate": 9.996199248749308e-05, "loss": 0.5926, "step": 310 }, { "epoch": 0.03783783783783784, "grad_norm": 0.13376714289188385, "learning_rate": 9.996075667272753e-05, "loss": 0.6925, "step": 315 }, { "epoch": 0.03843843843843844, "grad_norm": 0.1069822907447815, "learning_rate": 9.995950109472398e-05, "loss": 0.6107, "step": 320 }, { "epoch": 0.03903903903903904, "grad_norm": 0.09809288382530212, "learning_rate": 9.995822575397908e-05, "loss": 0.5597, "step": 325 }, { "epoch": 0.03963963963963964, "grad_norm": 0.11523476243019104, "learning_rate": 9.995693065099732e-05, "loss": 0.581, "step": 330 }, { "epoch": 0.04024024024024024, "grad_norm": 0.10354334861040115, "learning_rate": 9.995561578629105e-05, "loss": 0.5803, "step": 335 }, { "epoch": 0.04084084084084084, "grad_norm": 0.10784221440553665, "learning_rate": 9.995428116038035e-05, "loss": 0.5614, "step": 340 }, { "epoch": 0.04144144144144144, "grad_norm": 0.10870096832513809, "learning_rate": 9.99529267737932e-05, "loss": 0.636, "step": 345 }, { "epoch": 0.042042042042042045, "grad_norm": 0.10280580818653107, "learning_rate": 9.995155262706534e-05, "loss": 0.6333, "step": 350 }, { "epoch": 0.04264264264264264, "grad_norm": 0.12184490263462067, "learning_rate": 9.995015872074036e-05, "loss": 0.5996, "step": 355 }, { "epoch": 0.043243243243243246, "grad_norm": 0.10824449360370636, "learning_rate": 9.994874505536967e-05, "loss": 0.5483, "step": 360 }, { "epoch": 0.04384384384384384, "grad_norm": 0.1244700476527214, "learning_rate": 9.994731163151244e-05, "loss": 0.524, "step": 365 }, { "epoch": 0.044444444444444446, "grad_norm": 0.10343901067972183, "learning_rate": 9.994585844973574e-05, "loss": 0.5635, "step": 370 }, { "epoch": 0.04504504504504504, "grad_norm": 0.13498111069202423, "learning_rate": 9.994438551061437e-05, "loss": 0.6507, "step": 375 }, { "epoch": 0.04564564564564565, "grad_norm": 0.10724303871393204, "learning_rate": 9.994289281473104e-05, "loss": 0.6062, "step": 380 }, { "epoch": 0.04624624624624624, "grad_norm": 0.1190914437174797, "learning_rate": 9.994138036267617e-05, "loss": 0.606, "step": 385 }, { "epoch": 0.04684684684684685, "grad_norm": 0.10737879574298859, "learning_rate": 9.993984815504809e-05, "loss": 0.5966, "step": 390 }, { "epoch": 0.04744744744744745, "grad_norm": 0.10683471709489822, "learning_rate": 9.993829619245288e-05, "loss": 0.6322, "step": 395 }, { "epoch": 0.04804804804804805, "grad_norm": 0.12433134764432907, "learning_rate": 9.993672447550446e-05, "loss": 0.6776, "step": 400 }, { "epoch": 0.04864864864864865, "grad_norm": 0.10516783595085144, "learning_rate": 9.993513300482457e-05, "loss": 0.5399, "step": 405 }, { "epoch": 0.04924924924924925, "grad_norm": 0.10631490498781204, "learning_rate": 9.993352178104275e-05, "loss": 0.5712, "step": 410 }, { "epoch": 0.04984984984984985, "grad_norm": 0.10908538848161697, "learning_rate": 9.993189080479637e-05, "loss": 0.6396, "step": 415 }, { "epoch": 0.05045045045045045, "grad_norm": 0.13340666890144348, "learning_rate": 9.993024007673059e-05, "loss": 0.6671, "step": 420 }, { "epoch": 0.05105105105105105, "grad_norm": 0.12529441714286804, "learning_rate": 9.992856959749841e-05, "loss": 0.5756, "step": 425 }, { "epoch": 0.05165165165165165, "grad_norm": 0.10432100296020508, "learning_rate": 9.992687936776063e-05, "loss": 0.6271, "step": 430 }, { "epoch": 0.05225225225225225, "grad_norm": 0.11412281543016434, "learning_rate": 9.992516938818585e-05, "loss": 0.5768, "step": 435 }, { "epoch": 0.05285285285285285, "grad_norm": 0.1088799387216568, "learning_rate": 9.992343965945051e-05, "loss": 0.631, "step": 440 }, { "epoch": 0.05345345345345345, "grad_norm": 0.11275801062583923, "learning_rate": 9.992169018223884e-05, "loss": 0.6116, "step": 445 }, { "epoch": 0.05405405405405406, "grad_norm": 0.10910513252019882, "learning_rate": 9.99199209572429e-05, "loss": 0.5581, "step": 450 }, { "epoch": 0.054654654654654654, "grad_norm": 0.09757370501756668, "learning_rate": 9.991813198516252e-05, "loss": 0.5765, "step": 455 }, { "epoch": 0.05525525525525526, "grad_norm": 0.1048395037651062, "learning_rate": 9.991632326670542e-05, "loss": 0.5965, "step": 460 }, { "epoch": 0.055855855855855854, "grad_norm": 0.11976464092731476, "learning_rate": 9.991449480258704e-05, "loss": 0.5559, "step": 465 }, { "epoch": 0.05645645645645646, "grad_norm": 0.10895799845457077, "learning_rate": 9.99126465935307e-05, "loss": 0.6067, "step": 470 }, { "epoch": 0.057057057057057055, "grad_norm": 0.12459875643253326, "learning_rate": 9.99107786402675e-05, "loss": 0.5871, "step": 475 }, { "epoch": 0.05765765765765766, "grad_norm": 0.09745662659406662, "learning_rate": 9.990889094353637e-05, "loss": 0.5714, "step": 480 }, { "epoch": 0.058258258258258255, "grad_norm": 0.09854046255350113, "learning_rate": 9.9906983504084e-05, "loss": 0.5477, "step": 485 }, { "epoch": 0.05885885885885886, "grad_norm": 0.10370152443647385, "learning_rate": 9.990505632266498e-05, "loss": 0.5688, "step": 490 }, { "epoch": 0.05945945945945946, "grad_norm": 0.1205800324678421, "learning_rate": 9.990310940004159e-05, "loss": 0.6448, "step": 495 }, { "epoch": 0.06006006006006006, "grad_norm": 0.12087002396583557, "learning_rate": 9.990114273698406e-05, "loss": 0.5661, "step": 500 }, { "epoch": 0.06006006006006006, "eval_loss": 0.5705558657646179, "eval_runtime": 35.6595, "eval_samples_per_second": 22.434, "eval_steps_per_second": 5.609, "step": 500 }, { "epoch": 0.06066066066066066, "grad_norm": 0.10749085247516632, "learning_rate": 9.989915633427028e-05, "loss": 0.5849, "step": 505 }, { "epoch": 0.06126126126126126, "grad_norm": 0.10628439486026764, "learning_rate": 9.989715019268606e-05, "loss": 0.5916, "step": 510 }, { "epoch": 0.061861861861861864, "grad_norm": 0.10315616428852081, "learning_rate": 9.989512431302497e-05, "loss": 0.5358, "step": 515 }, { "epoch": 0.06246246246246246, "grad_norm": 0.14266809821128845, "learning_rate": 9.989307869608841e-05, "loss": 0.6284, "step": 520 }, { "epoch": 0.06306306306306306, "grad_norm": 0.14430400729179382, "learning_rate": 9.989101334268555e-05, "loss": 0.5701, "step": 525 }, { "epoch": 0.06366366366366366, "grad_norm": 0.12293283641338348, "learning_rate": 9.988892825363343e-05, "loss": 0.5759, "step": 530 }, { "epoch": 0.06426426426426426, "grad_norm": 0.1240479052066803, "learning_rate": 9.988682342975682e-05, "loss": 0.6167, "step": 535 }, { "epoch": 0.06486486486486487, "grad_norm": 0.12589618563652039, "learning_rate": 9.988469887188837e-05, "loss": 0.596, "step": 540 }, { "epoch": 0.06546546546546547, "grad_norm": 0.11583826690912247, "learning_rate": 9.988255458086848e-05, "loss": 0.5746, "step": 545 }, { "epoch": 0.06606606606606606, "grad_norm": 0.12019343674182892, "learning_rate": 9.988039055754538e-05, "loss": 0.5773, "step": 550 }, { "epoch": 0.06666666666666667, "grad_norm": 0.1159575954079628, "learning_rate": 9.987820680277514e-05, "loss": 0.6047, "step": 555 }, { "epoch": 0.06726726726726727, "grad_norm": 0.1157040074467659, "learning_rate": 9.987600331742152e-05, "loss": 0.6517, "step": 560 }, { "epoch": 0.06786786786786787, "grad_norm": 0.11724773794412613, "learning_rate": 9.987378010235625e-05, "loss": 0.6074, "step": 565 }, { "epoch": 0.06846846846846846, "grad_norm": 0.11794179677963257, "learning_rate": 9.987153715845874e-05, "loss": 0.5875, "step": 570 }, { "epoch": 0.06906906906906907, "grad_norm": 0.12677189707756042, "learning_rate": 9.986927448661623e-05, "loss": 0.6014, "step": 575 }, { "epoch": 0.06966966966966967, "grad_norm": 0.1056021898984909, "learning_rate": 9.98669920877238e-05, "loss": 0.5983, "step": 580 }, { "epoch": 0.07027027027027027, "grad_norm": 0.10002574324607849, "learning_rate": 9.98646899626843e-05, "loss": 0.5524, "step": 585 }, { "epoch": 0.07087087087087086, "grad_norm": 0.12917128205299377, "learning_rate": 9.986236811240841e-05, "loss": 0.6182, "step": 590 }, { "epoch": 0.07147147147147147, "grad_norm": 0.11160294711589813, "learning_rate": 9.986002653781457e-05, "loss": 0.5808, "step": 595 }, { "epoch": 0.07207207207207207, "grad_norm": 0.12028118222951889, "learning_rate": 9.985766523982906e-05, "loss": 0.5942, "step": 600 }, { "epoch": 0.07267267267267268, "grad_norm": 0.1268133968114853, "learning_rate": 9.985528421938595e-05, "loss": 0.6005, "step": 605 }, { "epoch": 0.07327327327327328, "grad_norm": 0.12729580700397491, "learning_rate": 9.985288347742713e-05, "loss": 0.5953, "step": 610 }, { "epoch": 0.07387387387387387, "grad_norm": 0.11391682922840118, "learning_rate": 9.985046301490224e-05, "loss": 0.5797, "step": 615 }, { "epoch": 0.07447447447447447, "grad_norm": 0.11333835124969482, "learning_rate": 9.98480228327688e-05, "loss": 0.5463, "step": 620 }, { "epoch": 0.07507507507507508, "grad_norm": 0.11474984139204025, "learning_rate": 9.984556293199204e-05, "loss": 0.661, "step": 625 }, { "epoch": 0.07567567567567568, "grad_norm": 0.11972816288471222, "learning_rate": 9.984308331354505e-05, "loss": 0.5481, "step": 630 }, { "epoch": 0.07627627627627627, "grad_norm": 0.12083037197589874, "learning_rate": 9.984058397840874e-05, "loss": 0.6251, "step": 635 }, { "epoch": 0.07687687687687687, "grad_norm": 0.10288316756486893, "learning_rate": 9.983806492757173e-05, "loss": 0.5038, "step": 640 }, { "epoch": 0.07747747747747748, "grad_norm": 0.11947996914386749, "learning_rate": 9.983552616203054e-05, "loss": 0.5844, "step": 645 }, { "epoch": 0.07807807807807808, "grad_norm": 0.11687720566987991, "learning_rate": 9.983296768278941e-05, "loss": 0.6452, "step": 650 }, { "epoch": 0.07867867867867868, "grad_norm": 0.12168864905834198, "learning_rate": 9.983038949086043e-05, "loss": 0.55, "step": 655 }, { "epoch": 0.07927927927927927, "grad_norm": 0.11078694462776184, "learning_rate": 9.982779158726346e-05, "loss": 0.5744, "step": 660 }, { "epoch": 0.07987987987987988, "grad_norm": 0.12191225588321686, "learning_rate": 9.982517397302617e-05, "loss": 0.6319, "step": 665 }, { "epoch": 0.08048048048048048, "grad_norm": 0.11814052611589432, "learning_rate": 9.982253664918404e-05, "loss": 0.5397, "step": 670 }, { "epoch": 0.08108108108108109, "grad_norm": 0.11518977582454681, "learning_rate": 9.98198796167803e-05, "loss": 0.6731, "step": 675 }, { "epoch": 0.08168168168168168, "grad_norm": 0.10897009819746017, "learning_rate": 9.9817202876866e-05, "loss": 0.5274, "step": 680 }, { "epoch": 0.08228228228228228, "grad_norm": 0.12917213141918182, "learning_rate": 9.981450643050004e-05, "loss": 0.6496, "step": 685 }, { "epoch": 0.08288288288288288, "grad_norm": 0.11243332922458649, "learning_rate": 9.981179027874903e-05, "loss": 0.5684, "step": 690 }, { "epoch": 0.08348348348348349, "grad_norm": 0.1249338760972023, "learning_rate": 9.980905442268742e-05, "loss": 0.589, "step": 695 }, { "epoch": 0.08408408408408409, "grad_norm": 0.13728263974189758, "learning_rate": 9.980629886339745e-05, "loss": 0.5862, "step": 700 }, { "epoch": 0.08468468468468468, "grad_norm": 0.1067405566573143, "learning_rate": 9.980352360196915e-05, "loss": 0.5325, "step": 705 }, { "epoch": 0.08528528528528528, "grad_norm": 0.12034221738576889, "learning_rate": 9.980072863950034e-05, "loss": 0.6633, "step": 710 }, { "epoch": 0.08588588588588589, "grad_norm": 0.12357219308614731, "learning_rate": 9.979791397709665e-05, "loss": 0.5595, "step": 715 }, { "epoch": 0.08648648648648649, "grad_norm": 0.12757450342178345, "learning_rate": 9.979507961587146e-05, "loss": 0.566, "step": 720 }, { "epoch": 0.08708708708708708, "grad_norm": 0.12144903093576431, "learning_rate": 9.979222555694603e-05, "loss": 0.5574, "step": 725 }, { "epoch": 0.08768768768768768, "grad_norm": 0.1184130534529686, "learning_rate": 9.978935180144929e-05, "loss": 0.6473, "step": 730 }, { "epoch": 0.08828828828828829, "grad_norm": 0.1136375144124031, "learning_rate": 9.978645835051807e-05, "loss": 0.56, "step": 735 }, { "epoch": 0.08888888888888889, "grad_norm": 0.11569700390100479, "learning_rate": 9.978354520529694e-05, "loss": 0.6062, "step": 740 }, { "epoch": 0.0894894894894895, "grad_norm": 0.1249203309416771, "learning_rate": 9.978061236693825e-05, "loss": 0.4966, "step": 745 }, { "epoch": 0.09009009009009009, "grad_norm": 0.1043449193239212, "learning_rate": 9.977765983660219e-05, "loss": 0.581, "step": 750 }, { "epoch": 0.09009009009009009, "eval_loss": 0.556373655796051, "eval_runtime": 35.7127, "eval_samples_per_second": 22.401, "eval_steps_per_second": 5.6, "step": 750 }, { "epoch": 0.09069069069069069, "grad_norm": 0.12919944524765015, "learning_rate": 9.977468761545668e-05, "loss": 0.5873, "step": 755 }, { "epoch": 0.0912912912912913, "grad_norm": 0.13691309094429016, "learning_rate": 9.977169570467746e-05, "loss": 0.5736, "step": 760 }, { "epoch": 0.0918918918918919, "grad_norm": 0.11516213417053223, "learning_rate": 9.976868410544807e-05, "loss": 0.6117, "step": 765 }, { "epoch": 0.09249249249249249, "grad_norm": 0.1293349415063858, "learning_rate": 9.97656528189598e-05, "loss": 0.6001, "step": 770 }, { "epoch": 0.09309309309309309, "grad_norm": 0.13094763457775116, "learning_rate": 9.976260184641178e-05, "loss": 0.5834, "step": 775 }, { "epoch": 0.0936936936936937, "grad_norm": 0.1106487512588501, "learning_rate": 9.975953118901087e-05, "loss": 0.5727, "step": 780 }, { "epoch": 0.0942942942942943, "grad_norm": 0.11776825040578842, "learning_rate": 9.975644084797177e-05, "loss": 0.57, "step": 785 }, { "epoch": 0.0948948948948949, "grad_norm": 0.13150927424430847, "learning_rate": 9.975333082451697e-05, "loss": 0.5947, "step": 790 }, { "epoch": 0.09549549549549549, "grad_norm": 0.10551803559064865, "learning_rate": 9.975020111987665e-05, "loss": 0.5666, "step": 795 }, { "epoch": 0.0960960960960961, "grad_norm": 0.12537789344787598, "learning_rate": 9.974705173528888e-05, "loss": 0.59, "step": 800 }, { "epoch": 0.0966966966966967, "grad_norm": 0.10980799794197083, "learning_rate": 9.974388267199948e-05, "loss": 0.6458, "step": 805 }, { "epoch": 0.0972972972972973, "grad_norm": 0.13737636804580688, "learning_rate": 9.974069393126204e-05, "loss": 0.6085, "step": 810 }, { "epoch": 0.09789789789789789, "grad_norm": 0.11622725427150726, "learning_rate": 9.973748551433797e-05, "loss": 0.5489, "step": 815 }, { "epoch": 0.0984984984984985, "grad_norm": 0.12641353905200958, "learning_rate": 9.973425742249642e-05, "loss": 0.5548, "step": 820 }, { "epoch": 0.0990990990990991, "grad_norm": 0.13305918872356415, "learning_rate": 9.973100965701434e-05, "loss": 0.5867, "step": 825 }, { "epoch": 0.0996996996996997, "grad_norm": 0.11327166110277176, "learning_rate": 9.972774221917649e-05, "loss": 0.5836, "step": 830 }, { "epoch": 0.1003003003003003, "grad_norm": 0.11508522182703018, "learning_rate": 9.972445511027536e-05, "loss": 0.5103, "step": 835 }, { "epoch": 0.1009009009009009, "grad_norm": 0.13816924393177032, "learning_rate": 9.972114833161127e-05, "loss": 0.5635, "step": 840 }, { "epoch": 0.1015015015015015, "grad_norm": 0.11702617257833481, "learning_rate": 9.971782188449227e-05, "loss": 0.581, "step": 845 }, { "epoch": 0.1021021021021021, "grad_norm": 0.11020959168672562, "learning_rate": 9.971447577023427e-05, "loss": 0.5462, "step": 850 }, { "epoch": 0.10270270270270271, "grad_norm": 0.10916626453399658, "learning_rate": 9.971110999016087e-05, "loss": 0.5908, "step": 855 }, { "epoch": 0.1033033033033033, "grad_norm": 0.1086755096912384, "learning_rate": 9.97077245456035e-05, "loss": 0.5541, "step": 860 }, { "epoch": 0.1039039039039039, "grad_norm": 0.11078358441591263, "learning_rate": 9.970431943790135e-05, "loss": 0.5465, "step": 865 }, { "epoch": 0.1045045045045045, "grad_norm": 0.1155092641711235, "learning_rate": 9.970089466840141e-05, "loss": 0.5377, "step": 870 }, { "epoch": 0.10510510510510511, "grad_norm": 0.15020284056663513, "learning_rate": 9.969745023845842e-05, "loss": 0.6166, "step": 875 }, { "epoch": 0.1057057057057057, "grad_norm": 0.13164208829402924, "learning_rate": 9.969398614943493e-05, "loss": 0.6068, "step": 880 }, { "epoch": 0.1063063063063063, "grad_norm": 0.1536552608013153, "learning_rate": 9.969050240270123e-05, "loss": 0.6202, "step": 885 }, { "epoch": 0.1069069069069069, "grad_norm": 0.12868870794773102, "learning_rate": 9.968699899963542e-05, "loss": 0.5557, "step": 890 }, { "epoch": 0.10750750750750751, "grad_norm": 0.14724482595920563, "learning_rate": 9.968347594162335e-05, "loss": 0.5372, "step": 895 }, { "epoch": 0.10810810810810811, "grad_norm": 0.12083470076322556, "learning_rate": 9.967993323005867e-05, "loss": 0.5613, "step": 900 }, { "epoch": 0.1087087087087087, "grad_norm": 0.14249064028263092, "learning_rate": 9.967637086634273e-05, "loss": 0.579, "step": 905 }, { "epoch": 0.10930930930930931, "grad_norm": 0.12009460479021072, "learning_rate": 9.96727888518848e-05, "loss": 0.61, "step": 910 }, { "epoch": 0.10990990990990991, "grad_norm": 0.13391336798667908, "learning_rate": 9.966918718810178e-05, "loss": 0.5847, "step": 915 }, { "epoch": 0.11051051051051052, "grad_norm": 0.1236066222190857, "learning_rate": 9.96655658764184e-05, "loss": 0.5946, "step": 920 }, { "epoch": 0.1111111111111111, "grad_norm": 0.11532710492610931, "learning_rate": 9.966192491826719e-05, "loss": 0.5679, "step": 925 }, { "epoch": 0.11171171171171171, "grad_norm": 0.13501344621181488, "learning_rate": 9.965826431508838e-05, "loss": 0.572, "step": 930 }, { "epoch": 0.11231231231231231, "grad_norm": 0.13786637783050537, "learning_rate": 9.965458406833007e-05, "loss": 0.5867, "step": 935 }, { "epoch": 0.11291291291291292, "grad_norm": 0.13774774968624115, "learning_rate": 9.965088417944804e-05, "loss": 0.6295, "step": 940 }, { "epoch": 0.11351351351351352, "grad_norm": 0.11815715581178665, "learning_rate": 9.964716464990587e-05, "loss": 0.5813, "step": 945 }, { "epoch": 0.11411411411411411, "grad_norm": 0.14110974967479706, "learning_rate": 9.964342548117492e-05, "loss": 0.5655, "step": 950 }, { "epoch": 0.11471471471471471, "grad_norm": 0.13697779178619385, "learning_rate": 9.963966667473432e-05, "loss": 0.5126, "step": 955 }, { "epoch": 0.11531531531531532, "grad_norm": 0.1307583451271057, "learning_rate": 9.963588823207095e-05, "loss": 0.5642, "step": 960 }, { "epoch": 0.11591591591591592, "grad_norm": 0.13687662780284882, "learning_rate": 9.96320901546795e-05, "loss": 0.5406, "step": 965 }, { "epoch": 0.11651651651651651, "grad_norm": 0.1363164782524109, "learning_rate": 9.962827244406235e-05, "loss": 0.5358, "step": 970 }, { "epoch": 0.11711711711711711, "grad_norm": 0.11956089735031128, "learning_rate": 9.962443510172969e-05, "loss": 0.5797, "step": 975 }, { "epoch": 0.11771771771771772, "grad_norm": 0.13000960648059845, "learning_rate": 9.962057812919954e-05, "loss": 0.5879, "step": 980 }, { "epoch": 0.11831831831831832, "grad_norm": 0.12119048088788986, "learning_rate": 9.961670152799756e-05, "loss": 0.4789, "step": 985 }, { "epoch": 0.11891891891891893, "grad_norm": 0.11234846711158752, "learning_rate": 9.961280529965726e-05, "loss": 0.5643, "step": 990 }, { "epoch": 0.11951951951951952, "grad_norm": 0.12192381918430328, "learning_rate": 9.960888944571989e-05, "loss": 0.562, "step": 995 }, { "epoch": 0.12012012012012012, "grad_norm": 0.11708851158618927, "learning_rate": 9.960495396773448e-05, "loss": 0.6252, "step": 1000 }, { "epoch": 0.12012012012012012, "eval_loss": 0.5504798889160156, "eval_runtime": 35.636, "eval_samples_per_second": 22.449, "eval_steps_per_second": 5.612, "step": 1000 }, { "epoch": 0.12072072072072072, "grad_norm": 0.11086433380842209, "learning_rate": 9.960099886725778e-05, "loss": 0.5448, "step": 1005 }, { "epoch": 0.12132132132132133, "grad_norm": 0.13834629952907562, "learning_rate": 9.959702414585434e-05, "loss": 0.5544, "step": 1010 }, { "epoch": 0.12192192192192192, "grad_norm": 0.13374705612659454, "learning_rate": 9.959302980509648e-05, "loss": 0.6367, "step": 1015 }, { "epoch": 0.12252252252252252, "grad_norm": 0.12303639948368073, "learning_rate": 9.958901584656424e-05, "loss": 0.6276, "step": 1020 }, { "epoch": 0.12312312312312312, "grad_norm": 0.11851788312196732, "learning_rate": 9.958498227184545e-05, "loss": 0.5434, "step": 1025 }, { "epoch": 0.12372372372372373, "grad_norm": 0.118187315762043, "learning_rate": 9.95809290825357e-05, "loss": 0.4774, "step": 1030 }, { "epoch": 0.12432432432432433, "grad_norm": 0.11095068603754044, "learning_rate": 9.95768562802383e-05, "loss": 0.5365, "step": 1035 }, { "epoch": 0.12492492492492492, "grad_norm": 0.15384650230407715, "learning_rate": 9.957276386656438e-05, "loss": 0.5857, "step": 1040 }, { "epoch": 0.12552552552552554, "grad_norm": 0.14868667721748352, "learning_rate": 9.95686518431328e-05, "loss": 0.6165, "step": 1045 }, { "epoch": 0.12612612612612611, "grad_norm": 0.13861216604709625, "learning_rate": 9.956452021157015e-05, "loss": 0.6233, "step": 1050 }, { "epoch": 0.12672672672672672, "grad_norm": 0.11492381244897842, "learning_rate": 9.956036897351082e-05, "loss": 0.518, "step": 1055 }, { "epoch": 0.12732732732732732, "grad_norm": 0.14138482511043549, "learning_rate": 9.955619813059695e-05, "loss": 0.5503, "step": 1060 }, { "epoch": 0.12792792792792793, "grad_norm": 0.13464735448360443, "learning_rate": 9.955200768447839e-05, "loss": 0.5786, "step": 1065 }, { "epoch": 0.12852852852852853, "grad_norm": 0.15015809237957, "learning_rate": 9.954779763681279e-05, "loss": 0.6308, "step": 1070 }, { "epoch": 0.12912912912912913, "grad_norm": 0.1323210448026657, "learning_rate": 9.954356798926556e-05, "loss": 0.5932, "step": 1075 }, { "epoch": 0.12972972972972974, "grad_norm": 0.13801470398902893, "learning_rate": 9.953931874350981e-05, "loss": 0.6251, "step": 1080 }, { "epoch": 0.13033033033033034, "grad_norm": 0.14785467088222504, "learning_rate": 9.953504990122645e-05, "loss": 0.5704, "step": 1085 }, { "epoch": 0.13093093093093094, "grad_norm": 0.1381571739912033, "learning_rate": 9.953076146410414e-05, "loss": 0.5665, "step": 1090 }, { "epoch": 0.13153153153153152, "grad_norm": 0.12187087535858154, "learning_rate": 9.952645343383926e-05, "loss": 0.581, "step": 1095 }, { "epoch": 0.13213213213213212, "grad_norm": 0.12892118096351624, "learning_rate": 9.952212581213598e-05, "loss": 0.5671, "step": 1100 }, { "epoch": 0.13273273273273273, "grad_norm": 0.14864008128643036, "learning_rate": 9.95177786007062e-05, "loss": 0.6301, "step": 1105 }, { "epoch": 0.13333333333333333, "grad_norm": 0.1285526156425476, "learning_rate": 9.951341180126954e-05, "loss": 0.5563, "step": 1110 }, { "epoch": 0.13393393393393394, "grad_norm": 0.12077897042036057, "learning_rate": 9.950902541555342e-05, "loss": 0.5541, "step": 1115 }, { "epoch": 0.13453453453453454, "grad_norm": 0.14453986287117004, "learning_rate": 9.9504619445293e-05, "loss": 0.5592, "step": 1120 }, { "epoch": 0.13513513513513514, "grad_norm": 0.11925803869962692, "learning_rate": 9.950019389223113e-05, "loss": 0.5096, "step": 1125 }, { "epoch": 0.13573573573573575, "grad_norm": 0.1241493970155716, "learning_rate": 9.949574875811849e-05, "loss": 0.5401, "step": 1130 }, { "epoch": 0.13633633633633635, "grad_norm": 0.12902575731277466, "learning_rate": 9.949128404471346e-05, "loss": 0.5652, "step": 1135 }, { "epoch": 0.13693693693693693, "grad_norm": 0.1486339122056961, "learning_rate": 9.948679975378215e-05, "loss": 0.6048, "step": 1140 }, { "epoch": 0.13753753753753753, "grad_norm": 0.11694955080747604, "learning_rate": 9.948229588709843e-05, "loss": 0.5372, "step": 1145 }, { "epoch": 0.13813813813813813, "grad_norm": 0.14657153189182281, "learning_rate": 9.947777244644395e-05, "loss": 0.6111, "step": 1150 }, { "epoch": 0.13873873873873874, "grad_norm": 0.1201421245932579, "learning_rate": 9.947322943360805e-05, "loss": 0.4755, "step": 1155 }, { "epoch": 0.13933933933933934, "grad_norm": 0.14225557446479797, "learning_rate": 9.946866685038782e-05, "loss": 0.6418, "step": 1160 }, { "epoch": 0.13993993993993994, "grad_norm": 0.11903166025876999, "learning_rate": 9.946408469858814e-05, "loss": 0.5509, "step": 1165 }, { "epoch": 0.14054054054054055, "grad_norm": 0.14576730132102966, "learning_rate": 9.945948298002159e-05, "loss": 0.5591, "step": 1170 }, { "epoch": 0.14114114114114115, "grad_norm": 0.1285872757434845, "learning_rate": 9.945486169650846e-05, "loss": 0.5596, "step": 1175 }, { "epoch": 0.14174174174174173, "grad_norm": 0.12496671825647354, "learning_rate": 9.945022084987686e-05, "loss": 0.4635, "step": 1180 }, { "epoch": 0.14234234234234233, "grad_norm": 0.1366618573665619, "learning_rate": 9.944556044196254e-05, "loss": 0.621, "step": 1185 }, { "epoch": 0.14294294294294294, "grad_norm": 0.14241304993629456, "learning_rate": 9.944088047460908e-05, "loss": 0.5102, "step": 1190 }, { "epoch": 0.14354354354354354, "grad_norm": 0.13404640555381775, "learning_rate": 9.943618094966778e-05, "loss": 0.4964, "step": 1195 }, { "epoch": 0.14414414414414414, "grad_norm": 0.1323569267988205, "learning_rate": 9.943146186899763e-05, "loss": 0.6184, "step": 1200 }, { "epoch": 0.14474474474474475, "grad_norm": 0.15703721344470978, "learning_rate": 9.942672323446535e-05, "loss": 0.5274, "step": 1205 }, { "epoch": 0.14534534534534535, "grad_norm": 0.13442037999629974, "learning_rate": 9.942196504794548e-05, "loss": 0.5314, "step": 1210 }, { "epoch": 0.14594594594594595, "grad_norm": 0.14303931593894958, "learning_rate": 9.941718731132021e-05, "loss": 0.5491, "step": 1215 }, { "epoch": 0.14654654654654656, "grad_norm": 0.14930444955825806, "learning_rate": 9.941239002647951e-05, "loss": 0.5963, "step": 1220 }, { "epoch": 0.14714714714714713, "grad_norm": 0.1592545211315155, "learning_rate": 9.940757319532106e-05, "loss": 0.6314, "step": 1225 }, { "epoch": 0.14774774774774774, "grad_norm": 0.12300348281860352, "learning_rate": 9.940273681975028e-05, "loss": 0.5336, "step": 1230 }, { "epoch": 0.14834834834834834, "grad_norm": 0.14255018532276154, "learning_rate": 9.939788090168029e-05, "loss": 0.5748, "step": 1235 }, { "epoch": 0.14894894894894894, "grad_norm": 0.12294194102287292, "learning_rate": 9.939300544303203e-05, "loss": 0.4948, "step": 1240 }, { "epoch": 0.14954954954954955, "grad_norm": 0.13256989419460297, "learning_rate": 9.938811044573408e-05, "loss": 0.6179, "step": 1245 }, { "epoch": 0.15015015015015015, "grad_norm": 0.13299089670181274, "learning_rate": 9.938319591172276e-05, "loss": 0.5523, "step": 1250 }, { "epoch": 0.15015015015015015, "eval_loss": 0.5406990051269531, "eval_runtime": 35.8435, "eval_samples_per_second": 22.319, "eval_steps_per_second": 5.58, "step": 1250 }, { "epoch": 0.15075075075075076, "grad_norm": 0.14382098615169525, "learning_rate": 9.93782618429422e-05, "loss": 0.5333, "step": 1255 }, { "epoch": 0.15135135135135136, "grad_norm": 0.15322475135326385, "learning_rate": 9.937330824134411e-05, "loss": 0.5857, "step": 1260 }, { "epoch": 0.15195195195195196, "grad_norm": 0.1334904432296753, "learning_rate": 9.936833510888808e-05, "loss": 0.5689, "step": 1265 }, { "epoch": 0.15255255255255254, "grad_norm": 0.1450200080871582, "learning_rate": 9.93633424475413e-05, "loss": 0.5308, "step": 1270 }, { "epoch": 0.15315315315315314, "grad_norm": 0.16856804490089417, "learning_rate": 9.935833025927881e-05, "loss": 0.6035, "step": 1275 }, { "epoch": 0.15375375375375375, "grad_norm": 0.11449337005615234, "learning_rate": 9.935329854608328e-05, "loss": 0.5383, "step": 1280 }, { "epoch": 0.15435435435435435, "grad_norm": 0.14133980870246887, "learning_rate": 9.93482473099451e-05, "loss": 0.6096, "step": 1285 }, { "epoch": 0.15495495495495495, "grad_norm": 0.14201879501342773, "learning_rate": 9.934317655286246e-05, "loss": 0.5111, "step": 1290 }, { "epoch": 0.15555555555555556, "grad_norm": 0.13272619247436523, "learning_rate": 9.93380862768412e-05, "loss": 0.553, "step": 1295 }, { "epoch": 0.15615615615615616, "grad_norm": 0.16790813207626343, "learning_rate": 9.93329764838949e-05, "loss": 0.5438, "step": 1300 }, { "epoch": 0.15675675675675677, "grad_norm": 0.1232328861951828, "learning_rate": 9.93278471760449e-05, "loss": 0.5568, "step": 1305 }, { "epoch": 0.15735735735735737, "grad_norm": 0.1317184567451477, "learning_rate": 9.93226983553202e-05, "loss": 0.5557, "step": 1310 }, { "epoch": 0.15795795795795795, "grad_norm": 0.13694261014461517, "learning_rate": 9.931753002375755e-05, "loss": 0.5952, "step": 1315 }, { "epoch": 0.15855855855855855, "grad_norm": 0.14206798374652863, "learning_rate": 9.931234218340142e-05, "loss": 0.6113, "step": 1320 }, { "epoch": 0.15915915915915915, "grad_norm": 0.14540702104568481, "learning_rate": 9.930713483630398e-05, "loss": 0.5499, "step": 1325 }, { "epoch": 0.15975975975975976, "grad_norm": 0.135905459523201, "learning_rate": 9.930190798452515e-05, "loss": 0.5708, "step": 1330 }, { "epoch": 0.16036036036036036, "grad_norm": 0.1379026472568512, "learning_rate": 9.929666163013251e-05, "loss": 0.5838, "step": 1335 }, { "epoch": 0.16096096096096096, "grad_norm": 0.14682543277740479, "learning_rate": 9.929139577520143e-05, "loss": 0.5267, "step": 1340 }, { "epoch": 0.16156156156156157, "grad_norm": 0.13486546277999878, "learning_rate": 9.92861104218149e-05, "loss": 0.5689, "step": 1345 }, { "epoch": 0.16216216216216217, "grad_norm": 0.14320464432239532, "learning_rate": 9.928080557206374e-05, "loss": 0.5452, "step": 1350 }, { "epoch": 0.16276276276276277, "grad_norm": 0.1397445648908615, "learning_rate": 9.927548122804636e-05, "loss": 0.475, "step": 1355 }, { "epoch": 0.16336336336336335, "grad_norm": 0.13572080433368683, "learning_rate": 9.927013739186896e-05, "loss": 0.5778, "step": 1360 }, { "epoch": 0.16396396396396395, "grad_norm": 0.1294446587562561, "learning_rate": 9.926477406564543e-05, "loss": 0.5633, "step": 1365 }, { "epoch": 0.16456456456456456, "grad_norm": 0.13914752006530762, "learning_rate": 9.925939125149737e-05, "loss": 0.493, "step": 1370 }, { "epoch": 0.16516516516516516, "grad_norm": 0.14273406565189362, "learning_rate": 9.925398895155408e-05, "loss": 0.5125, "step": 1375 }, { "epoch": 0.16576576576576577, "grad_norm": 0.1549476832151413, "learning_rate": 9.924856716795259e-05, "loss": 0.5856, "step": 1380 }, { "epoch": 0.16636636636636637, "grad_norm": 0.1278187483549118, "learning_rate": 9.924312590283759e-05, "loss": 0.5287, "step": 1385 }, { "epoch": 0.16696696696696697, "grad_norm": 0.11236418783664703, "learning_rate": 9.923766515836158e-05, "loss": 0.5092, "step": 1390 }, { "epoch": 0.16756756756756758, "grad_norm": 0.13451069593429565, "learning_rate": 9.923218493668462e-05, "loss": 0.5557, "step": 1395 }, { "epoch": 0.16816816816816818, "grad_norm": 0.1335114985704422, "learning_rate": 9.922668523997459e-05, "loss": 0.5246, "step": 1400 }, { "epoch": 0.16876876876876876, "grad_norm": 0.13806520402431488, "learning_rate": 9.922116607040701e-05, "loss": 0.547, "step": 1405 }, { "epoch": 0.16936936936936936, "grad_norm": 0.13814181089401245, "learning_rate": 9.921562743016515e-05, "loss": 0.6071, "step": 1410 }, { "epoch": 0.16996996996996996, "grad_norm": 0.17738781869411469, "learning_rate": 9.921006932143995e-05, "loss": 0.6035, "step": 1415 }, { "epoch": 0.17057057057057057, "grad_norm": 0.13893504440784454, "learning_rate": 9.920449174643006e-05, "loss": 0.5531, "step": 1420 }, { "epoch": 0.17117117117117117, "grad_norm": 0.1506422758102417, "learning_rate": 9.919889470734183e-05, "loss": 0.5554, "step": 1425 }, { "epoch": 0.17177177177177178, "grad_norm": 0.1395304948091507, "learning_rate": 9.91932782063893e-05, "loss": 0.5718, "step": 1430 }, { "epoch": 0.17237237237237238, "grad_norm": 0.13701309263706207, "learning_rate": 9.918764224579425e-05, "loss": 0.5424, "step": 1435 }, { "epoch": 0.17297297297297298, "grad_norm": 0.1376919001340866, "learning_rate": 9.91819868277861e-05, "loss": 0.5977, "step": 1440 }, { "epoch": 0.1735735735735736, "grad_norm": 0.15341120958328247, "learning_rate": 9.9176311954602e-05, "loss": 0.5686, "step": 1445 }, { "epoch": 0.17417417417417416, "grad_norm": 0.15332254767417908, "learning_rate": 9.917061762848677e-05, "loss": 0.5695, "step": 1450 }, { "epoch": 0.17477477477477477, "grad_norm": 0.13192585110664368, "learning_rate": 9.916490385169297e-05, "loss": 0.5524, "step": 1455 }, { "epoch": 0.17537537537537537, "grad_norm": 0.16271525621414185, "learning_rate": 9.915917062648083e-05, "loss": 0.625, "step": 1460 }, { "epoch": 0.17597597597597597, "grad_norm": 0.15786905586719513, "learning_rate": 9.915341795511826e-05, "loss": 0.4909, "step": 1465 }, { "epoch": 0.17657657657657658, "grad_norm": 0.17195844650268555, "learning_rate": 9.914764583988087e-05, "loss": 0.5432, "step": 1470 }, { "epoch": 0.17717717717717718, "grad_norm": 0.14075967669487, "learning_rate": 9.914185428305198e-05, "loss": 0.4958, "step": 1475 }, { "epoch": 0.17777777777777778, "grad_norm": 0.14056497812271118, "learning_rate": 9.913604328692258e-05, "loss": 0.585, "step": 1480 }, { "epoch": 0.1783783783783784, "grad_norm": 0.16021881997585297, "learning_rate": 9.913021285379136e-05, "loss": 0.5645, "step": 1485 }, { "epoch": 0.178978978978979, "grad_norm": 0.1612151712179184, "learning_rate": 9.912436298596469e-05, "loss": 0.5109, "step": 1490 }, { "epoch": 0.17957957957957957, "grad_norm": 0.15164624154567719, "learning_rate": 9.91184936857566e-05, "loss": 0.5532, "step": 1495 }, { "epoch": 0.18018018018018017, "grad_norm": 0.15083666145801544, "learning_rate": 9.911260495548892e-05, "loss": 0.5337, "step": 1500 }, { "epoch": 0.18018018018018017, "eval_loss": 0.532286524772644, "eval_runtime": 37.2562, "eval_samples_per_second": 21.473, "eval_steps_per_second": 5.368, "step": 1500 }, { "epoch": 0.18078078078078078, "grad_norm": 0.1541498303413391, "learning_rate": 9.910669679749101e-05, "loss": 0.5712, "step": 1505 }, { "epoch": 0.18138138138138138, "grad_norm": 0.16341352462768555, "learning_rate": 9.910076921410003e-05, "loss": 0.5631, "step": 1510 }, { "epoch": 0.18198198198198198, "grad_norm": 0.15230105817317963, "learning_rate": 9.909482220766077e-05, "loss": 0.5518, "step": 1515 }, { "epoch": 0.1825825825825826, "grad_norm": 0.12905479967594147, "learning_rate": 9.908885578052573e-05, "loss": 0.5093, "step": 1520 }, { "epoch": 0.1831831831831832, "grad_norm": 0.15421278774738312, "learning_rate": 9.908286993505509e-05, "loss": 0.5395, "step": 1525 }, { "epoch": 0.1837837837837838, "grad_norm": 0.13414278626441956, "learning_rate": 9.907686467361667e-05, "loss": 0.5682, "step": 1530 }, { "epoch": 0.1843843843843844, "grad_norm": 0.14239756762981415, "learning_rate": 9.907083999858601e-05, "loss": 0.5906, "step": 1535 }, { "epoch": 0.18498498498498497, "grad_norm": 0.1477949470281601, "learning_rate": 9.906479591234634e-05, "loss": 0.5854, "step": 1540 }, { "epoch": 0.18558558558558558, "grad_norm": 0.1359156221151352, "learning_rate": 9.905873241728856e-05, "loss": 0.5209, "step": 1545 }, { "epoch": 0.18618618618618618, "grad_norm": 0.1480684131383896, "learning_rate": 9.90526495158112e-05, "loss": 0.6048, "step": 1550 }, { "epoch": 0.18678678678678678, "grad_norm": 0.16773481667041779, "learning_rate": 9.904654721032053e-05, "loss": 0.5303, "step": 1555 }, { "epoch": 0.1873873873873874, "grad_norm": 0.1348617523908615, "learning_rate": 9.904042550323047e-05, "loss": 0.5531, "step": 1560 }, { "epoch": 0.187987987987988, "grad_norm": 0.15043242275714874, "learning_rate": 9.90342843969626e-05, "loss": 0.5665, "step": 1565 }, { "epoch": 0.1885885885885886, "grad_norm": 0.15387991070747375, "learning_rate": 9.902812389394622e-05, "loss": 0.5394, "step": 1570 }, { "epoch": 0.1891891891891892, "grad_norm": 0.15662983059883118, "learning_rate": 9.902194399661826e-05, "loss": 0.6264, "step": 1575 }, { "epoch": 0.1897897897897898, "grad_norm": 0.14307186007499695, "learning_rate": 9.901574470742332e-05, "loss": 0.5138, "step": 1580 }, { "epoch": 0.19039039039039038, "grad_norm": 0.17086243629455566, "learning_rate": 9.900952602881369e-05, "loss": 0.5255, "step": 1585 }, { "epoch": 0.19099099099099098, "grad_norm": 0.16943460702896118, "learning_rate": 9.900328796324933e-05, "loss": 0.5263, "step": 1590 }, { "epoch": 0.1915915915915916, "grad_norm": 0.13184261322021484, "learning_rate": 9.899703051319786e-05, "loss": 0.5284, "step": 1595 }, { "epoch": 0.1921921921921922, "grad_norm": 0.16626524925231934, "learning_rate": 9.899075368113459e-05, "loss": 0.5661, "step": 1600 }, { "epoch": 0.1927927927927928, "grad_norm": 0.15596666932106018, "learning_rate": 9.898445746954246e-05, "loss": 0.5182, "step": 1605 }, { "epoch": 0.1933933933933934, "grad_norm": 0.13959796726703644, "learning_rate": 9.897814188091209e-05, "loss": 0.5516, "step": 1610 }, { "epoch": 0.193993993993994, "grad_norm": 0.1410140097141266, "learning_rate": 9.89718069177418e-05, "loss": 0.5306, "step": 1615 }, { "epoch": 0.1945945945945946, "grad_norm": 0.16129790246486664, "learning_rate": 9.896545258253751e-05, "loss": 0.6369, "step": 1620 }, { "epoch": 0.19519519519519518, "grad_norm": 0.13550494611263275, "learning_rate": 9.895907887781286e-05, "loss": 0.4805, "step": 1625 }, { "epoch": 0.19579579579579579, "grad_norm": 0.15120775997638702, "learning_rate": 9.895268580608912e-05, "loss": 0.5868, "step": 1630 }, { "epoch": 0.1963963963963964, "grad_norm": 0.19538648426532745, "learning_rate": 9.894627336989524e-05, "loss": 0.5838, "step": 1635 }, { "epoch": 0.196996996996997, "grad_norm": 0.15177778899669647, "learning_rate": 9.893984157176781e-05, "loss": 0.5651, "step": 1640 }, { "epoch": 0.1975975975975976, "grad_norm": 0.17927424609661102, "learning_rate": 9.89333904142511e-05, "loss": 0.5452, "step": 1645 }, { "epoch": 0.1981981981981982, "grad_norm": 0.14731010794639587, "learning_rate": 9.892691989989701e-05, "loss": 0.5117, "step": 1650 }, { "epoch": 0.1987987987987988, "grad_norm": 0.15999050438404083, "learning_rate": 9.892043003126515e-05, "loss": 0.5252, "step": 1655 }, { "epoch": 0.1993993993993994, "grad_norm": 0.14248783886432648, "learning_rate": 9.891392081092272e-05, "loss": 0.578, "step": 1660 }, { "epoch": 0.2, "grad_norm": 0.12995529174804688, "learning_rate": 9.890739224144461e-05, "loss": 0.5536, "step": 1665 }, { "epoch": 0.2006006006006006, "grad_norm": 0.1568031758069992, "learning_rate": 9.890084432541337e-05, "loss": 0.5022, "step": 1670 }, { "epoch": 0.2012012012012012, "grad_norm": 0.17728778719902039, "learning_rate": 9.889427706541918e-05, "loss": 0.5913, "step": 1675 }, { "epoch": 0.2018018018018018, "grad_norm": 0.18074369430541992, "learning_rate": 9.888769046405991e-05, "loss": 0.5252, "step": 1680 }, { "epoch": 0.2024024024024024, "grad_norm": 0.15932361781597137, "learning_rate": 9.888108452394105e-05, "loss": 0.5609, "step": 1685 }, { "epoch": 0.203003003003003, "grad_norm": 0.15575721859931946, "learning_rate": 9.887445924767571e-05, "loss": 0.6251, "step": 1690 }, { "epoch": 0.2036036036036036, "grad_norm": 0.14726823568344116, "learning_rate": 9.886781463788474e-05, "loss": 0.5817, "step": 1695 }, { "epoch": 0.2042042042042042, "grad_norm": 0.1656755954027176, "learning_rate": 9.886115069719654e-05, "loss": 0.4974, "step": 1700 }, { "epoch": 0.2048048048048048, "grad_norm": 0.169316828250885, "learning_rate": 9.885446742824722e-05, "loss": 0.5278, "step": 1705 }, { "epoch": 0.20540540540540542, "grad_norm": 0.14765839278697968, "learning_rate": 9.884776483368052e-05, "loss": 0.5993, "step": 1710 }, { "epoch": 0.206006006006006, "grad_norm": 0.16174598038196564, "learning_rate": 9.884104291614779e-05, "loss": 0.6095, "step": 1715 }, { "epoch": 0.2066066066066066, "grad_norm": 0.157635897397995, "learning_rate": 9.88343016783081e-05, "loss": 0.548, "step": 1720 }, { "epoch": 0.2072072072072072, "grad_norm": 0.15818659961223602, "learning_rate": 9.88275411228281e-05, "loss": 0.5238, "step": 1725 }, { "epoch": 0.2078078078078078, "grad_norm": 0.16762599349021912, "learning_rate": 9.882076125238206e-05, "loss": 0.5897, "step": 1730 }, { "epoch": 0.2084084084084084, "grad_norm": 0.16823016107082367, "learning_rate": 9.881396206965199e-05, "loss": 0.5364, "step": 1735 }, { "epoch": 0.209009009009009, "grad_norm": 0.18279995024204254, "learning_rate": 9.880714357732743e-05, "loss": 0.5928, "step": 1740 }, { "epoch": 0.20960960960960962, "grad_norm": 0.13614603877067566, "learning_rate": 9.880030577810564e-05, "loss": 0.5266, "step": 1745 }, { "epoch": 0.21021021021021022, "grad_norm": 0.16419890522956848, "learning_rate": 9.879344867469145e-05, "loss": 0.5925, "step": 1750 }, { "epoch": 0.21021021021021022, "eval_loss": 0.5199762582778931, "eval_runtime": 35.5614, "eval_samples_per_second": 22.496, "eval_steps_per_second": 5.624, "step": 1750 }, { "epoch": 0.21081081081081082, "grad_norm": 0.17191119492053986, "learning_rate": 9.87865722697974e-05, "loss": 0.5782, "step": 1755 }, { "epoch": 0.2114114114114114, "grad_norm": 0.18159760534763336, "learning_rate": 9.877967656614359e-05, "loss": 0.5149, "step": 1760 }, { "epoch": 0.212012012012012, "grad_norm": 0.16005107760429382, "learning_rate": 9.87727615664578e-05, "loss": 0.5673, "step": 1765 }, { "epoch": 0.2126126126126126, "grad_norm": 0.16644255816936493, "learning_rate": 9.876582727347545e-05, "loss": 0.5478, "step": 1770 }, { "epoch": 0.2132132132132132, "grad_norm": 0.1638658046722412, "learning_rate": 9.875887368993957e-05, "loss": 0.5637, "step": 1775 }, { "epoch": 0.2138138138138138, "grad_norm": 0.14520032703876495, "learning_rate": 9.87519008186008e-05, "loss": 0.5773, "step": 1780 }, { "epoch": 0.21441441441441442, "grad_norm": 0.15663036704063416, "learning_rate": 9.874490866221747e-05, "loss": 0.5455, "step": 1785 }, { "epoch": 0.21501501501501502, "grad_norm": 0.16514965891838074, "learning_rate": 9.873789722355546e-05, "loss": 0.542, "step": 1790 }, { "epoch": 0.21561561561561562, "grad_norm": 0.1629652976989746, "learning_rate": 9.873086650538837e-05, "loss": 0.6072, "step": 1795 }, { "epoch": 0.21621621621621623, "grad_norm": 0.1657508760690689, "learning_rate": 9.872381651049734e-05, "loss": 0.515, "step": 1800 }, { "epoch": 0.2168168168168168, "grad_norm": 0.15766561031341553, "learning_rate": 9.87167472416712e-05, "loss": 0.5516, "step": 1805 }, { "epoch": 0.2174174174174174, "grad_norm": 0.14582960307598114, "learning_rate": 9.870965870170636e-05, "loss": 0.5225, "step": 1810 }, { "epoch": 0.218018018018018, "grad_norm": 0.15485908091068268, "learning_rate": 9.870255089340689e-05, "loss": 0.5809, "step": 1815 }, { "epoch": 0.21861861861861862, "grad_norm": 0.17167295515537262, "learning_rate": 9.869542381958445e-05, "loss": 0.5067, "step": 1820 }, { "epoch": 0.21921921921921922, "grad_norm": 0.18673966825008392, "learning_rate": 9.868827748305833e-05, "loss": 0.601, "step": 1825 }, { "epoch": 0.21981981981981982, "grad_norm": 0.1618594527244568, "learning_rate": 9.868111188665544e-05, "loss": 0.5431, "step": 1830 }, { "epoch": 0.22042042042042043, "grad_norm": 0.17336180806159973, "learning_rate": 9.867392703321032e-05, "loss": 0.544, "step": 1835 }, { "epoch": 0.22102102102102103, "grad_norm": 0.15530580282211304, "learning_rate": 9.866672292556513e-05, "loss": 0.5428, "step": 1840 }, { "epoch": 0.22162162162162163, "grad_norm": 0.1728203147649765, "learning_rate": 9.865949956656964e-05, "loss": 0.4687, "step": 1845 }, { "epoch": 0.2222222222222222, "grad_norm": 0.17873457074165344, "learning_rate": 9.86522569590812e-05, "loss": 0.5932, "step": 1850 }, { "epoch": 0.2228228228228228, "grad_norm": 0.16019093990325928, "learning_rate": 9.864499510596483e-05, "loss": 0.5252, "step": 1855 }, { "epoch": 0.22342342342342342, "grad_norm": 0.1702725887298584, "learning_rate": 9.863771401009314e-05, "loss": 0.5088, "step": 1860 }, { "epoch": 0.22402402402402402, "grad_norm": 0.1690736711025238, "learning_rate": 9.863041367434633e-05, "loss": 0.6143, "step": 1865 }, { "epoch": 0.22462462462462462, "grad_norm": 0.15510286390781403, "learning_rate": 9.862309410161227e-05, "loss": 0.5828, "step": 1870 }, { "epoch": 0.22522522522522523, "grad_norm": 0.1484537273645401, "learning_rate": 9.861575529478637e-05, "loss": 0.4891, "step": 1875 }, { "epoch": 0.22582582582582583, "grad_norm": 0.16495390236377716, "learning_rate": 9.860839725677168e-05, "loss": 0.5181, "step": 1880 }, { "epoch": 0.22642642642642644, "grad_norm": 0.15596404671669006, "learning_rate": 9.860101999047888e-05, "loss": 0.5655, "step": 1885 }, { "epoch": 0.22702702702702704, "grad_norm": 0.14629977941513062, "learning_rate": 9.859362349882621e-05, "loss": 0.5182, "step": 1890 }, { "epoch": 0.22762762762762762, "grad_norm": 0.16224302351474762, "learning_rate": 9.858620778473958e-05, "loss": 0.5392, "step": 1895 }, { "epoch": 0.22822822822822822, "grad_norm": 0.16578389704227448, "learning_rate": 9.85787728511524e-05, "loss": 0.5447, "step": 1900 }, { "epoch": 0.22882882882882882, "grad_norm": 0.1378798633813858, "learning_rate": 9.857131870100579e-05, "loss": 0.4545, "step": 1905 }, { "epoch": 0.22942942942942943, "grad_norm": 0.16701015830039978, "learning_rate": 9.856384533724841e-05, "loss": 0.5554, "step": 1910 }, { "epoch": 0.23003003003003003, "grad_norm": 0.17047229409217834, "learning_rate": 9.855635276283656e-05, "loss": 0.5063, "step": 1915 }, { "epoch": 0.23063063063063063, "grad_norm": 0.19112730026245117, "learning_rate": 9.854884098073409e-05, "loss": 0.5325, "step": 1920 }, { "epoch": 0.23123123123123124, "grad_norm": 0.16093651950359344, "learning_rate": 9.854130999391249e-05, "loss": 0.4856, "step": 1925 }, { "epoch": 0.23183183183183184, "grad_norm": 0.15758563578128815, "learning_rate": 9.853375980535082e-05, "loss": 0.5477, "step": 1930 }, { "epoch": 0.23243243243243245, "grad_norm": 0.15003785490989685, "learning_rate": 9.852619041803576e-05, "loss": 0.5487, "step": 1935 }, { "epoch": 0.23303303303303302, "grad_norm": 0.1630222648382187, "learning_rate": 9.851860183496155e-05, "loss": 0.4892, "step": 1940 }, { "epoch": 0.23363363363363363, "grad_norm": 0.14337176084518433, "learning_rate": 9.851099405913009e-05, "loss": 0.4906, "step": 1945 }, { "epoch": 0.23423423423423423, "grad_norm": 0.14522404968738556, "learning_rate": 9.850336709355079e-05, "loss": 0.5897, "step": 1950 }, { "epoch": 0.23483483483483483, "grad_norm": 0.18825727701187134, "learning_rate": 9.849572094124069e-05, "loss": 0.5179, "step": 1955 }, { "epoch": 0.23543543543543544, "grad_norm": 0.15544892847537994, "learning_rate": 9.848805560522444e-05, "loss": 0.5082, "step": 1960 }, { "epoch": 0.23603603603603604, "grad_norm": 0.1687832772731781, "learning_rate": 9.848037108853423e-05, "loss": 0.5303, "step": 1965 }, { "epoch": 0.23663663663663664, "grad_norm": 0.14829890429973602, "learning_rate": 9.84726673942099e-05, "loss": 0.4956, "step": 1970 }, { "epoch": 0.23723723723723725, "grad_norm": 0.17282085120677948, "learning_rate": 9.846494452529879e-05, "loss": 0.5406, "step": 1975 }, { "epoch": 0.23783783783783785, "grad_norm": 0.14352402091026306, "learning_rate": 9.845720248485593e-05, "loss": 0.5014, "step": 1980 }, { "epoch": 0.23843843843843843, "grad_norm": 0.13369005918502808, "learning_rate": 9.844944127594385e-05, "loss": 0.4679, "step": 1985 }, { "epoch": 0.23903903903903903, "grad_norm": 0.16212862730026245, "learning_rate": 9.84416609016327e-05, "loss": 0.5333, "step": 1990 }, { "epoch": 0.23963963963963963, "grad_norm": 0.1569916158914566, "learning_rate": 9.843386136500018e-05, "loss": 0.57, "step": 1995 }, { "epoch": 0.24024024024024024, "grad_norm": 0.16127009689807892, "learning_rate": 9.842604266913165e-05, "loss": 0.5064, "step": 2000 }, { "epoch": 0.24024024024024024, "eval_loss": 0.5154778361320496, "eval_runtime": 35.5905, "eval_samples_per_second": 22.478, "eval_steps_per_second": 5.619, "step": 2000 }, { "epoch": 0.24084084084084084, "grad_norm": 0.16832779347896576, "learning_rate": 9.841820481711992e-05, "loss": 0.5872, "step": 2005 }, { "epoch": 0.24144144144144145, "grad_norm": 0.1714124083518982, "learning_rate": 9.84103478120655e-05, "loss": 0.5264, "step": 2010 }, { "epoch": 0.24204204204204205, "grad_norm": 0.17094390094280243, "learning_rate": 9.840247165707642e-05, "loss": 0.5507, "step": 2015 }, { "epoch": 0.24264264264264265, "grad_norm": 0.1704624891281128, "learning_rate": 9.839457635526827e-05, "loss": 0.5719, "step": 2020 }, { "epoch": 0.24324324324324326, "grad_norm": 0.17609746754169464, "learning_rate": 9.838666190976427e-05, "loss": 0.5424, "step": 2025 }, { "epoch": 0.24384384384384383, "grad_norm": 0.15378032624721527, "learning_rate": 9.837872832369515e-05, "loss": 0.5369, "step": 2030 }, { "epoch": 0.24444444444444444, "grad_norm": 0.2722972333431244, "learning_rate": 9.837077560019925e-05, "loss": 0.4998, "step": 2035 }, { "epoch": 0.24504504504504504, "grad_norm": 0.1851070672273636, "learning_rate": 9.836280374242248e-05, "loss": 0.5136, "step": 2040 }, { "epoch": 0.24564564564564564, "grad_norm": 0.17583970725536346, "learning_rate": 9.835481275351828e-05, "loss": 0.5538, "step": 2045 }, { "epoch": 0.24624624624624625, "grad_norm": 0.18735957145690918, "learning_rate": 9.834680263664771e-05, "loss": 0.5468, "step": 2050 }, { "epoch": 0.24684684684684685, "grad_norm": 0.17340996861457825, "learning_rate": 9.833877339497939e-05, "loss": 0.6059, "step": 2055 }, { "epoch": 0.24744744744744746, "grad_norm": 0.1732621043920517, "learning_rate": 9.833072503168945e-05, "loss": 0.5425, "step": 2060 }, { "epoch": 0.24804804804804806, "grad_norm": 0.1660718470811844, "learning_rate": 9.832265754996164e-05, "loss": 0.4991, "step": 2065 }, { "epoch": 0.24864864864864866, "grad_norm": 0.18556509912014008, "learning_rate": 9.831457095298728e-05, "loss": 0.5401, "step": 2070 }, { "epoch": 0.24924924924924924, "grad_norm": 0.1343226432800293, "learning_rate": 9.830646524396518e-05, "loss": 0.5304, "step": 2075 }, { "epoch": 0.24984984984984984, "grad_norm": 0.17407070100307465, "learning_rate": 9.82983404261018e-05, "loss": 0.5154, "step": 2080 }, { "epoch": 0.25045045045045045, "grad_norm": 0.17739513516426086, "learning_rate": 9.829019650261111e-05, "loss": 0.5223, "step": 2085 }, { "epoch": 0.2510510510510511, "grad_norm": 0.15243694186210632, "learning_rate": 9.828203347671462e-05, "loss": 0.485, "step": 2090 }, { "epoch": 0.25165165165165165, "grad_norm": 0.17251120507717133, "learning_rate": 9.827385135164145e-05, "loss": 0.5174, "step": 2095 }, { "epoch": 0.25225225225225223, "grad_norm": 0.14546386897563934, "learning_rate": 9.82656501306282e-05, "loss": 0.5192, "step": 2100 }, { "epoch": 0.25285285285285286, "grad_norm": 0.1839355081319809, "learning_rate": 9.825742981691915e-05, "loss": 0.5414, "step": 2105 }, { "epoch": 0.25345345345345344, "grad_norm": 0.17075906693935394, "learning_rate": 9.824919041376597e-05, "loss": 0.4985, "step": 2110 }, { "epoch": 0.25405405405405407, "grad_norm": 0.20638392865657806, "learning_rate": 9.8240931924428e-05, "loss": 0.5412, "step": 2115 }, { "epoch": 0.25465465465465464, "grad_norm": 0.2239358276128769, "learning_rate": 9.82326543521721e-05, "loss": 0.55, "step": 2120 }, { "epoch": 0.2552552552552553, "grad_norm": 0.16201543807983398, "learning_rate": 9.822435770027267e-05, "loss": 0.5412, "step": 2125 }, { "epoch": 0.25585585585585585, "grad_norm": 0.17090320587158203, "learning_rate": 9.821604197201166e-05, "loss": 0.5311, "step": 2130 }, { "epoch": 0.2564564564564565, "grad_norm": 0.19192443788051605, "learning_rate": 9.820770717067856e-05, "loss": 0.5618, "step": 2135 }, { "epoch": 0.25705705705705706, "grad_norm": 0.19195041060447693, "learning_rate": 9.81993532995704e-05, "loss": 0.5095, "step": 2140 }, { "epoch": 0.25765765765765763, "grad_norm": 0.16927067935466766, "learning_rate": 9.819098036199178e-05, "loss": 0.5586, "step": 2145 }, { "epoch": 0.25825825825825827, "grad_norm": 0.18273282051086426, "learning_rate": 9.818258836125482e-05, "loss": 0.5314, "step": 2150 }, { "epoch": 0.25885885885885884, "grad_norm": 0.17846918106079102, "learning_rate": 9.81741773006792e-05, "loss": 0.5399, "step": 2155 }, { "epoch": 0.2594594594594595, "grad_norm": 0.15504570305347443, "learning_rate": 9.81657471835921e-05, "loss": 0.4769, "step": 2160 }, { "epoch": 0.26006006006006005, "grad_norm": 0.17235280573368073, "learning_rate": 9.815729801332832e-05, "loss": 0.633, "step": 2165 }, { "epoch": 0.2606606606606607, "grad_norm": 0.20063692331314087, "learning_rate": 9.814882979323008e-05, "loss": 0.5649, "step": 2170 }, { "epoch": 0.26126126126126126, "grad_norm": 0.17492002248764038, "learning_rate": 9.814034252664723e-05, "loss": 0.5565, "step": 2175 }, { "epoch": 0.2618618618618619, "grad_norm": 0.21677549183368683, "learning_rate": 9.813183621693711e-05, "loss": 0.5253, "step": 2180 }, { "epoch": 0.26246246246246246, "grad_norm": 0.1620025932788849, "learning_rate": 9.812331086746462e-05, "loss": 0.5449, "step": 2185 }, { "epoch": 0.26306306306306304, "grad_norm": 0.2064737230539322, "learning_rate": 9.811476648160216e-05, "loss": 0.5254, "step": 2190 }, { "epoch": 0.26366366366366367, "grad_norm": 0.1585373878479004, "learning_rate": 9.81062030627297e-05, "loss": 0.4856, "step": 2195 }, { "epoch": 0.26426426426426425, "grad_norm": 0.1649765521287918, "learning_rate": 9.809762061423469e-05, "loss": 0.5199, "step": 2200 }, { "epoch": 0.2648648648648649, "grad_norm": 0.17186199128627777, "learning_rate": 9.808901913951216e-05, "loss": 0.496, "step": 2205 }, { "epoch": 0.26546546546546546, "grad_norm": 0.16912420094013214, "learning_rate": 9.808039864196464e-05, "loss": 0.5118, "step": 2210 }, { "epoch": 0.2660660660660661, "grad_norm": 0.17357052862644196, "learning_rate": 9.807175912500215e-05, "loss": 0.5153, "step": 2215 }, { "epoch": 0.26666666666666666, "grad_norm": 0.1911926567554474, "learning_rate": 9.806310059204229e-05, "loss": 0.5019, "step": 2220 }, { "epoch": 0.2672672672672673, "grad_norm": 0.20069928467273712, "learning_rate": 9.805442304651018e-05, "loss": 0.4793, "step": 2225 }, { "epoch": 0.26786786786786787, "grad_norm": 0.18133674561977386, "learning_rate": 9.804572649183841e-05, "loss": 0.5091, "step": 2230 }, { "epoch": 0.26846846846846845, "grad_norm": 0.215446338057518, "learning_rate": 9.803701093146715e-05, "loss": 0.5845, "step": 2235 }, { "epoch": 0.2690690690690691, "grad_norm": 0.1596394032239914, "learning_rate": 9.802827636884405e-05, "loss": 0.5082, "step": 2240 }, { "epoch": 0.26966966966966965, "grad_norm": 0.2033008188009262, "learning_rate": 9.801952280742426e-05, "loss": 0.5612, "step": 2245 }, { "epoch": 0.2702702702702703, "grad_norm": 0.20107373595237732, "learning_rate": 9.801075025067053e-05, "loss": 0.5786, "step": 2250 }, { "epoch": 0.2702702702702703, "eval_loss": 0.5005493760108948, "eval_runtime": 35.6032, "eval_samples_per_second": 22.47, "eval_steps_per_second": 5.617, "step": 2250 }, { "epoch": 0.27087087087087086, "grad_norm": 0.17059586942195892, "learning_rate": 9.800195870205299e-05, "loss": 0.5403, "step": 2255 }, { "epoch": 0.2714714714714715, "grad_norm": 0.200556218624115, "learning_rate": 9.799314816504942e-05, "loss": 0.5233, "step": 2260 }, { "epoch": 0.27207207207207207, "grad_norm": 0.17981156706809998, "learning_rate": 9.798431864314506e-05, "loss": 0.5067, "step": 2265 }, { "epoch": 0.2726726726726727, "grad_norm": 0.18970321118831635, "learning_rate": 9.797547013983259e-05, "loss": 0.5582, "step": 2270 }, { "epoch": 0.2732732732732733, "grad_norm": 0.20475243031978607, "learning_rate": 9.796660265861228e-05, "loss": 0.5416, "step": 2275 }, { "epoch": 0.27387387387387385, "grad_norm": 0.15468288958072662, "learning_rate": 9.795771620299192e-05, "loss": 0.4692, "step": 2280 }, { "epoch": 0.2744744744744745, "grad_norm": 0.20453935861587524, "learning_rate": 9.794881077648674e-05, "loss": 0.554, "step": 2285 }, { "epoch": 0.27507507507507506, "grad_norm": 0.18685589730739594, "learning_rate": 9.793988638261952e-05, "loss": 0.5307, "step": 2290 }, { "epoch": 0.2756756756756757, "grad_norm": 0.18669554591178894, "learning_rate": 9.793094302492051e-05, "loss": 0.549, "step": 2295 }, { "epoch": 0.27627627627627627, "grad_norm": 0.189845472574234, "learning_rate": 9.79219807069275e-05, "loss": 0.4745, "step": 2300 }, { "epoch": 0.2768768768768769, "grad_norm": 0.15178772807121277, "learning_rate": 9.791299943218575e-05, "loss": 0.4152, "step": 2305 }, { "epoch": 0.2774774774774775, "grad_norm": 0.1891813725233078, "learning_rate": 9.790399920424806e-05, "loss": 0.4991, "step": 2310 }, { "epoch": 0.27807807807807805, "grad_norm": 0.19077537953853607, "learning_rate": 9.789498002667465e-05, "loss": 0.5572, "step": 2315 }, { "epoch": 0.2786786786786787, "grad_norm": 0.19744697213172913, "learning_rate": 9.78859419030333e-05, "loss": 0.5025, "step": 2320 }, { "epoch": 0.27927927927927926, "grad_norm": 0.21802768111228943, "learning_rate": 9.787688483689928e-05, "loss": 0.5121, "step": 2325 }, { "epoch": 0.2798798798798799, "grad_norm": 0.1979355812072754, "learning_rate": 9.786780883185534e-05, "loss": 0.5216, "step": 2330 }, { "epoch": 0.28048048048048047, "grad_norm": 0.18332335352897644, "learning_rate": 9.785871389149171e-05, "loss": 0.5519, "step": 2335 }, { "epoch": 0.2810810810810811, "grad_norm": 0.21532176434993744, "learning_rate": 9.784960001940613e-05, "loss": 0.5385, "step": 2340 }, { "epoch": 0.2816816816816817, "grad_norm": 0.2113877236843109, "learning_rate": 9.784046721920384e-05, "loss": 0.5273, "step": 2345 }, { "epoch": 0.2822822822822823, "grad_norm": 0.202877476811409, "learning_rate": 9.783131549449752e-05, "loss": 0.5381, "step": 2350 }, { "epoch": 0.2828828828828829, "grad_norm": 0.2112427055835724, "learning_rate": 9.782214484890736e-05, "loss": 0.5083, "step": 2355 }, { "epoch": 0.28348348348348346, "grad_norm": 0.193809375166893, "learning_rate": 9.781295528606108e-05, "loss": 0.5206, "step": 2360 }, { "epoch": 0.2840840840840841, "grad_norm": 0.16697217524051666, "learning_rate": 9.78037468095938e-05, "loss": 0.5498, "step": 2365 }, { "epoch": 0.28468468468468466, "grad_norm": 0.1831083744764328, "learning_rate": 9.779451942314822e-05, "loss": 0.5008, "step": 2370 }, { "epoch": 0.2852852852852853, "grad_norm": 0.1841299831867218, "learning_rate": 9.77852731303744e-05, "loss": 0.5426, "step": 2375 }, { "epoch": 0.28588588588588587, "grad_norm": 0.1795235127210617, "learning_rate": 9.777600793492998e-05, "loss": 0.5489, "step": 2380 }, { "epoch": 0.2864864864864865, "grad_norm": 0.19949278235435486, "learning_rate": 9.776672384048005e-05, "loss": 0.5087, "step": 2385 }, { "epoch": 0.2870870870870871, "grad_norm": 0.18009242415428162, "learning_rate": 9.775742085069715e-05, "loss": 0.5096, "step": 2390 }, { "epoch": 0.2876876876876877, "grad_norm": 0.17108169198036194, "learning_rate": 9.774809896926133e-05, "loss": 0.5274, "step": 2395 }, { "epoch": 0.2882882882882883, "grad_norm": 0.1928662210702896, "learning_rate": 9.773875819986007e-05, "loss": 0.5499, "step": 2400 }, { "epoch": 0.28888888888888886, "grad_norm": 0.23090901970863342, "learning_rate": 9.772939854618836e-05, "loss": 0.553, "step": 2405 }, { "epoch": 0.2894894894894895, "grad_norm": 0.18499891459941864, "learning_rate": 9.772002001194866e-05, "loss": 0.4899, "step": 2410 }, { "epoch": 0.29009009009009007, "grad_norm": 0.16356371343135834, "learning_rate": 9.771062260085089e-05, "loss": 0.4746, "step": 2415 }, { "epoch": 0.2906906906906907, "grad_norm": 0.16599981486797333, "learning_rate": 9.770120631661239e-05, "loss": 0.5046, "step": 2420 }, { "epoch": 0.2912912912912913, "grad_norm": 0.17277301847934723, "learning_rate": 9.769177116295805e-05, "loss": 0.5564, "step": 2425 }, { "epoch": 0.2918918918918919, "grad_norm": 0.16423299908638, "learning_rate": 9.768231714362015e-05, "loss": 0.494, "step": 2430 }, { "epoch": 0.2924924924924925, "grad_norm": 0.21444030106067657, "learning_rate": 9.767284426233849e-05, "loss": 0.4991, "step": 2435 }, { "epoch": 0.2930930930930931, "grad_norm": 0.22856414318084717, "learning_rate": 9.766335252286031e-05, "loss": 0.5258, "step": 2440 }, { "epoch": 0.2936936936936937, "grad_norm": 0.18524369597434998, "learning_rate": 9.765384192894031e-05, "loss": 0.4881, "step": 2445 }, { "epoch": 0.29429429429429427, "grad_norm": 0.19163531064987183, "learning_rate": 9.764431248434062e-05, "loss": 0.5674, "step": 2450 }, { "epoch": 0.2948948948948949, "grad_norm": 0.19218379259109497, "learning_rate": 9.763476419283086e-05, "loss": 0.6041, "step": 2455 }, { "epoch": 0.2954954954954955, "grad_norm": 0.17554765939712524, "learning_rate": 9.762519705818813e-05, "loss": 0.5599, "step": 2460 }, { "epoch": 0.2960960960960961, "grad_norm": 0.19090323150157928, "learning_rate": 9.761561108419691e-05, "loss": 0.5389, "step": 2465 }, { "epoch": 0.2966966966966967, "grad_norm": 0.20730207860469818, "learning_rate": 9.76060062746492e-05, "loss": 0.4687, "step": 2470 }, { "epoch": 0.2972972972972973, "grad_norm": 0.19577758014202118, "learning_rate": 9.75963826333444e-05, "loss": 0.5313, "step": 2475 }, { "epoch": 0.2978978978978979, "grad_norm": 0.22707220911979675, "learning_rate": 9.75867401640894e-05, "loss": 0.488, "step": 2480 }, { "epoch": 0.2984984984984985, "grad_norm": 0.19406184554100037, "learning_rate": 9.757707887069854e-05, "loss": 0.5086, "step": 2485 }, { "epoch": 0.2990990990990991, "grad_norm": 0.16421322524547577, "learning_rate": 9.756739875699354e-05, "loss": 0.4937, "step": 2490 }, { "epoch": 0.2996996996996997, "grad_norm": 0.20811273157596588, "learning_rate": 9.755769982680367e-05, "loss": 0.5175, "step": 2495 }, { "epoch": 0.3003003003003003, "grad_norm": 0.1713363230228424, "learning_rate": 9.754798208396554e-05, "loss": 0.444, "step": 2500 }, { "epoch": 0.3003003003003003, "eval_loss": 0.48717838525772095, "eval_runtime": 35.6296, "eval_samples_per_second": 22.453, "eval_steps_per_second": 5.613, "step": 2500 }, { "epoch": 0.3009009009009009, "grad_norm": 0.15711075067520142, "learning_rate": 9.753824553232327e-05, "loss": 0.4657, "step": 2505 }, { "epoch": 0.3015015015015015, "grad_norm": 0.18782839179039001, "learning_rate": 9.752849017572841e-05, "loss": 0.5125, "step": 2510 }, { "epoch": 0.3021021021021021, "grad_norm": 0.2134360820055008, "learning_rate": 9.751871601803993e-05, "loss": 0.4831, "step": 2515 }, { "epoch": 0.3027027027027027, "grad_norm": 0.21311652660369873, "learning_rate": 9.750892306312423e-05, "loss": 0.5408, "step": 2520 }, { "epoch": 0.3033033033033033, "grad_norm": 0.17872583866119385, "learning_rate": 9.749911131485516e-05, "loss": 0.5442, "step": 2525 }, { "epoch": 0.3039039039039039, "grad_norm": 0.22014202177524567, "learning_rate": 9.748928077711402e-05, "loss": 0.5266, "step": 2530 }, { "epoch": 0.3045045045045045, "grad_norm": 0.21009013056755066, "learning_rate": 9.74794314537895e-05, "loss": 0.501, "step": 2535 }, { "epoch": 0.3051051051051051, "grad_norm": 0.20648615062236786, "learning_rate": 9.74695633487778e-05, "loss": 0.4969, "step": 2540 }, { "epoch": 0.3057057057057057, "grad_norm": 0.18218368291854858, "learning_rate": 9.745967646598245e-05, "loss": 0.5197, "step": 2545 }, { "epoch": 0.3063063063063063, "grad_norm": 0.17514494061470032, "learning_rate": 9.744977080931448e-05, "loss": 0.4872, "step": 2550 }, { "epoch": 0.3069069069069069, "grad_norm": 0.19781029224395752, "learning_rate": 9.743984638269233e-05, "loss": 0.5385, "step": 2555 }, { "epoch": 0.3075075075075075, "grad_norm": 0.21139483153820038, "learning_rate": 9.742990319004182e-05, "loss": 0.5162, "step": 2560 }, { "epoch": 0.3081081081081081, "grad_norm": 0.2510957419872284, "learning_rate": 9.741994123529626e-05, "loss": 0.5231, "step": 2565 }, { "epoch": 0.3087087087087087, "grad_norm": 0.20459957420825958, "learning_rate": 9.740996052239635e-05, "loss": 0.4739, "step": 2570 }, { "epoch": 0.30930930930930933, "grad_norm": 0.18105819821357727, "learning_rate": 9.739996105529021e-05, "loss": 0.513, "step": 2575 }, { "epoch": 0.3099099099099099, "grad_norm": 0.20429784059524536, "learning_rate": 9.738994283793336e-05, "loss": 0.4921, "step": 2580 }, { "epoch": 0.3105105105105105, "grad_norm": 0.2193429321050644, "learning_rate": 9.737990587428881e-05, "loss": 0.5081, "step": 2585 }, { "epoch": 0.3111111111111111, "grad_norm": 0.22544243931770325, "learning_rate": 9.736985016832689e-05, "loss": 0.4762, "step": 2590 }, { "epoch": 0.3117117117117117, "grad_norm": 0.22182606160640717, "learning_rate": 9.735977572402541e-05, "loss": 0.5515, "step": 2595 }, { "epoch": 0.3123123123123123, "grad_norm": 0.2171258181333542, "learning_rate": 9.734968254536955e-05, "loss": 0.5324, "step": 2600 }, { "epoch": 0.3129129129129129, "grad_norm": 0.1789737343788147, "learning_rate": 9.733957063635196e-05, "loss": 0.4967, "step": 2605 }, { "epoch": 0.31351351351351353, "grad_norm": 0.18293219804763794, "learning_rate": 9.732944000097259e-05, "loss": 0.5441, "step": 2610 }, { "epoch": 0.3141141141141141, "grad_norm": 0.20862580835819244, "learning_rate": 9.731929064323895e-05, "loss": 0.4834, "step": 2615 }, { "epoch": 0.31471471471471474, "grad_norm": 0.1822856217622757, "learning_rate": 9.730912256716582e-05, "loss": 0.5704, "step": 2620 }, { "epoch": 0.3153153153153153, "grad_norm": 0.2051629275083542, "learning_rate": 9.729893577677547e-05, "loss": 0.5149, "step": 2625 }, { "epoch": 0.3159159159159159, "grad_norm": 0.20480480790138245, "learning_rate": 9.728873027609752e-05, "loss": 0.5412, "step": 2630 }, { "epoch": 0.3165165165165165, "grad_norm": 0.17666372656822205, "learning_rate": 9.727850606916902e-05, "loss": 0.5019, "step": 2635 }, { "epoch": 0.3171171171171171, "grad_norm": 0.19383026659488678, "learning_rate": 9.726826316003442e-05, "loss": 0.5269, "step": 2640 }, { "epoch": 0.31771771771771773, "grad_norm": 0.1739753782749176, "learning_rate": 9.725800155274556e-05, "loss": 0.4867, "step": 2645 }, { "epoch": 0.3183183183183183, "grad_norm": 0.18919163942337036, "learning_rate": 9.724772125136168e-05, "loss": 0.525, "step": 2650 }, { "epoch": 0.31891891891891894, "grad_norm": 0.2092183381319046, "learning_rate": 9.723742225994938e-05, "loss": 0.5567, "step": 2655 }, { "epoch": 0.3195195195195195, "grad_norm": 0.1984640508890152, "learning_rate": 9.722710458258276e-05, "loss": 0.4905, "step": 2660 }, { "epoch": 0.32012012012012014, "grad_norm": 0.21197611093521118, "learning_rate": 9.721676822334315e-05, "loss": 0.4638, "step": 2665 }, { "epoch": 0.3207207207207207, "grad_norm": 0.21106930077075958, "learning_rate": 9.72064131863194e-05, "loss": 0.5742, "step": 2670 }, { "epoch": 0.3213213213213213, "grad_norm": 0.1765323430299759, "learning_rate": 9.719603947560771e-05, "loss": 0.4783, "step": 2675 }, { "epoch": 0.3219219219219219, "grad_norm": 0.18311339616775513, "learning_rate": 9.718564709531167e-05, "loss": 0.4522, "step": 2680 }, { "epoch": 0.3225225225225225, "grad_norm": 0.19145925343036652, "learning_rate": 9.717523604954223e-05, "loss": 0.518, "step": 2685 }, { "epoch": 0.32312312312312313, "grad_norm": 0.19816061854362488, "learning_rate": 9.716480634241773e-05, "loss": 0.5503, "step": 2690 }, { "epoch": 0.3237237237237237, "grad_norm": 0.19963683187961578, "learning_rate": 9.715435797806395e-05, "loss": 0.4971, "step": 2695 }, { "epoch": 0.32432432432432434, "grad_norm": 0.18355773389339447, "learning_rate": 9.714389096061396e-05, "loss": 0.5028, "step": 2700 }, { "epoch": 0.3249249249249249, "grad_norm": 0.2459624856710434, "learning_rate": 9.713340529420826e-05, "loss": 0.5526, "step": 2705 }, { "epoch": 0.32552552552552555, "grad_norm": 0.2084140181541443, "learning_rate": 9.712290098299475e-05, "loss": 0.5147, "step": 2710 }, { "epoch": 0.3261261261261261, "grad_norm": 0.18029722571372986, "learning_rate": 9.711237803112865e-05, "loss": 0.5202, "step": 2715 }, { "epoch": 0.3267267267267267, "grad_norm": 0.16493675112724304, "learning_rate": 9.710183644277257e-05, "loss": 0.4528, "step": 2720 }, { "epoch": 0.32732732732732733, "grad_norm": 0.23521707952022552, "learning_rate": 9.709127622209652e-05, "loss": 0.5859, "step": 2725 }, { "epoch": 0.3279279279279279, "grad_norm": 0.1886672079563141, "learning_rate": 9.708069737327786e-05, "loss": 0.5038, "step": 2730 }, { "epoch": 0.32852852852852854, "grad_norm": 0.19265535473823547, "learning_rate": 9.707009990050131e-05, "loss": 0.4781, "step": 2735 }, { "epoch": 0.3291291291291291, "grad_norm": 0.1997983306646347, "learning_rate": 9.705948380795897e-05, "loss": 0.5192, "step": 2740 }, { "epoch": 0.32972972972972975, "grad_norm": 0.2557390034198761, "learning_rate": 9.704884909985031e-05, "loss": 0.495, "step": 2745 }, { "epoch": 0.3303303303303303, "grad_norm": 0.1663554459810257, "learning_rate": 9.703819578038216e-05, "loss": 0.5106, "step": 2750 }, { "epoch": 0.3303303303303303, "eval_loss": 0.4807363748550415, "eval_runtime": 35.697, "eval_samples_per_second": 22.411, "eval_steps_per_second": 5.603, "step": 2750 }, { "epoch": 0.33093093093093096, "grad_norm": 0.21156282722949982, "learning_rate": 9.70275238537687e-05, "loss": 0.5359, "step": 2755 }, { "epoch": 0.33153153153153153, "grad_norm": 0.19707150757312775, "learning_rate": 9.70168333242315e-05, "loss": 0.5322, "step": 2760 }, { "epoch": 0.3321321321321321, "grad_norm": 0.1987551748752594, "learning_rate": 9.700612419599943e-05, "loss": 0.5211, "step": 2765 }, { "epoch": 0.33273273273273274, "grad_norm": 0.1993408352136612, "learning_rate": 9.69953964733088e-05, "loss": 0.4963, "step": 2770 }, { "epoch": 0.3333333333333333, "grad_norm": 0.17729812860488892, "learning_rate": 9.69846501604032e-05, "loss": 0.5273, "step": 2775 }, { "epoch": 0.33393393393393395, "grad_norm": 0.2088416963815689, "learning_rate": 9.69738852615336e-05, "loss": 0.5219, "step": 2780 }, { "epoch": 0.3345345345345345, "grad_norm": 0.214419886469841, "learning_rate": 9.696310178095835e-05, "loss": 0.5371, "step": 2785 }, { "epoch": 0.33513513513513515, "grad_norm": 0.20148953795433044, "learning_rate": 9.695229972294314e-05, "loss": 0.5254, "step": 2790 }, { "epoch": 0.33573573573573573, "grad_norm": 0.21607989072799683, "learning_rate": 9.694147909176097e-05, "loss": 0.502, "step": 2795 }, { "epoch": 0.33633633633633636, "grad_norm": 0.22730080783367157, "learning_rate": 9.69306398916922e-05, "loss": 0.5142, "step": 2800 }, { "epoch": 0.33693693693693694, "grad_norm": 0.2318321019411087, "learning_rate": 9.691978212702459e-05, "loss": 0.558, "step": 2805 }, { "epoch": 0.3375375375375375, "grad_norm": 0.19326099753379822, "learning_rate": 9.690890580205318e-05, "loss": 0.5068, "step": 2810 }, { "epoch": 0.33813813813813814, "grad_norm": 0.22810117900371552, "learning_rate": 9.689801092108037e-05, "loss": 0.5672, "step": 2815 }, { "epoch": 0.3387387387387387, "grad_norm": 0.22707174718379974, "learning_rate": 9.688709748841591e-05, "loss": 0.4705, "step": 2820 }, { "epoch": 0.33933933933933935, "grad_norm": 0.24668732285499573, "learning_rate": 9.68761655083769e-05, "loss": 0.5207, "step": 2825 }, { "epoch": 0.33993993993993993, "grad_norm": 0.21590447425842285, "learning_rate": 9.686521498528774e-05, "loss": 0.4787, "step": 2830 }, { "epoch": 0.34054054054054056, "grad_norm": 0.20210616290569305, "learning_rate": 9.68542459234802e-05, "loss": 0.5231, "step": 2835 }, { "epoch": 0.34114114114114114, "grad_norm": 0.17560534179210663, "learning_rate": 9.684325832729335e-05, "loss": 0.4835, "step": 2840 }, { "epoch": 0.34174174174174177, "grad_norm": 0.17611800134181976, "learning_rate": 9.683225220107363e-05, "loss": 0.5273, "step": 2845 }, { "epoch": 0.34234234234234234, "grad_norm": 0.2211238294839859, "learning_rate": 9.682122754917479e-05, "loss": 0.493, "step": 2850 }, { "epoch": 0.3429429429429429, "grad_norm": 0.20786675810813904, "learning_rate": 9.681018437595789e-05, "loss": 0.5369, "step": 2855 }, { "epoch": 0.34354354354354355, "grad_norm": 0.2106047421693802, "learning_rate": 9.679912268579136e-05, "loss": 0.4923, "step": 2860 }, { "epoch": 0.3441441441441441, "grad_norm": 0.21731366217136383, "learning_rate": 9.678804248305091e-05, "loss": 0.559, "step": 2865 }, { "epoch": 0.34474474474474476, "grad_norm": 0.22532141208648682, "learning_rate": 9.67769437721196e-05, "loss": 0.5048, "step": 2870 }, { "epoch": 0.34534534534534533, "grad_norm": 0.23368632793426514, "learning_rate": 9.676582655738781e-05, "loss": 0.4887, "step": 2875 }, { "epoch": 0.34594594594594597, "grad_norm": 0.2270139902830124, "learning_rate": 9.675469084325324e-05, "loss": 0.5017, "step": 2880 }, { "epoch": 0.34654654654654654, "grad_norm": 0.22090162336826324, "learning_rate": 9.674353663412091e-05, "loss": 0.5136, "step": 2885 }, { "epoch": 0.3471471471471472, "grad_norm": 0.18990397453308105, "learning_rate": 9.67323639344031e-05, "loss": 0.498, "step": 2890 }, { "epoch": 0.34774774774774775, "grad_norm": 0.24817807972431183, "learning_rate": 9.672117274851952e-05, "loss": 0.5257, "step": 2895 }, { "epoch": 0.3483483483483483, "grad_norm": 0.23883749544620514, "learning_rate": 9.670996308089708e-05, "loss": 0.455, "step": 2900 }, { "epoch": 0.34894894894894896, "grad_norm": 0.21196013689041138, "learning_rate": 9.669873493597006e-05, "loss": 0.4734, "step": 2905 }, { "epoch": 0.34954954954954953, "grad_norm": 0.2275170087814331, "learning_rate": 9.668748831818005e-05, "loss": 0.4953, "step": 2910 }, { "epoch": 0.35015015015015016, "grad_norm": 0.20784740149974823, "learning_rate": 9.66762232319759e-05, "loss": 0.5077, "step": 2915 }, { "epoch": 0.35075075075075074, "grad_norm": 0.22956329584121704, "learning_rate": 9.666493968181383e-05, "loss": 0.4433, "step": 2920 }, { "epoch": 0.35135135135135137, "grad_norm": 0.24458478391170502, "learning_rate": 9.665363767215732e-05, "loss": 0.528, "step": 2925 }, { "epoch": 0.35195195195195195, "grad_norm": 0.21098071336746216, "learning_rate": 9.664231720747718e-05, "loss": 0.4942, "step": 2930 }, { "epoch": 0.3525525525525526, "grad_norm": 0.19882138073444366, "learning_rate": 9.663097829225148e-05, "loss": 0.4704, "step": 2935 }, { "epoch": 0.35315315315315315, "grad_norm": 0.21656285226345062, "learning_rate": 9.661962093096563e-05, "loss": 0.4909, "step": 2940 }, { "epoch": 0.35375375375375373, "grad_norm": 0.216679185628891, "learning_rate": 9.66082451281123e-05, "loss": 0.4855, "step": 2945 }, { "epoch": 0.35435435435435436, "grad_norm": 0.20689897239208221, "learning_rate": 9.659685088819152e-05, "loss": 0.4355, "step": 2950 }, { "epoch": 0.35495495495495494, "grad_norm": 0.22368694841861725, "learning_rate": 9.658543821571054e-05, "loss": 0.5387, "step": 2955 }, { "epoch": 0.35555555555555557, "grad_norm": 0.22052088379859924, "learning_rate": 9.657400711518394e-05, "loss": 0.5567, "step": 2960 }, { "epoch": 0.35615615615615615, "grad_norm": 0.2549915313720703, "learning_rate": 9.656255759113355e-05, "loss": 0.5356, "step": 2965 }, { "epoch": 0.3567567567567568, "grad_norm": 0.24571342766284943, "learning_rate": 9.655108964808857e-05, "loss": 0.5178, "step": 2970 }, { "epoch": 0.35735735735735735, "grad_norm": 0.2269880622625351, "learning_rate": 9.653960329058538e-05, "loss": 0.4883, "step": 2975 }, { "epoch": 0.357957957957958, "grad_norm": 0.23701144754886627, "learning_rate": 9.652809852316774e-05, "loss": 0.5237, "step": 2980 }, { "epoch": 0.35855855855855856, "grad_norm": 0.23912853002548218, "learning_rate": 9.651657535038663e-05, "loss": 0.5064, "step": 2985 }, { "epoch": 0.35915915915915914, "grad_norm": 0.22557851672172546, "learning_rate": 9.650503377680035e-05, "loss": 0.4864, "step": 2990 }, { "epoch": 0.35975975975975977, "grad_norm": 0.2128036916255951, "learning_rate": 9.649347380697445e-05, "loss": 0.5378, "step": 2995 }, { "epoch": 0.36036036036036034, "grad_norm": 0.23978100717067719, "learning_rate": 9.648189544548173e-05, "loss": 0.6153, "step": 3000 }, { "epoch": 0.36036036036036034, "eval_loss": 0.47244203090667725, "eval_runtime": 35.6635, "eval_samples_per_second": 22.432, "eval_steps_per_second": 5.608, "step": 3000 }, { "epoch": 0.360960960960961, "grad_norm": 0.22764725983142853, "learning_rate": 9.647029869690238e-05, "loss": 0.4915, "step": 3005 }, { "epoch": 0.36156156156156155, "grad_norm": 0.23097966611385345, "learning_rate": 9.645868356582373e-05, "loss": 0.4695, "step": 3010 }, { "epoch": 0.3621621621621622, "grad_norm": 0.20730188488960266, "learning_rate": 9.644705005684045e-05, "loss": 0.4611, "step": 3015 }, { "epoch": 0.36276276276276276, "grad_norm": 0.22593528032302856, "learning_rate": 9.643539817455448e-05, "loss": 0.4925, "step": 3020 }, { "epoch": 0.3633633633633634, "grad_norm": 0.18994325399398804, "learning_rate": 9.642372792357501e-05, "loss": 0.5304, "step": 3025 }, { "epoch": 0.36396396396396397, "grad_norm": 0.20703668892383575, "learning_rate": 9.64120393085185e-05, "loss": 0.4805, "step": 3030 }, { "epoch": 0.36456456456456454, "grad_norm": 0.20578399300575256, "learning_rate": 9.640033233400867e-05, "loss": 0.5288, "step": 3035 }, { "epoch": 0.3651651651651652, "grad_norm": 0.24444563686847687, "learning_rate": 9.638860700467652e-05, "loss": 0.494, "step": 3040 }, { "epoch": 0.36576576576576575, "grad_norm": 0.20720888674259186, "learning_rate": 9.637686332516029e-05, "loss": 0.5434, "step": 3045 }, { "epoch": 0.3663663663663664, "grad_norm": 0.2322540134191513, "learning_rate": 9.63651013001055e-05, "loss": 0.4814, "step": 3050 }, { "epoch": 0.36696696696696696, "grad_norm": 0.2209395170211792, "learning_rate": 9.635332093416491e-05, "loss": 0.4871, "step": 3055 }, { "epoch": 0.3675675675675676, "grad_norm": 0.242445707321167, "learning_rate": 9.634152223199855e-05, "loss": 0.4968, "step": 3060 }, { "epoch": 0.36816816816816816, "grad_norm": 0.2292807400226593, "learning_rate": 9.632970519827367e-05, "loss": 0.5409, "step": 3065 }, { "epoch": 0.3687687687687688, "grad_norm": 0.22164930403232574, "learning_rate": 9.631786983766482e-05, "loss": 0.4993, "step": 3070 }, { "epoch": 0.36936936936936937, "grad_norm": 0.21160635352134705, "learning_rate": 9.630601615485378e-05, "loss": 0.4895, "step": 3075 }, { "epoch": 0.36996996996996995, "grad_norm": 0.21484865248203278, "learning_rate": 9.629414415452954e-05, "loss": 0.4554, "step": 3080 }, { "epoch": 0.3705705705705706, "grad_norm": 0.19489717483520508, "learning_rate": 9.62822538413884e-05, "loss": 0.4545, "step": 3085 }, { "epoch": 0.37117117117117115, "grad_norm": 0.22546513378620148, "learning_rate": 9.627034522013386e-05, "loss": 0.4997, "step": 3090 }, { "epoch": 0.3717717717717718, "grad_norm": 0.22963769733905792, "learning_rate": 9.625841829547668e-05, "loss": 0.5092, "step": 3095 }, { "epoch": 0.37237237237237236, "grad_norm": 0.2154102623462677, "learning_rate": 9.624647307213485e-05, "loss": 0.4711, "step": 3100 }, { "epoch": 0.372972972972973, "grad_norm": 0.2301490157842636, "learning_rate": 9.623450955483363e-05, "loss": 0.4809, "step": 3105 }, { "epoch": 0.37357357357357357, "grad_norm": 0.2505705952644348, "learning_rate": 9.622252774830545e-05, "loss": 0.4854, "step": 3110 }, { "epoch": 0.3741741741741742, "grad_norm": 0.23982185125350952, "learning_rate": 9.621052765729006e-05, "loss": 0.5379, "step": 3115 }, { "epoch": 0.3747747747747748, "grad_norm": 0.24677838385105133, "learning_rate": 9.619850928653436e-05, "loss": 0.4954, "step": 3120 }, { "epoch": 0.37537537537537535, "grad_norm": 0.20734712481498718, "learning_rate": 9.618647264079253e-05, "loss": 0.4826, "step": 3125 }, { "epoch": 0.375975975975976, "grad_norm": 0.2506435513496399, "learning_rate": 9.617441772482598e-05, "loss": 0.4749, "step": 3130 }, { "epoch": 0.37657657657657656, "grad_norm": 0.2044931799173355, "learning_rate": 9.616234454340332e-05, "loss": 0.5429, "step": 3135 }, { "epoch": 0.3771771771771772, "grad_norm": 0.23018285632133484, "learning_rate": 9.615025310130044e-05, "loss": 0.5125, "step": 3140 }, { "epoch": 0.37777777777777777, "grad_norm": 0.2430395931005478, "learning_rate": 9.613814340330036e-05, "loss": 0.4755, "step": 3145 }, { "epoch": 0.3783783783783784, "grad_norm": 0.2155851274728775, "learning_rate": 9.612601545419342e-05, "loss": 0.4792, "step": 3150 }, { "epoch": 0.378978978978979, "grad_norm": 0.20493851602077484, "learning_rate": 9.611386925877711e-05, "loss": 0.5253, "step": 3155 }, { "epoch": 0.3795795795795796, "grad_norm": 0.2408454418182373, "learning_rate": 9.610170482185619e-05, "loss": 0.4542, "step": 3160 }, { "epoch": 0.3801801801801802, "grad_norm": 0.22176580131053925, "learning_rate": 9.608952214824257e-05, "loss": 0.4956, "step": 3165 }, { "epoch": 0.38078078078078076, "grad_norm": 0.22562333941459656, "learning_rate": 9.607732124275545e-05, "loss": 0.4627, "step": 3170 }, { "epoch": 0.3813813813813814, "grad_norm": 0.2003718614578247, "learning_rate": 9.60651021102212e-05, "loss": 0.3945, "step": 3175 }, { "epoch": 0.38198198198198197, "grad_norm": 0.24408352375030518, "learning_rate": 9.605286475547339e-05, "loss": 0.4303, "step": 3180 }, { "epoch": 0.3825825825825826, "grad_norm": 0.24553069472312927, "learning_rate": 9.604060918335283e-05, "loss": 0.4767, "step": 3185 }, { "epoch": 0.3831831831831832, "grad_norm": 0.23498015105724335, "learning_rate": 9.602833539870753e-05, "loss": 0.4424, "step": 3190 }, { "epoch": 0.3837837837837838, "grad_norm": 0.22736471891403198, "learning_rate": 9.601604340639265e-05, "loss": 0.4697, "step": 3195 }, { "epoch": 0.3843843843843844, "grad_norm": 0.23163831233978271, "learning_rate": 9.600373321127065e-05, "loss": 0.4637, "step": 3200 }, { "epoch": 0.384984984984985, "grad_norm": 0.27113768458366394, "learning_rate": 9.599140481821112e-05, "loss": 0.4072, "step": 3205 }, { "epoch": 0.3855855855855856, "grad_norm": 0.26473140716552734, "learning_rate": 9.597905823209086e-05, "loss": 0.5052, "step": 3210 }, { "epoch": 0.38618618618618616, "grad_norm": 0.23976100981235504, "learning_rate": 9.596669345779388e-05, "loss": 0.5035, "step": 3215 }, { "epoch": 0.3867867867867868, "grad_norm": 0.19085469841957092, "learning_rate": 9.595431050021135e-05, "loss": 0.4899, "step": 3220 }, { "epoch": 0.38738738738738737, "grad_norm": 0.23537762463092804, "learning_rate": 9.594190936424173e-05, "loss": 0.4674, "step": 3225 }, { "epoch": 0.387987987987988, "grad_norm": 0.2305203676223755, "learning_rate": 9.592949005479053e-05, "loss": 0.4776, "step": 3230 }, { "epoch": 0.3885885885885886, "grad_norm": 0.23646143078804016, "learning_rate": 9.591705257677054e-05, "loss": 0.4402, "step": 3235 }, { "epoch": 0.3891891891891892, "grad_norm": 0.2590693533420563, "learning_rate": 9.590459693510177e-05, "loss": 0.5002, "step": 3240 }, { "epoch": 0.3897897897897898, "grad_norm": 0.2276909500360489, "learning_rate": 9.58921231347113e-05, "loss": 0.4744, "step": 3245 }, { "epoch": 0.39039039039039036, "grad_norm": 0.24374452233314514, "learning_rate": 9.587963118053347e-05, "loss": 0.5043, "step": 3250 }, { "epoch": 0.39039039039039036, "eval_loss": 0.4580923318862915, "eval_runtime": 35.5771, "eval_samples_per_second": 22.486, "eval_steps_per_second": 5.622, "step": 3250 }, { "epoch": 0.390990990990991, "grad_norm": 0.21707071363925934, "learning_rate": 9.586712107750982e-05, "loss": 0.5061, "step": 3255 }, { "epoch": 0.39159159159159157, "grad_norm": 0.28769123554229736, "learning_rate": 9.5854592830589e-05, "loss": 0.4688, "step": 3260 }, { "epoch": 0.3921921921921922, "grad_norm": 0.2628069221973419, "learning_rate": 9.584204644472688e-05, "loss": 0.4614, "step": 3265 }, { "epoch": 0.3927927927927928, "grad_norm": 0.2501368820667267, "learning_rate": 9.582948192488652e-05, "loss": 0.4461, "step": 3270 }, { "epoch": 0.3933933933933934, "grad_norm": 0.2464858740568161, "learning_rate": 9.581689927603812e-05, "loss": 0.4968, "step": 3275 }, { "epoch": 0.393993993993994, "grad_norm": 0.21707935631275177, "learning_rate": 9.580429850315906e-05, "loss": 0.4345, "step": 3280 }, { "epoch": 0.3945945945945946, "grad_norm": 0.1674998700618744, "learning_rate": 9.57916796112339e-05, "loss": 0.4617, "step": 3285 }, { "epoch": 0.3951951951951952, "grad_norm": 0.2718343436717987, "learning_rate": 9.577904260525436e-05, "loss": 0.4988, "step": 3290 }, { "epoch": 0.39579579579579577, "grad_norm": 0.23669962584972382, "learning_rate": 9.576638749021933e-05, "loss": 0.4525, "step": 3295 }, { "epoch": 0.3963963963963964, "grad_norm": 0.278176486492157, "learning_rate": 9.575371427113484e-05, "loss": 0.5211, "step": 3300 }, { "epoch": 0.396996996996997, "grad_norm": 0.19819344580173492, "learning_rate": 9.574102295301414e-05, "loss": 0.4779, "step": 3305 }, { "epoch": 0.3975975975975976, "grad_norm": 0.22306062281131744, "learning_rate": 9.572831354087756e-05, "loss": 0.4847, "step": 3310 }, { "epoch": 0.3981981981981982, "grad_norm": 0.23362375795841217, "learning_rate": 9.571558603975266e-05, "loss": 0.4081, "step": 3315 }, { "epoch": 0.3987987987987988, "grad_norm": 0.21090473234653473, "learning_rate": 9.570284045467412e-05, "loss": 0.4819, "step": 3320 }, { "epoch": 0.3993993993993994, "grad_norm": 0.23887136578559875, "learning_rate": 9.569007679068376e-05, "loss": 0.4929, "step": 3325 }, { "epoch": 0.4, "grad_norm": 0.23648279905319214, "learning_rate": 9.567729505283057e-05, "loss": 0.4871, "step": 3330 }, { "epoch": 0.4006006006006006, "grad_norm": 0.2086629420518875, "learning_rate": 9.566449524617069e-05, "loss": 0.4621, "step": 3335 }, { "epoch": 0.4012012012012012, "grad_norm": 0.20192669332027435, "learning_rate": 9.565167737576744e-05, "loss": 0.4218, "step": 3340 }, { "epoch": 0.4018018018018018, "grad_norm": 0.2431265413761139, "learning_rate": 9.563884144669122e-05, "loss": 0.5005, "step": 3345 }, { "epoch": 0.4024024024024024, "grad_norm": 0.30352410674095154, "learning_rate": 9.56259874640196e-05, "loss": 0.5186, "step": 3350 }, { "epoch": 0.403003003003003, "grad_norm": 0.27248936891555786, "learning_rate": 9.561311543283733e-05, "loss": 0.5055, "step": 3355 }, { "epoch": 0.4036036036036036, "grad_norm": 0.3054885268211365, "learning_rate": 9.560022535823623e-05, "loss": 0.4541, "step": 3360 }, { "epoch": 0.4042042042042042, "grad_norm": 0.24470002949237823, "learning_rate": 9.558731724531531e-05, "loss": 0.4806, "step": 3365 }, { "epoch": 0.4048048048048048, "grad_norm": 0.22992166876792908, "learning_rate": 9.55743910991807e-05, "loss": 0.4664, "step": 3370 }, { "epoch": 0.40540540540540543, "grad_norm": 0.2561578154563904, "learning_rate": 9.556144692494568e-05, "loss": 0.5623, "step": 3375 }, { "epoch": 0.406006006006006, "grad_norm": 0.25261953473091125, "learning_rate": 9.554848472773061e-05, "loss": 0.4414, "step": 3380 }, { "epoch": 0.4066066066066066, "grad_norm": 0.21225038170814514, "learning_rate": 9.553550451266304e-05, "loss": 0.4548, "step": 3385 }, { "epoch": 0.4072072072072072, "grad_norm": 0.20666995644569397, "learning_rate": 9.552250628487761e-05, "loss": 0.4819, "step": 3390 }, { "epoch": 0.4078078078078078, "grad_norm": 0.2576189339160919, "learning_rate": 9.55094900495161e-05, "loss": 0.4577, "step": 3395 }, { "epoch": 0.4084084084084084, "grad_norm": 0.2355472445487976, "learning_rate": 9.549645581172741e-05, "loss": 0.4501, "step": 3400 }, { "epoch": 0.409009009009009, "grad_norm": 0.2224271148443222, "learning_rate": 9.548340357666759e-05, "loss": 0.4815, "step": 3405 }, { "epoch": 0.4096096096096096, "grad_norm": 0.2793062627315521, "learning_rate": 9.547033334949972e-05, "loss": 0.4471, "step": 3410 }, { "epoch": 0.4102102102102102, "grad_norm": 0.2566879689693451, "learning_rate": 9.545724513539411e-05, "loss": 0.5139, "step": 3415 }, { "epoch": 0.41081081081081083, "grad_norm": 0.22943030297756195, "learning_rate": 9.54441389395281e-05, "loss": 0.482, "step": 3420 }, { "epoch": 0.4114114114114114, "grad_norm": 0.22264313697814941, "learning_rate": 9.54310147670862e-05, "loss": 0.4776, "step": 3425 }, { "epoch": 0.412012012012012, "grad_norm": 0.2629396915435791, "learning_rate": 9.541787262326001e-05, "loss": 0.4684, "step": 3430 }, { "epoch": 0.4126126126126126, "grad_norm": 0.22405704855918884, "learning_rate": 9.540471251324821e-05, "loss": 0.4368, "step": 3435 }, { "epoch": 0.4132132132132132, "grad_norm": 0.23231935501098633, "learning_rate": 9.539153444225665e-05, "loss": 0.5391, "step": 3440 }, { "epoch": 0.4138138138138138, "grad_norm": 0.25214463472366333, "learning_rate": 9.537833841549821e-05, "loss": 0.4723, "step": 3445 }, { "epoch": 0.4144144144144144, "grad_norm": 0.250463604927063, "learning_rate": 9.536512443819294e-05, "loss": 0.451, "step": 3450 }, { "epoch": 0.41501501501501503, "grad_norm": 0.24605700373649597, "learning_rate": 9.535189251556795e-05, "loss": 0.5072, "step": 3455 }, { "epoch": 0.4156156156156156, "grad_norm": 0.2573533356189728, "learning_rate": 9.533864265285746e-05, "loss": 0.4928, "step": 3460 }, { "epoch": 0.41621621621621624, "grad_norm": 0.22676625847816467, "learning_rate": 9.532537485530279e-05, "loss": 0.4481, "step": 3465 }, { "epoch": 0.4168168168168168, "grad_norm": 0.2723495066165924, "learning_rate": 9.531208912815235e-05, "loss": 0.5058, "step": 3470 }, { "epoch": 0.4174174174174174, "grad_norm": 0.2828966975212097, "learning_rate": 9.529878547666164e-05, "loss": 0.4567, "step": 3475 }, { "epoch": 0.418018018018018, "grad_norm": 0.23241691291332245, "learning_rate": 9.528546390609329e-05, "loss": 0.5008, "step": 3480 }, { "epoch": 0.4186186186186186, "grad_norm": 0.21605411171913147, "learning_rate": 9.527212442171694e-05, "loss": 0.4309, "step": 3485 }, { "epoch": 0.41921921921921923, "grad_norm": 0.2571276128292084, "learning_rate": 9.525876702880937e-05, "loss": 0.517, "step": 3490 }, { "epoch": 0.4198198198198198, "grad_norm": 0.317888468503952, "learning_rate": 9.524539173265444e-05, "loss": 0.4841, "step": 3495 }, { "epoch": 0.42042042042042044, "grad_norm": 0.2505663335323334, "learning_rate": 9.52319985385431e-05, "loss": 0.4586, "step": 3500 }, { "epoch": 0.42042042042042044, "eval_loss": 0.4424484968185425, "eval_runtime": 35.6127, "eval_samples_per_second": 22.464, "eval_steps_per_second": 5.616, "step": 3500 }, { "epoch": 0.421021021021021, "grad_norm": 0.25227388739585876, "learning_rate": 9.521858745177332e-05, "loss": 0.4151, "step": 3505 }, { "epoch": 0.42162162162162165, "grad_norm": 0.24687834084033966, "learning_rate": 9.520515847765025e-05, "loss": 0.4532, "step": 3510 }, { "epoch": 0.4222222222222222, "grad_norm": 0.24203993380069733, "learning_rate": 9.519171162148604e-05, "loss": 0.4706, "step": 3515 }, { "epoch": 0.4228228228228228, "grad_norm": 0.2538060247898102, "learning_rate": 9.517824688859991e-05, "loss": 0.4953, "step": 3520 }, { "epoch": 0.42342342342342343, "grad_norm": 0.22765670716762543, "learning_rate": 9.516476428431819e-05, "loss": 0.474, "step": 3525 }, { "epoch": 0.424024024024024, "grad_norm": 0.266476571559906, "learning_rate": 9.515126381397429e-05, "loss": 0.4863, "step": 3530 }, { "epoch": 0.42462462462462464, "grad_norm": 0.2860608696937561, "learning_rate": 9.513774548290862e-05, "loss": 0.497, "step": 3535 }, { "epoch": 0.4252252252252252, "grad_norm": 0.30343401432037354, "learning_rate": 9.51242092964687e-05, "loss": 0.4592, "step": 3540 }, { "epoch": 0.42582582582582584, "grad_norm": 0.30189409852027893, "learning_rate": 9.511065526000915e-05, "loss": 0.4557, "step": 3545 }, { "epoch": 0.4264264264264264, "grad_norm": 0.2727750241756439, "learning_rate": 9.509708337889159e-05, "loss": 0.4625, "step": 3550 }, { "epoch": 0.42702702702702705, "grad_norm": 0.27454304695129395, "learning_rate": 9.50834936584847e-05, "loss": 0.5036, "step": 3555 }, { "epoch": 0.4276276276276276, "grad_norm": 0.2149980366230011, "learning_rate": 9.506988610416425e-05, "loss": 0.4307, "step": 3560 }, { "epoch": 0.4282282282282282, "grad_norm": 0.30365270376205444, "learning_rate": 9.505626072131306e-05, "loss": 0.4864, "step": 3565 }, { "epoch": 0.42882882882882883, "grad_norm": 0.24695612490177155, "learning_rate": 9.5042617515321e-05, "loss": 0.4211, "step": 3570 }, { "epoch": 0.4294294294294294, "grad_norm": 0.2488124817609787, "learning_rate": 9.502895649158496e-05, "loss": 0.4793, "step": 3575 }, { "epoch": 0.43003003003003004, "grad_norm": 0.21579088270664215, "learning_rate": 9.501527765550893e-05, "loss": 0.4628, "step": 3580 }, { "epoch": 0.4306306306306306, "grad_norm": 0.29319724440574646, "learning_rate": 9.500158101250389e-05, "loss": 0.5088, "step": 3585 }, { "epoch": 0.43123123123123125, "grad_norm": 0.2599899172782898, "learning_rate": 9.498786656798793e-05, "loss": 0.4622, "step": 3590 }, { "epoch": 0.4318318318318318, "grad_norm": 0.229834645986557, "learning_rate": 9.497413432738612e-05, "loss": 0.4755, "step": 3595 }, { "epoch": 0.43243243243243246, "grad_norm": 0.2928219735622406, "learning_rate": 9.496038429613056e-05, "loss": 0.4545, "step": 3600 }, { "epoch": 0.43303303303303303, "grad_norm": 0.25314855575561523, "learning_rate": 9.49466164796605e-05, "loss": 0.4667, "step": 3605 }, { "epoch": 0.4336336336336336, "grad_norm": 0.2712530791759491, "learning_rate": 9.493283088342209e-05, "loss": 0.4864, "step": 3610 }, { "epoch": 0.43423423423423424, "grad_norm": 0.24743470549583435, "learning_rate": 9.491902751286857e-05, "loss": 0.4741, "step": 3615 }, { "epoch": 0.4348348348348348, "grad_norm": 0.2904561460018158, "learning_rate": 9.490520637346026e-05, "loss": 0.5047, "step": 3620 }, { "epoch": 0.43543543543543545, "grad_norm": 0.23873092234134674, "learning_rate": 9.489136747066441e-05, "loss": 0.4633, "step": 3625 }, { "epoch": 0.436036036036036, "grad_norm": 0.27880802750587463, "learning_rate": 9.487751080995535e-05, "loss": 0.4693, "step": 3630 }, { "epoch": 0.43663663663663665, "grad_norm": 0.28825628757476807, "learning_rate": 9.486363639681447e-05, "loss": 0.4463, "step": 3635 }, { "epoch": 0.43723723723723723, "grad_norm": 0.2847222685813904, "learning_rate": 9.48497442367301e-05, "loss": 0.4602, "step": 3640 }, { "epoch": 0.43783783783783786, "grad_norm": 0.25332358479499817, "learning_rate": 9.483583433519769e-05, "loss": 0.4772, "step": 3645 }, { "epoch": 0.43843843843843844, "grad_norm": 0.25083234906196594, "learning_rate": 9.482190669771958e-05, "loss": 0.3913, "step": 3650 }, { "epoch": 0.439039039039039, "grad_norm": 0.2363145500421524, "learning_rate": 9.480796132980526e-05, "loss": 0.4521, "step": 3655 }, { "epoch": 0.43963963963963965, "grad_norm": 0.21178728342056274, "learning_rate": 9.479399823697115e-05, "loss": 0.4516, "step": 3660 }, { "epoch": 0.4402402402402402, "grad_norm": 0.3048122823238373, "learning_rate": 9.47800174247407e-05, "loss": 0.4607, "step": 3665 }, { "epoch": 0.44084084084084085, "grad_norm": 0.30136120319366455, "learning_rate": 9.476601889864436e-05, "loss": 0.4843, "step": 3670 }, { "epoch": 0.44144144144144143, "grad_norm": 0.2415563464164734, "learning_rate": 9.475200266421962e-05, "loss": 0.4694, "step": 3675 }, { "epoch": 0.44204204204204206, "grad_norm": 0.25498002767562866, "learning_rate": 9.473796872701097e-05, "loss": 0.4571, "step": 3680 }, { "epoch": 0.44264264264264264, "grad_norm": 0.2813263237476349, "learning_rate": 9.472391709256986e-05, "loss": 0.4851, "step": 3685 }, { "epoch": 0.44324324324324327, "grad_norm": 0.23250964283943176, "learning_rate": 9.470984776645478e-05, "loss": 0.4642, "step": 3690 }, { "epoch": 0.44384384384384384, "grad_norm": 0.2851886451244354, "learning_rate": 9.469576075423119e-05, "loss": 0.5034, "step": 3695 }, { "epoch": 0.4444444444444444, "grad_norm": 0.22036044299602509, "learning_rate": 9.468165606147158e-05, "loss": 0.4534, "step": 3700 }, { "epoch": 0.44504504504504505, "grad_norm": 0.22608880698680878, "learning_rate": 9.466753369375544e-05, "loss": 0.4113, "step": 3705 }, { "epoch": 0.4456456456456456, "grad_norm": 0.28404590487480164, "learning_rate": 9.465339365666918e-05, "loss": 0.4374, "step": 3710 }, { "epoch": 0.44624624624624626, "grad_norm": 0.29327112436294556, "learning_rate": 9.463923595580628e-05, "loss": 0.4706, "step": 3715 }, { "epoch": 0.44684684684684683, "grad_norm": 0.2675343453884125, "learning_rate": 9.462506059676717e-05, "loss": 0.4486, "step": 3720 }, { "epoch": 0.44744744744744747, "grad_norm": 0.2927496135234833, "learning_rate": 9.461086758515926e-05, "loss": 0.4567, "step": 3725 }, { "epoch": 0.44804804804804804, "grad_norm": 0.2421569973230362, "learning_rate": 9.459665692659698e-05, "loss": 0.4276, "step": 3730 }, { "epoch": 0.4486486486486487, "grad_norm": 0.2948514521121979, "learning_rate": 9.458242862670169e-05, "loss": 0.4773, "step": 3735 }, { "epoch": 0.44924924924924925, "grad_norm": 0.2598010003566742, "learning_rate": 9.456818269110176e-05, "loss": 0.4914, "step": 3740 }, { "epoch": 0.4498498498498498, "grad_norm": 0.23903946578502655, "learning_rate": 9.455391912543252e-05, "loss": 0.4357, "step": 3745 }, { "epoch": 0.45045045045045046, "grad_norm": 0.251955509185791, "learning_rate": 9.453963793533631e-05, "loss": 0.4996, "step": 3750 }, { "epoch": 0.45045045045045046, "eval_loss": 0.4331132173538208, "eval_runtime": 35.6334, "eval_samples_per_second": 22.451, "eval_steps_per_second": 5.613, "step": 3750 }, { "epoch": 0.45105105105105103, "grad_norm": 0.31112930178642273, "learning_rate": 9.452533912646239e-05, "loss": 0.4731, "step": 3755 }, { "epoch": 0.45165165165165166, "grad_norm": 0.2766417860984802, "learning_rate": 9.451102270446703e-05, "loss": 0.4503, "step": 3760 }, { "epoch": 0.45225225225225224, "grad_norm": 0.25546813011169434, "learning_rate": 9.449668867501343e-05, "loss": 0.4644, "step": 3765 }, { "epoch": 0.45285285285285287, "grad_norm": 0.24376080930233002, "learning_rate": 9.44823370437718e-05, "loss": 0.453, "step": 3770 }, { "epoch": 0.45345345345345345, "grad_norm": 0.29174941778182983, "learning_rate": 9.44679678164193e-05, "loss": 0.4494, "step": 3775 }, { "epoch": 0.4540540540540541, "grad_norm": 0.27715715765953064, "learning_rate": 9.445358099863998e-05, "loss": 0.386, "step": 3780 }, { "epoch": 0.45465465465465466, "grad_norm": 0.2590334415435791, "learning_rate": 9.443917659612499e-05, "loss": 0.4672, "step": 3785 }, { "epoch": 0.45525525525525523, "grad_norm": 0.30765074491500854, "learning_rate": 9.44247546145723e-05, "loss": 0.4694, "step": 3790 }, { "epoch": 0.45585585585585586, "grad_norm": 0.30974051356315613, "learning_rate": 9.441031505968692e-05, "loss": 0.4396, "step": 3795 }, { "epoch": 0.45645645645645644, "grad_norm": 0.2814948260784149, "learning_rate": 9.439585793718075e-05, "loss": 0.4841, "step": 3800 }, { "epoch": 0.45705705705705707, "grad_norm": 0.2723504304885864, "learning_rate": 9.438138325277269e-05, "loss": 0.4723, "step": 3805 }, { "epoch": 0.45765765765765765, "grad_norm": 0.2845655679702759, "learning_rate": 9.436689101218856e-05, "loss": 0.4633, "step": 3810 }, { "epoch": 0.4582582582582583, "grad_norm": 0.2787138521671295, "learning_rate": 9.435238122116112e-05, "loss": 0.4433, "step": 3815 }, { "epoch": 0.45885885885885885, "grad_norm": 0.28986725211143494, "learning_rate": 9.433785388543012e-05, "loss": 0.4294, "step": 3820 }, { "epoch": 0.4594594594594595, "grad_norm": 0.26881736516952515, "learning_rate": 9.432330901074218e-05, "loss": 0.4476, "step": 3825 }, { "epoch": 0.46006006006006006, "grad_norm": 0.23753543198108673, "learning_rate": 9.430874660285092e-05, "loss": 0.4544, "step": 3830 }, { "epoch": 0.46066066066066064, "grad_norm": 0.24957028031349182, "learning_rate": 9.429416666751683e-05, "loss": 0.4322, "step": 3835 }, { "epoch": 0.46126126126126127, "grad_norm": 0.28791627287864685, "learning_rate": 9.42795692105074e-05, "loss": 0.4256, "step": 3840 }, { "epoch": 0.46186186186186184, "grad_norm": 0.2748839855194092, "learning_rate": 9.4264954237597e-05, "loss": 0.4783, "step": 3845 }, { "epoch": 0.4624624624624625, "grad_norm": 0.2614935636520386, "learning_rate": 9.425032175456699e-05, "loss": 0.447, "step": 3850 }, { "epoch": 0.46306306306306305, "grad_norm": 0.27256256341934204, "learning_rate": 9.423567176720558e-05, "loss": 0.4418, "step": 3855 }, { "epoch": 0.4636636636636637, "grad_norm": 0.2694026231765747, "learning_rate": 9.422100428130797e-05, "loss": 0.4641, "step": 3860 }, { "epoch": 0.46426426426426426, "grad_norm": 0.3331802785396576, "learning_rate": 9.420631930267623e-05, "loss": 0.4657, "step": 3865 }, { "epoch": 0.4648648648648649, "grad_norm": 0.24749064445495605, "learning_rate": 9.41916168371194e-05, "loss": 0.4974, "step": 3870 }, { "epoch": 0.46546546546546547, "grad_norm": 0.25842297077178955, "learning_rate": 9.417689689045337e-05, "loss": 0.4185, "step": 3875 }, { "epoch": 0.46606606606606604, "grad_norm": 0.25025177001953125, "learning_rate": 9.416215946850104e-05, "loss": 0.4707, "step": 3880 }, { "epoch": 0.4666666666666667, "grad_norm": 0.24062250554561615, "learning_rate": 9.414740457709213e-05, "loss": 0.4227, "step": 3885 }, { "epoch": 0.46726726726726725, "grad_norm": 0.29786667227745056, "learning_rate": 9.41326322220633e-05, "loss": 0.4499, "step": 3890 }, { "epoch": 0.4678678678678679, "grad_norm": 0.2728450298309326, "learning_rate": 9.411784240925818e-05, "loss": 0.404, "step": 3895 }, { "epoch": 0.46846846846846846, "grad_norm": 0.25382304191589355, "learning_rate": 9.410303514452721e-05, "loss": 0.4989, "step": 3900 }, { "epoch": 0.4690690690690691, "grad_norm": 0.2877455949783325, "learning_rate": 9.408821043372777e-05, "loss": 0.4942, "step": 3905 }, { "epoch": 0.46966966966966966, "grad_norm": 0.27008742094039917, "learning_rate": 9.40733682827242e-05, "loss": 0.4847, "step": 3910 }, { "epoch": 0.4702702702702703, "grad_norm": 0.2753797471523285, "learning_rate": 9.405850869738764e-05, "loss": 0.4571, "step": 3915 }, { "epoch": 0.4708708708708709, "grad_norm": 0.30681371688842773, "learning_rate": 9.40436316835962e-05, "loss": 0.4697, "step": 3920 }, { "epoch": 0.47147147147147145, "grad_norm": 0.2850700616836548, "learning_rate": 9.402873724723483e-05, "loss": 0.4297, "step": 3925 }, { "epoch": 0.4720720720720721, "grad_norm": 0.2790954113006592, "learning_rate": 9.401382539419544e-05, "loss": 0.4217, "step": 3930 }, { "epoch": 0.47267267267267266, "grad_norm": 0.2670901119709015, "learning_rate": 9.399889613037675e-05, "loss": 0.4696, "step": 3935 }, { "epoch": 0.4732732732732733, "grad_norm": 0.3313315808773041, "learning_rate": 9.398394946168443e-05, "loss": 0.5239, "step": 3940 }, { "epoch": 0.47387387387387386, "grad_norm": 0.2873570919036865, "learning_rate": 9.396898539403101e-05, "loss": 0.4921, "step": 3945 }, { "epoch": 0.4744744744744745, "grad_norm": 0.28026047348976135, "learning_rate": 9.395400393333589e-05, "loss": 0.4495, "step": 3950 }, { "epoch": 0.47507507507507507, "grad_norm": 0.29478156566619873, "learning_rate": 9.393900508552538e-05, "loss": 0.4111, "step": 3955 }, { "epoch": 0.4756756756756757, "grad_norm": 0.2536068558692932, "learning_rate": 9.392398885653266e-05, "loss": 0.4605, "step": 3960 }, { "epoch": 0.4762762762762763, "grad_norm": 0.27553731203079224, "learning_rate": 9.390895525229775e-05, "loss": 0.4648, "step": 3965 }, { "epoch": 0.47687687687687685, "grad_norm": 0.2897990643978119, "learning_rate": 9.38939042787676e-05, "loss": 0.4725, "step": 3970 }, { "epoch": 0.4774774774774775, "grad_norm": 0.2889381945133209, "learning_rate": 9.3878835941896e-05, "loss": 0.4532, "step": 3975 }, { "epoch": 0.47807807807807806, "grad_norm": 0.33516576886177063, "learning_rate": 9.386375024764358e-05, "loss": 0.4689, "step": 3980 }, { "epoch": 0.4786786786786787, "grad_norm": 0.2570089101791382, "learning_rate": 9.38486472019779e-05, "loss": 0.4283, "step": 3985 }, { "epoch": 0.47927927927927927, "grad_norm": 0.2153373807668686, "learning_rate": 9.383352681087333e-05, "loss": 0.4194, "step": 3990 }, { "epoch": 0.4798798798798799, "grad_norm": 0.26292797923088074, "learning_rate": 9.381838908031116e-05, "loss": 0.4445, "step": 3995 }, { "epoch": 0.4804804804804805, "grad_norm": 0.26263949275016785, "learning_rate": 9.380323401627944e-05, "loss": 0.4583, "step": 4000 }, { "epoch": 0.4804804804804805, "eval_loss": 0.4252912998199463, "eval_runtime": 35.577, "eval_samples_per_second": 22.486, "eval_steps_per_second": 5.622, "step": 4000 }, { "epoch": 0.4810810810810811, "grad_norm": 0.294264018535614, "learning_rate": 9.378806162477319e-05, "loss": 0.5011, "step": 4005 }, { "epoch": 0.4816816816816817, "grad_norm": 0.25689366459846497, "learning_rate": 9.37728719117942e-05, "loss": 0.4251, "step": 4010 }, { "epoch": 0.48228228228228226, "grad_norm": 0.26754069328308105, "learning_rate": 9.375766488335117e-05, "loss": 0.4507, "step": 4015 }, { "epoch": 0.4828828828828829, "grad_norm": 0.23651185631752014, "learning_rate": 9.37424405454596e-05, "loss": 0.3836, "step": 4020 }, { "epoch": 0.48348348348348347, "grad_norm": 0.2558950185775757, "learning_rate": 9.372719890414187e-05, "loss": 0.4634, "step": 4025 }, { "epoch": 0.4840840840840841, "grad_norm": 0.24600175023078918, "learning_rate": 9.371193996542721e-05, "loss": 0.4459, "step": 4030 }, { "epoch": 0.4846846846846847, "grad_norm": 0.2958679795265198, "learning_rate": 9.369666373535169e-05, "loss": 0.4254, "step": 4035 }, { "epoch": 0.4852852852852853, "grad_norm": 0.23160912096500397, "learning_rate": 9.368137021995815e-05, "loss": 0.4714, "step": 4040 }, { "epoch": 0.4858858858858859, "grad_norm": 0.2944106459617615, "learning_rate": 9.366605942529637e-05, "loss": 0.4583, "step": 4045 }, { "epoch": 0.4864864864864865, "grad_norm": 0.32667234539985657, "learning_rate": 9.36507313574229e-05, "loss": 0.4697, "step": 4050 }, { "epoch": 0.4870870870870871, "grad_norm": 0.2736036479473114, "learning_rate": 9.363538602240119e-05, "loss": 0.4414, "step": 4055 }, { "epoch": 0.48768768768768767, "grad_norm": 0.27527615427970886, "learning_rate": 9.36200234263014e-05, "loss": 0.4284, "step": 4060 }, { "epoch": 0.4882882882882883, "grad_norm": 0.3154074251651764, "learning_rate": 9.360464357520067e-05, "loss": 0.5026, "step": 4065 }, { "epoch": 0.4888888888888889, "grad_norm": 0.24428270757198334, "learning_rate": 9.358924647518282e-05, "loss": 0.404, "step": 4070 }, { "epoch": 0.4894894894894895, "grad_norm": 0.31537002325057983, "learning_rate": 9.357383213233861e-05, "loss": 0.4414, "step": 4075 }, { "epoch": 0.4900900900900901, "grad_norm": 0.27242958545684814, "learning_rate": 9.355840055276556e-05, "loss": 0.459, "step": 4080 }, { "epoch": 0.4906906906906907, "grad_norm": 0.26572486758232117, "learning_rate": 9.354295174256801e-05, "loss": 0.4567, "step": 4085 }, { "epoch": 0.4912912912912913, "grad_norm": 0.27613312005996704, "learning_rate": 9.352748570785713e-05, "loss": 0.4078, "step": 4090 }, { "epoch": 0.4918918918918919, "grad_norm": 0.28778305649757385, "learning_rate": 9.351200245475089e-05, "loss": 0.4763, "step": 4095 }, { "epoch": 0.4924924924924925, "grad_norm": 0.2761375606060028, "learning_rate": 9.349650198937411e-05, "loss": 0.3981, "step": 4100 }, { "epoch": 0.49309309309309307, "grad_norm": 0.2764434218406677, "learning_rate": 9.348098431785837e-05, "loss": 0.386, "step": 4105 }, { "epoch": 0.4936936936936937, "grad_norm": 0.30036839842796326, "learning_rate": 9.34654494463421e-05, "loss": 0.4823, "step": 4110 }, { "epoch": 0.4942942942942943, "grad_norm": 0.30533745884895325, "learning_rate": 9.344989738097047e-05, "loss": 0.4344, "step": 4115 }, { "epoch": 0.4948948948948949, "grad_norm": 0.36635449528694153, "learning_rate": 9.343432812789551e-05, "loss": 0.4805, "step": 4120 }, { "epoch": 0.4954954954954955, "grad_norm": 0.33037155866622925, "learning_rate": 9.341874169327604e-05, "loss": 0.4222, "step": 4125 }, { "epoch": 0.4960960960960961, "grad_norm": 0.31396031379699707, "learning_rate": 9.340313808327768e-05, "loss": 0.4369, "step": 4130 }, { "epoch": 0.4966966966966967, "grad_norm": 0.32045847177505493, "learning_rate": 9.338751730407278e-05, "loss": 0.4751, "step": 4135 }, { "epoch": 0.4972972972972973, "grad_norm": 0.280370831489563, "learning_rate": 9.33718793618406e-05, "loss": 0.4844, "step": 4140 }, { "epoch": 0.4978978978978979, "grad_norm": 0.26197347044944763, "learning_rate": 9.335622426276707e-05, "loss": 0.4052, "step": 4145 }, { "epoch": 0.4984984984984985, "grad_norm": 0.24671530723571777, "learning_rate": 9.334055201304499e-05, "loss": 0.4077, "step": 4150 }, { "epoch": 0.4990990990990991, "grad_norm": 0.3186667263507843, "learning_rate": 9.332486261887388e-05, "loss": 0.4672, "step": 4155 }, { "epoch": 0.4996996996996997, "grad_norm": 0.3468276262283325, "learning_rate": 9.330915608646012e-05, "loss": 0.4303, "step": 4160 }, { "epoch": 0.5003003003003003, "grad_norm": 0.3199736773967743, "learning_rate": 9.32934324220168e-05, "loss": 0.4752, "step": 4165 }, { "epoch": 0.5009009009009009, "grad_norm": 0.26355046033859253, "learning_rate": 9.32776916317638e-05, "loss": 0.4574, "step": 4170 }, { "epoch": 0.5015015015015015, "grad_norm": 0.2576383054256439, "learning_rate": 9.326193372192783e-05, "loss": 0.4284, "step": 4175 }, { "epoch": 0.5021021021021022, "grad_norm": 0.29025712609291077, "learning_rate": 9.324615869874229e-05, "loss": 0.3747, "step": 4180 }, { "epoch": 0.5027027027027027, "grad_norm": 0.2769593894481659, "learning_rate": 9.323036656844739e-05, "loss": 0.4554, "step": 4185 }, { "epoch": 0.5033033033033033, "grad_norm": 0.38268837332725525, "learning_rate": 9.321455733729014e-05, "loss": 0.4523, "step": 4190 }, { "epoch": 0.5039039039039039, "grad_norm": 0.31293413043022156, "learning_rate": 9.319873101152423e-05, "loss": 0.44, "step": 4195 }, { "epoch": 0.5045045045045045, "grad_norm": 0.34711217880249023, "learning_rate": 9.31828875974102e-05, "loss": 0.396, "step": 4200 }, { "epoch": 0.5051051051051051, "grad_norm": 0.28682348132133484, "learning_rate": 9.31670271012153e-05, "loss": 0.4043, "step": 4205 }, { "epoch": 0.5057057057057057, "grad_norm": 0.30304649472236633, "learning_rate": 9.315114952921356e-05, "loss": 0.4867, "step": 4210 }, { "epoch": 0.5063063063063064, "grad_norm": 0.3410811126232147, "learning_rate": 9.313525488768573e-05, "loss": 0.4326, "step": 4215 }, { "epoch": 0.5069069069069069, "grad_norm": 0.26315030455589294, "learning_rate": 9.311934318291937e-05, "loss": 0.4231, "step": 4220 }, { "epoch": 0.5075075075075075, "grad_norm": 0.3111753761768341, "learning_rate": 9.310341442120871e-05, "loss": 0.4761, "step": 4225 }, { "epoch": 0.5081081081081081, "grad_norm": 0.273722380399704, "learning_rate": 9.308746860885482e-05, "loss": 0.4434, "step": 4230 }, { "epoch": 0.5087087087087087, "grad_norm": 0.3247709274291992, "learning_rate": 9.307150575216545e-05, "loss": 0.414, "step": 4235 }, { "epoch": 0.5093093093093093, "grad_norm": 0.33878350257873535, "learning_rate": 9.305552585745511e-05, "loss": 0.4901, "step": 4240 }, { "epoch": 0.5099099099099099, "grad_norm": 0.3130566477775574, "learning_rate": 9.303952893104504e-05, "loss": 0.4206, "step": 4245 }, { "epoch": 0.5105105105105106, "grad_norm": 0.26403722167015076, "learning_rate": 9.302351497926325e-05, "loss": 0.3785, "step": 4250 }, { "epoch": 0.5105105105105106, "eval_loss": 0.4109116792678833, "eval_runtime": 35.6573, "eval_samples_per_second": 22.436, "eval_steps_per_second": 5.609, "step": 4250 }, { "epoch": 0.5111111111111111, "grad_norm": 0.27747008204460144, "learning_rate": 9.300748400844446e-05, "loss": 0.4567, "step": 4255 }, { "epoch": 0.5117117117117117, "grad_norm": 0.35479068756103516, "learning_rate": 9.29914360249301e-05, "loss": 0.4504, "step": 4260 }, { "epoch": 0.5123123123123123, "grad_norm": 0.2651272118091583, "learning_rate": 9.297537103506838e-05, "loss": 0.4214, "step": 4265 }, { "epoch": 0.512912912912913, "grad_norm": 0.265099436044693, "learning_rate": 9.29592890452142e-05, "loss": 0.4077, "step": 4270 }, { "epoch": 0.5135135135135135, "grad_norm": 0.31767547130584717, "learning_rate": 9.294319006172921e-05, "loss": 0.4426, "step": 4275 }, { "epoch": 0.5141141141141141, "grad_norm": 0.31464385986328125, "learning_rate": 9.292707409098174e-05, "loss": 0.4308, "step": 4280 }, { "epoch": 0.5147147147147147, "grad_norm": 0.2928173840045929, "learning_rate": 9.291094113934689e-05, "loss": 0.4406, "step": 4285 }, { "epoch": 0.5153153153153153, "grad_norm": 0.2902061939239502, "learning_rate": 9.289479121320648e-05, "loss": 0.4676, "step": 4290 }, { "epoch": 0.5159159159159159, "grad_norm": 0.26935285329818726, "learning_rate": 9.287862431894897e-05, "loss": 0.4455, "step": 4295 }, { "epoch": 0.5165165165165165, "grad_norm": 0.24586215615272522, "learning_rate": 9.286244046296961e-05, "loss": 0.4804, "step": 4300 }, { "epoch": 0.5171171171171172, "grad_norm": 0.2868068814277649, "learning_rate": 9.284623965167035e-05, "loss": 0.4312, "step": 4305 }, { "epoch": 0.5177177177177177, "grad_norm": 0.3604901134967804, "learning_rate": 9.28300218914598e-05, "loss": 0.4556, "step": 4310 }, { "epoch": 0.5183183183183183, "grad_norm": 0.3339654207229614, "learning_rate": 9.281378718875332e-05, "loss": 0.441, "step": 4315 }, { "epoch": 0.518918918918919, "grad_norm": 0.2429099678993225, "learning_rate": 9.279753554997295e-05, "loss": 0.4353, "step": 4320 }, { "epoch": 0.5195195195195195, "grad_norm": 0.3159288167953491, "learning_rate": 9.278126698154743e-05, "loss": 0.4532, "step": 4325 }, { "epoch": 0.5201201201201201, "grad_norm": 0.30051764845848083, "learning_rate": 9.276498148991222e-05, "loss": 0.4082, "step": 4330 }, { "epoch": 0.5207207207207207, "grad_norm": 0.34233060479164124, "learning_rate": 9.274867908150944e-05, "loss": 0.4422, "step": 4335 }, { "epoch": 0.5213213213213214, "grad_norm": 0.33868885040283203, "learning_rate": 9.273235976278794e-05, "loss": 0.4399, "step": 4340 }, { "epoch": 0.5219219219219219, "grad_norm": 0.26910141110420227, "learning_rate": 9.27160235402032e-05, "loss": 0.4676, "step": 4345 }, { "epoch": 0.5225225225225225, "grad_norm": 0.2950465977191925, "learning_rate": 9.269967042021747e-05, "loss": 0.437, "step": 4350 }, { "epoch": 0.5231231231231231, "grad_norm": 0.289869099855423, "learning_rate": 9.268330040929962e-05, "loss": 0.4739, "step": 4355 }, { "epoch": 0.5237237237237238, "grad_norm": 0.3041685223579407, "learning_rate": 9.26669135139252e-05, "loss": 0.4482, "step": 4360 }, { "epoch": 0.5243243243243243, "grad_norm": 0.3370480239391327, "learning_rate": 9.265050974057649e-05, "loss": 0.4397, "step": 4365 }, { "epoch": 0.5249249249249249, "grad_norm": 0.2817637324333191, "learning_rate": 9.26340890957424e-05, "loss": 0.3884, "step": 4370 }, { "epoch": 0.5255255255255256, "grad_norm": 0.29471760988235474, "learning_rate": 9.261765158591855e-05, "loss": 0.41, "step": 4375 }, { "epoch": 0.5261261261261261, "grad_norm": 0.3439834415912628, "learning_rate": 9.260119721760721e-05, "loss": 0.4138, "step": 4380 }, { "epoch": 0.5267267267267267, "grad_norm": 0.2582058906555176, "learning_rate": 9.258472599731728e-05, "loss": 0.3942, "step": 4385 }, { "epoch": 0.5273273273273273, "grad_norm": 0.28308677673339844, "learning_rate": 9.256823793156441e-05, "loss": 0.4389, "step": 4390 }, { "epoch": 0.527927927927928, "grad_norm": 0.3756570518016815, "learning_rate": 9.255173302687085e-05, "loss": 0.4692, "step": 4395 }, { "epoch": 0.5285285285285285, "grad_norm": 0.25984251499176025, "learning_rate": 9.253521128976554e-05, "loss": 0.4154, "step": 4400 }, { "epoch": 0.5291291291291291, "grad_norm": 0.28260019421577454, "learning_rate": 9.251867272678408e-05, "loss": 0.4026, "step": 4405 }, { "epoch": 0.5297297297297298, "grad_norm": 0.2700667381286621, "learning_rate": 9.25021173444687e-05, "loss": 0.4194, "step": 4410 }, { "epoch": 0.5303303303303303, "grad_norm": 0.27166318893432617, "learning_rate": 9.24855451493683e-05, "loss": 0.4413, "step": 4415 }, { "epoch": 0.5309309309309309, "grad_norm": 0.34264490008354187, "learning_rate": 9.246895614803843e-05, "loss": 0.4665, "step": 4420 }, { "epoch": 0.5315315315315315, "grad_norm": 0.296682208776474, "learning_rate": 9.24523503470413e-05, "loss": 0.4124, "step": 4425 }, { "epoch": 0.5321321321321322, "grad_norm": 0.29106104373931885, "learning_rate": 9.243572775294573e-05, "loss": 0.4239, "step": 4430 }, { "epoch": 0.5327327327327327, "grad_norm": 0.3212127983570099, "learning_rate": 9.241908837232722e-05, "loss": 0.446, "step": 4435 }, { "epoch": 0.5333333333333333, "grad_norm": 0.34617817401885986, "learning_rate": 9.240243221176791e-05, "loss": 0.4625, "step": 4440 }, { "epoch": 0.533933933933934, "grad_norm": 0.2617619037628174, "learning_rate": 9.238575927785655e-05, "loss": 0.4127, "step": 4445 }, { "epoch": 0.5345345345345346, "grad_norm": 0.3500515818595886, "learning_rate": 9.236906957718854e-05, "loss": 0.4488, "step": 4450 }, { "epoch": 0.5351351351351351, "grad_norm": 0.2569849491119385, "learning_rate": 9.235236311636593e-05, "loss": 0.4422, "step": 4455 }, { "epoch": 0.5357357357357357, "grad_norm": 0.3044881522655487, "learning_rate": 9.233563990199735e-05, "loss": 0.4279, "step": 4460 }, { "epoch": 0.5363363363363364, "grad_norm": 0.35698455572128296, "learning_rate": 9.231889994069811e-05, "loss": 0.4296, "step": 4465 }, { "epoch": 0.5369369369369369, "grad_norm": 0.3179773986339569, "learning_rate": 9.230214323909012e-05, "loss": 0.3862, "step": 4470 }, { "epoch": 0.5375375375375375, "grad_norm": 0.3401271104812622, "learning_rate": 9.228536980380191e-05, "loss": 0.4274, "step": 4475 }, { "epoch": 0.5381381381381382, "grad_norm": 0.36162683367729187, "learning_rate": 9.226857964146866e-05, "loss": 0.3976, "step": 4480 }, { "epoch": 0.5387387387387388, "grad_norm": 0.322964608669281, "learning_rate": 9.225177275873211e-05, "loss": 0.4376, "step": 4485 }, { "epoch": 0.5393393393393393, "grad_norm": 0.33346307277679443, "learning_rate": 9.223494916224066e-05, "loss": 0.4482, "step": 4490 }, { "epoch": 0.5399399399399399, "grad_norm": 0.2842477262020111, "learning_rate": 9.221810885864933e-05, "loss": 0.3776, "step": 4495 }, { "epoch": 0.5405405405405406, "grad_norm": 0.3431280851364136, "learning_rate": 9.220125185461967e-05, "loss": 0.4224, "step": 4500 }, { "epoch": 0.5405405405405406, "eval_loss": 0.39646750688552856, "eval_runtime": 35.5976, "eval_samples_per_second": 22.473, "eval_steps_per_second": 5.618, "step": 4500 }, { "epoch": 0.5411411411411411, "grad_norm": 0.3617427945137024, "learning_rate": 9.218437815681996e-05, "loss": 0.4425, "step": 4505 }, { "epoch": 0.5417417417417417, "grad_norm": 0.31604188680648804, "learning_rate": 9.216748777192498e-05, "loss": 0.3861, "step": 4510 }, { "epoch": 0.5423423423423424, "grad_norm": 0.270926296710968, "learning_rate": 9.215058070661615e-05, "loss": 0.4568, "step": 4515 }, { "epoch": 0.542942942942943, "grad_norm": 0.33962738513946533, "learning_rate": 9.21336569675815e-05, "loss": 0.3911, "step": 4520 }, { "epoch": 0.5435435435435435, "grad_norm": 0.3347056806087494, "learning_rate": 9.211671656151563e-05, "loss": 0.4243, "step": 4525 }, { "epoch": 0.5441441441441441, "grad_norm": 0.31746459007263184, "learning_rate": 9.209975949511974e-05, "loss": 0.4082, "step": 4530 }, { "epoch": 0.5447447447447448, "grad_norm": 0.36672934889793396, "learning_rate": 9.208278577510163e-05, "loss": 0.409, "step": 4535 }, { "epoch": 0.5453453453453454, "grad_norm": 0.3124452531337738, "learning_rate": 9.20657954081757e-05, "loss": 0.3757, "step": 4540 }, { "epoch": 0.5459459459459459, "grad_norm": 0.3697468340396881, "learning_rate": 9.20487884010629e-05, "loss": 0.3845, "step": 4545 }, { "epoch": 0.5465465465465466, "grad_norm": 0.3563442826271057, "learning_rate": 9.203176476049079e-05, "loss": 0.4451, "step": 4550 }, { "epoch": 0.5471471471471472, "grad_norm": 0.22961801290512085, "learning_rate": 9.20147244931935e-05, "loss": 0.4466, "step": 4555 }, { "epoch": 0.5477477477477477, "grad_norm": 0.30496376752853394, "learning_rate": 9.199766760591174e-05, "loss": 0.3924, "step": 4560 }, { "epoch": 0.5483483483483483, "grad_norm": 0.3423655331134796, "learning_rate": 9.198059410539275e-05, "loss": 0.4239, "step": 4565 }, { "epoch": 0.548948948948949, "grad_norm": 0.3447710871696472, "learning_rate": 9.196350399839044e-05, "loss": 0.4254, "step": 4570 }, { "epoch": 0.5495495495495496, "grad_norm": 0.3405103385448456, "learning_rate": 9.194639729166523e-05, "loss": 0.4787, "step": 4575 }, { "epoch": 0.5501501501501501, "grad_norm": 0.2896510362625122, "learning_rate": 9.192927399198408e-05, "loss": 0.3943, "step": 4580 }, { "epoch": 0.5507507507507508, "grad_norm": 0.32498177886009216, "learning_rate": 9.191213410612056e-05, "loss": 0.4402, "step": 4585 }, { "epoch": 0.5513513513513514, "grad_norm": 0.2963517904281616, "learning_rate": 9.189497764085477e-05, "loss": 0.4015, "step": 4590 }, { "epoch": 0.5519519519519519, "grad_norm": 0.3304550349712372, "learning_rate": 9.187780460297341e-05, "loss": 0.3991, "step": 4595 }, { "epoch": 0.5525525525525525, "grad_norm": 0.25571829080581665, "learning_rate": 9.186061499926968e-05, "loss": 0.3949, "step": 4600 }, { "epoch": 0.5531531531531532, "grad_norm": 0.30609068274497986, "learning_rate": 9.184340883654339e-05, "loss": 0.3866, "step": 4605 }, { "epoch": 0.5537537537537538, "grad_norm": 0.2816171646118164, "learning_rate": 9.182618612160084e-05, "loss": 0.4492, "step": 4610 }, { "epoch": 0.5543543543543543, "grad_norm": 0.29632723331451416, "learning_rate": 9.180894686125492e-05, "loss": 0.4284, "step": 4615 }, { "epoch": 0.554954954954955, "grad_norm": 0.3231966495513916, "learning_rate": 9.179169106232507e-05, "loss": 0.3891, "step": 4620 }, { "epoch": 0.5555555555555556, "grad_norm": 0.2741132974624634, "learning_rate": 9.177441873163723e-05, "loss": 0.3458, "step": 4625 }, { "epoch": 0.5561561561561561, "grad_norm": 0.25370296835899353, "learning_rate": 9.175712987602395e-05, "loss": 0.4313, "step": 4630 }, { "epoch": 0.5567567567567567, "grad_norm": 0.36991292238235474, "learning_rate": 9.173982450232424e-05, "loss": 0.381, "step": 4635 }, { "epoch": 0.5573573573573574, "grad_norm": 0.33524763584136963, "learning_rate": 9.172250261738367e-05, "loss": 0.411, "step": 4640 }, { "epoch": 0.557957957957958, "grad_norm": 0.29027846455574036, "learning_rate": 9.170516422805435e-05, "loss": 0.4039, "step": 4645 }, { "epoch": 0.5585585585585585, "grad_norm": 0.284198522567749, "learning_rate": 9.168780934119494e-05, "loss": 0.4276, "step": 4650 }, { "epoch": 0.5591591591591591, "grad_norm": 0.3591119349002838, "learning_rate": 9.167043796367061e-05, "loss": 0.4115, "step": 4655 }, { "epoch": 0.5597597597597598, "grad_norm": 0.35682356357574463, "learning_rate": 9.165305010235301e-05, "loss": 0.4331, "step": 4660 }, { "epoch": 0.5603603603603604, "grad_norm": 0.4038154184818268, "learning_rate": 9.163564576412037e-05, "loss": 0.4335, "step": 4665 }, { "epoch": 0.5609609609609609, "grad_norm": 0.2744685709476471, "learning_rate": 9.161822495585741e-05, "loss": 0.3959, "step": 4670 }, { "epoch": 0.5615615615615616, "grad_norm": 0.36517074704170227, "learning_rate": 9.160078768445537e-05, "loss": 0.4722, "step": 4675 }, { "epoch": 0.5621621621621622, "grad_norm": 0.2964160740375519, "learning_rate": 9.158333395681203e-05, "loss": 0.4154, "step": 4680 }, { "epoch": 0.5627627627627627, "grad_norm": 0.3045138120651245, "learning_rate": 9.156586377983158e-05, "loss": 0.4513, "step": 4685 }, { "epoch": 0.5633633633633633, "grad_norm": 0.2858031690120697, "learning_rate": 9.154837716042487e-05, "loss": 0.399, "step": 4690 }, { "epoch": 0.563963963963964, "grad_norm": 0.3186417818069458, "learning_rate": 9.153087410550914e-05, "loss": 0.3787, "step": 4695 }, { "epoch": 0.5645645645645646, "grad_norm": 0.401591956615448, "learning_rate": 9.151335462200814e-05, "loss": 0.4507, "step": 4700 }, { "epoch": 0.5651651651651651, "grad_norm": 0.27890413999557495, "learning_rate": 9.149581871685218e-05, "loss": 0.374, "step": 4705 }, { "epoch": 0.5657657657657658, "grad_norm": 0.34605100750923157, "learning_rate": 9.147826639697803e-05, "loss": 0.4303, "step": 4710 }, { "epoch": 0.5663663663663664, "grad_norm": 0.2923938035964966, "learning_rate": 9.146069766932893e-05, "loss": 0.3849, "step": 4715 }, { "epoch": 0.5669669669669669, "grad_norm": 0.32079771161079407, "learning_rate": 9.144311254085464e-05, "loss": 0.4424, "step": 4720 }, { "epoch": 0.5675675675675675, "grad_norm": 0.3526853024959564, "learning_rate": 9.142551101851143e-05, "loss": 0.3951, "step": 4725 }, { "epoch": 0.5681681681681682, "grad_norm": 0.3102889955043793, "learning_rate": 9.140789310926199e-05, "loss": 0.3816, "step": 4730 }, { "epoch": 0.5687687687687688, "grad_norm": 0.348457932472229, "learning_rate": 9.139025882007554e-05, "loss": 0.3873, "step": 4735 }, { "epoch": 0.5693693693693693, "grad_norm": 0.34294039011001587, "learning_rate": 9.137260815792776e-05, "loss": 0.4115, "step": 4740 }, { "epoch": 0.56996996996997, "grad_norm": 0.33280232548713684, "learning_rate": 9.135494112980083e-05, "loss": 0.3652, "step": 4745 }, { "epoch": 0.5705705705705706, "grad_norm": 0.39281317591667175, "learning_rate": 9.133725774268338e-05, "loss": 0.3946, "step": 4750 }, { "epoch": 0.5705705705705706, "eval_loss": 0.38407135009765625, "eval_runtime": 35.5507, "eval_samples_per_second": 22.503, "eval_steps_per_second": 5.626, "step": 4750 }, { "epoch": 0.5711711711711712, "grad_norm": 0.29195722937583923, "learning_rate": 9.131955800357053e-05, "loss": 0.442, "step": 4755 }, { "epoch": 0.5717717717717717, "grad_norm": 0.3438752293586731, "learning_rate": 9.130184191946385e-05, "loss": 0.3713, "step": 4760 }, { "epoch": 0.5723723723723724, "grad_norm": 0.33953338861465454, "learning_rate": 9.128410949737138e-05, "loss": 0.4078, "step": 4765 }, { "epoch": 0.572972972972973, "grad_norm": 0.36786186695098877, "learning_rate": 9.126636074430764e-05, "loss": 0.4506, "step": 4770 }, { "epoch": 0.5735735735735735, "grad_norm": 0.33084627985954285, "learning_rate": 9.124859566729358e-05, "loss": 0.457, "step": 4775 }, { "epoch": 0.5741741741741742, "grad_norm": 0.34048357605934143, "learning_rate": 9.123081427335665e-05, "loss": 0.4624, "step": 4780 }, { "epoch": 0.5747747747747748, "grad_norm": 0.3166431486606598, "learning_rate": 9.12130165695307e-05, "loss": 0.4316, "step": 4785 }, { "epoch": 0.5753753753753754, "grad_norm": 0.35750752687454224, "learning_rate": 9.119520256285608e-05, "loss": 0.4237, "step": 4790 }, { "epoch": 0.5759759759759759, "grad_norm": 0.3233676254749298, "learning_rate": 9.117737226037956e-05, "loss": 0.4279, "step": 4795 }, { "epoch": 0.5765765765765766, "grad_norm": 0.30795225501060486, "learning_rate": 9.115952566915436e-05, "loss": 0.3651, "step": 4800 }, { "epoch": 0.5771771771771772, "grad_norm": 0.3767455816268921, "learning_rate": 9.114166279624017e-05, "loss": 0.4354, "step": 4805 }, { "epoch": 0.5777777777777777, "grad_norm": 0.2709740996360779, "learning_rate": 9.112378364870309e-05, "loss": 0.396, "step": 4810 }, { "epoch": 0.5783783783783784, "grad_norm": 0.3398534953594208, "learning_rate": 9.110588823361566e-05, "loss": 0.4046, "step": 4815 }, { "epoch": 0.578978978978979, "grad_norm": 0.3292076289653778, "learning_rate": 9.108797655805689e-05, "loss": 0.3966, "step": 4820 }, { "epoch": 0.5795795795795796, "grad_norm": 0.3727780878543854, "learning_rate": 9.107004862911216e-05, "loss": 0.4112, "step": 4825 }, { "epoch": 0.5801801801801801, "grad_norm": 0.3289770781993866, "learning_rate": 9.105210445387333e-05, "loss": 0.4083, "step": 4830 }, { "epoch": 0.5807807807807808, "grad_norm": 0.39395871758461, "learning_rate": 9.103414403943868e-05, "loss": 0.3572, "step": 4835 }, { "epoch": 0.5813813813813814, "grad_norm": 0.3781627416610718, "learning_rate": 9.101616739291288e-05, "loss": 0.4168, "step": 4840 }, { "epoch": 0.581981981981982, "grad_norm": 0.3350948393344879, "learning_rate": 9.099817452140709e-05, "loss": 0.4191, "step": 4845 }, { "epoch": 0.5825825825825826, "grad_norm": 0.31275612115859985, "learning_rate": 9.09801654320388e-05, "loss": 0.4307, "step": 4850 }, { "epoch": 0.5831831831831832, "grad_norm": 0.3432002067565918, "learning_rate": 9.096214013193198e-05, "loss": 0.4171, "step": 4855 }, { "epoch": 0.5837837837837838, "grad_norm": 0.30382242798805237, "learning_rate": 9.094409862821698e-05, "loss": 0.4029, "step": 4860 }, { "epoch": 0.5843843843843843, "grad_norm": 0.35811981558799744, "learning_rate": 9.092604092803058e-05, "loss": 0.4422, "step": 4865 }, { "epoch": 0.584984984984985, "grad_norm": 0.2972864806652069, "learning_rate": 9.090796703851598e-05, "loss": 0.3555, "step": 4870 }, { "epoch": 0.5855855855855856, "grad_norm": 0.3713989555835724, "learning_rate": 9.088987696682275e-05, "loss": 0.3629, "step": 4875 }, { "epoch": 0.5861861861861862, "grad_norm": 0.3291909992694855, "learning_rate": 9.087177072010684e-05, "loss": 0.395, "step": 4880 }, { "epoch": 0.5867867867867868, "grad_norm": 0.3542977571487427, "learning_rate": 9.085364830553067e-05, "loss": 0.4634, "step": 4885 }, { "epoch": 0.5873873873873874, "grad_norm": 0.38173505663871765, "learning_rate": 9.083550973026302e-05, "loss": 0.3931, "step": 4890 }, { "epoch": 0.587987987987988, "grad_norm": 0.29966017603874207, "learning_rate": 9.081735500147904e-05, "loss": 0.3576, "step": 4895 }, { "epoch": 0.5885885885885885, "grad_norm": 0.3684738576412201, "learning_rate": 9.07991841263603e-05, "loss": 0.4445, "step": 4900 }, { "epoch": 0.5891891891891892, "grad_norm": 0.2916768193244934, "learning_rate": 9.078099711209475e-05, "loss": 0.3801, "step": 4905 }, { "epoch": 0.5897897897897898, "grad_norm": 0.36422377824783325, "learning_rate": 9.076279396587672e-05, "loss": 0.4423, "step": 4910 }, { "epoch": 0.5903903903903904, "grad_norm": 0.33675840497016907, "learning_rate": 9.074457469490694e-05, "loss": 0.3983, "step": 4915 }, { "epoch": 0.590990990990991, "grad_norm": 0.396847665309906, "learning_rate": 9.072633930639248e-05, "loss": 0.4162, "step": 4920 }, { "epoch": 0.5915915915915916, "grad_norm": 0.2722485661506653, "learning_rate": 9.070808780754681e-05, "loss": 0.3783, "step": 4925 }, { "epoch": 0.5921921921921922, "grad_norm": 0.35289111733436584, "learning_rate": 9.068982020558978e-05, "loss": 0.4248, "step": 4930 }, { "epoch": 0.5927927927927928, "grad_norm": 0.29830169677734375, "learning_rate": 9.06715365077476e-05, "loss": 0.4108, "step": 4935 }, { "epoch": 0.5933933933933934, "grad_norm": 0.2733657658100128, "learning_rate": 9.065323672125286e-05, "loss": 0.3841, "step": 4940 }, { "epoch": 0.593993993993994, "grad_norm": 0.33087727427482605, "learning_rate": 9.063492085334446e-05, "loss": 0.3879, "step": 4945 }, { "epoch": 0.5945945945945946, "grad_norm": 0.341102659702301, "learning_rate": 9.061658891126776e-05, "loss": 0.4004, "step": 4950 }, { "epoch": 0.5951951951951951, "grad_norm": 0.3307470381259918, "learning_rate": 9.059824090227438e-05, "loss": 0.4093, "step": 4955 }, { "epoch": 0.5957957957957958, "grad_norm": 0.2968612313270569, "learning_rate": 9.057987683362234e-05, "loss": 0.3832, "step": 4960 }, { "epoch": 0.5963963963963964, "grad_norm": 0.34879839420318604, "learning_rate": 9.056149671257606e-05, "loss": 0.4216, "step": 4965 }, { "epoch": 0.596996996996997, "grad_norm": 0.33287060260772705, "learning_rate": 9.05431005464062e-05, "loss": 0.4384, "step": 4970 }, { "epoch": 0.5975975975975976, "grad_norm": 0.3686003088951111, "learning_rate": 9.052468834238986e-05, "loss": 0.36, "step": 4975 }, { "epoch": 0.5981981981981982, "grad_norm": 0.3743337094783783, "learning_rate": 9.050626010781043e-05, "loss": 0.402, "step": 4980 }, { "epoch": 0.5987987987987988, "grad_norm": 0.3147016167640686, "learning_rate": 9.048781584995766e-05, "loss": 0.4415, "step": 4985 }, { "epoch": 0.5993993993993993, "grad_norm": 0.31453582644462585, "learning_rate": 9.04693555761277e-05, "loss": 0.4095, "step": 4990 }, { "epoch": 0.6, "grad_norm": 0.3484346866607666, "learning_rate": 9.04508792936229e-05, "loss": 0.3784, "step": 4995 }, { "epoch": 0.6006006006006006, "grad_norm": 0.3905284106731415, "learning_rate": 9.043238700975209e-05, "loss": 0.3904, "step": 5000 }, { "epoch": 0.6006006006006006, "eval_loss": 0.37757378816604614, "eval_runtime": 35.6434, "eval_samples_per_second": 22.445, "eval_steps_per_second": 5.611, "step": 5000 }, { "epoch": 0.6012012012012012, "grad_norm": 0.3124229311943054, "learning_rate": 9.041387873183029e-05, "loss": 0.4109, "step": 5005 }, { "epoch": 0.6018018018018018, "grad_norm": 0.28576046228408813, "learning_rate": 9.039535446717898e-05, "loss": 0.3467, "step": 5010 }, { "epoch": 0.6024024024024024, "grad_norm": 0.3818845748901367, "learning_rate": 9.037681422312586e-05, "loss": 0.3995, "step": 5015 }, { "epoch": 0.603003003003003, "grad_norm": 0.28922221064567566, "learning_rate": 9.0358258007005e-05, "loss": 0.3503, "step": 5020 }, { "epoch": 0.6036036036036037, "grad_norm": 0.37059420347213745, "learning_rate": 9.033968582615679e-05, "loss": 0.4039, "step": 5025 }, { "epoch": 0.6042042042042042, "grad_norm": 0.4044734239578247, "learning_rate": 9.03210976879279e-05, "loss": 0.4621, "step": 5030 }, { "epoch": 0.6048048048048048, "grad_norm": 0.3339507579803467, "learning_rate": 9.030249359967138e-05, "loss": 0.4037, "step": 5035 }, { "epoch": 0.6054054054054054, "grad_norm": 0.4029451608657837, "learning_rate": 9.02838735687465e-05, "loss": 0.4259, "step": 5040 }, { "epoch": 0.606006006006006, "grad_norm": 0.35424843430519104, "learning_rate": 9.026523760251891e-05, "loss": 0.3733, "step": 5045 }, { "epoch": 0.6066066066066066, "grad_norm": 0.37154632806777954, "learning_rate": 9.024658570836053e-05, "loss": 0.4732, "step": 5050 }, { "epoch": 0.6072072072072072, "grad_norm": 0.3752360939979553, "learning_rate": 9.02279178936496e-05, "loss": 0.4081, "step": 5055 }, { "epoch": 0.6078078078078079, "grad_norm": 0.34165915846824646, "learning_rate": 9.020923416577061e-05, "loss": 0.413, "step": 5060 }, { "epoch": 0.6084084084084084, "grad_norm": 0.3951336741447449, "learning_rate": 9.019053453211441e-05, "loss": 0.4257, "step": 5065 }, { "epoch": 0.609009009009009, "grad_norm": 0.4182293117046356, "learning_rate": 9.017181900007811e-05, "loss": 0.4486, "step": 5070 }, { "epoch": 0.6096096096096096, "grad_norm": 0.3688332736492157, "learning_rate": 9.015308757706511e-05, "loss": 0.3916, "step": 5075 }, { "epoch": 0.6102102102102102, "grad_norm": 0.36908355355262756, "learning_rate": 9.01343402704851e-05, "loss": 0.394, "step": 5080 }, { "epoch": 0.6108108108108108, "grad_norm": 0.39296767115592957, "learning_rate": 9.011557708775402e-05, "loss": 0.4116, "step": 5085 }, { "epoch": 0.6114114114114114, "grad_norm": 0.314471036195755, "learning_rate": 9.009679803629416e-05, "loss": 0.4018, "step": 5090 }, { "epoch": 0.612012012012012, "grad_norm": 0.3594004511833191, "learning_rate": 9.007800312353402e-05, "loss": 0.3745, "step": 5095 }, { "epoch": 0.6126126126126126, "grad_norm": 0.3893055021762848, "learning_rate": 9.005919235690842e-05, "loss": 0.4367, "step": 5100 }, { "epoch": 0.6132132132132132, "grad_norm": 0.41374874114990234, "learning_rate": 9.004036574385844e-05, "loss": 0.4236, "step": 5105 }, { "epoch": 0.6138138138138138, "grad_norm": 0.33323878049850464, "learning_rate": 9.00215232918314e-05, "loss": 0.3806, "step": 5110 }, { "epoch": 0.6144144144144145, "grad_norm": 0.3618185520172119, "learning_rate": 9.000266500828091e-05, "loss": 0.4042, "step": 5115 }, { "epoch": 0.615015015015015, "grad_norm": 0.283945232629776, "learning_rate": 8.998379090066687e-05, "loss": 0.4158, "step": 5120 }, { "epoch": 0.6156156156156156, "grad_norm": 0.34311872720718384, "learning_rate": 8.996490097645536e-05, "loss": 0.4343, "step": 5125 }, { "epoch": 0.6162162162162163, "grad_norm": 0.31982770562171936, "learning_rate": 8.99459952431188e-05, "loss": 0.3358, "step": 5130 }, { "epoch": 0.6168168168168168, "grad_norm": 0.3433046042919159, "learning_rate": 8.992707370813581e-05, "loss": 0.364, "step": 5135 }, { "epoch": 0.6174174174174174, "grad_norm": 0.33817145228385925, "learning_rate": 8.99081363789913e-05, "loss": 0.4177, "step": 5140 }, { "epoch": 0.618018018018018, "grad_norm": 0.31253111362457275, "learning_rate": 8.988918326317641e-05, "loss": 0.4195, "step": 5145 }, { "epoch": 0.6186186186186187, "grad_norm": 0.3355768322944641, "learning_rate": 8.98702143681885e-05, "loss": 0.3551, "step": 5150 }, { "epoch": 0.6192192192192192, "grad_norm": 0.4061967730522156, "learning_rate": 8.985122970153121e-05, "loss": 0.402, "step": 5155 }, { "epoch": 0.6198198198198198, "grad_norm": 0.303901731967926, "learning_rate": 8.983222927071442e-05, "loss": 0.3525, "step": 5160 }, { "epoch": 0.6204204204204204, "grad_norm": 0.385477215051651, "learning_rate": 8.98132130832542e-05, "loss": 0.3721, "step": 5165 }, { "epoch": 0.621021021021021, "grad_norm": 0.3199411630630493, "learning_rate": 8.97941811466729e-05, "loss": 0.4093, "step": 5170 }, { "epoch": 0.6216216216216216, "grad_norm": 0.33952972292900085, "learning_rate": 8.977513346849907e-05, "loss": 0.3733, "step": 5175 }, { "epoch": 0.6222222222222222, "grad_norm": 0.37760990858078003, "learning_rate": 8.975607005626752e-05, "loss": 0.4278, "step": 5180 }, { "epoch": 0.6228228228228229, "grad_norm": 0.3244675397872925, "learning_rate": 8.973699091751923e-05, "loss": 0.3954, "step": 5185 }, { "epoch": 0.6234234234234234, "grad_norm": 0.40450751781463623, "learning_rate": 8.971789605980148e-05, "loss": 0.4145, "step": 5190 }, { "epoch": 0.624024024024024, "grad_norm": 0.35623157024383545, "learning_rate": 8.96987854906677e-05, "loss": 0.3455, "step": 5195 }, { "epoch": 0.6246246246246246, "grad_norm": 0.3210749924182892, "learning_rate": 8.967965921767755e-05, "loss": 0.3772, "step": 5200 }, { "epoch": 0.6252252252252253, "grad_norm": 0.2864453196525574, "learning_rate": 8.966051724839691e-05, "loss": 0.3695, "step": 5205 }, { "epoch": 0.6258258258258258, "grad_norm": 0.38941943645477295, "learning_rate": 8.96413595903979e-05, "loss": 0.3886, "step": 5210 }, { "epoch": 0.6264264264264264, "grad_norm": 0.34969213604927063, "learning_rate": 8.962218625125875e-05, "loss": 0.4243, "step": 5215 }, { "epoch": 0.6270270270270271, "grad_norm": 0.3812369406223297, "learning_rate": 8.960299723856404e-05, "loss": 0.3804, "step": 5220 }, { "epoch": 0.6276276276276276, "grad_norm": 0.32315537333488464, "learning_rate": 8.958379255990441e-05, "loss": 0.3731, "step": 5225 }, { "epoch": 0.6282282282282282, "grad_norm": 0.42129507660865784, "learning_rate": 8.956457222287677e-05, "loss": 0.3576, "step": 5230 }, { "epoch": 0.6288288288288288, "grad_norm": 0.38810428977012634, "learning_rate": 8.95453362350842e-05, "loss": 0.3623, "step": 5235 }, { "epoch": 0.6294294294294295, "grad_norm": 0.38933542370796204, "learning_rate": 8.952608460413603e-05, "loss": 0.3985, "step": 5240 }, { "epoch": 0.63003003003003, "grad_norm": 0.3767377436161041, "learning_rate": 8.950681733764767e-05, "loss": 0.4433, "step": 5245 }, { "epoch": 0.6306306306306306, "grad_norm": 0.3073877692222595, "learning_rate": 8.948753444324078e-05, "loss": 0.3119, "step": 5250 }, { "epoch": 0.6306306306306306, "eval_loss": 0.3674429953098297, "eval_runtime": 35.5913, "eval_samples_per_second": 22.477, "eval_steps_per_second": 5.619, "step": 5250 }, { "epoch": 0.6312312312312313, "grad_norm": 0.430916965007782, "learning_rate": 8.946823592854323e-05, "loss": 0.4167, "step": 5255 }, { "epoch": 0.6318318318318318, "grad_norm": 0.3854207396507263, "learning_rate": 8.944892180118901e-05, "loss": 0.4014, "step": 5260 }, { "epoch": 0.6324324324324324, "grad_norm": 0.40012556314468384, "learning_rate": 8.942959206881832e-05, "loss": 0.3998, "step": 5265 }, { "epoch": 0.633033033033033, "grad_norm": 0.37446069717407227, "learning_rate": 8.94102467390775e-05, "loss": 0.4084, "step": 5270 }, { "epoch": 0.6336336336336337, "grad_norm": 0.32861629128456116, "learning_rate": 8.939088581961912e-05, "loss": 0.4207, "step": 5275 }, { "epoch": 0.6342342342342342, "grad_norm": 0.3228270709514618, "learning_rate": 8.937150931810185e-05, "loss": 0.3641, "step": 5280 }, { "epoch": 0.6348348348348348, "grad_norm": 0.3372856676578522, "learning_rate": 8.935211724219057e-05, "loss": 0.4199, "step": 5285 }, { "epoch": 0.6354354354354355, "grad_norm": 0.4280979335308075, "learning_rate": 8.933270959955631e-05, "loss": 0.4213, "step": 5290 }, { "epoch": 0.6360360360360361, "grad_norm": 0.29096171259880066, "learning_rate": 8.931328639787624e-05, "loss": 0.381, "step": 5295 }, { "epoch": 0.6366366366366366, "grad_norm": 0.3825353980064392, "learning_rate": 8.929384764483369e-05, "loss": 0.403, "step": 5300 }, { "epoch": 0.6372372372372372, "grad_norm": 0.47639000415802, "learning_rate": 8.927439334811817e-05, "loss": 0.3837, "step": 5305 }, { "epoch": 0.6378378378378379, "grad_norm": 0.4135724604129791, "learning_rate": 8.92549235154253e-05, "loss": 0.4371, "step": 5310 }, { "epoch": 0.6384384384384384, "grad_norm": 0.3919542133808136, "learning_rate": 8.923543815445688e-05, "loss": 0.3985, "step": 5315 }, { "epoch": 0.639039039039039, "grad_norm": 0.35299044847488403, "learning_rate": 8.921593727292083e-05, "loss": 0.4093, "step": 5320 }, { "epoch": 0.6396396396396397, "grad_norm": 0.41586896777153015, "learning_rate": 8.919642087853122e-05, "loss": 0.417, "step": 5325 }, { "epoch": 0.6402402402402403, "grad_norm": 0.35224875807762146, "learning_rate": 8.917688897900822e-05, "loss": 0.3912, "step": 5330 }, { "epoch": 0.6408408408408408, "grad_norm": 0.3455101251602173, "learning_rate": 8.915734158207822e-05, "loss": 0.3583, "step": 5335 }, { "epoch": 0.6414414414414414, "grad_norm": 0.34384745359420776, "learning_rate": 8.913777869547365e-05, "loss": 0.389, "step": 5340 }, { "epoch": 0.6420420420420421, "grad_norm": 0.3941856026649475, "learning_rate": 8.91182003269331e-05, "loss": 0.4044, "step": 5345 }, { "epoch": 0.6426426426426426, "grad_norm": 0.32409560680389404, "learning_rate": 8.909860648420131e-05, "loss": 0.3529, "step": 5350 }, { "epoch": 0.6432432432432432, "grad_norm": 0.3262473940849304, "learning_rate": 8.90789971750291e-05, "loss": 0.3897, "step": 5355 }, { "epoch": 0.6438438438438439, "grad_norm": 0.3769555687904358, "learning_rate": 8.905937240717346e-05, "loss": 0.3962, "step": 5360 }, { "epoch": 0.6444444444444445, "grad_norm": 0.3664250373840332, "learning_rate": 8.903973218839741e-05, "loss": 0.3887, "step": 5365 }, { "epoch": 0.645045045045045, "grad_norm": 0.3253674805164337, "learning_rate": 8.902007652647018e-05, "loss": 0.3534, "step": 5370 }, { "epoch": 0.6456456456456456, "grad_norm": 0.47549325227737427, "learning_rate": 8.900040542916703e-05, "loss": 0.4098, "step": 5375 }, { "epoch": 0.6462462462462463, "grad_norm": 0.3671705424785614, "learning_rate": 8.898071890426937e-05, "loss": 0.3674, "step": 5380 }, { "epoch": 0.6468468468468469, "grad_norm": 0.42792201042175293, "learning_rate": 8.896101695956472e-05, "loss": 0.361, "step": 5385 }, { "epoch": 0.6474474474474474, "grad_norm": 0.34030088782310486, "learning_rate": 8.894129960284667e-05, "loss": 0.3389, "step": 5390 }, { "epoch": 0.648048048048048, "grad_norm": 0.37272098660469055, "learning_rate": 8.89215668419149e-05, "loss": 0.3755, "step": 5395 }, { "epoch": 0.6486486486486487, "grad_norm": 0.4172622263431549, "learning_rate": 8.890181868457523e-05, "loss": 0.3551, "step": 5400 }, { "epoch": 0.6492492492492492, "grad_norm": 0.3918893337249756, "learning_rate": 8.888205513863952e-05, "loss": 0.3692, "step": 5405 }, { "epoch": 0.6498498498498498, "grad_norm": 0.36653268337249756, "learning_rate": 8.886227621192576e-05, "loss": 0.3525, "step": 5410 }, { "epoch": 0.6504504504504505, "grad_norm": 0.37922611832618713, "learning_rate": 8.884248191225797e-05, "loss": 0.4231, "step": 5415 }, { "epoch": 0.6510510510510511, "grad_norm": 0.384181946516037, "learning_rate": 8.882267224746632e-05, "loss": 0.3599, "step": 5420 }, { "epoch": 0.6516516516516516, "grad_norm": 0.33977892994880676, "learning_rate": 8.8802847225387e-05, "loss": 0.3971, "step": 5425 }, { "epoch": 0.6522522522522523, "grad_norm": 0.33268967270851135, "learning_rate": 8.878300685386232e-05, "loss": 0.3826, "step": 5430 }, { "epoch": 0.6528528528528529, "grad_norm": 0.36312827467918396, "learning_rate": 8.87631511407406e-05, "loss": 0.3465, "step": 5435 }, { "epoch": 0.6534534534534534, "grad_norm": 0.41879400610923767, "learning_rate": 8.874328009387632e-05, "loss": 0.3895, "step": 5440 }, { "epoch": 0.654054054054054, "grad_norm": 0.3923923969268799, "learning_rate": 8.872339372112994e-05, "loss": 0.375, "step": 5445 }, { "epoch": 0.6546546546546547, "grad_norm": 0.4057493209838867, "learning_rate": 8.870349203036804e-05, "loss": 0.3528, "step": 5450 }, { "epoch": 0.6552552552552553, "grad_norm": 0.326043963432312, "learning_rate": 8.868357502946318e-05, "loss": 0.3786, "step": 5455 }, { "epoch": 0.6558558558558558, "grad_norm": 0.3921876847743988, "learning_rate": 8.86636427262941e-05, "loss": 0.3363, "step": 5460 }, { "epoch": 0.6564564564564564, "grad_norm": 0.37249556183815, "learning_rate": 8.864369512874551e-05, "loss": 0.3595, "step": 5465 }, { "epoch": 0.6570570570570571, "grad_norm": 0.4267086386680603, "learning_rate": 8.862373224470815e-05, "loss": 0.4264, "step": 5470 }, { "epoch": 0.6576576576576577, "grad_norm": 0.34260231256484985, "learning_rate": 8.860375408207888e-05, "loss": 0.3976, "step": 5475 }, { "epoch": 0.6582582582582582, "grad_norm": 0.3625841438770294, "learning_rate": 8.858376064876056e-05, "loss": 0.3296, "step": 5480 }, { "epoch": 0.6588588588588589, "grad_norm": 0.41739052534103394, "learning_rate": 8.856375195266208e-05, "loss": 0.3798, "step": 5485 }, { "epoch": 0.6594594594594595, "grad_norm": 0.4534454941749573, "learning_rate": 8.85437280016984e-05, "loss": 0.385, "step": 5490 }, { "epoch": 0.66006006006006, "grad_norm": 0.3796202540397644, "learning_rate": 8.852368880379049e-05, "loss": 0.4401, "step": 5495 }, { "epoch": 0.6606606606606606, "grad_norm": 0.35005345940589905, "learning_rate": 8.850363436686537e-05, "loss": 0.3768, "step": 5500 }, { "epoch": 0.6606606606606606, "eval_loss": 0.3557297885417938, "eval_runtime": 35.5127, "eval_samples_per_second": 22.527, "eval_steps_per_second": 5.632, "step": 5500 }, { "epoch": 0.6612612612612613, "grad_norm": 0.3682941794395447, "learning_rate": 8.848356469885606e-05, "loss": 0.352, "step": 5505 }, { "epoch": 0.6618618618618619, "grad_norm": 0.4099908471107483, "learning_rate": 8.846347980770165e-05, "loss": 0.4246, "step": 5510 }, { "epoch": 0.6624624624624624, "grad_norm": 0.3958446681499481, "learning_rate": 8.84433797013472e-05, "loss": 0.3555, "step": 5515 }, { "epoch": 0.6630630630630631, "grad_norm": 0.28630247712135315, "learning_rate": 8.842326438774383e-05, "loss": 0.3137, "step": 5520 }, { "epoch": 0.6636636636636637, "grad_norm": 0.39440688490867615, "learning_rate": 8.840313387484867e-05, "loss": 0.3905, "step": 5525 }, { "epoch": 0.6642642642642642, "grad_norm": 0.42743274569511414, "learning_rate": 8.838298817062483e-05, "loss": 0.4187, "step": 5530 }, { "epoch": 0.6648648648648648, "grad_norm": 0.3959651589393616, "learning_rate": 8.836282728304145e-05, "loss": 0.3735, "step": 5535 }, { "epoch": 0.6654654654654655, "grad_norm": 0.34156426787376404, "learning_rate": 8.83426512200737e-05, "loss": 0.3353, "step": 5540 }, { "epoch": 0.6660660660660661, "grad_norm": 0.32064732909202576, "learning_rate": 8.832245998970271e-05, "loss": 0.3512, "step": 5545 }, { "epoch": 0.6666666666666666, "grad_norm": 0.4145539402961731, "learning_rate": 8.830225359991564e-05, "loss": 0.3382, "step": 5550 }, { "epoch": 0.6672672672672673, "grad_norm": 0.43516016006469727, "learning_rate": 8.828203205870564e-05, "loss": 0.4395, "step": 5555 }, { "epoch": 0.6678678678678679, "grad_norm": 0.37278711795806885, "learning_rate": 8.826179537407185e-05, "loss": 0.3943, "step": 5560 }, { "epoch": 0.6684684684684684, "grad_norm": 0.369424968957901, "learning_rate": 8.82415435540194e-05, "loss": 0.3275, "step": 5565 }, { "epoch": 0.669069069069069, "grad_norm": 0.3561498522758484, "learning_rate": 8.822127660655942e-05, "loss": 0.3346, "step": 5570 }, { "epoch": 0.6696696696696697, "grad_norm": 0.38225051760673523, "learning_rate": 8.820099453970899e-05, "loss": 0.3948, "step": 5575 }, { "epoch": 0.6702702702702703, "grad_norm": 0.399332731962204, "learning_rate": 8.81806973614912e-05, "loss": 0.375, "step": 5580 }, { "epoch": 0.6708708708708708, "grad_norm": 0.4058336019515991, "learning_rate": 8.81603850799351e-05, "loss": 0.4305, "step": 5585 }, { "epoch": 0.6714714714714715, "grad_norm": 0.4609101414680481, "learning_rate": 8.814005770307575e-05, "loss": 0.4438, "step": 5590 }, { "epoch": 0.6720720720720721, "grad_norm": 0.46701136231422424, "learning_rate": 8.811971523895415e-05, "loss": 0.4087, "step": 5595 }, { "epoch": 0.6726726726726727, "grad_norm": 0.32095766067504883, "learning_rate": 8.809935769561728e-05, "loss": 0.3499, "step": 5600 }, { "epoch": 0.6732732732732732, "grad_norm": 0.41197481751441956, "learning_rate": 8.807898508111806e-05, "loss": 0.4206, "step": 5605 }, { "epoch": 0.6738738738738739, "grad_norm": 0.42365097999572754, "learning_rate": 8.805859740351541e-05, "loss": 0.4077, "step": 5610 }, { "epoch": 0.6744744744744745, "grad_norm": 0.3006840944290161, "learning_rate": 8.803819467087417e-05, "loss": 0.3083, "step": 5615 }, { "epoch": 0.675075075075075, "grad_norm": 0.43815919756889343, "learning_rate": 8.80177768912652e-05, "loss": 0.35, "step": 5620 }, { "epoch": 0.6756756756756757, "grad_norm": 0.4547651410102844, "learning_rate": 8.799734407276521e-05, "loss": 0.4137, "step": 5625 }, { "epoch": 0.6762762762762763, "grad_norm": 0.3741016685962677, "learning_rate": 8.797689622345695e-05, "loss": 0.3489, "step": 5630 }, { "epoch": 0.6768768768768769, "grad_norm": 0.373638778924942, "learning_rate": 8.795643335142908e-05, "loss": 0.3916, "step": 5635 }, { "epoch": 0.6774774774774774, "grad_norm": 0.3880307972431183, "learning_rate": 8.79359554647762e-05, "loss": 0.4292, "step": 5640 }, { "epoch": 0.6780780780780781, "grad_norm": 0.4027876853942871, "learning_rate": 8.791546257159886e-05, "loss": 0.3855, "step": 5645 }, { "epoch": 0.6786786786786787, "grad_norm": 0.3801300823688507, "learning_rate": 8.789495468000354e-05, "loss": 0.3638, "step": 5650 }, { "epoch": 0.6792792792792792, "grad_norm": 0.34081804752349854, "learning_rate": 8.787443179810266e-05, "loss": 0.3597, "step": 5655 }, { "epoch": 0.6798798798798799, "grad_norm": 0.4241330027580261, "learning_rate": 8.785389393401455e-05, "loss": 0.3532, "step": 5660 }, { "epoch": 0.6804804804804805, "grad_norm": 0.35839948058128357, "learning_rate": 8.783334109586348e-05, "loss": 0.3605, "step": 5665 }, { "epoch": 0.6810810810810811, "grad_norm": 0.4927769899368286, "learning_rate": 8.781277329177966e-05, "loss": 0.4043, "step": 5670 }, { "epoch": 0.6816816816816816, "grad_norm": 0.3698582947254181, "learning_rate": 8.779219052989919e-05, "loss": 0.3446, "step": 5675 }, { "epoch": 0.6822822822822823, "grad_norm": 0.515845775604248, "learning_rate": 8.77715928183641e-05, "loss": 0.4091, "step": 5680 }, { "epoch": 0.6828828828828829, "grad_norm": 0.39872249960899353, "learning_rate": 8.775098016532235e-05, "loss": 0.405, "step": 5685 }, { "epoch": 0.6834834834834835, "grad_norm": 0.33597859740257263, "learning_rate": 8.773035257892778e-05, "loss": 0.3354, "step": 5690 }, { "epoch": 0.684084084084084, "grad_norm": 0.34901607036590576, "learning_rate": 8.770971006734015e-05, "loss": 0.3642, "step": 5695 }, { "epoch": 0.6846846846846847, "grad_norm": 0.4633309841156006, "learning_rate": 8.768905263872515e-05, "loss": 0.3454, "step": 5700 }, { "epoch": 0.6852852852852853, "grad_norm": 0.44013097882270813, "learning_rate": 8.766838030125432e-05, "loss": 0.3439, "step": 5705 }, { "epoch": 0.6858858858858858, "grad_norm": 0.36319929361343384, "learning_rate": 8.764769306310513e-05, "loss": 0.3189, "step": 5710 }, { "epoch": 0.6864864864864865, "grad_norm": 0.44178229570388794, "learning_rate": 8.762699093246096e-05, "loss": 0.3161, "step": 5715 }, { "epoch": 0.6870870870870871, "grad_norm": 0.4986707270145416, "learning_rate": 8.760627391751103e-05, "loss": 0.3449, "step": 5720 }, { "epoch": 0.6876876876876877, "grad_norm": 0.4556070566177368, "learning_rate": 8.75855420264505e-05, "loss": 0.3516, "step": 5725 }, { "epoch": 0.6882882882882883, "grad_norm": 0.45364683866500854, "learning_rate": 8.756479526748039e-05, "loss": 0.3842, "step": 5730 }, { "epoch": 0.6888888888888889, "grad_norm": 0.34849977493286133, "learning_rate": 8.754403364880759e-05, "loss": 0.3194, "step": 5735 }, { "epoch": 0.6894894894894895, "grad_norm": 0.39173176884651184, "learning_rate": 8.752325717864488e-05, "loss": 0.3344, "step": 5740 }, { "epoch": 0.69009009009009, "grad_norm": 0.34377187490463257, "learning_rate": 8.750246586521095e-05, "loss": 0.3627, "step": 5745 }, { "epoch": 0.6906906906906907, "grad_norm": 0.44872725009918213, "learning_rate": 8.74816597167303e-05, "loss": 0.3905, "step": 5750 }, { "epoch": 0.6906906906906907, "eval_loss": 0.33489829301834106, "eval_runtime": 35.5357, "eval_samples_per_second": 22.513, "eval_steps_per_second": 5.628, "step": 5750 }, { "epoch": 0.6912912912912913, "grad_norm": 0.38582900166511536, "learning_rate": 8.746083874143334e-05, "loss": 0.3528, "step": 5755 }, { "epoch": 0.6918918918918919, "grad_norm": 0.4613777697086334, "learning_rate": 8.744000294755632e-05, "loss": 0.3335, "step": 5760 }, { "epoch": 0.6924924924924925, "grad_norm": 0.36886975169181824, "learning_rate": 8.741915234334138e-05, "loss": 0.3598, "step": 5765 }, { "epoch": 0.6930930930930931, "grad_norm": 0.2886711061000824, "learning_rate": 8.739828693703647e-05, "loss": 0.3482, "step": 5770 }, { "epoch": 0.6936936936936937, "grad_norm": 0.35636454820632935, "learning_rate": 8.737740673689547e-05, "loss": 0.3472, "step": 5775 }, { "epoch": 0.6942942942942943, "grad_norm": 0.3754615783691406, "learning_rate": 8.735651175117805e-05, "loss": 0.3748, "step": 5780 }, { "epoch": 0.6948948948948949, "grad_norm": 0.4312724769115448, "learning_rate": 8.733560198814975e-05, "loss": 0.3662, "step": 5785 }, { "epoch": 0.6954954954954955, "grad_norm": 0.4254775643348694, "learning_rate": 8.731467745608195e-05, "loss": 0.3273, "step": 5790 }, { "epoch": 0.6960960960960961, "grad_norm": 0.3985266387462616, "learning_rate": 8.72937381632519e-05, "loss": 0.3426, "step": 5795 }, { "epoch": 0.6966966966966966, "grad_norm": 0.3750954866409302, "learning_rate": 8.727278411794261e-05, "loss": 0.3256, "step": 5800 }, { "epoch": 0.6972972972972973, "grad_norm": 0.5321087837219238, "learning_rate": 8.725181532844305e-05, "loss": 0.3643, "step": 5805 }, { "epoch": 0.6978978978978979, "grad_norm": 0.38525283336639404, "learning_rate": 8.723083180304791e-05, "loss": 0.3331, "step": 5810 }, { "epoch": 0.6984984984984985, "grad_norm": 0.4347771108150482, "learning_rate": 8.720983355005776e-05, "loss": 0.4098, "step": 5815 }, { "epoch": 0.6990990990990991, "grad_norm": 0.3714815676212311, "learning_rate": 8.7188820577779e-05, "loss": 0.3134, "step": 5820 }, { "epoch": 0.6996996996996997, "grad_norm": 0.3754033148288727, "learning_rate": 8.716779289452384e-05, "loss": 0.3197, "step": 5825 }, { "epoch": 0.7003003003003003, "grad_norm": 0.34821298718452454, "learning_rate": 8.714675050861029e-05, "loss": 0.3545, "step": 5830 }, { "epoch": 0.7009009009009008, "grad_norm": 0.3712867200374603, "learning_rate": 8.712569342836223e-05, "loss": 0.3837, "step": 5835 }, { "epoch": 0.7015015015015015, "grad_norm": 0.3959085941314697, "learning_rate": 8.710462166210931e-05, "loss": 0.368, "step": 5840 }, { "epoch": 0.7021021021021021, "grad_norm": 0.4257919192314148, "learning_rate": 8.708353521818697e-05, "loss": 0.3487, "step": 5845 }, { "epoch": 0.7027027027027027, "grad_norm": 0.35467055439949036, "learning_rate": 8.706243410493651e-05, "loss": 0.3392, "step": 5850 }, { "epoch": 0.7033033033033033, "grad_norm": 0.3705252408981323, "learning_rate": 8.704131833070501e-05, "loss": 0.4105, "step": 5855 }, { "epoch": 0.7039039039039039, "grad_norm": 0.3051586151123047, "learning_rate": 8.702018790384535e-05, "loss": 0.3708, "step": 5860 }, { "epoch": 0.7045045045045045, "grad_norm": 0.37856411933898926, "learning_rate": 8.699904283271617e-05, "loss": 0.3514, "step": 5865 }, { "epoch": 0.7051051051051052, "grad_norm": 0.4097536504268646, "learning_rate": 8.697788312568198e-05, "loss": 0.3417, "step": 5870 }, { "epoch": 0.7057057057057057, "grad_norm": 0.43204861879348755, "learning_rate": 8.695670879111301e-05, "loss": 0.3584, "step": 5875 }, { "epoch": 0.7063063063063063, "grad_norm": 0.3879355788230896, "learning_rate": 8.69355198373853e-05, "loss": 0.335, "step": 5880 }, { "epoch": 0.7069069069069069, "grad_norm": 0.5099337100982666, "learning_rate": 8.691431627288072e-05, "loss": 0.3692, "step": 5885 }, { "epoch": 0.7075075075075075, "grad_norm": 0.4005381166934967, "learning_rate": 8.68930981059868e-05, "loss": 0.3466, "step": 5890 }, { "epoch": 0.7081081081081081, "grad_norm": 0.4230159819126129, "learning_rate": 8.687186534509699e-05, "loss": 0.3351, "step": 5895 }, { "epoch": 0.7087087087087087, "grad_norm": 0.4088141620159149, "learning_rate": 8.68506179986104e-05, "loss": 0.3813, "step": 5900 }, { "epoch": 0.7093093093093094, "grad_norm": 0.3280819356441498, "learning_rate": 8.682935607493197e-05, "loss": 0.3401, "step": 5905 }, { "epoch": 0.7099099099099099, "grad_norm": 0.3707108497619629, "learning_rate": 8.68080795824724e-05, "loss": 0.32, "step": 5910 }, { "epoch": 0.7105105105105105, "grad_norm": 0.5042741298675537, "learning_rate": 8.678678852964812e-05, "loss": 0.3853, "step": 5915 }, { "epoch": 0.7111111111111111, "grad_norm": 0.4071110188961029, "learning_rate": 8.676548292488136e-05, "loss": 0.3667, "step": 5920 }, { "epoch": 0.7117117117117117, "grad_norm": 0.3619343936443329, "learning_rate": 8.67441627766001e-05, "loss": 0.3353, "step": 5925 }, { "epoch": 0.7123123123123123, "grad_norm": 0.42885512113571167, "learning_rate": 8.672282809323802e-05, "loss": 0.3706, "step": 5930 }, { "epoch": 0.7129129129129129, "grad_norm": 0.4062737822532654, "learning_rate": 8.670147888323466e-05, "loss": 0.3547, "step": 5935 }, { "epoch": 0.7135135135135136, "grad_norm": 0.3969167470932007, "learning_rate": 8.668011515503517e-05, "loss": 0.326, "step": 5940 }, { "epoch": 0.7141141141141141, "grad_norm": 0.4645400047302246, "learning_rate": 8.665873691709055e-05, "loss": 0.336, "step": 5945 }, { "epoch": 0.7147147147147147, "grad_norm": 0.3994089961051941, "learning_rate": 8.66373441778575e-05, "loss": 0.3598, "step": 5950 }, { "epoch": 0.7153153153153153, "grad_norm": 0.37047797441482544, "learning_rate": 8.661593694579845e-05, "loss": 0.3982, "step": 5955 }, { "epoch": 0.715915915915916, "grad_norm": 0.399649977684021, "learning_rate": 8.659451522938157e-05, "loss": 0.3593, "step": 5960 }, { "epoch": 0.7165165165165165, "grad_norm": 0.36607518792152405, "learning_rate": 8.657307903708077e-05, "loss": 0.3224, "step": 5965 }, { "epoch": 0.7171171171171171, "grad_norm": 0.4388699531555176, "learning_rate": 8.655162837737565e-05, "loss": 0.3652, "step": 5970 }, { "epoch": 0.7177177177177178, "grad_norm": 0.4220297634601593, "learning_rate": 8.653016325875158e-05, "loss": 0.3607, "step": 5975 }, { "epoch": 0.7183183183183183, "grad_norm": 0.41517767310142517, "learning_rate": 8.650868368969964e-05, "loss": 0.3531, "step": 5980 }, { "epoch": 0.7189189189189189, "grad_norm": 0.3969719409942627, "learning_rate": 8.648718967871661e-05, "loss": 0.3121, "step": 5985 }, { "epoch": 0.7195195195195195, "grad_norm": 0.49861007928848267, "learning_rate": 8.646568123430499e-05, "loss": 0.3661, "step": 5990 }, { "epoch": 0.7201201201201202, "grad_norm": 0.4132179915904999, "learning_rate": 8.644415836497295e-05, "loss": 0.3354, "step": 5995 }, { "epoch": 0.7207207207207207, "grad_norm": 0.4681933522224426, "learning_rate": 8.642262107923445e-05, "loss": 0.3979, "step": 6000 }, { "epoch": 0.7207207207207207, "eval_loss": 0.3170243203639984, "eval_runtime": 35.5466, "eval_samples_per_second": 22.506, "eval_steps_per_second": 5.626, "step": 6000 }, { "epoch": 0.7213213213213213, "grad_norm": 0.3149552643299103, "learning_rate": 8.64010693856091e-05, "loss": 0.3217, "step": 6005 }, { "epoch": 0.721921921921922, "grad_norm": 0.42974141240119934, "learning_rate": 8.637950329262219e-05, "loss": 0.3539, "step": 6010 }, { "epoch": 0.7225225225225225, "grad_norm": 0.3971770405769348, "learning_rate": 8.635792280880475e-05, "loss": 0.318, "step": 6015 }, { "epoch": 0.7231231231231231, "grad_norm": 0.4378364682197571, "learning_rate": 8.63363279426935e-05, "loss": 0.3837, "step": 6020 }, { "epoch": 0.7237237237237237, "grad_norm": 0.46137237548828125, "learning_rate": 8.631471870283082e-05, "loss": 0.3426, "step": 6025 }, { "epoch": 0.7243243243243244, "grad_norm": 0.4154767692089081, "learning_rate": 8.629309509776478e-05, "loss": 0.3469, "step": 6030 }, { "epoch": 0.7249249249249249, "grad_norm": 0.35032203793525696, "learning_rate": 8.627145713604916e-05, "loss": 0.3323, "step": 6035 }, { "epoch": 0.7255255255255255, "grad_norm": 0.434451162815094, "learning_rate": 8.624980482624339e-05, "loss": 0.3409, "step": 6040 }, { "epoch": 0.7261261261261261, "grad_norm": 0.4690226912498474, "learning_rate": 8.622813817691258e-05, "loss": 0.3693, "step": 6045 }, { "epoch": 0.7267267267267268, "grad_norm": 0.5112568140029907, "learning_rate": 8.620645719662754e-05, "loss": 0.3628, "step": 6050 }, { "epoch": 0.7273273273273273, "grad_norm": 0.3646400570869446, "learning_rate": 8.618476189396472e-05, "loss": 0.3293, "step": 6055 }, { "epoch": 0.7279279279279279, "grad_norm": 0.37316304445266724, "learning_rate": 8.616305227750624e-05, "loss": 0.3099, "step": 6060 }, { "epoch": 0.7285285285285286, "grad_norm": 0.4041762351989746, "learning_rate": 8.61413283558399e-05, "loss": 0.2997, "step": 6065 }, { "epoch": 0.7291291291291291, "grad_norm": 0.4494181275367737, "learning_rate": 8.611959013755912e-05, "loss": 0.3401, "step": 6070 }, { "epoch": 0.7297297297297297, "grad_norm": 0.43668776750564575, "learning_rate": 8.609783763126303e-05, "loss": 0.3148, "step": 6075 }, { "epoch": 0.7303303303303303, "grad_norm": 0.42859476804733276, "learning_rate": 8.607607084555636e-05, "loss": 0.3863, "step": 6080 }, { "epoch": 0.730930930930931, "grad_norm": 0.4075924754142761, "learning_rate": 8.605428978904953e-05, "loss": 0.3486, "step": 6085 }, { "epoch": 0.7315315315315315, "grad_norm": 0.39961525797843933, "learning_rate": 8.603249447035859e-05, "loss": 0.292, "step": 6090 }, { "epoch": 0.7321321321321321, "grad_norm": 0.4883679449558258, "learning_rate": 8.60106848981052e-05, "loss": 0.3576, "step": 6095 }, { "epoch": 0.7327327327327328, "grad_norm": 0.39501965045928955, "learning_rate": 8.59888610809167e-05, "loss": 0.3473, "step": 6100 }, { "epoch": 0.7333333333333333, "grad_norm": 0.4299990236759186, "learning_rate": 8.596702302742605e-05, "loss": 0.3048, "step": 6105 }, { "epoch": 0.7339339339339339, "grad_norm": 0.3688097298145294, "learning_rate": 8.594517074627186e-05, "loss": 0.3456, "step": 6110 }, { "epoch": 0.7345345345345345, "grad_norm": 0.44492146372795105, "learning_rate": 8.592330424609834e-05, "loss": 0.3273, "step": 6115 }, { "epoch": 0.7351351351351352, "grad_norm": 0.475625216960907, "learning_rate": 8.590142353555532e-05, "loss": 0.3544, "step": 6120 }, { "epoch": 0.7357357357357357, "grad_norm": 0.4551377594470978, "learning_rate": 8.587952862329829e-05, "loss": 0.362, "step": 6125 }, { "epoch": 0.7363363363363363, "grad_norm": 0.47384923696517944, "learning_rate": 8.585761951798832e-05, "loss": 0.3778, "step": 6130 }, { "epoch": 0.736936936936937, "grad_norm": 0.3864975869655609, "learning_rate": 8.583569622829213e-05, "loss": 0.2848, "step": 6135 }, { "epoch": 0.7375375375375376, "grad_norm": 0.4830896258354187, "learning_rate": 8.5813758762882e-05, "loss": 0.3299, "step": 6140 }, { "epoch": 0.7381381381381381, "grad_norm": 0.4643004238605499, "learning_rate": 8.579180713043587e-05, "loss": 0.3324, "step": 6145 }, { "epoch": 0.7387387387387387, "grad_norm": 0.46419695019721985, "learning_rate": 8.576984133963725e-05, "loss": 0.3263, "step": 6150 }, { "epoch": 0.7393393393393394, "grad_norm": 0.40200313925743103, "learning_rate": 8.57478613991753e-05, "loss": 0.3368, "step": 6155 }, { "epoch": 0.7399399399399399, "grad_norm": 0.441680371761322, "learning_rate": 8.572586731774468e-05, "loss": 0.3421, "step": 6160 }, { "epoch": 0.7405405405405405, "grad_norm": 0.43650439381599426, "learning_rate": 8.570385910404575e-05, "loss": 0.3844, "step": 6165 }, { "epoch": 0.7411411411411412, "grad_norm": 0.3936440050601959, "learning_rate": 8.568183676678438e-05, "loss": 0.3255, "step": 6170 }, { "epoch": 0.7417417417417418, "grad_norm": 0.5458399057388306, "learning_rate": 8.56598003146721e-05, "loss": 0.3524, "step": 6175 }, { "epoch": 0.7423423423423423, "grad_norm": 0.3615384101867676, "learning_rate": 8.563774975642595e-05, "loss": 0.3202, "step": 6180 }, { "epoch": 0.7429429429429429, "grad_norm": 0.40769365429878235, "learning_rate": 8.561568510076861e-05, "loss": 0.3557, "step": 6185 }, { "epoch": 0.7435435435435436, "grad_norm": 0.41414713859558105, "learning_rate": 8.559360635642828e-05, "loss": 0.2905, "step": 6190 }, { "epoch": 0.7441441441441441, "grad_norm": 0.46470022201538086, "learning_rate": 8.557151353213881e-05, "loss": 0.3417, "step": 6195 }, { "epoch": 0.7447447447447447, "grad_norm": 0.4674842953681946, "learning_rate": 8.554940663663953e-05, "loss": 0.3212, "step": 6200 }, { "epoch": 0.7453453453453454, "grad_norm": 0.43788832426071167, "learning_rate": 8.55272856786754e-05, "loss": 0.3357, "step": 6205 }, { "epoch": 0.745945945945946, "grad_norm": 0.42673784494400024, "learning_rate": 8.550515066699692e-05, "loss": 0.3096, "step": 6210 }, { "epoch": 0.7465465465465465, "grad_norm": 0.39619484543800354, "learning_rate": 8.548300161036016e-05, "loss": 0.3188, "step": 6215 }, { "epoch": 0.7471471471471471, "grad_norm": 0.42830052971839905, "learning_rate": 8.546083851752673e-05, "loss": 0.3581, "step": 6220 }, { "epoch": 0.7477477477477478, "grad_norm": 0.3399454355239868, "learning_rate": 8.543866139726381e-05, "loss": 0.3254, "step": 6225 }, { "epoch": 0.7483483483483484, "grad_norm": 0.38868388533592224, "learning_rate": 8.541647025834412e-05, "loss": 0.3096, "step": 6230 }, { "epoch": 0.7489489489489489, "grad_norm": 0.442039430141449, "learning_rate": 8.539426510954589e-05, "loss": 0.3312, "step": 6235 }, { "epoch": 0.7495495495495496, "grad_norm": 0.4817804992198944, "learning_rate": 8.537204595965298e-05, "loss": 0.3422, "step": 6240 }, { "epoch": 0.7501501501501502, "grad_norm": 0.42679134011268616, "learning_rate": 8.53498128174547e-05, "loss": 0.3781, "step": 6245 }, { "epoch": 0.7507507507507507, "grad_norm": 0.4844966530799866, "learning_rate": 8.532756569174593e-05, "loss": 0.3843, "step": 6250 }, { "epoch": 0.7507507507507507, "eval_loss": 0.30824774503707886, "eval_runtime": 35.7014, "eval_samples_per_second": 22.408, "eval_steps_per_second": 5.602, "step": 6250 }, { "epoch": 0.7513513513513513, "grad_norm": 0.3791470527648926, "learning_rate": 8.53053045913271e-05, "loss": 0.3248, "step": 6255 }, { "epoch": 0.751951951951952, "grad_norm": 0.4487055540084839, "learning_rate": 8.528302952500415e-05, "loss": 0.3045, "step": 6260 }, { "epoch": 0.7525525525525526, "grad_norm": 0.3538472354412079, "learning_rate": 8.526074050158855e-05, "loss": 0.2978, "step": 6265 }, { "epoch": 0.7531531531531531, "grad_norm": 0.475076824426651, "learning_rate": 8.523843752989724e-05, "loss": 0.3561, "step": 6270 }, { "epoch": 0.7537537537537538, "grad_norm": 0.4490582346916199, "learning_rate": 8.521612061875278e-05, "loss": 0.3931, "step": 6275 }, { "epoch": 0.7543543543543544, "grad_norm": 0.3989025354385376, "learning_rate": 8.519378977698316e-05, "loss": 0.3208, "step": 6280 }, { "epoch": 0.7549549549549549, "grad_norm": 0.41477465629577637, "learning_rate": 8.51714450134219e-05, "loss": 0.3535, "step": 6285 }, { "epoch": 0.7555555555555555, "grad_norm": 0.3471718430519104, "learning_rate": 8.514908633690806e-05, "loss": 0.3103, "step": 6290 }, { "epoch": 0.7561561561561562, "grad_norm": 0.3647453486919403, "learning_rate": 8.512671375628616e-05, "loss": 0.3069, "step": 6295 }, { "epoch": 0.7567567567567568, "grad_norm": 0.4987131357192993, "learning_rate": 8.510432728040623e-05, "loss": 0.3442, "step": 6300 }, { "epoch": 0.7573573573573573, "grad_norm": 0.4753042757511139, "learning_rate": 8.508192691812385e-05, "loss": 0.3547, "step": 6305 }, { "epoch": 0.757957957957958, "grad_norm": 0.39992815256118774, "learning_rate": 8.505951267829999e-05, "loss": 0.3786, "step": 6310 }, { "epoch": 0.7585585585585586, "grad_norm": 0.4165491759777069, "learning_rate": 8.50370845698012e-05, "loss": 0.313, "step": 6315 }, { "epoch": 0.7591591591591592, "grad_norm": 0.4802296459674835, "learning_rate": 8.50146426014995e-05, "loss": 0.3144, "step": 6320 }, { "epoch": 0.7597597597597597, "grad_norm": 0.34493401646614075, "learning_rate": 8.499218678227234e-05, "loss": 0.3359, "step": 6325 }, { "epoch": 0.7603603603603604, "grad_norm": 0.40848666429519653, "learning_rate": 8.49697171210027e-05, "loss": 0.3263, "step": 6330 }, { "epoch": 0.760960960960961, "grad_norm": 0.4861222803592682, "learning_rate": 8.494723362657902e-05, "loss": 0.3275, "step": 6335 }, { "epoch": 0.7615615615615615, "grad_norm": 0.4158020615577698, "learning_rate": 8.492473630789523e-05, "loss": 0.313, "step": 6340 }, { "epoch": 0.7621621621621621, "grad_norm": 0.3994755148887634, "learning_rate": 8.49022251738507e-05, "loss": 0.3762, "step": 6345 }, { "epoch": 0.7627627627627628, "grad_norm": 0.47359034419059753, "learning_rate": 8.487970023335028e-05, "loss": 0.312, "step": 6350 }, { "epoch": 0.7633633633633634, "grad_norm": 0.4214676320552826, "learning_rate": 8.485716149530428e-05, "loss": 0.3457, "step": 6355 }, { "epoch": 0.7639639639639639, "grad_norm": 0.4861663281917572, "learning_rate": 8.483460896862845e-05, "loss": 0.3491, "step": 6360 }, { "epoch": 0.7645645645645646, "grad_norm": 0.40490207076072693, "learning_rate": 8.481204266224403e-05, "loss": 0.293, "step": 6365 }, { "epoch": 0.7651651651651652, "grad_norm": 0.4203982949256897, "learning_rate": 8.478946258507767e-05, "loss": 0.3458, "step": 6370 }, { "epoch": 0.7657657657657657, "grad_norm": 0.4834545850753784, "learning_rate": 8.476686874606153e-05, "loss": 0.3688, "step": 6375 }, { "epoch": 0.7663663663663663, "grad_norm": 0.4755549430847168, "learning_rate": 8.474426115413314e-05, "loss": 0.3221, "step": 6380 }, { "epoch": 0.766966966966967, "grad_norm": 0.39933833479881287, "learning_rate": 8.47216398182355e-05, "loss": 0.3388, "step": 6385 }, { "epoch": 0.7675675675675676, "grad_norm": 0.474345326423645, "learning_rate": 8.469900474731707e-05, "loss": 0.3069, "step": 6390 }, { "epoch": 0.7681681681681681, "grad_norm": 0.46039968729019165, "learning_rate": 8.467635595033172e-05, "loss": 0.3501, "step": 6395 }, { "epoch": 0.7687687687687688, "grad_norm": 0.4588356614112854, "learning_rate": 8.465369343623875e-05, "loss": 0.3405, "step": 6400 }, { "epoch": 0.7693693693693694, "grad_norm": 0.3978899419307709, "learning_rate": 8.463101721400287e-05, "loss": 0.3406, "step": 6405 }, { "epoch": 0.76996996996997, "grad_norm": 0.3917028605937958, "learning_rate": 8.460832729259427e-05, "loss": 0.3561, "step": 6410 }, { "epoch": 0.7705705705705705, "grad_norm": 0.36762386560440063, "learning_rate": 8.458562368098849e-05, "loss": 0.357, "step": 6415 }, { "epoch": 0.7711711711711712, "grad_norm": 0.426001638174057, "learning_rate": 8.456290638816653e-05, "loss": 0.3219, "step": 6420 }, { "epoch": 0.7717717717717718, "grad_norm": 0.43328964710235596, "learning_rate": 8.454017542311478e-05, "loss": 0.3638, "step": 6425 }, { "epoch": 0.7723723723723723, "grad_norm": 0.39316022396087646, "learning_rate": 8.451743079482506e-05, "loss": 0.3212, "step": 6430 }, { "epoch": 0.772972972972973, "grad_norm": 0.37973538041114807, "learning_rate": 8.449467251229457e-05, "loss": 0.312, "step": 6435 }, { "epoch": 0.7735735735735736, "grad_norm": 0.5051682591438293, "learning_rate": 8.447190058452592e-05, "loss": 0.3516, "step": 6440 }, { "epoch": 0.7741741741741742, "grad_norm": 0.3958034813404083, "learning_rate": 8.444911502052715e-05, "loss": 0.3364, "step": 6445 }, { "epoch": 0.7747747747747747, "grad_norm": 0.36705729365348816, "learning_rate": 8.442631582931162e-05, "loss": 0.3391, "step": 6450 }, { "epoch": 0.7753753753753754, "grad_norm": 0.48368221521377563, "learning_rate": 8.440350301989817e-05, "loss": 0.3281, "step": 6455 }, { "epoch": 0.775975975975976, "grad_norm": 0.4401091933250427, "learning_rate": 8.438067660131092e-05, "loss": 0.2843, "step": 6460 }, { "epoch": 0.7765765765765765, "grad_norm": 0.4518408179283142, "learning_rate": 8.435783658257952e-05, "loss": 0.3332, "step": 6465 }, { "epoch": 0.7771771771771772, "grad_norm": 0.4035428464412689, "learning_rate": 8.433498297273885e-05, "loss": 0.3057, "step": 6470 }, { "epoch": 0.7777777777777778, "grad_norm": 0.3968808650970459, "learning_rate": 8.431211578082925e-05, "loss": 0.3064, "step": 6475 }, { "epoch": 0.7783783783783784, "grad_norm": 0.45929592847824097, "learning_rate": 8.428923501589642e-05, "loss": 0.361, "step": 6480 }, { "epoch": 0.7789789789789789, "grad_norm": 0.4726107716560364, "learning_rate": 8.426634068699144e-05, "loss": 0.3555, "step": 6485 }, { "epoch": 0.7795795795795796, "grad_norm": 0.4348137676715851, "learning_rate": 8.42434328031707e-05, "loss": 0.3134, "step": 6490 }, { "epoch": 0.7801801801801802, "grad_norm": 0.42450523376464844, "learning_rate": 8.422051137349604e-05, "loss": 0.2929, "step": 6495 }, { "epoch": 0.7807807807807807, "grad_norm": 0.3884715437889099, "learning_rate": 8.419757640703456e-05, "loss": 0.3505, "step": 6500 }, { "epoch": 0.7807807807807807, "eval_loss": 0.2983928620815277, "eval_runtime": 35.5006, "eval_samples_per_second": 22.535, "eval_steps_per_second": 5.634, "step": 6500 }, { "epoch": 0.7813813813813814, "grad_norm": 0.38903436064720154, "learning_rate": 8.417462791285879e-05, "loss": 0.319, "step": 6505 }, { "epoch": 0.781981981981982, "grad_norm": 0.5192430019378662, "learning_rate": 8.41516659000466e-05, "loss": 0.3627, "step": 6510 }, { "epoch": 0.7825825825825826, "grad_norm": 0.5047812461853027, "learning_rate": 8.412869037768118e-05, "loss": 0.3569, "step": 6515 }, { "epoch": 0.7831831831831831, "grad_norm": 0.4486142694950104, "learning_rate": 8.410570135485105e-05, "loss": 0.3639, "step": 6520 }, { "epoch": 0.7837837837837838, "grad_norm": 0.3849766254425049, "learning_rate": 8.408269884065014e-05, "loss": 0.366, "step": 6525 }, { "epoch": 0.7843843843843844, "grad_norm": 0.5203561186790466, "learning_rate": 8.405968284417766e-05, "loss": 0.3197, "step": 6530 }, { "epoch": 0.784984984984985, "grad_norm": 0.3983413279056549, "learning_rate": 8.403665337453817e-05, "loss": 0.297, "step": 6535 }, { "epoch": 0.7855855855855856, "grad_norm": 0.5210694670677185, "learning_rate": 8.401361044084154e-05, "loss": 0.3278, "step": 6540 }, { "epoch": 0.7861861861861862, "grad_norm": 0.3612450063228607, "learning_rate": 8.3990554052203e-05, "loss": 0.3049, "step": 6545 }, { "epoch": 0.7867867867867868, "grad_norm": 0.5003215670585632, "learning_rate": 8.39674842177431e-05, "loss": 0.3062, "step": 6550 }, { "epoch": 0.7873873873873873, "grad_norm": 0.4537302255630493, "learning_rate": 8.394440094658767e-05, "loss": 0.3214, "step": 6555 }, { "epoch": 0.787987987987988, "grad_norm": 0.4880407452583313, "learning_rate": 8.392130424786788e-05, "loss": 0.3067, "step": 6560 }, { "epoch": 0.7885885885885886, "grad_norm": 0.4855288863182068, "learning_rate": 8.389819413072024e-05, "loss": 0.3358, "step": 6565 }, { "epoch": 0.7891891891891892, "grad_norm": 0.45421022176742554, "learning_rate": 8.387507060428652e-05, "loss": 0.3018, "step": 6570 }, { "epoch": 0.7897897897897898, "grad_norm": 0.5397417545318604, "learning_rate": 8.38519336777138e-05, "loss": 0.345, "step": 6575 }, { "epoch": 0.7903903903903904, "grad_norm": 0.4351378083229065, "learning_rate": 8.38287833601545e-05, "loss": 0.3067, "step": 6580 }, { "epoch": 0.790990990990991, "grad_norm": 0.39667877554893494, "learning_rate": 8.380561966076632e-05, "loss": 0.294, "step": 6585 }, { "epoch": 0.7915915915915915, "grad_norm": 0.45850926637649536, "learning_rate": 8.378244258871221e-05, "loss": 0.3033, "step": 6590 }, { "epoch": 0.7921921921921922, "grad_norm": 0.4755285978317261, "learning_rate": 8.375925215316048e-05, "loss": 0.3158, "step": 6595 }, { "epoch": 0.7927927927927928, "grad_norm": 0.45534393191337585, "learning_rate": 8.373604836328466e-05, "loss": 0.3269, "step": 6600 }, { "epoch": 0.7933933933933934, "grad_norm": 0.4943680763244629, "learning_rate": 8.371283122826363e-05, "loss": 0.3118, "step": 6605 }, { "epoch": 0.793993993993994, "grad_norm": 0.3808231055736542, "learning_rate": 8.368960075728149e-05, "loss": 0.3331, "step": 6610 }, { "epoch": 0.7945945945945946, "grad_norm": 0.4319944381713867, "learning_rate": 8.366635695952765e-05, "loss": 0.3259, "step": 6615 }, { "epoch": 0.7951951951951952, "grad_norm": 0.4353392720222473, "learning_rate": 8.364309984419677e-05, "loss": 0.3113, "step": 6620 }, { "epoch": 0.7957957957957958, "grad_norm": 0.4579818546772003, "learning_rate": 8.361982942048878e-05, "loss": 0.2505, "step": 6625 }, { "epoch": 0.7963963963963964, "grad_norm": 0.44811052083969116, "learning_rate": 8.359654569760893e-05, "loss": 0.3219, "step": 6630 }, { "epoch": 0.796996996996997, "grad_norm": 0.476921409368515, "learning_rate": 8.357324868476762e-05, "loss": 0.3276, "step": 6635 }, { "epoch": 0.7975975975975976, "grad_norm": 0.5040744543075562, "learning_rate": 8.354993839118062e-05, "loss": 0.3516, "step": 6640 }, { "epoch": 0.7981981981981981, "grad_norm": 0.5249648094177246, "learning_rate": 8.352661482606888e-05, "loss": 0.3068, "step": 6645 }, { "epoch": 0.7987987987987988, "grad_norm": 0.45975515246391296, "learning_rate": 8.350327799865864e-05, "loss": 0.2917, "step": 6650 }, { "epoch": 0.7993993993993994, "grad_norm": 0.6015775799751282, "learning_rate": 8.347992791818137e-05, "loss": 0.3764, "step": 6655 }, { "epoch": 0.8, "grad_norm": 0.49470993876457214, "learning_rate": 8.345656459387376e-05, "loss": 0.3179, "step": 6660 }, { "epoch": 0.8006006006006006, "grad_norm": 0.434529572725296, "learning_rate": 8.343318803497779e-05, "loss": 0.3311, "step": 6665 }, { "epoch": 0.8012012012012012, "grad_norm": 0.3967740535736084, "learning_rate": 8.340979825074063e-05, "loss": 0.3022, "step": 6670 }, { "epoch": 0.8018018018018018, "grad_norm": 0.40132951736450195, "learning_rate": 8.338639525041472e-05, "loss": 0.295, "step": 6675 }, { "epoch": 0.8024024024024023, "grad_norm": 0.38449764251708984, "learning_rate": 8.33629790432577e-05, "loss": 0.3307, "step": 6680 }, { "epoch": 0.803003003003003, "grad_norm": 0.500756561756134, "learning_rate": 8.333954963853241e-05, "loss": 0.3057, "step": 6685 }, { "epoch": 0.8036036036036036, "grad_norm": 0.5168707370758057, "learning_rate": 8.331610704550698e-05, "loss": 0.2988, "step": 6690 }, { "epoch": 0.8042042042042042, "grad_norm": 0.33584412932395935, "learning_rate": 8.329265127345471e-05, "loss": 0.3074, "step": 6695 }, { "epoch": 0.8048048048048048, "grad_norm": 0.3804640769958496, "learning_rate": 8.326918233165412e-05, "loss": 0.3003, "step": 6700 }, { "epoch": 0.8054054054054054, "grad_norm": 0.5202556848526001, "learning_rate": 8.324570022938894e-05, "loss": 0.3428, "step": 6705 }, { "epoch": 0.806006006006006, "grad_norm": 0.4677339792251587, "learning_rate": 8.32222049759481e-05, "loss": 0.3413, "step": 6710 }, { "epoch": 0.8066066066066067, "grad_norm": 0.42975470423698425, "learning_rate": 8.319869658062575e-05, "loss": 0.3425, "step": 6715 }, { "epoch": 0.8072072072072072, "grad_norm": 0.396402508020401, "learning_rate": 8.317517505272125e-05, "loss": 0.308, "step": 6720 }, { "epoch": 0.8078078078078078, "grad_norm": 0.452930748462677, "learning_rate": 8.315164040153911e-05, "loss": 0.292, "step": 6725 }, { "epoch": 0.8084084084084084, "grad_norm": 0.4625903069972992, "learning_rate": 8.312809263638906e-05, "loss": 0.264, "step": 6730 }, { "epoch": 0.809009009009009, "grad_norm": 0.38552045822143555, "learning_rate": 8.310453176658599e-05, "loss": 0.28, "step": 6735 }, { "epoch": 0.8096096096096096, "grad_norm": 0.4750034511089325, "learning_rate": 8.308095780145002e-05, "loss": 0.3182, "step": 6740 }, { "epoch": 0.8102102102102102, "grad_norm": 0.49137282371520996, "learning_rate": 8.305737075030645e-05, "loss": 0.3417, "step": 6745 }, { "epoch": 0.8108108108108109, "grad_norm": 0.4060055911540985, "learning_rate": 8.30337706224857e-05, "loss": 0.2868, "step": 6750 }, { "epoch": 0.8108108108108109, "eval_loss": 0.2909846603870392, "eval_runtime": 35.4853, "eval_samples_per_second": 22.545, "eval_steps_per_second": 5.636, "step": 6750 }, { "epoch": 0.8114114114114114, "grad_norm": 0.4271671175956726, "learning_rate": 8.301015742732338e-05, "loss": 0.3264, "step": 6755 }, { "epoch": 0.812012012012012, "grad_norm": 0.4157504737377167, "learning_rate": 8.298653117416033e-05, "loss": 0.3168, "step": 6760 }, { "epoch": 0.8126126126126126, "grad_norm": 0.47310057282447815, "learning_rate": 8.296289187234248e-05, "loss": 0.3072, "step": 6765 }, { "epoch": 0.8132132132132132, "grad_norm": 0.4417066276073456, "learning_rate": 8.293923953122098e-05, "loss": 0.3141, "step": 6770 }, { "epoch": 0.8138138138138138, "grad_norm": 0.44005995988845825, "learning_rate": 8.291557416015208e-05, "loss": 0.3032, "step": 6775 }, { "epoch": 0.8144144144144144, "grad_norm": 0.457770437002182, "learning_rate": 8.289189576849722e-05, "loss": 0.2836, "step": 6780 }, { "epoch": 0.815015015015015, "grad_norm": 0.48804140090942383, "learning_rate": 8.286820436562301e-05, "loss": 0.3044, "step": 6785 }, { "epoch": 0.8156156156156156, "grad_norm": 0.3784973919391632, "learning_rate": 8.284449996090115e-05, "loss": 0.3148, "step": 6790 }, { "epoch": 0.8162162162162162, "grad_norm": 0.4097319543361664, "learning_rate": 8.282078256370853e-05, "loss": 0.2991, "step": 6795 }, { "epoch": 0.8168168168168168, "grad_norm": 0.45843109488487244, "learning_rate": 8.279705218342718e-05, "loss": 0.2848, "step": 6800 }, { "epoch": 0.8174174174174175, "grad_norm": 0.4852583110332489, "learning_rate": 8.277330882944422e-05, "loss": 0.3338, "step": 6805 }, { "epoch": 0.818018018018018, "grad_norm": 0.43163394927978516, "learning_rate": 8.274955251115195e-05, "loss": 0.3061, "step": 6810 }, { "epoch": 0.8186186186186186, "grad_norm": 0.456047922372818, "learning_rate": 8.272578323794778e-05, "loss": 0.3251, "step": 6815 }, { "epoch": 0.8192192192192193, "grad_norm": 0.42587724328041077, "learning_rate": 8.270200101923427e-05, "loss": 0.2674, "step": 6820 }, { "epoch": 0.8198198198198198, "grad_norm": 0.5281959772109985, "learning_rate": 8.267820586441901e-05, "loss": 0.2946, "step": 6825 }, { "epoch": 0.8204204204204204, "grad_norm": 0.4978925585746765, "learning_rate": 8.265439778291485e-05, "loss": 0.3074, "step": 6830 }, { "epoch": 0.821021021021021, "grad_norm": 0.575862467288971, "learning_rate": 8.263057678413963e-05, "loss": 0.3383, "step": 6835 }, { "epoch": 0.8216216216216217, "grad_norm": 0.38265636563301086, "learning_rate": 8.260674287751637e-05, "loss": 0.3126, "step": 6840 }, { "epoch": 0.8222222222222222, "grad_norm": 0.48499342799186707, "learning_rate": 8.258289607247314e-05, "loss": 0.3055, "step": 6845 }, { "epoch": 0.8228228228228228, "grad_norm": 0.44227099418640137, "learning_rate": 8.25590363784432e-05, "loss": 0.3088, "step": 6850 }, { "epoch": 0.8234234234234235, "grad_norm": 0.5235266089439392, "learning_rate": 8.253516380486481e-05, "loss": 0.311, "step": 6855 }, { "epoch": 0.824024024024024, "grad_norm": 0.4245067238807678, "learning_rate": 8.251127836118138e-05, "loss": 0.279, "step": 6860 }, { "epoch": 0.8246246246246246, "grad_norm": 0.441022127866745, "learning_rate": 8.24873800568414e-05, "loss": 0.3058, "step": 6865 }, { "epoch": 0.8252252252252252, "grad_norm": 0.5665812492370605, "learning_rate": 8.246346890129846e-05, "loss": 0.3386, "step": 6870 }, { "epoch": 0.8258258258258259, "grad_norm": 0.40924161672592163, "learning_rate": 8.24395449040112e-05, "loss": 0.2884, "step": 6875 }, { "epoch": 0.8264264264264264, "grad_norm": 0.5347919464111328, "learning_rate": 8.241560807444338e-05, "loss": 0.3203, "step": 6880 }, { "epoch": 0.827027027027027, "grad_norm": 0.418349027633667, "learning_rate": 8.239165842206381e-05, "loss": 0.2994, "step": 6885 }, { "epoch": 0.8276276276276276, "grad_norm": 0.36912989616394043, "learning_rate": 8.236769595634636e-05, "loss": 0.3056, "step": 6890 }, { "epoch": 0.8282282282282283, "grad_norm": 0.43872979283332825, "learning_rate": 8.234372068677003e-05, "loss": 0.3307, "step": 6895 }, { "epoch": 0.8288288288288288, "grad_norm": 0.44087788462638855, "learning_rate": 8.231973262281881e-05, "loss": 0.297, "step": 6900 }, { "epoch": 0.8294294294294294, "grad_norm": 0.4877927899360657, "learning_rate": 8.22957317739818e-05, "loss": 0.2723, "step": 6905 }, { "epoch": 0.8300300300300301, "grad_norm": 0.5307222604751587, "learning_rate": 8.227171814975312e-05, "loss": 0.3188, "step": 6910 }, { "epoch": 0.8306306306306306, "grad_norm": 0.49993765354156494, "learning_rate": 8.224769175963199e-05, "loss": 0.2528, "step": 6915 }, { "epoch": 0.8312312312312312, "grad_norm": 0.5039234757423401, "learning_rate": 8.222365261312264e-05, "loss": 0.3006, "step": 6920 }, { "epoch": 0.8318318318318318, "grad_norm": 0.4721224009990692, "learning_rate": 8.219960071973436e-05, "loss": 0.3348, "step": 6925 }, { "epoch": 0.8324324324324325, "grad_norm": 0.4163762032985687, "learning_rate": 8.21755360889815e-05, "loss": 0.3302, "step": 6930 }, { "epoch": 0.833033033033033, "grad_norm": 0.523654043674469, "learning_rate": 8.215145873038341e-05, "loss": 0.3182, "step": 6935 }, { "epoch": 0.8336336336336336, "grad_norm": 0.45208466053009033, "learning_rate": 8.212736865346451e-05, "loss": 0.3073, "step": 6940 }, { "epoch": 0.8342342342342343, "grad_norm": 0.45595982670783997, "learning_rate": 8.210326586775423e-05, "loss": 0.3296, "step": 6945 }, { "epoch": 0.8348348348348348, "grad_norm": 0.3884303569793701, "learning_rate": 8.207915038278705e-05, "loss": 0.3124, "step": 6950 }, { "epoch": 0.8354354354354354, "grad_norm": 0.38619038462638855, "learning_rate": 8.205502220810244e-05, "loss": 0.3087, "step": 6955 }, { "epoch": 0.836036036036036, "grad_norm": 0.412649005651474, "learning_rate": 8.203088135324493e-05, "loss": 0.3136, "step": 6960 }, { "epoch": 0.8366366366366367, "grad_norm": 0.388984352350235, "learning_rate": 8.2006727827764e-05, "loss": 0.3094, "step": 6965 }, { "epoch": 0.8372372372372372, "grad_norm": 0.41695263981819153, "learning_rate": 8.198256164121425e-05, "loss": 0.3187, "step": 6970 }, { "epoch": 0.8378378378378378, "grad_norm": 0.48263269662857056, "learning_rate": 8.195838280315517e-05, "loss": 0.2842, "step": 6975 }, { "epoch": 0.8384384384384385, "grad_norm": 0.5374370813369751, "learning_rate": 8.193419132315136e-05, "loss": 0.2778, "step": 6980 }, { "epoch": 0.8390390390390391, "grad_norm": 0.43641534447669983, "learning_rate": 8.190998721077232e-05, "loss": 0.3082, "step": 6985 }, { "epoch": 0.8396396396396396, "grad_norm": 0.39405977725982666, "learning_rate": 8.188577047559262e-05, "loss": 0.2925, "step": 6990 }, { "epoch": 0.8402402402402402, "grad_norm": 0.44023704528808594, "learning_rate": 8.186154112719182e-05, "loss": 0.3021, "step": 6995 }, { "epoch": 0.8408408408408409, "grad_norm": 0.49988189339637756, "learning_rate": 8.183729917515441e-05, "loss": 0.2456, "step": 7000 }, { "epoch": 0.8408408408408409, "eval_loss": 0.2692303955554962, "eval_runtime": 35.5071, "eval_samples_per_second": 22.531, "eval_steps_per_second": 5.633, "step": 7000 }, { "epoch": 0.8414414414414414, "grad_norm": 0.4299604296684265, "learning_rate": 8.181304462906995e-05, "loss": 0.2665, "step": 7005 }, { "epoch": 0.842042042042042, "grad_norm": 0.47068995237350464, "learning_rate": 8.178877749853289e-05, "loss": 0.2925, "step": 7010 }, { "epoch": 0.8426426426426427, "grad_norm": 0.42724624276161194, "learning_rate": 8.176449779314275e-05, "loss": 0.2625, "step": 7015 }, { "epoch": 0.8432432432432433, "grad_norm": 0.4183838367462158, "learning_rate": 8.174020552250395e-05, "loss": 0.2865, "step": 7020 }, { "epoch": 0.8438438438438438, "grad_norm": 0.4842805862426758, "learning_rate": 8.171590069622592e-05, "loss": 0.2766, "step": 7025 }, { "epoch": 0.8444444444444444, "grad_norm": 0.42208966612815857, "learning_rate": 8.169158332392306e-05, "loss": 0.3053, "step": 7030 }, { "epoch": 0.8450450450450451, "grad_norm": 0.4593719244003296, "learning_rate": 8.166725341521468e-05, "loss": 0.2857, "step": 7035 }, { "epoch": 0.8456456456456456, "grad_norm": 0.5179038643836975, "learning_rate": 8.164291097972516e-05, "loss": 0.2931, "step": 7040 }, { "epoch": 0.8462462462462462, "grad_norm": 0.5054352879524231, "learning_rate": 8.161855602708369e-05, "loss": 0.2943, "step": 7045 }, { "epoch": 0.8468468468468469, "grad_norm": 0.4590916633605957, "learning_rate": 8.159418856692453e-05, "loss": 0.266, "step": 7050 }, { "epoch": 0.8474474474474475, "grad_norm": 0.5188509821891785, "learning_rate": 8.156980860888683e-05, "loss": 0.2912, "step": 7055 }, { "epoch": 0.848048048048048, "grad_norm": 0.470938116312027, "learning_rate": 8.154541616261471e-05, "loss": 0.3015, "step": 7060 }, { "epoch": 0.8486486486486486, "grad_norm": 0.47591304779052734, "learning_rate": 8.152101123775719e-05, "loss": 0.2815, "step": 7065 }, { "epoch": 0.8492492492492493, "grad_norm": 0.45679664611816406, "learning_rate": 8.149659384396828e-05, "loss": 0.2826, "step": 7070 }, { "epoch": 0.8498498498498499, "grad_norm": 0.45807573199272156, "learning_rate": 8.14721639909069e-05, "loss": 0.3041, "step": 7075 }, { "epoch": 0.8504504504504504, "grad_norm": 0.4140782654285431, "learning_rate": 8.144772168823686e-05, "loss": 0.2855, "step": 7080 }, { "epoch": 0.851051051051051, "grad_norm": 0.43625256419181824, "learning_rate": 8.142326694562696e-05, "loss": 0.2625, "step": 7085 }, { "epoch": 0.8516516516516517, "grad_norm": 0.3945463299751282, "learning_rate": 8.139879977275088e-05, "loss": 0.3109, "step": 7090 }, { "epoch": 0.8522522522522522, "grad_norm": 0.35140371322631836, "learning_rate": 8.137432017928726e-05, "loss": 0.265, "step": 7095 }, { "epoch": 0.8528528528528528, "grad_norm": 0.5079392194747925, "learning_rate": 8.134982817491956e-05, "loss": 0.2919, "step": 7100 }, { "epoch": 0.8534534534534535, "grad_norm": 0.5279013514518738, "learning_rate": 8.132532376933626e-05, "loss": 0.2684, "step": 7105 }, { "epoch": 0.8540540540540541, "grad_norm": 0.4624316096305847, "learning_rate": 8.130080697223065e-05, "loss": 0.2912, "step": 7110 }, { "epoch": 0.8546546546546546, "grad_norm": 0.434151828289032, "learning_rate": 8.127627779330103e-05, "loss": 0.2955, "step": 7115 }, { "epoch": 0.8552552552552553, "grad_norm": 0.44436565041542053, "learning_rate": 8.125173624225047e-05, "loss": 0.3194, "step": 7120 }, { "epoch": 0.8558558558558559, "grad_norm": 0.48708978295326233, "learning_rate": 8.122718232878705e-05, "loss": 0.3302, "step": 7125 }, { "epoch": 0.8564564564564564, "grad_norm": 0.48794692754745483, "learning_rate": 8.120261606262367e-05, "loss": 0.3095, "step": 7130 }, { "epoch": 0.857057057057057, "grad_norm": 0.5296076536178589, "learning_rate": 8.117803745347815e-05, "loss": 0.3131, "step": 7135 }, { "epoch": 0.8576576576576577, "grad_norm": 0.4305223524570465, "learning_rate": 8.115344651107314e-05, "loss": 0.3127, "step": 7140 }, { "epoch": 0.8582582582582583, "grad_norm": 0.41394057869911194, "learning_rate": 8.112884324513625e-05, "loss": 0.2848, "step": 7145 }, { "epoch": 0.8588588588588588, "grad_norm": 0.4637286067008972, "learning_rate": 8.110422766539991e-05, "loss": 0.3343, "step": 7150 }, { "epoch": 0.8594594594594595, "grad_norm": 0.47156769037246704, "learning_rate": 8.10795997816014e-05, "loss": 0.2641, "step": 7155 }, { "epoch": 0.8600600600600601, "grad_norm": 0.515585720539093, "learning_rate": 8.105495960348297e-05, "loss": 0.2931, "step": 7160 }, { "epoch": 0.8606606606606607, "grad_norm": 0.5015640258789062, "learning_rate": 8.10303071407916e-05, "loss": 0.2683, "step": 7165 }, { "epoch": 0.8612612612612612, "grad_norm": 0.4792843759059906, "learning_rate": 8.100564240327924e-05, "loss": 0.3325, "step": 7170 }, { "epoch": 0.8618618618618619, "grad_norm": 0.47345492243766785, "learning_rate": 8.098096540070262e-05, "loss": 0.2901, "step": 7175 }, { "epoch": 0.8624624624624625, "grad_norm": 0.4682060480117798, "learning_rate": 8.095627614282336e-05, "loss": 0.2767, "step": 7180 }, { "epoch": 0.863063063063063, "grad_norm": 0.4313962161540985, "learning_rate": 8.093157463940792e-05, "loss": 0.3022, "step": 7185 }, { "epoch": 0.8636636636636636, "grad_norm": 0.43570706248283386, "learning_rate": 8.090686090022759e-05, "loss": 0.2828, "step": 7190 }, { "epoch": 0.8642642642642643, "grad_norm": 0.45561483502388, "learning_rate": 8.088213493505854e-05, "loss": 0.2596, "step": 7195 }, { "epoch": 0.8648648648648649, "grad_norm": 0.41750627756118774, "learning_rate": 8.085739675368174e-05, "loss": 0.3473, "step": 7200 }, { "epoch": 0.8654654654654654, "grad_norm": 0.4715013802051544, "learning_rate": 8.083264636588299e-05, "loss": 0.2798, "step": 7205 }, { "epoch": 0.8660660660660661, "grad_norm": 0.38323187828063965, "learning_rate": 8.080788378145291e-05, "loss": 0.2644, "step": 7210 }, { "epoch": 0.8666666666666667, "grad_norm": 0.46973270177841187, "learning_rate": 8.078310901018703e-05, "loss": 0.328, "step": 7215 }, { "epoch": 0.8672672672672672, "grad_norm": 0.44707220792770386, "learning_rate": 8.075832206188558e-05, "loss": 0.2875, "step": 7220 }, { "epoch": 0.8678678678678678, "grad_norm": 0.48569974303245544, "learning_rate": 8.073352294635367e-05, "loss": 0.2957, "step": 7225 }, { "epoch": 0.8684684684684685, "grad_norm": 0.4617443382740021, "learning_rate": 8.070871167340125e-05, "loss": 0.2565, "step": 7230 }, { "epoch": 0.8690690690690691, "grad_norm": 0.44316914677619934, "learning_rate": 8.068388825284304e-05, "loss": 0.2776, "step": 7235 }, { "epoch": 0.8696696696696696, "grad_norm": 0.4461076855659485, "learning_rate": 8.065905269449852e-05, "loss": 0.2697, "step": 7240 }, { "epoch": 0.8702702702702703, "grad_norm": 0.568824827671051, "learning_rate": 8.063420500819205e-05, "loss": 0.3026, "step": 7245 }, { "epoch": 0.8708708708708709, "grad_norm": 0.5436012744903564, "learning_rate": 8.060934520375279e-05, "loss": 0.3146, "step": 7250 }, { "epoch": 0.8708708708708709, "eval_loss": 0.25483617186546326, "eval_runtime": 35.4853, "eval_samples_per_second": 22.545, "eval_steps_per_second": 5.636, "step": 7250 }, { "epoch": 0.8714714714714715, "grad_norm": 0.3968980610370636, "learning_rate": 8.05844732910146e-05, "loss": 0.2831, "step": 7255 }, { "epoch": 0.872072072072072, "grad_norm": 0.4703877568244934, "learning_rate": 8.055958927981627e-05, "loss": 0.3158, "step": 7260 }, { "epoch": 0.8726726726726727, "grad_norm": 0.47599485516548157, "learning_rate": 8.053469318000122e-05, "loss": 0.2853, "step": 7265 }, { "epoch": 0.8732732732732733, "grad_norm": 0.43644478917121887, "learning_rate": 8.050978500141778e-05, "loss": 0.2971, "step": 7270 }, { "epoch": 0.8738738738738738, "grad_norm": 0.5554895997047424, "learning_rate": 8.048486475391901e-05, "loss": 0.2946, "step": 7275 }, { "epoch": 0.8744744744744745, "grad_norm": 0.4631296992301941, "learning_rate": 8.045993244736271e-05, "loss": 0.2915, "step": 7280 }, { "epoch": 0.8750750750750751, "grad_norm": 0.47270381450653076, "learning_rate": 8.043498809161152e-05, "loss": 0.2789, "step": 7285 }, { "epoch": 0.8756756756756757, "grad_norm": 0.5386652946472168, "learning_rate": 8.041003169653278e-05, "loss": 0.2964, "step": 7290 }, { "epoch": 0.8762762762762762, "grad_norm": 0.589094340801239, "learning_rate": 8.038506327199864e-05, "loss": 0.2792, "step": 7295 }, { "epoch": 0.8768768768768769, "grad_norm": 0.32584628462791443, "learning_rate": 8.036008282788599e-05, "loss": 0.256, "step": 7300 }, { "epoch": 0.8774774774774775, "grad_norm": 0.49077874422073364, "learning_rate": 8.033509037407646e-05, "loss": 0.281, "step": 7305 }, { "epoch": 0.878078078078078, "grad_norm": 0.45855236053466797, "learning_rate": 8.031008592045645e-05, "loss": 0.2673, "step": 7310 }, { "epoch": 0.8786786786786787, "grad_norm": 0.5225518345832825, "learning_rate": 8.028506947691712e-05, "loss": 0.2563, "step": 7315 }, { "epoch": 0.8792792792792793, "grad_norm": 0.4622911512851715, "learning_rate": 8.026004105335434e-05, "loss": 0.2897, "step": 7320 }, { "epoch": 0.8798798798798799, "grad_norm": 0.5143116116523743, "learning_rate": 8.023500065966874e-05, "loss": 0.3078, "step": 7325 }, { "epoch": 0.8804804804804804, "grad_norm": 0.39429354667663574, "learning_rate": 8.020994830576566e-05, "loss": 0.2681, "step": 7330 }, { "epoch": 0.8810810810810811, "grad_norm": 0.47837233543395996, "learning_rate": 8.018488400155524e-05, "loss": 0.2708, "step": 7335 }, { "epoch": 0.8816816816816817, "grad_norm": 0.5050178170204163, "learning_rate": 8.015980775695223e-05, "loss": 0.2509, "step": 7340 }, { "epoch": 0.8822822822822823, "grad_norm": 0.6450315713882446, "learning_rate": 8.013471958187624e-05, "loss": 0.3039, "step": 7345 }, { "epoch": 0.8828828828828829, "grad_norm": 0.5351130366325378, "learning_rate": 8.010961948625147e-05, "loss": 0.2596, "step": 7350 }, { "epoch": 0.8834834834834835, "grad_norm": 0.45827925205230713, "learning_rate": 8.008450748000694e-05, "loss": 0.2794, "step": 7355 }, { "epoch": 0.8840840840840841, "grad_norm": 0.46393102407455444, "learning_rate": 8.005938357307632e-05, "loss": 0.2778, "step": 7360 }, { "epoch": 0.8846846846846846, "grad_norm": 0.3539421856403351, "learning_rate": 8.003424777539799e-05, "loss": 0.2509, "step": 7365 }, { "epoch": 0.8852852852852853, "grad_norm": 0.5830817222595215, "learning_rate": 8.000910009691509e-05, "loss": 0.2671, "step": 7370 }, { "epoch": 0.8858858858858859, "grad_norm": 0.4311886429786682, "learning_rate": 7.998394054757538e-05, "loss": 0.2585, "step": 7375 }, { "epoch": 0.8864864864864865, "grad_norm": 0.49385130405426025, "learning_rate": 7.995876913733138e-05, "loss": 0.2938, "step": 7380 }, { "epoch": 0.8870870870870871, "grad_norm": 0.5206505060195923, "learning_rate": 7.993358587614025e-05, "loss": 0.2431, "step": 7385 }, { "epoch": 0.8876876876876877, "grad_norm": 0.42388594150543213, "learning_rate": 7.990839077396391e-05, "loss": 0.2426, "step": 7390 }, { "epoch": 0.8882882882882883, "grad_norm": 0.5296729207038879, "learning_rate": 7.988318384076886e-05, "loss": 0.2884, "step": 7395 }, { "epoch": 0.8888888888888888, "grad_norm": 0.464364230632782, "learning_rate": 7.985796508652638e-05, "loss": 0.3027, "step": 7400 }, { "epoch": 0.8894894894894895, "grad_norm": 0.49531444907188416, "learning_rate": 7.983273452121237e-05, "loss": 0.2494, "step": 7405 }, { "epoch": 0.8900900900900901, "grad_norm": 0.5268791317939758, "learning_rate": 7.98074921548074e-05, "loss": 0.291, "step": 7410 }, { "epoch": 0.8906906906906907, "grad_norm": 0.5387738943099976, "learning_rate": 7.978223799729679e-05, "loss": 0.2952, "step": 7415 }, { "epoch": 0.8912912912912913, "grad_norm": 0.5476765036582947, "learning_rate": 7.975697205867037e-05, "loss": 0.2857, "step": 7420 }, { "epoch": 0.8918918918918919, "grad_norm": 0.45993947982788086, "learning_rate": 7.973169434892279e-05, "loss": 0.2922, "step": 7425 }, { "epoch": 0.8924924924924925, "grad_norm": 0.511158287525177, "learning_rate": 7.970640487805324e-05, "loss": 0.2863, "step": 7430 }, { "epoch": 0.893093093093093, "grad_norm": 0.505344808101654, "learning_rate": 7.968110365606564e-05, "loss": 0.2788, "step": 7435 }, { "epoch": 0.8936936936936937, "grad_norm": 0.4371821880340576, "learning_rate": 7.96557906929685e-05, "loss": 0.2868, "step": 7440 }, { "epoch": 0.8942942942942943, "grad_norm": 0.523827850818634, "learning_rate": 7.963046599877504e-05, "loss": 0.3103, "step": 7445 }, { "epoch": 0.8948948948948949, "grad_norm": 0.4492994546890259, "learning_rate": 7.960512958350303e-05, "loss": 0.2487, "step": 7450 }, { "epoch": 0.8954954954954955, "grad_norm": 0.40934139490127563, "learning_rate": 7.957978145717498e-05, "loss": 0.2574, "step": 7455 }, { "epoch": 0.8960960960960961, "grad_norm": 0.5381701588630676, "learning_rate": 7.955442162981794e-05, "loss": 0.2916, "step": 7460 }, { "epoch": 0.8966966966966967, "grad_norm": 0.5402195453643799, "learning_rate": 7.952905011146365e-05, "loss": 0.2736, "step": 7465 }, { "epoch": 0.8972972972972973, "grad_norm": 0.3956668972969055, "learning_rate": 7.950366691214843e-05, "loss": 0.2751, "step": 7470 }, { "epoch": 0.8978978978978979, "grad_norm": 0.5187951326370239, "learning_rate": 7.947827204191329e-05, "loss": 0.2976, "step": 7475 }, { "epoch": 0.8984984984984985, "grad_norm": 0.46738743782043457, "learning_rate": 7.945286551080379e-05, "loss": 0.2781, "step": 7480 }, { "epoch": 0.8990990990990991, "grad_norm": 0.5121240019798279, "learning_rate": 7.942744732887011e-05, "loss": 0.2903, "step": 7485 }, { "epoch": 0.8996996996996997, "grad_norm": 0.45248308777809143, "learning_rate": 7.940201750616707e-05, "loss": 0.2908, "step": 7490 }, { "epoch": 0.9003003003003003, "grad_norm": 0.5397672653198242, "learning_rate": 7.937657605275408e-05, "loss": 0.2565, "step": 7495 }, { "epoch": 0.9009009009009009, "grad_norm": 0.506698727607727, "learning_rate": 7.935112297869513e-05, "loss": 0.2824, "step": 7500 }, { "epoch": 0.9009009009009009, "eval_loss": 0.24328570067882538, "eval_runtime": 35.6281, "eval_samples_per_second": 22.454, "eval_steps_per_second": 5.614, "step": 7500 }, { "epoch": 0.9015015015015015, "grad_norm": 0.5071007609367371, "learning_rate": 7.932565829405883e-05, "loss": 0.2777, "step": 7505 }, { "epoch": 0.9021021021021021, "grad_norm": 0.41715240478515625, "learning_rate": 7.93001820089184e-05, "loss": 0.2832, "step": 7510 }, { "epoch": 0.9027027027027027, "grad_norm": 0.4370177090167999, "learning_rate": 7.927469413335163e-05, "loss": 0.2977, "step": 7515 }, { "epoch": 0.9033033033033033, "grad_norm": 0.5694493055343628, "learning_rate": 7.924919467744085e-05, "loss": 0.269, "step": 7520 }, { "epoch": 0.9039039039039038, "grad_norm": 0.4566720426082611, "learning_rate": 7.922368365127304e-05, "loss": 0.2566, "step": 7525 }, { "epoch": 0.9045045045045045, "grad_norm": 0.5011866092681885, "learning_rate": 7.919816106493973e-05, "loss": 0.2673, "step": 7530 }, { "epoch": 0.9051051051051051, "grad_norm": 0.5244460701942444, "learning_rate": 7.917262692853703e-05, "loss": 0.2872, "step": 7535 }, { "epoch": 0.9057057057057057, "grad_norm": 0.5451821684837341, "learning_rate": 7.91470812521656e-05, "loss": 0.2669, "step": 7540 }, { "epoch": 0.9063063063063063, "grad_norm": 0.4076676368713379, "learning_rate": 7.912152404593069e-05, "loss": 0.2551, "step": 7545 }, { "epoch": 0.9069069069069069, "grad_norm": 0.48945051431655884, "learning_rate": 7.909595531994208e-05, "loss": 0.2791, "step": 7550 }, { "epoch": 0.9075075075075075, "grad_norm": 0.444084107875824, "learning_rate": 7.907037508431414e-05, "loss": 0.2687, "step": 7555 }, { "epoch": 0.9081081081081082, "grad_norm": 0.48731303215026855, "learning_rate": 7.904478334916576e-05, "loss": 0.2764, "step": 7560 }, { "epoch": 0.9087087087087087, "grad_norm": 0.48154494166374207, "learning_rate": 7.901918012462042e-05, "loss": 0.2636, "step": 7565 }, { "epoch": 0.9093093093093093, "grad_norm": 0.6483919024467468, "learning_rate": 7.899356542080612e-05, "loss": 0.2545, "step": 7570 }, { "epoch": 0.9099099099099099, "grad_norm": 0.4964236915111542, "learning_rate": 7.896793924785537e-05, "loss": 0.2716, "step": 7575 }, { "epoch": 0.9105105105105105, "grad_norm": 0.5206024646759033, "learning_rate": 7.894230161590528e-05, "loss": 0.3074, "step": 7580 }, { "epoch": 0.9111111111111111, "grad_norm": 0.5688350796699524, "learning_rate": 7.891665253509746e-05, "loss": 0.3095, "step": 7585 }, { "epoch": 0.9117117117117117, "grad_norm": 0.5272214412689209, "learning_rate": 7.889099201557804e-05, "loss": 0.2776, "step": 7590 }, { "epoch": 0.9123123123123124, "grad_norm": 0.40869587659835815, "learning_rate": 7.886532006749768e-05, "loss": 0.2553, "step": 7595 }, { "epoch": 0.9129129129129129, "grad_norm": 0.41617196798324585, "learning_rate": 7.883963670101158e-05, "loss": 0.3021, "step": 7600 }, { "epoch": 0.9135135135135135, "grad_norm": 0.41561898589134216, "learning_rate": 7.881394192627947e-05, "loss": 0.2634, "step": 7605 }, { "epoch": 0.9141141141141141, "grad_norm": 0.5376163125038147, "learning_rate": 7.878823575346552e-05, "loss": 0.2937, "step": 7610 }, { "epoch": 0.9147147147147147, "grad_norm": 0.5017083287239075, "learning_rate": 7.876251819273846e-05, "loss": 0.2803, "step": 7615 }, { "epoch": 0.9153153153153153, "grad_norm": 0.49231407046318054, "learning_rate": 7.873678925427154e-05, "loss": 0.2636, "step": 7620 }, { "epoch": 0.9159159159159159, "grad_norm": 0.4994038939476013, "learning_rate": 7.87110489482425e-05, "loss": 0.2685, "step": 7625 }, { "epoch": 0.9165165165165166, "grad_norm": 0.4108923375606537, "learning_rate": 7.868529728483353e-05, "loss": 0.2594, "step": 7630 }, { "epoch": 0.9171171171171171, "grad_norm": 0.45832404494285583, "learning_rate": 7.865953427423139e-05, "loss": 0.2559, "step": 7635 }, { "epoch": 0.9177177177177177, "grad_norm": 0.41517266631126404, "learning_rate": 7.863375992662727e-05, "loss": 0.2434, "step": 7640 }, { "epoch": 0.9183183183183183, "grad_norm": 0.5447494387626648, "learning_rate": 7.860797425221685e-05, "loss": 0.2728, "step": 7645 }, { "epoch": 0.918918918918919, "grad_norm": 0.44468387961387634, "learning_rate": 7.858217726120033e-05, "loss": 0.2426, "step": 7650 }, { "epoch": 0.9195195195195195, "grad_norm": 0.5114275217056274, "learning_rate": 7.855636896378236e-05, "loss": 0.282, "step": 7655 }, { "epoch": 0.9201201201201201, "grad_norm": 0.4041288197040558, "learning_rate": 7.853054937017203e-05, "loss": 0.2664, "step": 7660 }, { "epoch": 0.9207207207207208, "grad_norm": 0.5267107486724854, "learning_rate": 7.850471849058297e-05, "loss": 0.2849, "step": 7665 }, { "epoch": 0.9213213213213213, "grad_norm": 0.5837280750274658, "learning_rate": 7.847887633523321e-05, "loss": 0.3062, "step": 7670 }, { "epoch": 0.9219219219219219, "grad_norm": 0.4394720494747162, "learning_rate": 7.845302291434528e-05, "loss": 0.2756, "step": 7675 }, { "epoch": 0.9225225225225225, "grad_norm": 0.4582143723964691, "learning_rate": 7.842715823814616e-05, "loss": 0.263, "step": 7680 }, { "epoch": 0.9231231231231232, "grad_norm": 0.5353251695632935, "learning_rate": 7.840128231686727e-05, "loss": 0.2802, "step": 7685 }, { "epoch": 0.9237237237237237, "grad_norm": 0.5931753516197205, "learning_rate": 7.837539516074448e-05, "loss": 0.2698, "step": 7690 }, { "epoch": 0.9243243243243243, "grad_norm": 0.4375219941139221, "learning_rate": 7.83494967800181e-05, "loss": 0.2491, "step": 7695 }, { "epoch": 0.924924924924925, "grad_norm": 0.4502767324447632, "learning_rate": 7.83235871849329e-05, "loss": 0.2608, "step": 7700 }, { "epoch": 0.9255255255255255, "grad_norm": 0.47270476818084717, "learning_rate": 7.829766638573805e-05, "loss": 0.2882, "step": 7705 }, { "epoch": 0.9261261261261261, "grad_norm": 0.47821709513664246, "learning_rate": 7.827173439268723e-05, "loss": 0.3019, "step": 7710 }, { "epoch": 0.9267267267267267, "grad_norm": 0.543599009513855, "learning_rate": 7.824579121603843e-05, "loss": 0.29, "step": 7715 }, { "epoch": 0.9273273273273274, "grad_norm": 0.5124878883361816, "learning_rate": 7.821983686605416e-05, "loss": 0.2691, "step": 7720 }, { "epoch": 0.9279279279279279, "grad_norm": 0.4807772934436798, "learning_rate": 7.819387135300134e-05, "loss": 0.2558, "step": 7725 }, { "epoch": 0.9285285285285285, "grad_norm": 0.4797044098377228, "learning_rate": 7.816789468715124e-05, "loss": 0.2526, "step": 7730 }, { "epoch": 0.9291291291291291, "grad_norm": 0.4510939419269562, "learning_rate": 7.81419068787796e-05, "loss": 0.283, "step": 7735 }, { "epoch": 0.9297297297297298, "grad_norm": 0.4945512115955353, "learning_rate": 7.811590793816658e-05, "loss": 0.2376, "step": 7740 }, { "epoch": 0.9303303303303303, "grad_norm": 0.5905566215515137, "learning_rate": 7.808989787559668e-05, "loss": 0.2849, "step": 7745 }, { "epoch": 0.9309309309309309, "grad_norm": 0.578352689743042, "learning_rate": 7.806387670135886e-05, "loss": 0.2847, "step": 7750 }, { "epoch": 0.9309309309309309, "eval_loss": 0.23058752715587616, "eval_runtime": 35.5677, "eval_samples_per_second": 22.492, "eval_steps_per_second": 5.623, "step": 7750 }, { "epoch": 0.9315315315315316, "grad_norm": 0.47081324458122253, "learning_rate": 7.803784442574646e-05, "loss": 0.2275, "step": 7755 }, { "epoch": 0.9321321321321321, "grad_norm": 0.4713752567768097, "learning_rate": 7.801180105905716e-05, "loss": 0.2742, "step": 7760 }, { "epoch": 0.9327327327327327, "grad_norm": 0.5127248764038086, "learning_rate": 7.798574661159313e-05, "loss": 0.2729, "step": 7765 }, { "epoch": 0.9333333333333333, "grad_norm": 0.48777830600738525, "learning_rate": 7.79596810936608e-05, "loss": 0.2683, "step": 7770 }, { "epoch": 0.933933933933934, "grad_norm": 0.5385095477104187, "learning_rate": 7.79336045155711e-05, "loss": 0.2452, "step": 7775 }, { "epoch": 0.9345345345345345, "grad_norm": 0.4768882691860199, "learning_rate": 7.790751688763926e-05, "loss": 0.2588, "step": 7780 }, { "epoch": 0.9351351351351351, "grad_norm": 0.451249361038208, "learning_rate": 7.788141822018488e-05, "loss": 0.2491, "step": 7785 }, { "epoch": 0.9357357357357358, "grad_norm": 0.5339096784591675, "learning_rate": 7.785530852353196e-05, "loss": 0.2273, "step": 7790 }, { "epoch": 0.9363363363363363, "grad_norm": 0.5050491690635681, "learning_rate": 7.782918780800885e-05, "loss": 0.2476, "step": 7795 }, { "epoch": 0.9369369369369369, "grad_norm": 0.626227080821991, "learning_rate": 7.780305608394828e-05, "loss": 0.2683, "step": 7800 }, { "epoch": 0.9375375375375375, "grad_norm": 0.5088838338851929, "learning_rate": 7.777691336168728e-05, "loss": 0.2779, "step": 7805 }, { "epoch": 0.9381381381381382, "grad_norm": 0.4789465367794037, "learning_rate": 7.775075965156726e-05, "loss": 0.2472, "step": 7810 }, { "epoch": 0.9387387387387387, "grad_norm": 0.478678822517395, "learning_rate": 7.772459496393401e-05, "loss": 0.2946, "step": 7815 }, { "epoch": 0.9393393393393393, "grad_norm": 0.4863022565841675, "learning_rate": 7.769841930913761e-05, "loss": 0.2454, "step": 7820 }, { "epoch": 0.93993993993994, "grad_norm": 0.5988958477973938, "learning_rate": 7.767223269753253e-05, "loss": 0.2678, "step": 7825 }, { "epoch": 0.9405405405405406, "grad_norm": 0.6609881520271301, "learning_rate": 7.76460351394775e-05, "loss": 0.2362, "step": 7830 }, { "epoch": 0.9411411411411411, "grad_norm": 0.4918898344039917, "learning_rate": 7.761982664533567e-05, "loss": 0.252, "step": 7835 }, { "epoch": 0.9417417417417417, "grad_norm": 0.43102115392684937, "learning_rate": 7.759360722547443e-05, "loss": 0.2719, "step": 7840 }, { "epoch": 0.9423423423423424, "grad_norm": 0.4987165331840515, "learning_rate": 7.756737689026556e-05, "loss": 0.2938, "step": 7845 }, { "epoch": 0.9429429429429429, "grad_norm": 0.4932297170162201, "learning_rate": 7.754113565008513e-05, "loss": 0.2675, "step": 7850 }, { "epoch": 0.9435435435435435, "grad_norm": 0.6924968957901001, "learning_rate": 7.751488351531351e-05, "loss": 0.2544, "step": 7855 }, { "epoch": 0.9441441441441442, "grad_norm": 0.5172476172447205, "learning_rate": 7.748862049633541e-05, "loss": 0.2303, "step": 7860 }, { "epoch": 0.9447447447447448, "grad_norm": 0.3794347047805786, "learning_rate": 7.746234660353981e-05, "loss": 0.2187, "step": 7865 }, { "epoch": 0.9453453453453453, "grad_norm": 0.513824462890625, "learning_rate": 7.743606184732004e-05, "loss": 0.2549, "step": 7870 }, { "epoch": 0.9459459459459459, "grad_norm": 0.4459366798400879, "learning_rate": 7.740976623807367e-05, "loss": 0.2298, "step": 7875 }, { "epoch": 0.9465465465465466, "grad_norm": 0.49473559856414795, "learning_rate": 7.73834597862026e-05, "loss": 0.2406, "step": 7880 }, { "epoch": 0.9471471471471471, "grad_norm": 0.4191824197769165, "learning_rate": 7.735714250211304e-05, "loss": 0.2586, "step": 7885 }, { "epoch": 0.9477477477477477, "grad_norm": 0.47258394956588745, "learning_rate": 7.733081439621542e-05, "loss": 0.2461, "step": 7890 }, { "epoch": 0.9483483483483484, "grad_norm": 0.5496533513069153, "learning_rate": 7.73044754789245e-05, "loss": 0.2664, "step": 7895 }, { "epoch": 0.948948948948949, "grad_norm": 0.48711472749710083, "learning_rate": 7.727812576065929e-05, "loss": 0.2677, "step": 7900 }, { "epoch": 0.9495495495495495, "grad_norm": 0.46601200103759766, "learning_rate": 7.72517652518431e-05, "loss": 0.275, "step": 7905 }, { "epoch": 0.9501501501501501, "grad_norm": 0.43569445610046387, "learning_rate": 7.722539396290349e-05, "loss": 0.2428, "step": 7910 }, { "epoch": 0.9507507507507508, "grad_norm": 0.5569596886634827, "learning_rate": 7.71990119042723e-05, "loss": 0.2315, "step": 7915 }, { "epoch": 0.9513513513513514, "grad_norm": 0.4711536169052124, "learning_rate": 7.717261908638562e-05, "loss": 0.238, "step": 7920 }, { "epoch": 0.9519519519519519, "grad_norm": 0.5207387804985046, "learning_rate": 7.714621551968375e-05, "loss": 0.2586, "step": 7925 }, { "epoch": 0.9525525525525526, "grad_norm": 0.419005423784256, "learning_rate": 7.711980121461136e-05, "loss": 0.2645, "step": 7930 }, { "epoch": 0.9531531531531532, "grad_norm": 0.5372774600982666, "learning_rate": 7.709337618161723e-05, "loss": 0.239, "step": 7935 }, { "epoch": 0.9537537537537537, "grad_norm": 0.470059335231781, "learning_rate": 7.706694043115448e-05, "loss": 0.2689, "step": 7940 }, { "epoch": 0.9543543543543543, "grad_norm": 0.4631572365760803, "learning_rate": 7.704049397368045e-05, "loss": 0.2763, "step": 7945 }, { "epoch": 0.954954954954955, "grad_norm": 0.4009503126144409, "learning_rate": 7.701403681965666e-05, "loss": 0.25, "step": 7950 }, { "epoch": 0.9555555555555556, "grad_norm": 0.6539427638053894, "learning_rate": 7.698756897954897e-05, "loss": 0.2336, "step": 7955 }, { "epoch": 0.9561561561561561, "grad_norm": 0.5389593839645386, "learning_rate": 7.696109046382733e-05, "loss": 0.2372, "step": 7960 }, { "epoch": 0.9567567567567568, "grad_norm": 0.5966012477874756, "learning_rate": 7.693460128296604e-05, "loss": 0.2446, "step": 7965 }, { "epoch": 0.9573573573573574, "grad_norm": 0.5658740401268005, "learning_rate": 7.690810144744352e-05, "loss": 0.2367, "step": 7970 }, { "epoch": 0.9579579579579579, "grad_norm": 0.4769929051399231, "learning_rate": 7.688159096774247e-05, "loss": 0.2536, "step": 7975 }, { "epoch": 0.9585585585585585, "grad_norm": 0.5098903179168701, "learning_rate": 7.685506985434981e-05, "loss": 0.2823, "step": 7980 }, { "epoch": 0.9591591591591592, "grad_norm": 0.4759390354156494, "learning_rate": 7.682853811775659e-05, "loss": 0.2797, "step": 7985 }, { "epoch": 0.9597597597597598, "grad_norm": 0.5201223492622375, "learning_rate": 7.680199576845813e-05, "loss": 0.2312, "step": 7990 }, { "epoch": 0.9603603603603603, "grad_norm": 0.4938308000564575, "learning_rate": 7.677544281695392e-05, "loss": 0.2334, "step": 7995 }, { "epoch": 0.960960960960961, "grad_norm": 0.5370311141014099, "learning_rate": 7.674887927374765e-05, "loss": 0.2612, "step": 8000 }, { "epoch": 0.960960960960961, "eval_loss": 0.2181350737810135, "eval_runtime": 35.7024, "eval_samples_per_second": 22.407, "eval_steps_per_second": 5.602, "step": 8000 }, { "epoch": 0.9615615615615616, "grad_norm": 0.5911093950271606, "learning_rate": 7.67223051493472e-05, "loss": 0.2559, "step": 8005 }, { "epoch": 0.9621621621621622, "grad_norm": 0.5361072421073914, "learning_rate": 7.669572045426463e-05, "loss": 0.2747, "step": 8010 }, { "epoch": 0.9627627627627627, "grad_norm": 0.4973791241645813, "learning_rate": 7.66691251990162e-05, "loss": 0.2293, "step": 8015 }, { "epoch": 0.9633633633633634, "grad_norm": 0.5274470448493958, "learning_rate": 7.664251939412232e-05, "loss": 0.2258, "step": 8020 }, { "epoch": 0.963963963963964, "grad_norm": 0.5574861168861389, "learning_rate": 7.661590305010759e-05, "loss": 0.2704, "step": 8025 }, { "epoch": 0.9645645645645645, "grad_norm": 0.5257890224456787, "learning_rate": 7.65892761775008e-05, "loss": 0.2243, "step": 8030 }, { "epoch": 0.9651651651651652, "grad_norm": 0.5335781574249268, "learning_rate": 7.656263878683485e-05, "loss": 0.2834, "step": 8035 }, { "epoch": 0.9657657657657658, "grad_norm": 0.5913284420967102, "learning_rate": 7.653599088864685e-05, "loss": 0.2434, "step": 8040 }, { "epoch": 0.9663663663663664, "grad_norm": 0.4447031617164612, "learning_rate": 7.650933249347803e-05, "loss": 0.2221, "step": 8045 }, { "epoch": 0.9669669669669669, "grad_norm": 0.4803239703178406, "learning_rate": 7.648266361187382e-05, "loss": 0.2511, "step": 8050 }, { "epoch": 0.9675675675675676, "grad_norm": 0.4980545938014984, "learning_rate": 7.645598425438374e-05, "loss": 0.2269, "step": 8055 }, { "epoch": 0.9681681681681682, "grad_norm": 0.6442884206771851, "learning_rate": 7.64292944315615e-05, "loss": 0.2651, "step": 8060 }, { "epoch": 0.9687687687687687, "grad_norm": 0.5565935969352722, "learning_rate": 7.640259415396494e-05, "loss": 0.2496, "step": 8065 }, { "epoch": 0.9693693693693693, "grad_norm": 0.5555434823036194, "learning_rate": 7.6375883432156e-05, "loss": 0.2471, "step": 8070 }, { "epoch": 0.96996996996997, "grad_norm": 0.5130464434623718, "learning_rate": 7.634916227670081e-05, "loss": 0.2215, "step": 8075 }, { "epoch": 0.9705705705705706, "grad_norm": 0.4054581820964813, "learning_rate": 7.632243069816957e-05, "loss": 0.2236, "step": 8080 }, { "epoch": 0.9711711711711711, "grad_norm": 0.628891110420227, "learning_rate": 7.629568870713668e-05, "loss": 0.2275, "step": 8085 }, { "epoch": 0.9717717717717718, "grad_norm": 0.5326948761940002, "learning_rate": 7.626893631418055e-05, "loss": 0.2417, "step": 8090 }, { "epoch": 0.9723723723723724, "grad_norm": 0.5320354700088501, "learning_rate": 7.624217352988379e-05, "loss": 0.2291, "step": 8095 }, { "epoch": 0.972972972972973, "grad_norm": 0.49078038334846497, "learning_rate": 7.621540036483308e-05, "loss": 0.2395, "step": 8100 }, { "epoch": 0.9735735735735735, "grad_norm": 0.5511507391929626, "learning_rate": 7.618861682961927e-05, "loss": 0.2246, "step": 8105 }, { "epoch": 0.9741741741741742, "grad_norm": 0.5163986086845398, "learning_rate": 7.61618229348372e-05, "loss": 0.2337, "step": 8110 }, { "epoch": 0.9747747747747748, "grad_norm": 0.5130642056465149, "learning_rate": 7.613501869108589e-05, "loss": 0.2465, "step": 8115 }, { "epoch": 0.9753753753753753, "grad_norm": 0.5894377827644348, "learning_rate": 7.610820410896847e-05, "loss": 0.2954, "step": 8120 }, { "epoch": 0.975975975975976, "grad_norm": 0.543550968170166, "learning_rate": 7.608137919909208e-05, "loss": 0.2444, "step": 8125 }, { "epoch": 0.9765765765765766, "grad_norm": 0.48070836067199707, "learning_rate": 7.605454397206802e-05, "loss": 0.2234, "step": 8130 }, { "epoch": 0.9771771771771772, "grad_norm": 0.6094707250595093, "learning_rate": 7.602769843851163e-05, "loss": 0.2473, "step": 8135 }, { "epoch": 0.9777777777777777, "grad_norm": 0.5057632327079773, "learning_rate": 7.600084260904232e-05, "loss": 0.2432, "step": 8140 }, { "epoch": 0.9783783783783784, "grad_norm": 0.6155203580856323, "learning_rate": 7.597397649428364e-05, "loss": 0.2365, "step": 8145 }, { "epoch": 0.978978978978979, "grad_norm": 0.4686107635498047, "learning_rate": 7.59471001048631e-05, "loss": 0.2335, "step": 8150 }, { "epoch": 0.9795795795795795, "grad_norm": 0.5839318633079529, "learning_rate": 7.592021345141238e-05, "loss": 0.2315, "step": 8155 }, { "epoch": 0.9801801801801802, "grad_norm": 0.45957908034324646, "learning_rate": 7.589331654456716e-05, "loss": 0.216, "step": 8160 }, { "epoch": 0.9807807807807808, "grad_norm": 0.4842189848423004, "learning_rate": 7.586640939496717e-05, "loss": 0.2067, "step": 8165 }, { "epoch": 0.9813813813813814, "grad_norm": 0.5004525184631348, "learning_rate": 7.583949201325623e-05, "loss": 0.2249, "step": 8170 }, { "epoch": 0.9819819819819819, "grad_norm": 0.5188431143760681, "learning_rate": 7.58125644100822e-05, "loss": 0.2414, "step": 8175 }, { "epoch": 0.9825825825825826, "grad_norm": 0.5917747020721436, "learning_rate": 7.578562659609696e-05, "loss": 0.2738, "step": 8180 }, { "epoch": 0.9831831831831832, "grad_norm": 0.48418867588043213, "learning_rate": 7.575867858195644e-05, "loss": 0.2446, "step": 8185 }, { "epoch": 0.9837837837837838, "grad_norm": 0.5367460250854492, "learning_rate": 7.573172037832062e-05, "loss": 0.2584, "step": 8190 }, { "epoch": 0.9843843843843844, "grad_norm": 0.5259852409362793, "learning_rate": 7.570475199585348e-05, "loss": 0.247, "step": 8195 }, { "epoch": 0.984984984984985, "grad_norm": 0.6347830295562744, "learning_rate": 7.567777344522304e-05, "loss": 0.228, "step": 8200 }, { "epoch": 0.9855855855855856, "grad_norm": 0.40549105405807495, "learning_rate": 7.565078473710137e-05, "loss": 0.2263, "step": 8205 }, { "epoch": 0.9861861861861861, "grad_norm": 0.48162418603897095, "learning_rate": 7.562378588216454e-05, "loss": 0.2353, "step": 8210 }, { "epoch": 0.9867867867867868, "grad_norm": 0.5109835863113403, "learning_rate": 7.55967768910926e-05, "loss": 0.2349, "step": 8215 }, { "epoch": 0.9873873873873874, "grad_norm": 0.37698864936828613, "learning_rate": 7.556975777456962e-05, "loss": 0.2165, "step": 8220 }, { "epoch": 0.987987987987988, "grad_norm": 0.5112248063087463, "learning_rate": 7.554272854328377e-05, "loss": 0.2562, "step": 8225 }, { "epoch": 0.9885885885885886, "grad_norm": 0.44548270106315613, "learning_rate": 7.551568920792708e-05, "loss": 0.2314, "step": 8230 }, { "epoch": 0.9891891891891892, "grad_norm": 0.44484102725982666, "learning_rate": 7.548863977919567e-05, "loss": 0.213, "step": 8235 }, { "epoch": 0.9897897897897898, "grad_norm": 0.48750653862953186, "learning_rate": 7.54615802677896e-05, "loss": 0.2283, "step": 8240 }, { "epoch": 0.9903903903903903, "grad_norm": 0.5573891997337341, "learning_rate": 7.543451068441297e-05, "loss": 0.27, "step": 8245 }, { "epoch": 0.990990990990991, "grad_norm": 0.5790622234344482, "learning_rate": 7.540743103977378e-05, "loss": 0.2519, "step": 8250 }, { "epoch": 0.990990990990991, "eval_loss": 0.20414112508296967, "eval_runtime": 35.4459, "eval_samples_per_second": 22.57, "eval_steps_per_second": 5.642, "step": 8250 }, { "epoch": 0.9915915915915916, "grad_norm": 0.5384604930877686, "learning_rate": 7.538034134458414e-05, "loss": 0.2538, "step": 8255 }, { "epoch": 0.9921921921921922, "grad_norm": 0.47747641801834106, "learning_rate": 7.535324160956003e-05, "loss": 0.2567, "step": 8260 }, { "epoch": 0.9927927927927928, "grad_norm": 0.576480507850647, "learning_rate": 7.532613184542144e-05, "loss": 0.2501, "step": 8265 }, { "epoch": 0.9933933933933934, "grad_norm": 0.693172812461853, "learning_rate": 7.529901206289231e-05, "loss": 0.2771, "step": 8270 }, { "epoch": 0.993993993993994, "grad_norm": 0.5924530625343323, "learning_rate": 7.527188227270057e-05, "loss": 0.2371, "step": 8275 }, { "epoch": 0.9945945945945946, "grad_norm": 0.5856573581695557, "learning_rate": 7.524474248557809e-05, "loss": 0.2362, "step": 8280 }, { "epoch": 0.9951951951951952, "grad_norm": 0.5145263671875, "learning_rate": 7.521759271226068e-05, "loss": 0.2495, "step": 8285 }, { "epoch": 0.9957957957957958, "grad_norm": 0.6087602972984314, "learning_rate": 7.519043296348813e-05, "loss": 0.2678, "step": 8290 }, { "epoch": 0.9963963963963964, "grad_norm": 0.45782071352005005, "learning_rate": 7.516326325000418e-05, "loss": 0.2515, "step": 8295 }, { "epoch": 0.996996996996997, "grad_norm": 0.5238723158836365, "learning_rate": 7.513608358255646e-05, "loss": 0.2662, "step": 8300 }, { "epoch": 0.9975975975975976, "grad_norm": 0.5170171856880188, "learning_rate": 7.51088939718966e-05, "loss": 0.2762, "step": 8305 }, { "epoch": 0.9981981981981982, "grad_norm": 0.5645842552185059, "learning_rate": 7.508169442878013e-05, "loss": 0.2756, "step": 8310 }, { "epoch": 0.9987987987987988, "grad_norm": 0.47060051560401917, "learning_rate": 7.505448496396652e-05, "loss": 0.2583, "step": 8315 }, { "epoch": 0.9993993993993994, "grad_norm": 0.5723556876182556, "learning_rate": 7.502726558821915e-05, "loss": 0.2484, "step": 8320 }, { "epoch": 1.0, "grad_norm": 0.5563798546791077, "learning_rate": 7.500003631230534e-05, "loss": 0.2431, "step": 8325 }, { "epoch": 1.0006006006006005, "grad_norm": 0.48347780108451843, "learning_rate": 7.497279714699632e-05, "loss": 0.1432, "step": 8330 }, { "epoch": 1.0012012012012013, "grad_norm": 0.4860369861125946, "learning_rate": 7.494554810306721e-05, "loss": 0.1562, "step": 8335 }, { "epoch": 1.0018018018018018, "grad_norm": 0.5238087773323059, "learning_rate": 7.491828919129709e-05, "loss": 0.1624, "step": 8340 }, { "epoch": 1.0024024024024023, "grad_norm": 0.5139604806900024, "learning_rate": 7.489102042246888e-05, "loss": 0.157, "step": 8345 }, { "epoch": 1.003003003003003, "grad_norm": 0.5299177169799805, "learning_rate": 7.486374180736944e-05, "loss": 0.1704, "step": 8350 }, { "epoch": 1.0036036036036036, "grad_norm": 0.4978356957435608, "learning_rate": 7.48364533567895e-05, "loss": 0.1538, "step": 8355 }, { "epoch": 1.0042042042042043, "grad_norm": 0.4236242175102234, "learning_rate": 7.480915508152372e-05, "loss": 0.1607, "step": 8360 }, { "epoch": 1.0048048048048048, "grad_norm": 0.3946767747402191, "learning_rate": 7.478184699237061e-05, "loss": 0.1365, "step": 8365 }, { "epoch": 1.0054054054054054, "grad_norm": 0.47756099700927734, "learning_rate": 7.475452910013259e-05, "loss": 0.1566, "step": 8370 }, { "epoch": 1.006006006006006, "grad_norm": 0.4343335032463074, "learning_rate": 7.472720141561591e-05, "loss": 0.1509, "step": 8375 }, { "epoch": 1.0066066066066066, "grad_norm": 0.4657896161079407, "learning_rate": 7.469986394963076e-05, "loss": 0.1595, "step": 8380 }, { "epoch": 1.0072072072072071, "grad_norm": 0.5368801355361938, "learning_rate": 7.467251671299113e-05, "loss": 0.1648, "step": 8385 }, { "epoch": 1.0078078078078079, "grad_norm": 0.5660285353660583, "learning_rate": 7.464515971651493e-05, "loss": 0.1529, "step": 8390 }, { "epoch": 1.0084084084084084, "grad_norm": 0.4559033513069153, "learning_rate": 7.461779297102391e-05, "loss": 0.1359, "step": 8395 }, { "epoch": 1.009009009009009, "grad_norm": 0.48404526710510254, "learning_rate": 7.459041648734368e-05, "loss": 0.1593, "step": 8400 }, { "epoch": 1.0096096096096097, "grad_norm": 0.4860776662826538, "learning_rate": 7.456303027630366e-05, "loss": 0.1693, "step": 8405 }, { "epoch": 1.0102102102102102, "grad_norm": 0.5458460450172424, "learning_rate": 7.453563434873722e-05, "loss": 0.1479, "step": 8410 }, { "epoch": 1.0108108108108107, "grad_norm": 0.5686392188072205, "learning_rate": 7.450822871548148e-05, "loss": 0.1694, "step": 8415 }, { "epoch": 1.0114114114114114, "grad_norm": 0.5729146003723145, "learning_rate": 7.448081338737742e-05, "loss": 0.165, "step": 8420 }, { "epoch": 1.012012012012012, "grad_norm": 0.5049359798431396, "learning_rate": 7.445338837526988e-05, "loss": 0.1461, "step": 8425 }, { "epoch": 1.0126126126126127, "grad_norm": 0.4519275724887848, "learning_rate": 7.442595369000749e-05, "loss": 0.1336, "step": 8430 }, { "epoch": 1.0132132132132132, "grad_norm": 0.48683297634124756, "learning_rate": 7.439850934244279e-05, "loss": 0.1429, "step": 8435 }, { "epoch": 1.0138138138138137, "grad_norm": 0.4096581041812897, "learning_rate": 7.437105534343202e-05, "loss": 0.1646, "step": 8440 }, { "epoch": 1.0144144144144145, "grad_norm": 0.4254092872142792, "learning_rate": 7.434359170383533e-05, "loss": 0.1389, "step": 8445 }, { "epoch": 1.015015015015015, "grad_norm": 0.4360242784023285, "learning_rate": 7.431611843451664e-05, "loss": 0.1376, "step": 8450 }, { "epoch": 1.0156156156156155, "grad_norm": 0.5856005549430847, "learning_rate": 7.428863554634373e-05, "loss": 0.1516, "step": 8455 }, { "epoch": 1.0162162162162163, "grad_norm": 0.507146954536438, "learning_rate": 7.426114305018812e-05, "loss": 0.1512, "step": 8460 }, { "epoch": 1.0168168168168168, "grad_norm": 0.5587776303291321, "learning_rate": 7.423364095692518e-05, "loss": 0.1592, "step": 8465 }, { "epoch": 1.0174174174174173, "grad_norm": 0.5522329807281494, "learning_rate": 7.420612927743404e-05, "loss": 0.1397, "step": 8470 }, { "epoch": 1.018018018018018, "grad_norm": 0.5756586790084839, "learning_rate": 7.417860802259764e-05, "loss": 0.1651, "step": 8475 }, { "epoch": 1.0186186186186186, "grad_norm": 0.5764936804771423, "learning_rate": 7.415107720330273e-05, "loss": 0.1661, "step": 8480 }, { "epoch": 1.0192192192192193, "grad_norm": 0.5376463532447815, "learning_rate": 7.412353683043978e-05, "loss": 0.1595, "step": 8485 }, { "epoch": 1.0198198198198198, "grad_norm": 0.4412538409233093, "learning_rate": 7.40959869149031e-05, "loss": 0.1412, "step": 8490 }, { "epoch": 1.0204204204204204, "grad_norm": 0.502080500125885, "learning_rate": 7.406842746759077e-05, "loss": 0.16, "step": 8495 }, { "epoch": 1.021021021021021, "grad_norm": 0.594376802444458, "learning_rate": 7.404085849940461e-05, "loss": 0.159, "step": 8500 }, { "epoch": 1.021021021021021, "eval_loss": 0.19814305007457733, "eval_runtime": 35.6951, "eval_samples_per_second": 22.412, "eval_steps_per_second": 5.603, "step": 8500 }, { "epoch": 1.0216216216216216, "grad_norm": 0.44608205556869507, "learning_rate": 7.40132800212502e-05, "loss": 0.152, "step": 8505 }, { "epoch": 1.0222222222222221, "grad_norm": 0.4485919177532196, "learning_rate": 7.398569204403696e-05, "loss": 0.1393, "step": 8510 }, { "epoch": 1.0228228228228229, "grad_norm": 0.4802648425102234, "learning_rate": 7.395809457867795e-05, "loss": 0.1436, "step": 8515 }, { "epoch": 1.0234234234234234, "grad_norm": 0.45354655385017395, "learning_rate": 7.39304876360901e-05, "loss": 0.1389, "step": 8520 }, { "epoch": 1.024024024024024, "grad_norm": 0.5193431973457336, "learning_rate": 7.390287122719397e-05, "loss": 0.1538, "step": 8525 }, { "epoch": 1.0246246246246247, "grad_norm": 0.5162495374679565, "learning_rate": 7.387524536291397e-05, "loss": 0.1535, "step": 8530 }, { "epoch": 1.0252252252252252, "grad_norm": 0.4876277446746826, "learning_rate": 7.38476100541782e-05, "loss": 0.1423, "step": 8535 }, { "epoch": 1.025825825825826, "grad_norm": 0.5237493515014648, "learning_rate": 7.38199653119185e-05, "loss": 0.1544, "step": 8540 }, { "epoch": 1.0264264264264265, "grad_norm": 0.49636128544807434, "learning_rate": 7.379231114707043e-05, "loss": 0.1597, "step": 8545 }, { "epoch": 1.027027027027027, "grad_norm": 0.4116309881210327, "learning_rate": 7.376464757057333e-05, "loss": 0.1372, "step": 8550 }, { "epoch": 1.0276276276276277, "grad_norm": 0.5808112025260925, "learning_rate": 7.373697459337019e-05, "loss": 0.1293, "step": 8555 }, { "epoch": 1.0282282282282282, "grad_norm": 0.4400934875011444, "learning_rate": 7.37092922264078e-05, "loss": 0.1223, "step": 8560 }, { "epoch": 1.0288288288288288, "grad_norm": 0.449886292219162, "learning_rate": 7.368160048063654e-05, "loss": 0.1318, "step": 8565 }, { "epoch": 1.0294294294294295, "grad_norm": 0.5196372270584106, "learning_rate": 7.365389936701066e-05, "loss": 0.1558, "step": 8570 }, { "epoch": 1.03003003003003, "grad_norm": 0.4798905551433563, "learning_rate": 7.3626188896488e-05, "loss": 0.1477, "step": 8575 }, { "epoch": 1.0306306306306305, "grad_norm": 0.3740388751029968, "learning_rate": 7.359846908003012e-05, "loss": 0.1304, "step": 8580 }, { "epoch": 1.0312312312312313, "grad_norm": 0.43643224239349365, "learning_rate": 7.357073992860233e-05, "loss": 0.1254, "step": 8585 }, { "epoch": 1.0318318318318318, "grad_norm": 0.6022786498069763, "learning_rate": 7.354300145317356e-05, "loss": 0.1496, "step": 8590 }, { "epoch": 1.0324324324324325, "grad_norm": 0.5783417820930481, "learning_rate": 7.351525366471647e-05, "loss": 0.1605, "step": 8595 }, { "epoch": 1.033033033033033, "grad_norm": 0.3740118741989136, "learning_rate": 7.348749657420744e-05, "loss": 0.1435, "step": 8600 }, { "epoch": 1.0336336336336336, "grad_norm": 0.5092105865478516, "learning_rate": 7.345973019262645e-05, "loss": 0.156, "step": 8605 }, { "epoch": 1.0342342342342343, "grad_norm": 0.465665727853775, "learning_rate": 7.343195453095719e-05, "loss": 0.1636, "step": 8610 }, { "epoch": 1.0348348348348348, "grad_norm": 0.6344625949859619, "learning_rate": 7.340416960018701e-05, "loss": 0.1466, "step": 8615 }, { "epoch": 1.0354354354354354, "grad_norm": 0.5152485370635986, "learning_rate": 7.337637541130699e-05, "loss": 0.1453, "step": 8620 }, { "epoch": 1.0360360360360361, "grad_norm": 0.4373214840888977, "learning_rate": 7.334857197531178e-05, "loss": 0.1586, "step": 8625 }, { "epoch": 1.0366366366366366, "grad_norm": 0.5414717197418213, "learning_rate": 7.332075930319974e-05, "loss": 0.1503, "step": 8630 }, { "epoch": 1.0372372372372372, "grad_norm": 0.4952808618545532, "learning_rate": 7.329293740597289e-05, "loss": 0.1505, "step": 8635 }, { "epoch": 1.037837837837838, "grad_norm": 0.549498975276947, "learning_rate": 7.326510629463688e-05, "loss": 0.1407, "step": 8640 }, { "epoch": 1.0384384384384384, "grad_norm": 0.5551578402519226, "learning_rate": 7.3237265980201e-05, "loss": 0.1615, "step": 8645 }, { "epoch": 1.039039039039039, "grad_norm": 0.43350839614868164, "learning_rate": 7.320941647367819e-05, "loss": 0.1452, "step": 8650 }, { "epoch": 1.0396396396396397, "grad_norm": 0.5714775323867798, "learning_rate": 7.318155778608502e-05, "loss": 0.1659, "step": 8655 }, { "epoch": 1.0402402402402402, "grad_norm": 0.49592894315719604, "learning_rate": 7.31536899284417e-05, "loss": 0.142, "step": 8660 }, { "epoch": 1.040840840840841, "grad_norm": 0.491641640663147, "learning_rate": 7.312581291177208e-05, "loss": 0.1373, "step": 8665 }, { "epoch": 1.0414414414414415, "grad_norm": 0.5000977516174316, "learning_rate": 7.30979267471036e-05, "loss": 0.1453, "step": 8670 }, { "epoch": 1.042042042042042, "grad_norm": 0.5157973766326904, "learning_rate": 7.307003144546733e-05, "loss": 0.1369, "step": 8675 }, { "epoch": 1.0426426426426427, "grad_norm": 0.38589340448379517, "learning_rate": 7.304212701789797e-05, "loss": 0.1372, "step": 8680 }, { "epoch": 1.0432432432432432, "grad_norm": 0.4763352572917938, "learning_rate": 7.301421347543384e-05, "loss": 0.1456, "step": 8685 }, { "epoch": 1.0438438438438438, "grad_norm": 0.424207478761673, "learning_rate": 7.298629082911682e-05, "loss": 0.1434, "step": 8690 }, { "epoch": 1.0444444444444445, "grad_norm": 0.47599339485168457, "learning_rate": 7.295835908999242e-05, "loss": 0.1487, "step": 8695 }, { "epoch": 1.045045045045045, "grad_norm": 0.42823123931884766, "learning_rate": 7.293041826910976e-05, "loss": 0.1452, "step": 8700 }, { "epoch": 1.0456456456456455, "grad_norm": 0.6307981014251709, "learning_rate": 7.290246837752152e-05, "loss": 0.1358, "step": 8705 }, { "epoch": 1.0462462462462463, "grad_norm": 0.4481140375137329, "learning_rate": 7.287450942628397e-05, "loss": 0.1451, "step": 8710 }, { "epoch": 1.0468468468468468, "grad_norm": 0.5314701795578003, "learning_rate": 7.284654142645704e-05, "loss": 0.1526, "step": 8715 }, { "epoch": 1.0474474474474476, "grad_norm": 0.44227954745292664, "learning_rate": 7.28185643891041e-05, "loss": 0.1455, "step": 8720 }, { "epoch": 1.048048048048048, "grad_norm": 0.502838134765625, "learning_rate": 7.279057832529224e-05, "loss": 0.1398, "step": 8725 }, { "epoch": 1.0486486486486486, "grad_norm": 0.6226637363433838, "learning_rate": 7.2762583246092e-05, "loss": 0.1531, "step": 8730 }, { "epoch": 1.0492492492492493, "grad_norm": 0.452466756105423, "learning_rate": 7.273457916257758e-05, "loss": 0.139, "step": 8735 }, { "epoch": 1.0498498498498499, "grad_norm": 0.4796338379383087, "learning_rate": 7.270656608582668e-05, "loss": 0.1551, "step": 8740 }, { "epoch": 1.0504504504504504, "grad_norm": 0.47294482588768005, "learning_rate": 7.26785440269206e-05, "loss": 0.1325, "step": 8745 }, { "epoch": 1.0510510510510511, "grad_norm": 0.5365278124809265, "learning_rate": 7.265051299694414e-05, "loss": 0.1455, "step": 8750 }, { "epoch": 1.0510510510510511, "eval_loss": 0.1942126303911209, "eval_runtime": 35.5728, "eval_samples_per_second": 22.489, "eval_steps_per_second": 5.622, "step": 8750 }, { "epoch": 1.0516516516516516, "grad_norm": 0.5005031228065491, "learning_rate": 7.262247300698571e-05, "loss": 0.1333, "step": 8755 }, { "epoch": 1.0522522522522522, "grad_norm": 0.49393969774246216, "learning_rate": 7.259442406813722e-05, "loss": 0.1492, "step": 8760 }, { "epoch": 1.052852852852853, "grad_norm": 0.6257902979850769, "learning_rate": 7.256636619149413e-05, "loss": 0.1778, "step": 8765 }, { "epoch": 1.0534534534534534, "grad_norm": 0.46658793091773987, "learning_rate": 7.253829938815546e-05, "loss": 0.1312, "step": 8770 }, { "epoch": 1.054054054054054, "grad_norm": 0.41227564215660095, "learning_rate": 7.251022366922375e-05, "loss": 0.1387, "step": 8775 }, { "epoch": 1.0546546546546547, "grad_norm": 0.4338676929473877, "learning_rate": 7.248213904580502e-05, "loss": 0.1606, "step": 8780 }, { "epoch": 1.0552552552552552, "grad_norm": 0.5233752131462097, "learning_rate": 7.245404552900889e-05, "loss": 0.1468, "step": 8785 }, { "epoch": 1.055855855855856, "grad_norm": 0.5996452569961548, "learning_rate": 7.242594312994843e-05, "loss": 0.1646, "step": 8790 }, { "epoch": 1.0564564564564565, "grad_norm": 0.5455158352851868, "learning_rate": 7.239783185974029e-05, "loss": 0.151, "step": 8795 }, { "epoch": 1.057057057057057, "grad_norm": 0.46771520376205444, "learning_rate": 7.236971172950455e-05, "loss": 0.1611, "step": 8800 }, { "epoch": 1.0576576576576577, "grad_norm": 0.5181732773780823, "learning_rate": 7.234158275036487e-05, "loss": 0.1218, "step": 8805 }, { "epoch": 1.0582582582582583, "grad_norm": 0.5358600616455078, "learning_rate": 7.231344493344834e-05, "loss": 0.1763, "step": 8810 }, { "epoch": 1.0588588588588588, "grad_norm": 0.4052846431732178, "learning_rate": 7.228529828988563e-05, "loss": 0.1369, "step": 8815 }, { "epoch": 1.0594594594594595, "grad_norm": 0.4466336667537689, "learning_rate": 7.225714283081083e-05, "loss": 0.1711, "step": 8820 }, { "epoch": 1.06006006006006, "grad_norm": 0.4244237542152405, "learning_rate": 7.222897856736154e-05, "loss": 0.1499, "step": 8825 }, { "epoch": 1.0606606606606606, "grad_norm": 0.49825313687324524, "learning_rate": 7.220080551067886e-05, "loss": 0.139, "step": 8830 }, { "epoch": 1.0612612612612613, "grad_norm": 0.5628939867019653, "learning_rate": 7.217262367190733e-05, "loss": 0.1466, "step": 8835 }, { "epoch": 1.0618618618618618, "grad_norm": 0.5116236805915833, "learning_rate": 7.214443306219501e-05, "loss": 0.1537, "step": 8840 }, { "epoch": 1.0624624624624626, "grad_norm": 0.4122700095176697, "learning_rate": 7.21162336926934e-05, "loss": 0.1237, "step": 8845 }, { "epoch": 1.063063063063063, "grad_norm": 0.5503990650177002, "learning_rate": 7.208802557455746e-05, "loss": 0.1608, "step": 8850 }, { "epoch": 1.0636636636636636, "grad_norm": 0.3909919559955597, "learning_rate": 7.205980871894565e-05, "loss": 0.1462, "step": 8855 }, { "epoch": 1.0642642642642643, "grad_norm": 0.45528560876846313, "learning_rate": 7.203158313701982e-05, "loss": 0.1405, "step": 8860 }, { "epoch": 1.0648648648648649, "grad_norm": 0.46630609035491943, "learning_rate": 7.200334883994534e-05, "loss": 0.1489, "step": 8865 }, { "epoch": 1.0654654654654654, "grad_norm": 0.42296266555786133, "learning_rate": 7.1975105838891e-05, "loss": 0.1566, "step": 8870 }, { "epoch": 1.0660660660660661, "grad_norm": 0.5201703906059265, "learning_rate": 7.194685414502902e-05, "loss": 0.1449, "step": 8875 }, { "epoch": 1.0666666666666667, "grad_norm": 0.5390458106994629, "learning_rate": 7.191859376953508e-05, "loss": 0.1408, "step": 8880 }, { "epoch": 1.0672672672672672, "grad_norm": 0.5914261937141418, "learning_rate": 7.189032472358826e-05, "loss": 0.1514, "step": 8885 }, { "epoch": 1.067867867867868, "grad_norm": 0.5696099996566772, "learning_rate": 7.186204701837114e-05, "loss": 0.1555, "step": 8890 }, { "epoch": 1.0684684684684684, "grad_norm": 0.45480287075042725, "learning_rate": 7.183376066506964e-05, "loss": 0.1434, "step": 8895 }, { "epoch": 1.069069069069069, "grad_norm": 0.4600028693675995, "learning_rate": 7.180546567487317e-05, "loss": 0.1374, "step": 8900 }, { "epoch": 1.0696696696696697, "grad_norm": 0.8249819278717041, "learning_rate": 7.177716205897449e-05, "loss": 0.1533, "step": 8905 }, { "epoch": 1.0702702702702702, "grad_norm": 0.3818807899951935, "learning_rate": 7.174884982856984e-05, "loss": 0.1327, "step": 8910 }, { "epoch": 1.070870870870871, "grad_norm": 0.5017231702804565, "learning_rate": 7.172052899485883e-05, "loss": 0.1359, "step": 8915 }, { "epoch": 1.0714714714714715, "grad_norm": 0.534890353679657, "learning_rate": 7.169219956904447e-05, "loss": 0.1364, "step": 8920 }, { "epoch": 1.072072072072072, "grad_norm": 0.4843946695327759, "learning_rate": 7.16638615623332e-05, "loss": 0.1476, "step": 8925 }, { "epoch": 1.0726726726726727, "grad_norm": 0.515217661857605, "learning_rate": 7.163551498593481e-05, "loss": 0.1443, "step": 8930 }, { "epoch": 1.0732732732732733, "grad_norm": 0.33950290083885193, "learning_rate": 7.160715985106253e-05, "loss": 0.1423, "step": 8935 }, { "epoch": 1.0738738738738738, "grad_norm": 0.5174293518066406, "learning_rate": 7.157879616893294e-05, "loss": 0.1527, "step": 8940 }, { "epoch": 1.0744744744744745, "grad_norm": 0.4591221511363983, "learning_rate": 7.155042395076598e-05, "loss": 0.168, "step": 8945 }, { "epoch": 1.075075075075075, "grad_norm": 0.49486660957336426, "learning_rate": 7.152204320778504e-05, "loss": 0.1564, "step": 8950 }, { "epoch": 1.0756756756756758, "grad_norm": 0.5360234379768372, "learning_rate": 7.149365395121681e-05, "loss": 0.1442, "step": 8955 }, { "epoch": 1.0762762762762763, "grad_norm": 0.5851607918739319, "learning_rate": 7.146525619229139e-05, "loss": 0.1332, "step": 8960 }, { "epoch": 1.0768768768768768, "grad_norm": 0.5005301833152771, "learning_rate": 7.143684994224222e-05, "loss": 0.1368, "step": 8965 }, { "epoch": 1.0774774774774776, "grad_norm": 0.4584340751171112, "learning_rate": 7.140843521230613e-05, "loss": 0.1499, "step": 8970 }, { "epoch": 1.078078078078078, "grad_norm": 0.45061230659484863, "learning_rate": 7.138001201372327e-05, "loss": 0.1291, "step": 8975 }, { "epoch": 1.0786786786786786, "grad_norm": 0.4673006534576416, "learning_rate": 7.135158035773714e-05, "loss": 0.1383, "step": 8980 }, { "epoch": 1.0792792792792794, "grad_norm": 0.36622899770736694, "learning_rate": 7.132314025559464e-05, "loss": 0.1409, "step": 8985 }, { "epoch": 1.0798798798798799, "grad_norm": 0.4832969903945923, "learning_rate": 7.12946917185459e-05, "loss": 0.1417, "step": 8990 }, { "epoch": 1.0804804804804804, "grad_norm": 0.4738680124282837, "learning_rate": 7.126623475784453e-05, "loss": 0.1373, "step": 8995 }, { "epoch": 1.0810810810810811, "grad_norm": 0.5347143411636353, "learning_rate": 7.123776938474735e-05, "loss": 0.167, "step": 9000 }, { "epoch": 1.0810810810810811, "eval_loss": 0.19050884246826172, "eval_runtime": 36.0676, "eval_samples_per_second": 22.181, "eval_steps_per_second": 5.545, "step": 9000 }, { "epoch": 1.0816816816816817, "grad_norm": 0.4649895131587982, "learning_rate": 7.120929561051458e-05, "loss": 0.1376, "step": 9005 }, { "epoch": 1.0822822822822822, "grad_norm": 0.5208431482315063, "learning_rate": 7.118081344640974e-05, "loss": 0.1543, "step": 9010 }, { "epoch": 1.082882882882883, "grad_norm": 0.4275619685649872, "learning_rate": 7.115232290369967e-05, "loss": 0.132, "step": 9015 }, { "epoch": 1.0834834834834834, "grad_norm": 0.4782681465148926, "learning_rate": 7.112382399365451e-05, "loss": 0.132, "step": 9020 }, { "epoch": 1.0840840840840842, "grad_norm": 0.508679986000061, "learning_rate": 7.109531672754772e-05, "loss": 0.1552, "step": 9025 }, { "epoch": 1.0846846846846847, "grad_norm": 0.42764633893966675, "learning_rate": 7.106680111665609e-05, "loss": 0.1186, "step": 9030 }, { "epoch": 1.0852852852852852, "grad_norm": 0.4004163444042206, "learning_rate": 7.103827717225968e-05, "loss": 0.1469, "step": 9035 }, { "epoch": 1.085885885885886, "grad_norm": 0.48991858959198, "learning_rate": 7.100974490564185e-05, "loss": 0.1525, "step": 9040 }, { "epoch": 1.0864864864864865, "grad_norm": 0.47079792618751526, "learning_rate": 7.098120432808924e-05, "loss": 0.116, "step": 9045 }, { "epoch": 1.087087087087087, "grad_norm": 0.5363724231719971, "learning_rate": 7.095265545089184e-05, "loss": 0.1172, "step": 9050 }, { "epoch": 1.0876876876876878, "grad_norm": 0.5028483867645264, "learning_rate": 7.092409828534285e-05, "loss": 0.1407, "step": 9055 }, { "epoch": 1.0882882882882883, "grad_norm": 0.4808637201786041, "learning_rate": 7.089553284273878e-05, "loss": 0.1293, "step": 9060 }, { "epoch": 1.0888888888888888, "grad_norm": 0.5302677154541016, "learning_rate": 7.086695913437942e-05, "loss": 0.142, "step": 9065 }, { "epoch": 1.0894894894894895, "grad_norm": 0.48279350996017456, "learning_rate": 7.083837717156781e-05, "loss": 0.1477, "step": 9070 }, { "epoch": 1.09009009009009, "grad_norm": 0.492736279964447, "learning_rate": 7.080978696561028e-05, "loss": 0.1542, "step": 9075 }, { "epoch": 1.0906906906906908, "grad_norm": 0.41189101338386536, "learning_rate": 7.07811885278164e-05, "loss": 0.1292, "step": 9080 }, { "epoch": 1.0912912912912913, "grad_norm": 0.5568517446517944, "learning_rate": 7.0752581869499e-05, "loss": 0.1356, "step": 9085 }, { "epoch": 1.0918918918918918, "grad_norm": 0.48629894852638245, "learning_rate": 7.072396700197416e-05, "loss": 0.12, "step": 9090 }, { "epoch": 1.0924924924924926, "grad_norm": 0.4972403645515442, "learning_rate": 7.069534393656124e-05, "loss": 0.1364, "step": 9095 }, { "epoch": 1.093093093093093, "grad_norm": 0.5167415738105774, "learning_rate": 7.06667126845828e-05, "loss": 0.1402, "step": 9100 }, { "epoch": 1.0936936936936936, "grad_norm": 0.466062068939209, "learning_rate": 7.063807325736464e-05, "loss": 0.1376, "step": 9105 }, { "epoch": 1.0942942942942944, "grad_norm": 0.4689285457134247, "learning_rate": 7.060942566623581e-05, "loss": 0.1593, "step": 9110 }, { "epoch": 1.0948948948948949, "grad_norm": 0.4900115430355072, "learning_rate": 7.058076992252861e-05, "loss": 0.1386, "step": 9115 }, { "epoch": 1.0954954954954954, "grad_norm": 0.5853942036628723, "learning_rate": 7.055210603757854e-05, "loss": 0.124, "step": 9120 }, { "epoch": 1.0960960960960962, "grad_norm": 0.46623408794403076, "learning_rate": 7.052343402272431e-05, "loss": 0.1322, "step": 9125 }, { "epoch": 1.0966966966966967, "grad_norm": 0.365323930978775, "learning_rate": 7.049475388930787e-05, "loss": 0.1427, "step": 9130 }, { "epoch": 1.0972972972972972, "grad_norm": 0.4380837678909302, "learning_rate": 7.046606564867433e-05, "loss": 0.1414, "step": 9135 }, { "epoch": 1.097897897897898, "grad_norm": 0.476395845413208, "learning_rate": 7.04373693121721e-05, "loss": 0.1447, "step": 9140 }, { "epoch": 1.0984984984984985, "grad_norm": 0.5355069637298584, "learning_rate": 7.040866489115272e-05, "loss": 0.1701, "step": 9145 }, { "epoch": 1.0990990990990992, "grad_norm": 0.5215357542037964, "learning_rate": 7.037995239697096e-05, "loss": 0.1544, "step": 9150 }, { "epoch": 1.0996996996996997, "grad_norm": 0.5702700018882751, "learning_rate": 7.035123184098476e-05, "loss": 0.1423, "step": 9155 }, { "epoch": 1.1003003003003002, "grad_norm": 0.5936440825462341, "learning_rate": 7.032250323455525e-05, "loss": 0.1545, "step": 9160 }, { "epoch": 1.100900900900901, "grad_norm": 0.4610235095024109, "learning_rate": 7.029376658904676e-05, "loss": 0.1269, "step": 9165 }, { "epoch": 1.1015015015015015, "grad_norm": 0.5780545473098755, "learning_rate": 7.026502191582683e-05, "loss": 0.1342, "step": 9170 }, { "epoch": 1.102102102102102, "grad_norm": 0.6412636637687683, "learning_rate": 7.023626922626609e-05, "loss": 0.1395, "step": 9175 }, { "epoch": 1.1027027027027028, "grad_norm": 0.4099220037460327, "learning_rate": 7.020750853173841e-05, "loss": 0.1295, "step": 9180 }, { "epoch": 1.1033033033033033, "grad_norm": 0.5327261090278625, "learning_rate": 7.017873984362082e-05, "loss": 0.1325, "step": 9185 }, { "epoch": 1.1039039039039038, "grad_norm": 0.620705783367157, "learning_rate": 7.014996317329349e-05, "loss": 0.1448, "step": 9190 }, { "epoch": 1.1045045045045045, "grad_norm": 0.5146176815032959, "learning_rate": 7.012117853213977e-05, "loss": 0.1461, "step": 9195 }, { "epoch": 1.105105105105105, "grad_norm": 0.4332140386104584, "learning_rate": 7.009238593154613e-05, "loss": 0.1325, "step": 9200 }, { "epoch": 1.1057057057057058, "grad_norm": 0.4083593189716339, "learning_rate": 7.006358538290224e-05, "loss": 0.1521, "step": 9205 }, { "epoch": 1.1063063063063063, "grad_norm": 0.574815034866333, "learning_rate": 7.003477689760084e-05, "loss": 0.1311, "step": 9210 }, { "epoch": 1.1069069069069069, "grad_norm": 0.492655873298645, "learning_rate": 7.00059604870379e-05, "loss": 0.125, "step": 9215 }, { "epoch": 1.1075075075075076, "grad_norm": 0.4560936689376831, "learning_rate": 6.997713616261246e-05, "loss": 0.1349, "step": 9220 }, { "epoch": 1.1081081081081081, "grad_norm": 0.5473215579986572, "learning_rate": 6.994830393572669e-05, "loss": 0.1242, "step": 9225 }, { "epoch": 1.1087087087087086, "grad_norm": 0.4096076190471649, "learning_rate": 6.991946381778593e-05, "loss": 0.1293, "step": 9230 }, { "epoch": 1.1093093093093094, "grad_norm": 0.6029496192932129, "learning_rate": 6.98906158201986e-05, "loss": 0.1312, "step": 9235 }, { "epoch": 1.10990990990991, "grad_norm": 0.44497260451316833, "learning_rate": 6.986175995437628e-05, "loss": 0.1254, "step": 9240 }, { "epoch": 1.1105105105105104, "grad_norm": 0.5495989918708801, "learning_rate": 6.983289623173361e-05, "loss": 0.1158, "step": 9245 }, { "epoch": 1.1111111111111112, "grad_norm": 0.4961594045162201, "learning_rate": 6.980402466368835e-05, "loss": 0.1548, "step": 9250 }, { "epoch": 1.1111111111111112, "eval_loss": 0.18500061333179474, "eval_runtime": 35.9632, "eval_samples_per_second": 22.245, "eval_steps_per_second": 5.561, "step": 9250 }, { "epoch": 1.1117117117117117, "grad_norm": 0.3899019956588745, "learning_rate": 6.977514526166143e-05, "loss": 0.1306, "step": 9255 }, { "epoch": 1.1123123123123122, "grad_norm": 0.49679896235466003, "learning_rate": 6.974625803707677e-05, "loss": 0.1348, "step": 9260 }, { "epoch": 1.112912912912913, "grad_norm": 0.4366336166858673, "learning_rate": 6.971736300136147e-05, "loss": 0.1452, "step": 9265 }, { "epoch": 1.1135135135135135, "grad_norm": 0.5401057600975037, "learning_rate": 6.96884601659457e-05, "loss": 0.1508, "step": 9270 }, { "epoch": 1.1141141141141142, "grad_norm": 0.5785194039344788, "learning_rate": 6.965954954226268e-05, "loss": 0.135, "step": 9275 }, { "epoch": 1.1147147147147147, "grad_norm": 0.5360791087150574, "learning_rate": 6.963063114174875e-05, "loss": 0.1385, "step": 9280 }, { "epoch": 1.1153153153153152, "grad_norm": 0.4878944754600525, "learning_rate": 6.960170497584331e-05, "loss": 0.1254, "step": 9285 }, { "epoch": 1.115915915915916, "grad_norm": 0.4216470718383789, "learning_rate": 6.957277105598884e-05, "loss": 0.1329, "step": 9290 }, { "epoch": 1.1165165165165165, "grad_norm": 0.4470789432525635, "learning_rate": 6.954382939363086e-05, "loss": 0.122, "step": 9295 }, { "epoch": 1.117117117117117, "grad_norm": 0.54117751121521, "learning_rate": 6.951488000021803e-05, "loss": 0.1188, "step": 9300 }, { "epoch": 1.1177177177177178, "grad_norm": 0.5519031882286072, "learning_rate": 6.948592288720194e-05, "loss": 0.1232, "step": 9305 }, { "epoch": 1.1183183183183183, "grad_norm": 0.5523632168769836, "learning_rate": 6.945695806603736e-05, "loss": 0.148, "step": 9310 }, { "epoch": 1.118918918918919, "grad_norm": 0.5505694150924683, "learning_rate": 6.942798554818204e-05, "loss": 0.148, "step": 9315 }, { "epoch": 1.1195195195195196, "grad_norm": 0.4322364032268524, "learning_rate": 6.939900534509678e-05, "loss": 0.1319, "step": 9320 }, { "epoch": 1.12012012012012, "grad_norm": 0.3765978217124939, "learning_rate": 6.937001746824545e-05, "loss": 0.1169, "step": 9325 }, { "epoch": 1.1207207207207208, "grad_norm": 0.5289835333824158, "learning_rate": 6.934102192909492e-05, "loss": 0.1318, "step": 9330 }, { "epoch": 1.1213213213213213, "grad_norm": 0.4689262807369232, "learning_rate": 6.931201873911511e-05, "loss": 0.1236, "step": 9335 }, { "epoch": 1.1219219219219219, "grad_norm": 0.5069834589958191, "learning_rate": 6.928300790977897e-05, "loss": 0.1444, "step": 9340 }, { "epoch": 1.1225225225225226, "grad_norm": 0.46477627754211426, "learning_rate": 6.925398945256247e-05, "loss": 0.1366, "step": 9345 }, { "epoch": 1.1231231231231231, "grad_norm": 0.4537050127983093, "learning_rate": 6.922496337894457e-05, "loss": 0.1385, "step": 9350 }, { "epoch": 1.1237237237237236, "grad_norm": 0.5252853035926819, "learning_rate": 6.919592970040731e-05, "loss": 0.1438, "step": 9355 }, { "epoch": 1.1243243243243244, "grad_norm": 0.46386396884918213, "learning_rate": 6.916688842843565e-05, "loss": 0.1298, "step": 9360 }, { "epoch": 1.124924924924925, "grad_norm": 0.5454824566841125, "learning_rate": 6.913783957451759e-05, "loss": 0.1404, "step": 9365 }, { "epoch": 1.1255255255255254, "grad_norm": 0.4597674012184143, "learning_rate": 6.91087831501442e-05, "loss": 0.1334, "step": 9370 }, { "epoch": 1.1261261261261262, "grad_norm": 0.48023849725723267, "learning_rate": 6.90797191668094e-05, "loss": 0.1337, "step": 9375 }, { "epoch": 1.1267267267267267, "grad_norm": 0.48772919178009033, "learning_rate": 6.905064763601026e-05, "loss": 0.1167, "step": 9380 }, { "epoch": 1.1273273273273272, "grad_norm": 0.4505400061607361, "learning_rate": 6.902156856924671e-05, "loss": 0.1514, "step": 9385 }, { "epoch": 1.127927927927928, "grad_norm": 0.528593122959137, "learning_rate": 6.899248197802172e-05, "loss": 0.1328, "step": 9390 }, { "epoch": 1.1285285285285285, "grad_norm": 0.6304023265838623, "learning_rate": 6.896338787384124e-05, "loss": 0.1505, "step": 9395 }, { "epoch": 1.1291291291291292, "grad_norm": 0.49781641364097595, "learning_rate": 6.893428626821413e-05, "loss": 0.1306, "step": 9400 }, { "epoch": 1.1297297297297297, "grad_norm": 0.5521763563156128, "learning_rate": 6.890517717265233e-05, "loss": 0.1448, "step": 9405 }, { "epoch": 1.1303303303303303, "grad_norm": 0.5749943852424622, "learning_rate": 6.887606059867065e-05, "loss": 0.1438, "step": 9410 }, { "epoch": 1.130930930930931, "grad_norm": 0.45946699380874634, "learning_rate": 6.884693655778685e-05, "loss": 0.1301, "step": 9415 }, { "epoch": 1.1315315315315315, "grad_norm": 0.5034966468811035, "learning_rate": 6.881780506152172e-05, "loss": 0.1471, "step": 9420 }, { "epoch": 1.132132132132132, "grad_norm": 0.5998401045799255, "learning_rate": 6.878866612139895e-05, "loss": 0.1561, "step": 9425 }, { "epoch": 1.1327327327327328, "grad_norm": 0.4555962085723877, "learning_rate": 6.875951974894519e-05, "loss": 0.1342, "step": 9430 }, { "epoch": 1.1333333333333333, "grad_norm": 0.6748508810997009, "learning_rate": 6.873036595569e-05, "loss": 0.1412, "step": 9435 }, { "epoch": 1.133933933933934, "grad_norm": 0.4627913534641266, "learning_rate": 6.870120475316592e-05, "loss": 0.1393, "step": 9440 }, { "epoch": 1.1345345345345346, "grad_norm": 0.4778697192668915, "learning_rate": 6.86720361529084e-05, "loss": 0.15, "step": 9445 }, { "epoch": 1.135135135135135, "grad_norm": 0.4317777454853058, "learning_rate": 6.86428601664558e-05, "loss": 0.1138, "step": 9450 }, { "epoch": 1.1357357357357358, "grad_norm": 0.5334888696670532, "learning_rate": 6.861367680534942e-05, "loss": 0.1275, "step": 9455 }, { "epoch": 1.1363363363363364, "grad_norm": 0.44648465514183044, "learning_rate": 6.85844860811335e-05, "loss": 0.1269, "step": 9460 }, { "epoch": 1.1369369369369369, "grad_norm": 0.5643259286880493, "learning_rate": 6.855528800535513e-05, "loss": 0.1256, "step": 9465 }, { "epoch": 1.1375375375375376, "grad_norm": 0.5081361532211304, "learning_rate": 6.852608258956437e-05, "loss": 0.1561, "step": 9470 }, { "epoch": 1.1381381381381381, "grad_norm": 0.5348806977272034, "learning_rate": 6.849686984531416e-05, "loss": 0.1282, "step": 9475 }, { "epoch": 1.1387387387387387, "grad_norm": 0.4955986738204956, "learning_rate": 6.846764978416031e-05, "loss": 0.1355, "step": 9480 }, { "epoch": 1.1393393393393394, "grad_norm": 0.6122444868087769, "learning_rate": 6.843842241766158e-05, "loss": 0.1671, "step": 9485 }, { "epoch": 1.13993993993994, "grad_norm": 0.502400279045105, "learning_rate": 6.84091877573796e-05, "loss": 0.13, "step": 9490 }, { "epoch": 1.1405405405405404, "grad_norm": 0.5369451642036438, "learning_rate": 6.837994581487888e-05, "loss": 0.1318, "step": 9495 }, { "epoch": 1.1411411411411412, "grad_norm": 0.5451071858406067, "learning_rate": 6.835069660172678e-05, "loss": 0.1388, "step": 9500 }, { "epoch": 1.1411411411411412, "eval_loss": 0.18366886675357819, "eval_runtime": 35.8223, "eval_samples_per_second": 22.332, "eval_steps_per_second": 5.583, "step": 9500 }, { "epoch": 1.1417417417417417, "grad_norm": 0.49755939841270447, "learning_rate": 6.832144012949356e-05, "loss": 0.1286, "step": 9505 }, { "epoch": 1.1423423423423422, "grad_norm": 0.5574833154678345, "learning_rate": 6.829217640975238e-05, "loss": 0.1277, "step": 9510 }, { "epoch": 1.142942942942943, "grad_norm": 0.4342218041419983, "learning_rate": 6.826290545407923e-05, "loss": 0.1269, "step": 9515 }, { "epoch": 1.1435435435435435, "grad_norm": 0.4537164270877838, "learning_rate": 6.823362727405298e-05, "loss": 0.1365, "step": 9520 }, { "epoch": 1.1441441441441442, "grad_norm": 0.5578528642654419, "learning_rate": 6.820434188125536e-05, "loss": 0.1343, "step": 9525 }, { "epoch": 1.1447447447447447, "grad_norm": 0.48114368319511414, "learning_rate": 6.817504928727094e-05, "loss": 0.129, "step": 9530 }, { "epoch": 1.1453453453453453, "grad_norm": 0.46689116954803467, "learning_rate": 6.814574950368715e-05, "loss": 0.1249, "step": 9535 }, { "epoch": 1.145945945945946, "grad_norm": 0.5292294025421143, "learning_rate": 6.811644254209423e-05, "loss": 0.1503, "step": 9540 }, { "epoch": 1.1465465465465465, "grad_norm": 0.45598042011260986, "learning_rate": 6.808712841408533e-05, "loss": 0.1284, "step": 9545 }, { "epoch": 1.147147147147147, "grad_norm": 0.6221840977668762, "learning_rate": 6.805780713125638e-05, "loss": 0.1312, "step": 9550 }, { "epoch": 1.1477477477477478, "grad_norm": 0.4730389714241028, "learning_rate": 6.802847870520614e-05, "loss": 0.1311, "step": 9555 }, { "epoch": 1.1483483483483483, "grad_norm": 0.563447117805481, "learning_rate": 6.799914314753622e-05, "loss": 0.1382, "step": 9560 }, { "epoch": 1.148948948948949, "grad_norm": 0.48742344975471497, "learning_rate": 6.796980046985102e-05, "loss": 0.1403, "step": 9565 }, { "epoch": 1.1495495495495496, "grad_norm": 0.5710808038711548, "learning_rate": 6.79404506837578e-05, "loss": 0.1459, "step": 9570 }, { "epoch": 1.15015015015015, "grad_norm": 0.6052168011665344, "learning_rate": 6.79110938008666e-05, "loss": 0.1464, "step": 9575 }, { "epoch": 1.1507507507507508, "grad_norm": 0.6504958868026733, "learning_rate": 6.788172983279028e-05, "loss": 0.1512, "step": 9580 }, { "epoch": 1.1513513513513514, "grad_norm": 0.49774986505508423, "learning_rate": 6.78523587911445e-05, "loss": 0.1284, "step": 9585 }, { "epoch": 1.1519519519519519, "grad_norm": 0.5204494595527649, "learning_rate": 6.782298068754772e-05, "loss": 0.1356, "step": 9590 }, { "epoch": 1.1525525525525526, "grad_norm": 0.42971011996269226, "learning_rate": 6.779359553362118e-05, "loss": 0.1282, "step": 9595 }, { "epoch": 1.1531531531531531, "grad_norm": 0.6139044165611267, "learning_rate": 6.776420334098891e-05, "loss": 0.1272, "step": 9600 }, { "epoch": 1.1537537537537537, "grad_norm": 0.439885675907135, "learning_rate": 6.773480412127776e-05, "loss": 0.1202, "step": 9605 }, { "epoch": 1.1543543543543544, "grad_norm": 0.5982673764228821, "learning_rate": 6.77053978861173e-05, "loss": 0.153, "step": 9610 }, { "epoch": 1.154954954954955, "grad_norm": 0.4638057351112366, "learning_rate": 6.767598464713994e-05, "loss": 0.1324, "step": 9615 }, { "epoch": 1.1555555555555554, "grad_norm": 0.5485991835594177, "learning_rate": 6.764656441598081e-05, "loss": 0.1349, "step": 9620 }, { "epoch": 1.1561561561561562, "grad_norm": 0.6574414968490601, "learning_rate": 6.761713720427782e-05, "loss": 0.1252, "step": 9625 }, { "epoch": 1.1567567567567567, "grad_norm": 0.533554196357727, "learning_rate": 6.758770302367168e-05, "loss": 0.1275, "step": 9630 }, { "epoch": 1.1573573573573575, "grad_norm": 0.4326445460319519, "learning_rate": 6.755826188580579e-05, "loss": 0.1056, "step": 9635 }, { "epoch": 1.157957957957958, "grad_norm": 0.47175487875938416, "learning_rate": 6.752881380232634e-05, "loss": 0.1284, "step": 9640 }, { "epoch": 1.1585585585585585, "grad_norm": 0.5621086359024048, "learning_rate": 6.749935878488227e-05, "loss": 0.1409, "step": 9645 }, { "epoch": 1.1591591591591592, "grad_norm": 0.47167137265205383, "learning_rate": 6.746989684512525e-05, "loss": 0.1348, "step": 9650 }, { "epoch": 1.1597597597597598, "grad_norm": 0.38340097665786743, "learning_rate": 6.74404279947097e-05, "loss": 0.1096, "step": 9655 }, { "epoch": 1.1603603603603603, "grad_norm": 0.4628503918647766, "learning_rate": 6.741095224529277e-05, "loss": 0.1325, "step": 9660 }, { "epoch": 1.160960960960961, "grad_norm": 0.47487297654151917, "learning_rate": 6.738146960853433e-05, "loss": 0.1251, "step": 9665 }, { "epoch": 1.1615615615615615, "grad_norm": 0.5550730228424072, "learning_rate": 6.735198009609697e-05, "loss": 0.1366, "step": 9670 }, { "epoch": 1.1621621621621623, "grad_norm": 0.45188426971435547, "learning_rate": 6.732248371964602e-05, "loss": 0.1318, "step": 9675 }, { "epoch": 1.1627627627627628, "grad_norm": 0.38361048698425293, "learning_rate": 6.729298049084953e-05, "loss": 0.1336, "step": 9680 }, { "epoch": 1.1633633633633633, "grad_norm": 0.5466015338897705, "learning_rate": 6.726347042137824e-05, "loss": 0.1468, "step": 9685 }, { "epoch": 1.163963963963964, "grad_norm": 0.5949849486351013, "learning_rate": 6.723395352290558e-05, "loss": 0.1526, "step": 9690 }, { "epoch": 1.1645645645645646, "grad_norm": 0.551245391368866, "learning_rate": 6.720442980710773e-05, "loss": 0.1302, "step": 9695 }, { "epoch": 1.165165165165165, "grad_norm": 0.6451214551925659, "learning_rate": 6.717489928566355e-05, "loss": 0.1336, "step": 9700 }, { "epoch": 1.1657657657657658, "grad_norm": 0.4822372496128082, "learning_rate": 6.714536197025452e-05, "loss": 0.133, "step": 9705 }, { "epoch": 1.1663663663663664, "grad_norm": 0.5520108342170715, "learning_rate": 6.711581787256494e-05, "loss": 0.1284, "step": 9710 }, { "epoch": 1.166966966966967, "grad_norm": 0.6228134632110596, "learning_rate": 6.70862670042817e-05, "loss": 0.1428, "step": 9715 }, { "epoch": 1.1675675675675676, "grad_norm": 0.4633306860923767, "learning_rate": 6.70567093770944e-05, "loss": 0.1209, "step": 9720 }, { "epoch": 1.1681681681681682, "grad_norm": 0.6069526672363281, "learning_rate": 6.702714500269528e-05, "loss": 0.133, "step": 9725 }, { "epoch": 1.1687687687687687, "grad_norm": 0.6277711987495422, "learning_rate": 6.69975738927793e-05, "loss": 0.1327, "step": 9730 }, { "epoch": 1.1693693693693694, "grad_norm": 0.4462870657444, "learning_rate": 6.696799605904405e-05, "loss": 0.1161, "step": 9735 }, { "epoch": 1.16996996996997, "grad_norm": 0.5560117363929749, "learning_rate": 6.693841151318978e-05, "loss": 0.1532, "step": 9740 }, { "epoch": 1.1705705705705705, "grad_norm": 0.5032148957252502, "learning_rate": 6.690882026691941e-05, "loss": 0.1297, "step": 9745 }, { "epoch": 1.1711711711711712, "grad_norm": 0.5112046003341675, "learning_rate": 6.687922233193851e-05, "loss": 0.1243, "step": 9750 }, { "epoch": 1.1711711711711712, "eval_loss": 0.17193254828453064, "eval_runtime": 35.8517, "eval_samples_per_second": 22.314, "eval_steps_per_second": 5.579, "step": 9750 }, { "epoch": 1.1717717717717717, "grad_norm": 0.46110421419143677, "learning_rate": 6.684961771995529e-05, "loss": 0.1238, "step": 9755 }, { "epoch": 1.1723723723723725, "grad_norm": 0.6364412307739258, "learning_rate": 6.682000644268058e-05, "loss": 0.114, "step": 9760 }, { "epoch": 1.172972972972973, "grad_norm": 0.5057504773139954, "learning_rate": 6.67903885118279e-05, "loss": 0.1165, "step": 9765 }, { "epoch": 1.1735735735735735, "grad_norm": 0.5055462718009949, "learning_rate": 6.676076393911335e-05, "loss": 0.13, "step": 9770 }, { "epoch": 1.1741741741741742, "grad_norm": 0.4475913941860199, "learning_rate": 6.673113273625566e-05, "loss": 0.1297, "step": 9775 }, { "epoch": 1.1747747747747748, "grad_norm": 0.6086392998695374, "learning_rate": 6.670149491497625e-05, "loss": 0.1488, "step": 9780 }, { "epoch": 1.1753753753753753, "grad_norm": 0.5319909453392029, "learning_rate": 6.667185048699909e-05, "loss": 0.1419, "step": 9785 }, { "epoch": 1.175975975975976, "grad_norm": 0.4748077094554901, "learning_rate": 6.664219946405075e-05, "loss": 0.1344, "step": 9790 }, { "epoch": 1.1765765765765765, "grad_norm": 0.4510291814804077, "learning_rate": 6.661254185786047e-05, "loss": 0.0885, "step": 9795 }, { "epoch": 1.1771771771771773, "grad_norm": 0.4001372754573822, "learning_rate": 6.658287768016004e-05, "loss": 0.1206, "step": 9800 }, { "epoch": 1.1777777777777778, "grad_norm": 0.45367351174354553, "learning_rate": 6.65532069426839e-05, "loss": 0.1346, "step": 9805 }, { "epoch": 1.1783783783783783, "grad_norm": 0.47267067432403564, "learning_rate": 6.652352965716905e-05, "loss": 0.1159, "step": 9810 }, { "epoch": 1.178978978978979, "grad_norm": 0.5264884829521179, "learning_rate": 6.64938458353551e-05, "loss": 0.127, "step": 9815 }, { "epoch": 1.1795795795795796, "grad_norm": 0.592536449432373, "learning_rate": 6.646415548898419e-05, "loss": 0.1401, "step": 9820 }, { "epoch": 1.1801801801801801, "grad_norm": 0.481799453496933, "learning_rate": 6.643445862980115e-05, "loss": 0.114, "step": 9825 }, { "epoch": 1.1807807807807809, "grad_norm": 0.4007432758808136, "learning_rate": 6.640475526955329e-05, "loss": 0.1041, "step": 9830 }, { "epoch": 1.1813813813813814, "grad_norm": 0.44925785064697266, "learning_rate": 6.637504541999051e-05, "loss": 0.1322, "step": 9835 }, { "epoch": 1.181981981981982, "grad_norm": 0.5656154155731201, "learning_rate": 6.634532909286532e-05, "loss": 0.1241, "step": 9840 }, { "epoch": 1.1825825825825826, "grad_norm": 0.4835910201072693, "learning_rate": 6.631560629993274e-05, "loss": 0.1364, "step": 9845 }, { "epoch": 1.1831831831831832, "grad_norm": 0.4915154278278351, "learning_rate": 6.628587705295037e-05, "loss": 0.1215, "step": 9850 }, { "epoch": 1.1837837837837837, "grad_norm": 0.4441297650337219, "learning_rate": 6.625614136367838e-05, "loss": 0.1261, "step": 9855 }, { "epoch": 1.1843843843843844, "grad_norm": 0.3880355954170227, "learning_rate": 6.622639924387945e-05, "loss": 0.1264, "step": 9860 }, { "epoch": 1.184984984984985, "grad_norm": 0.6732304096221924, "learning_rate": 6.619665070531887e-05, "loss": 0.149, "step": 9865 }, { "epoch": 1.1855855855855855, "grad_norm": 0.4434427320957184, "learning_rate": 6.61668957597644e-05, "loss": 0.108, "step": 9870 }, { "epoch": 1.1861861861861862, "grad_norm": 0.4159413278102875, "learning_rate": 6.613713441898634e-05, "loss": 0.1095, "step": 9875 }, { "epoch": 1.1867867867867867, "grad_norm": 0.4756458103656769, "learning_rate": 6.610736669475755e-05, "loss": 0.1441, "step": 9880 }, { "epoch": 1.1873873873873875, "grad_norm": 0.5100711584091187, "learning_rate": 6.607759259885341e-05, "loss": 0.1341, "step": 9885 }, { "epoch": 1.187987987987988, "grad_norm": 0.4814146161079407, "learning_rate": 6.604781214305181e-05, "loss": 0.1338, "step": 9890 }, { "epoch": 1.1885885885885885, "grad_norm": 0.5011911988258362, "learning_rate": 6.601802533913317e-05, "loss": 0.1277, "step": 9895 }, { "epoch": 1.1891891891891893, "grad_norm": 0.4508677124977112, "learning_rate": 6.598823219888042e-05, "loss": 0.1226, "step": 9900 }, { "epoch": 1.1897897897897898, "grad_norm": 0.49636122584342957, "learning_rate": 6.595843273407895e-05, "loss": 0.1251, "step": 9905 }, { "epoch": 1.1903903903903903, "grad_norm": 0.5991763472557068, "learning_rate": 6.592862695651674e-05, "loss": 0.1259, "step": 9910 }, { "epoch": 1.190990990990991, "grad_norm": 0.6228018403053284, "learning_rate": 6.589881487798418e-05, "loss": 0.1402, "step": 9915 }, { "epoch": 1.1915915915915916, "grad_norm": 0.5502091646194458, "learning_rate": 6.586899651027421e-05, "loss": 0.1151, "step": 9920 }, { "epoch": 1.1921921921921923, "grad_norm": 0.47402524948120117, "learning_rate": 6.583917186518223e-05, "loss": 0.1217, "step": 9925 }, { "epoch": 1.1927927927927928, "grad_norm": 0.5979349613189697, "learning_rate": 6.580934095450613e-05, "loss": 0.1065, "step": 9930 }, { "epoch": 1.1933933933933933, "grad_norm": 0.41192102432250977, "learning_rate": 6.57795037900463e-05, "loss": 0.1135, "step": 9935 }, { "epoch": 1.193993993993994, "grad_norm": 0.4513644278049469, "learning_rate": 6.574966038360553e-05, "loss": 0.1224, "step": 9940 }, { "epoch": 1.1945945945945946, "grad_norm": 0.47603142261505127, "learning_rate": 6.57198107469892e-05, "loss": 0.129, "step": 9945 }, { "epoch": 1.1951951951951951, "grad_norm": 0.40763887763023376, "learning_rate": 6.568995489200503e-05, "loss": 0.1185, "step": 9950 }, { "epoch": 1.1957957957957959, "grad_norm": 0.5000661015510559, "learning_rate": 6.566009283046329e-05, "loss": 0.1214, "step": 9955 }, { "epoch": 1.1963963963963964, "grad_norm": 0.451659619808197, "learning_rate": 6.563022457417666e-05, "loss": 0.1212, "step": 9960 }, { "epoch": 1.196996996996997, "grad_norm": 0.5542309880256653, "learning_rate": 6.56003501349603e-05, "loss": 0.1351, "step": 9965 }, { "epoch": 1.1975975975975977, "grad_norm": 0.634504497051239, "learning_rate": 6.557046952463178e-05, "loss": 0.1228, "step": 9970 }, { "epoch": 1.1981981981981982, "grad_norm": 0.44421622157096863, "learning_rate": 6.554058275501112e-05, "loss": 0.1066, "step": 9975 }, { "epoch": 1.1987987987987987, "grad_norm": 0.43437567353248596, "learning_rate": 6.55106898379208e-05, "loss": 0.1176, "step": 9980 }, { "epoch": 1.1993993993993994, "grad_norm": 0.4679704010486603, "learning_rate": 6.548079078518572e-05, "loss": 0.109, "step": 9985 }, { "epoch": 1.2, "grad_norm": 0.47817036509513855, "learning_rate": 6.54508856086332e-05, "loss": 0.1204, "step": 9990 }, { "epoch": 1.2006006006006005, "grad_norm": 0.46039843559265137, "learning_rate": 6.5420974320093e-05, "loss": 0.1223, "step": 9995 }, { "epoch": 1.2012012012012012, "grad_norm": 0.6263976693153381, "learning_rate": 6.539105693139726e-05, "loss": 0.1211, "step": 10000 }, { "epoch": 1.2012012012012012, "eval_loss": 0.16594858467578888, "eval_runtime": 35.7813, "eval_samples_per_second": 22.358, "eval_steps_per_second": 5.59, "step": 10000 }, { "epoch": 1.2018018018018017, "grad_norm": 0.5672759413719177, "learning_rate": 6.536113345438058e-05, "loss": 0.1308, "step": 10005 }, { "epoch": 1.2024024024024025, "grad_norm": 0.561998188495636, "learning_rate": 6.533120390087995e-05, "loss": 0.1633, "step": 10010 }, { "epoch": 1.203003003003003, "grad_norm": 0.4906715750694275, "learning_rate": 6.530126828273472e-05, "loss": 0.1206, "step": 10015 }, { "epoch": 1.2036036036036035, "grad_norm": 0.554155707359314, "learning_rate": 6.527132661178673e-05, "loss": 0.1155, "step": 10020 }, { "epoch": 1.2042042042042043, "grad_norm": 0.5317109823226929, "learning_rate": 6.524137889988013e-05, "loss": 0.1276, "step": 10025 }, { "epoch": 1.2048048048048048, "grad_norm": 0.602896511554718, "learning_rate": 6.521142515886151e-05, "loss": 0.1309, "step": 10030 }, { "epoch": 1.2054054054054055, "grad_norm": 0.4094832241535187, "learning_rate": 6.518146540057981e-05, "loss": 0.1236, "step": 10035 }, { "epoch": 1.206006006006006, "grad_norm": 0.5748536586761475, "learning_rate": 6.51514996368864e-05, "loss": 0.1356, "step": 10040 }, { "epoch": 1.2066066066066066, "grad_norm": 0.4584883153438568, "learning_rate": 6.512152787963496e-05, "loss": 0.1343, "step": 10045 }, { "epoch": 1.2072072072072073, "grad_norm": 0.4517095685005188, "learning_rate": 6.50915501406816e-05, "loss": 0.1015, "step": 10050 }, { "epoch": 1.2078078078078078, "grad_norm": 0.5176427364349365, "learning_rate": 6.506156643188475e-05, "loss": 0.1451, "step": 10055 }, { "epoch": 1.2084084084084084, "grad_norm": 0.5709592700004578, "learning_rate": 6.503157676510523e-05, "loss": 0.1186, "step": 10060 }, { "epoch": 1.209009009009009, "grad_norm": 0.5626363754272461, "learning_rate": 6.500158115220624e-05, "loss": 0.1119, "step": 10065 }, { "epoch": 1.2096096096096096, "grad_norm": 0.4673994779586792, "learning_rate": 6.497157960505324e-05, "loss": 0.1179, "step": 10070 }, { "epoch": 1.2102102102102101, "grad_norm": 0.41876932978630066, "learning_rate": 6.494157213551416e-05, "loss": 0.1314, "step": 10075 }, { "epoch": 1.2108108108108109, "grad_norm": 0.4630737006664276, "learning_rate": 6.491155875545918e-05, "loss": 0.1158, "step": 10080 }, { "epoch": 1.2114114114114114, "grad_norm": 0.6827098727226257, "learning_rate": 6.488153947676085e-05, "loss": 0.1418, "step": 10085 }, { "epoch": 1.212012012012012, "grad_norm": 0.5200510025024414, "learning_rate": 6.485151431129405e-05, "loss": 0.1162, "step": 10090 }, { "epoch": 1.2126126126126127, "grad_norm": 0.5516908168792725, "learning_rate": 6.482148327093601e-05, "loss": 0.1177, "step": 10095 }, { "epoch": 1.2132132132132132, "grad_norm": 0.44765716791152954, "learning_rate": 6.479144636756624e-05, "loss": 0.11, "step": 10100 }, { "epoch": 1.2138138138138137, "grad_norm": 0.46599870920181274, "learning_rate": 6.476140361306665e-05, "loss": 0.1136, "step": 10105 }, { "epoch": 1.2144144144144144, "grad_norm": 0.5948307514190674, "learning_rate": 6.473135501932134e-05, "loss": 0.1267, "step": 10110 }, { "epoch": 1.215015015015015, "grad_norm": 0.47781988978385925, "learning_rate": 6.470130059821681e-05, "loss": 0.0974, "step": 10115 }, { "epoch": 1.2156156156156157, "grad_norm": 0.42919135093688965, "learning_rate": 6.467124036164188e-05, "loss": 0.1225, "step": 10120 }, { "epoch": 1.2162162162162162, "grad_norm": 0.5844857096672058, "learning_rate": 6.464117432148759e-05, "loss": 0.1407, "step": 10125 }, { "epoch": 1.2168168168168167, "grad_norm": 0.5320519804954529, "learning_rate": 6.461110248964737e-05, "loss": 0.1305, "step": 10130 }, { "epoch": 1.2174174174174175, "grad_norm": 0.4706374406814575, "learning_rate": 6.458102487801684e-05, "loss": 0.1116, "step": 10135 }, { "epoch": 1.218018018018018, "grad_norm": 0.6610530614852905, "learning_rate": 6.455094149849398e-05, "loss": 0.1264, "step": 10140 }, { "epoch": 1.2186186186186185, "grad_norm": 0.3942696750164032, "learning_rate": 6.452085236297904e-05, "loss": 0.1115, "step": 10145 }, { "epoch": 1.2192192192192193, "grad_norm": 0.6189076900482178, "learning_rate": 6.449075748337451e-05, "loss": 0.1303, "step": 10150 }, { "epoch": 1.2198198198198198, "grad_norm": 0.5192115306854248, "learning_rate": 6.446065687158522e-05, "loss": 0.1206, "step": 10155 }, { "epoch": 1.2204204204204205, "grad_norm": 0.4605419933795929, "learning_rate": 6.44305505395182e-05, "loss": 0.1194, "step": 10160 }, { "epoch": 1.221021021021021, "grad_norm": 0.5050801038742065, "learning_rate": 6.440043849908277e-05, "loss": 0.1121, "step": 10165 }, { "epoch": 1.2216216216216216, "grad_norm": 0.5062825679779053, "learning_rate": 6.43703207621905e-05, "loss": 0.1175, "step": 10170 }, { "epoch": 1.2222222222222223, "grad_norm": 0.6098731756210327, "learning_rate": 6.434019734075523e-05, "loss": 0.1326, "step": 10175 }, { "epoch": 1.2228228228228228, "grad_norm": 0.6831001043319702, "learning_rate": 6.431006824669305e-05, "loss": 0.1289, "step": 10180 }, { "epoch": 1.2234234234234234, "grad_norm": 0.5312969088554382, "learning_rate": 6.427993349192226e-05, "loss": 0.1089, "step": 10185 }, { "epoch": 1.224024024024024, "grad_norm": 0.5393247604370117, "learning_rate": 6.424979308836346e-05, "loss": 0.1352, "step": 10190 }, { "epoch": 1.2246246246246246, "grad_norm": 0.6124199032783508, "learning_rate": 6.42196470479394e-05, "loss": 0.1273, "step": 10195 }, { "epoch": 1.2252252252252251, "grad_norm": 0.45156264305114746, "learning_rate": 6.418949538257515e-05, "loss": 0.1155, "step": 10200 }, { "epoch": 1.2258258258258259, "grad_norm": 0.47056496143341064, "learning_rate": 6.415933810419794e-05, "loss": 0.1354, "step": 10205 }, { "epoch": 1.2264264264264264, "grad_norm": 0.3897019028663635, "learning_rate": 6.412917522473722e-05, "loss": 0.1378, "step": 10210 }, { "epoch": 1.227027027027027, "grad_norm": 0.530168354511261, "learning_rate": 6.409900675612475e-05, "loss": 0.1124, "step": 10215 }, { "epoch": 1.2276276276276277, "grad_norm": 0.5184093117713928, "learning_rate": 6.406883271029434e-05, "loss": 0.1349, "step": 10220 }, { "epoch": 1.2282282282282282, "grad_norm": 0.3887346386909485, "learning_rate": 6.403865309918216e-05, "loss": 0.1099, "step": 10225 }, { "epoch": 1.2288288288288287, "grad_norm": 0.5268977880477905, "learning_rate": 6.400846793472648e-05, "loss": 0.1156, "step": 10230 }, { "epoch": 1.2294294294294295, "grad_norm": 0.4158872663974762, "learning_rate": 6.397827722886782e-05, "loss": 0.105, "step": 10235 }, { "epoch": 1.23003003003003, "grad_norm": 0.4172183573246002, "learning_rate": 6.394808099354888e-05, "loss": 0.1115, "step": 10240 }, { "epoch": 1.2306306306306307, "grad_norm": 0.5329809188842773, "learning_rate": 6.391787924071454e-05, "loss": 0.1058, "step": 10245 }, { "epoch": 1.2312312312312312, "grad_norm": 0.5237940549850464, "learning_rate": 6.388767198231187e-05, "loss": 0.1364, "step": 10250 }, { "epoch": 1.2312312312312312, "eval_loss": 0.1568148285150528, "eval_runtime": 35.9579, "eval_samples_per_second": 22.248, "eval_steps_per_second": 5.562, "step": 10250 }, { "epoch": 1.2318318318318318, "grad_norm": 0.433456152677536, "learning_rate": 6.385745923029008e-05, "loss": 0.1168, "step": 10255 }, { "epoch": 1.2324324324324325, "grad_norm": 0.4698168635368347, "learning_rate": 6.382724099660063e-05, "loss": 0.1221, "step": 10260 }, { "epoch": 1.233033033033033, "grad_norm": 0.3991837501525879, "learning_rate": 6.379701729319707e-05, "loss": 0.1029, "step": 10265 }, { "epoch": 1.2336336336336335, "grad_norm": 0.42745494842529297, "learning_rate": 6.376678813203517e-05, "loss": 0.1037, "step": 10270 }, { "epoch": 1.2342342342342343, "grad_norm": 0.5544251799583435, "learning_rate": 6.373655352507284e-05, "loss": 0.1207, "step": 10275 }, { "epoch": 1.2348348348348348, "grad_norm": 0.5787578821182251, "learning_rate": 6.370631348427012e-05, "loss": 0.1276, "step": 10280 }, { "epoch": 1.2354354354354355, "grad_norm": 0.4974392056465149, "learning_rate": 6.367606802158925e-05, "loss": 0.1074, "step": 10285 }, { "epoch": 1.236036036036036, "grad_norm": 0.630325198173523, "learning_rate": 6.364581714899457e-05, "loss": 0.1348, "step": 10290 }, { "epoch": 1.2366366366366366, "grad_norm": 0.5606110692024231, "learning_rate": 6.361556087845258e-05, "loss": 0.1237, "step": 10295 }, { "epoch": 1.2372372372372373, "grad_norm": 0.4720557928085327, "learning_rate": 6.358529922193191e-05, "loss": 0.0998, "step": 10300 }, { "epoch": 1.2378378378378379, "grad_norm": 0.5140436291694641, "learning_rate": 6.35550321914033e-05, "loss": 0.1122, "step": 10305 }, { "epoch": 1.2384384384384384, "grad_norm": 0.45371970534324646, "learning_rate": 6.352475979883967e-05, "loss": 0.1095, "step": 10310 }, { "epoch": 1.2390390390390391, "grad_norm": 0.6673846244812012, "learning_rate": 6.349448205621602e-05, "loss": 0.1274, "step": 10315 }, { "epoch": 1.2396396396396396, "grad_norm": 0.4930076599121094, "learning_rate": 6.346419897550946e-05, "loss": 0.1263, "step": 10320 }, { "epoch": 1.2402402402402402, "grad_norm": 0.5467943549156189, "learning_rate": 6.343391056869925e-05, "loss": 0.1112, "step": 10325 }, { "epoch": 1.240840840840841, "grad_norm": 0.5361719131469727, "learning_rate": 6.340361684776673e-05, "loss": 0.11, "step": 10330 }, { "epoch": 1.2414414414414414, "grad_norm": 0.539135754108429, "learning_rate": 6.337331782469532e-05, "loss": 0.1243, "step": 10335 }, { "epoch": 1.242042042042042, "grad_norm": 0.4833470582962036, "learning_rate": 6.334301351147061e-05, "loss": 0.1153, "step": 10340 }, { "epoch": 1.2426426426426427, "grad_norm": 0.49930423498153687, "learning_rate": 6.331270392008019e-05, "loss": 0.1257, "step": 10345 }, { "epoch": 1.2432432432432432, "grad_norm": 0.47665318846702576, "learning_rate": 6.328238906251378e-05, "loss": 0.1163, "step": 10350 }, { "epoch": 1.2438438438438437, "grad_norm": 0.5809736847877502, "learning_rate": 6.325206895076323e-05, "loss": 0.1286, "step": 10355 }, { "epoch": 1.2444444444444445, "grad_norm": 0.5782884955406189, "learning_rate": 6.32217435968224e-05, "loss": 0.1287, "step": 10360 }, { "epoch": 1.245045045045045, "grad_norm": 0.6237002611160278, "learning_rate": 6.319141301268725e-05, "loss": 0.1359, "step": 10365 }, { "epoch": 1.2456456456456457, "grad_norm": 0.5011314153671265, "learning_rate": 6.316107721035581e-05, "loss": 0.1022, "step": 10370 }, { "epoch": 1.2462462462462462, "grad_norm": 0.4131128191947937, "learning_rate": 6.313073620182816e-05, "loss": 0.1088, "step": 10375 }, { "epoch": 1.2468468468468468, "grad_norm": 0.5163730978965759, "learning_rate": 6.310038999910648e-05, "loss": 0.1046, "step": 10380 }, { "epoch": 1.2474474474474475, "grad_norm": 0.4977834224700928, "learning_rate": 6.307003861419498e-05, "loss": 0.1283, "step": 10385 }, { "epoch": 1.248048048048048, "grad_norm": 0.6062964797019958, "learning_rate": 6.303968205909985e-05, "loss": 0.131, "step": 10390 }, { "epoch": 1.2486486486486488, "grad_norm": 0.472942978143692, "learning_rate": 6.300932034582947e-05, "loss": 0.1219, "step": 10395 }, { "epoch": 1.2492492492492493, "grad_norm": 0.4516295790672302, "learning_rate": 6.297895348639415e-05, "loss": 0.1202, "step": 10400 }, { "epoch": 1.2498498498498498, "grad_norm": 0.5245740413665771, "learning_rate": 6.294858149280625e-05, "loss": 0.1097, "step": 10405 }, { "epoch": 1.2504504504504506, "grad_norm": 0.5840051770210266, "learning_rate": 6.291820437708018e-05, "loss": 0.1359, "step": 10410 }, { "epoch": 1.251051051051051, "grad_norm": 0.704037606716156, "learning_rate": 6.288782215123242e-05, "loss": 0.1243, "step": 10415 }, { "epoch": 1.2516516516516516, "grad_norm": 0.4552038311958313, "learning_rate": 6.285743482728138e-05, "loss": 0.139, "step": 10420 }, { "epoch": 1.2522522522522523, "grad_norm": 0.5076271891593933, "learning_rate": 6.282704241724756e-05, "loss": 0.1306, "step": 10425 }, { "epoch": 1.2528528528528529, "grad_norm": 0.514893651008606, "learning_rate": 6.279664493315343e-05, "loss": 0.1206, "step": 10430 }, { "epoch": 1.2534534534534534, "grad_norm": 0.4690745770931244, "learning_rate": 6.276624238702347e-05, "loss": 0.1217, "step": 10435 }, { "epoch": 1.2540540540540541, "grad_norm": 0.47772398591041565, "learning_rate": 6.273583479088422e-05, "loss": 0.1172, "step": 10440 }, { "epoch": 1.2546546546546546, "grad_norm": 0.49891212582588196, "learning_rate": 6.270542215676411e-05, "loss": 0.126, "step": 10445 }, { "epoch": 1.2552552552552552, "grad_norm": 0.40780380368232727, "learning_rate": 6.267500449669367e-05, "loss": 0.12, "step": 10450 }, { "epoch": 1.255855855855856, "grad_norm": 0.5549760460853577, "learning_rate": 6.264458182270536e-05, "loss": 0.1375, "step": 10455 }, { "epoch": 1.2564564564564564, "grad_norm": 0.5111986994743347, "learning_rate": 6.261415414683365e-05, "loss": 0.1206, "step": 10460 }, { "epoch": 1.257057057057057, "grad_norm": 0.5683580040931702, "learning_rate": 6.258372148111493e-05, "loss": 0.127, "step": 10465 }, { "epoch": 1.2576576576576577, "grad_norm": 0.4563046991825104, "learning_rate": 6.255328383758768e-05, "loss": 0.0997, "step": 10470 }, { "epoch": 1.2582582582582582, "grad_norm": 0.5216641426086426, "learning_rate": 6.25228412282922e-05, "loss": 0.1185, "step": 10475 }, { "epoch": 1.2588588588588587, "grad_norm": 0.45824185013771057, "learning_rate": 6.249239366527088e-05, "loss": 0.1145, "step": 10480 }, { "epoch": 1.2594594594594595, "grad_norm": 0.5047664046287537, "learning_rate": 6.246194116056803e-05, "loss": 0.1249, "step": 10485 }, { "epoch": 1.26006006006006, "grad_norm": 0.4454682767391205, "learning_rate": 6.243148372622986e-05, "loss": 0.1012, "step": 10490 }, { "epoch": 1.2606606606606607, "grad_norm": 0.5449995994567871, "learning_rate": 6.240102137430463e-05, "loss": 0.1177, "step": 10495 }, { "epoch": 1.2612612612612613, "grad_norm": 0.4220269024372101, "learning_rate": 6.237055411684245e-05, "loss": 0.1177, "step": 10500 }, { "epoch": 1.2612612612612613, "eval_loss": 0.13997453451156616, "eval_runtime": 35.894, "eval_samples_per_second": 22.288, "eval_steps_per_second": 5.572, "step": 10500 }, { "epoch": 1.261861861861862, "grad_norm": 0.7029174566268921, "learning_rate": 6.234008196589545e-05, "loss": 0.1257, "step": 10505 }, { "epoch": 1.2624624624624625, "grad_norm": 0.4518882632255554, "learning_rate": 6.230960493351761e-05, "loss": 0.1183, "step": 10510 }, { "epoch": 1.263063063063063, "grad_norm": 0.46593165397644043, "learning_rate": 6.227912303176493e-05, "loss": 0.1118, "step": 10515 }, { "epoch": 1.2636636636636638, "grad_norm": 0.5268282294273376, "learning_rate": 6.224863627269529e-05, "loss": 0.129, "step": 10520 }, { "epoch": 1.2642642642642643, "grad_norm": 0.4126332700252533, "learning_rate": 6.221814466836848e-05, "loss": 0.0942, "step": 10525 }, { "epoch": 1.2648648648648648, "grad_norm": 0.4374845325946808, "learning_rate": 6.218764823084624e-05, "loss": 0.1007, "step": 10530 }, { "epoch": 1.2654654654654656, "grad_norm": 0.48468881845474243, "learning_rate": 6.21571469721922e-05, "loss": 0.1248, "step": 10535 }, { "epoch": 1.266066066066066, "grad_norm": 0.5242211818695068, "learning_rate": 6.21266409044719e-05, "loss": 0.1234, "step": 10540 }, { "epoch": 1.2666666666666666, "grad_norm": 0.48668068647384644, "learning_rate": 6.209613003975278e-05, "loss": 0.111, "step": 10545 }, { "epoch": 1.2672672672672673, "grad_norm": 0.5469307899475098, "learning_rate": 6.206561439010418e-05, "loss": 0.0973, "step": 10550 }, { "epoch": 1.2678678678678679, "grad_norm": 0.5339605212211609, "learning_rate": 6.203509396759736e-05, "loss": 0.1105, "step": 10555 }, { "epoch": 1.2684684684684684, "grad_norm": 0.49797797203063965, "learning_rate": 6.200456878430542e-05, "loss": 0.1043, "step": 10560 }, { "epoch": 1.2690690690690691, "grad_norm": 0.442689448595047, "learning_rate": 6.197403885230337e-05, "loss": 0.0849, "step": 10565 }, { "epoch": 1.2696696696696697, "grad_norm": 0.4138694703578949, "learning_rate": 6.19435041836681e-05, "loss": 0.0989, "step": 10570 }, { "epoch": 1.2702702702702702, "grad_norm": 0.45859405398368835, "learning_rate": 6.191296479047837e-05, "loss": 0.1008, "step": 10575 }, { "epoch": 1.270870870870871, "grad_norm": 0.4636090397834778, "learning_rate": 6.188242068481481e-05, "loss": 0.116, "step": 10580 }, { "epoch": 1.2714714714714714, "grad_norm": 0.48628488183021545, "learning_rate": 6.185187187875989e-05, "loss": 0.1138, "step": 10585 }, { "epoch": 1.272072072072072, "grad_norm": 0.4805254638195038, "learning_rate": 6.182131838439799e-05, "loss": 0.1153, "step": 10590 }, { "epoch": 1.2726726726726727, "grad_norm": 0.4614923298358917, "learning_rate": 6.17907602138153e-05, "loss": 0.1061, "step": 10595 }, { "epoch": 1.2732732732732732, "grad_norm": 0.523812472820282, "learning_rate": 6.176019737909989e-05, "loss": 0.1067, "step": 10600 }, { "epoch": 1.2738738738738737, "grad_norm": 0.4345147907733917, "learning_rate": 6.172962989234162e-05, "loss": 0.1158, "step": 10605 }, { "epoch": 1.2744744744744745, "grad_norm": 0.5423539876937866, "learning_rate": 6.169905776563229e-05, "loss": 0.1207, "step": 10610 }, { "epoch": 1.275075075075075, "grad_norm": 0.5352399945259094, "learning_rate": 6.166848101106543e-05, "loss": 0.1206, "step": 10615 }, { "epoch": 1.2756756756756757, "grad_norm": 0.47472965717315674, "learning_rate": 6.163789964073647e-05, "loss": 0.1311, "step": 10620 }, { "epoch": 1.2762762762762763, "grad_norm": 0.48953425884246826, "learning_rate": 6.160731366674264e-05, "loss": 0.1231, "step": 10625 }, { "epoch": 1.276876876876877, "grad_norm": 0.565130889415741, "learning_rate": 6.157672310118297e-05, "loss": 0.1138, "step": 10630 }, { "epoch": 1.2774774774774775, "grad_norm": 0.42137813568115234, "learning_rate": 6.154612795615836e-05, "loss": 0.0997, "step": 10635 }, { "epoch": 1.278078078078078, "grad_norm": 0.5966997146606445, "learning_rate": 6.151552824377148e-05, "loss": 0.1034, "step": 10640 }, { "epoch": 1.2786786786786788, "grad_norm": 0.49393218755722046, "learning_rate": 6.148492397612683e-05, "loss": 0.1254, "step": 10645 }, { "epoch": 1.2792792792792793, "grad_norm": 0.4611142873764038, "learning_rate": 6.145431516533068e-05, "loss": 0.1067, "step": 10650 }, { "epoch": 1.2798798798798798, "grad_norm": 0.5526403188705444, "learning_rate": 6.142370182349113e-05, "loss": 0.1147, "step": 10655 }, { "epoch": 1.2804804804804806, "grad_norm": 0.454584002494812, "learning_rate": 6.139308396271804e-05, "loss": 0.111, "step": 10660 }, { "epoch": 1.281081081081081, "grad_norm": 0.48215189576148987, "learning_rate": 6.136246159512311e-05, "loss": 0.1082, "step": 10665 }, { "epoch": 1.2816816816816816, "grad_norm": 0.49625515937805176, "learning_rate": 6.133183473281978e-05, "loss": 0.0919, "step": 10670 }, { "epoch": 1.2822822822822824, "grad_norm": 0.557956337928772, "learning_rate": 6.130120338792327e-05, "loss": 0.129, "step": 10675 }, { "epoch": 1.2828828828828829, "grad_norm": 0.46650010347366333, "learning_rate": 6.127056757255059e-05, "loss": 0.1067, "step": 10680 }, { "epoch": 1.2834834834834834, "grad_norm": 0.4905858635902405, "learning_rate": 6.12399272988205e-05, "loss": 0.112, "step": 10685 }, { "epoch": 1.2840840840840841, "grad_norm": 0.5483079552650452, "learning_rate": 6.120928257885354e-05, "loss": 0.0999, "step": 10690 }, { "epoch": 1.2846846846846847, "grad_norm": 0.5385778546333313, "learning_rate": 6.117863342477199e-05, "loss": 0.1065, "step": 10695 }, { "epoch": 1.2852852852852852, "grad_norm": 0.40807682275772095, "learning_rate": 6.114797984869992e-05, "loss": 0.1099, "step": 10700 }, { "epoch": 1.285885885885886, "grad_norm": 0.5240715742111206, "learning_rate": 6.11173218627631e-05, "loss": 0.107, "step": 10705 }, { "epoch": 1.2864864864864864, "grad_norm": 0.6276156306266785, "learning_rate": 6.108665947908909e-05, "loss": 0.112, "step": 10710 }, { "epoch": 1.287087087087087, "grad_norm": 0.5692849159240723, "learning_rate": 6.105599270980716e-05, "loss": 0.1166, "step": 10715 }, { "epoch": 1.2876876876876877, "grad_norm": 0.5226588249206543, "learning_rate": 6.102532156704832e-05, "loss": 0.0986, "step": 10720 }, { "epoch": 1.2882882882882882, "grad_norm": 0.4636688828468323, "learning_rate": 6.099464606294533e-05, "loss": 0.104, "step": 10725 }, { "epoch": 1.2888888888888888, "grad_norm": 0.5031043887138367, "learning_rate": 6.096396620963264e-05, "loss": 0.1117, "step": 10730 }, { "epoch": 1.2894894894894895, "grad_norm": 0.5021368861198425, "learning_rate": 6.093328201924645e-05, "loss": 0.0998, "step": 10735 }, { "epoch": 1.29009009009009, "grad_norm": 0.4384394586086273, "learning_rate": 6.090259350392468e-05, "loss": 0.1088, "step": 10740 }, { "epoch": 1.2906906906906908, "grad_norm": 0.3991621732711792, "learning_rate": 6.087190067580691e-05, "loss": 0.0943, "step": 10745 }, { "epoch": 1.2912912912912913, "grad_norm": 0.4799728989601135, "learning_rate": 6.0841203547034495e-05, "loss": 0.0976, "step": 10750 }, { "epoch": 1.2912912912912913, "eval_loss": 0.12925225496292114, "eval_runtime": 35.8181, "eval_samples_per_second": 22.335, "eval_steps_per_second": 5.584, "step": 10750 }, { "epoch": 1.291891891891892, "grad_norm": 0.5584620237350464, "learning_rate": 6.081050212975047e-05, "loss": 0.1053, "step": 10755 }, { "epoch": 1.2924924924924925, "grad_norm": 0.5148786902427673, "learning_rate": 6.077979643609952e-05, "loss": 0.1092, "step": 10760 }, { "epoch": 1.293093093093093, "grad_norm": 0.5329381823539734, "learning_rate": 6.0749086478228066e-05, "loss": 0.1068, "step": 10765 }, { "epoch": 1.2936936936936938, "grad_norm": 0.5270235538482666, "learning_rate": 6.0718372268284216e-05, "loss": 0.1021, "step": 10770 }, { "epoch": 1.2942942942942943, "grad_norm": 0.41905245184898376, "learning_rate": 6.068765381841776e-05, "loss": 0.1232, "step": 10775 }, { "epoch": 1.2948948948948948, "grad_norm": 0.4731054902076721, "learning_rate": 6.065693114078012e-05, "loss": 0.1049, "step": 10780 }, { "epoch": 1.2954954954954956, "grad_norm": 0.4915459454059601, "learning_rate": 6.062620424752446e-05, "loss": 0.1081, "step": 10785 }, { "epoch": 1.296096096096096, "grad_norm": 0.46217599511146545, "learning_rate": 6.059547315080557e-05, "loss": 0.1113, "step": 10790 }, { "epoch": 1.2966966966966966, "grad_norm": 0.5027230381965637, "learning_rate": 6.0564737862779894e-05, "loss": 0.1014, "step": 10795 }, { "epoch": 1.2972972972972974, "grad_norm": 0.49862319231033325, "learning_rate": 6.053399839560559e-05, "loss": 0.1044, "step": 10800 }, { "epoch": 1.297897897897898, "grad_norm": 0.44932979345321655, "learning_rate": 6.0503254761442384e-05, "loss": 0.0887, "step": 10805 }, { "epoch": 1.2984984984984984, "grad_norm": 0.5156846046447754, "learning_rate": 6.0472506972451724e-05, "loss": 0.1093, "step": 10810 }, { "epoch": 1.2990990990990992, "grad_norm": 0.5869203209877014, "learning_rate": 6.0441755040796676e-05, "loss": 0.1189, "step": 10815 }, { "epoch": 1.2996996996996997, "grad_norm": 0.5359819531440735, "learning_rate": 6.041099897864192e-05, "loss": 0.0929, "step": 10820 }, { "epoch": 1.3003003003003002, "grad_norm": 0.42127725481987, "learning_rate": 6.038023879815382e-05, "loss": 0.1256, "step": 10825 }, { "epoch": 1.300900900900901, "grad_norm": 0.4706318974494934, "learning_rate": 6.034947451150032e-05, "loss": 0.1094, "step": 10830 }, { "epoch": 1.3015015015015015, "grad_norm": 0.4898918867111206, "learning_rate": 6.0318706130851024e-05, "loss": 0.1089, "step": 10835 }, { "epoch": 1.302102102102102, "grad_norm": 0.4475489854812622, "learning_rate": 6.028793366837712e-05, "loss": 0.0878, "step": 10840 }, { "epoch": 1.3027027027027027, "grad_norm": 0.5071420669555664, "learning_rate": 6.025715713625146e-05, "loss": 0.1083, "step": 10845 }, { "epoch": 1.3033033033033032, "grad_norm": 0.4866783618927002, "learning_rate": 6.022637654664846e-05, "loss": 0.1085, "step": 10850 }, { "epoch": 1.303903903903904, "grad_norm": 0.5027065277099609, "learning_rate": 6.019559191174416e-05, "loss": 0.1021, "step": 10855 }, { "epoch": 1.3045045045045045, "grad_norm": 0.5954436659812927, "learning_rate": 6.016480324371622e-05, "loss": 0.1194, "step": 10860 }, { "epoch": 1.305105105105105, "grad_norm": 0.5378061532974243, "learning_rate": 6.013401055474384e-05, "loss": 0.1081, "step": 10865 }, { "epoch": 1.3057057057057058, "grad_norm": 0.4428832530975342, "learning_rate": 6.0103213857007864e-05, "loss": 0.1146, "step": 10870 }, { "epoch": 1.3063063063063063, "grad_norm": 0.6072458028793335, "learning_rate": 6.00724131626907e-05, "loss": 0.1112, "step": 10875 }, { "epoch": 1.306906906906907, "grad_norm": 0.3962419033050537, "learning_rate": 6.004160848397635e-05, "loss": 0.1133, "step": 10880 }, { "epoch": 1.3075075075075075, "grad_norm": 0.49185970425605774, "learning_rate": 6.001079983305037e-05, "loss": 0.108, "step": 10885 }, { "epoch": 1.308108108108108, "grad_norm": 0.5775038003921509, "learning_rate": 5.9979987222099895e-05, "loss": 0.1059, "step": 10890 }, { "epoch": 1.3087087087087088, "grad_norm": 0.49251505732536316, "learning_rate": 5.994917066331366e-05, "loss": 0.108, "step": 10895 }, { "epoch": 1.3093093093093093, "grad_norm": 0.5165398716926575, "learning_rate": 5.9918350168881885e-05, "loss": 0.1058, "step": 10900 }, { "epoch": 1.3099099099099099, "grad_norm": 0.571881890296936, "learning_rate": 5.988752575099644e-05, "loss": 0.1248, "step": 10905 }, { "epoch": 1.3105105105105106, "grad_norm": 0.4895869791507721, "learning_rate": 5.985669742185068e-05, "loss": 0.1187, "step": 10910 }, { "epoch": 1.3111111111111111, "grad_norm": 0.467013418674469, "learning_rate": 5.982586519363954e-05, "loss": 0.1275, "step": 10915 }, { "epoch": 1.3117117117117116, "grad_norm": 0.4548030495643616, "learning_rate": 5.979502907855945e-05, "loss": 0.116, "step": 10920 }, { "epoch": 1.3123123123123124, "grad_norm": 0.419702410697937, "learning_rate": 5.976418908880845e-05, "loss": 0.0964, "step": 10925 }, { "epoch": 1.312912912912913, "grad_norm": 0.531001627445221, "learning_rate": 5.973334523658607e-05, "loss": 0.0963, "step": 10930 }, { "epoch": 1.3135135135135134, "grad_norm": 0.5297698974609375, "learning_rate": 5.970249753409336e-05, "loss": 0.1035, "step": 10935 }, { "epoch": 1.3141141141141142, "grad_norm": 0.40090298652648926, "learning_rate": 5.967164599353293e-05, "loss": 0.0923, "step": 10940 }, { "epoch": 1.3147147147147147, "grad_norm": 0.5055508017539978, "learning_rate": 5.9640790627108865e-05, "loss": 0.1082, "step": 10945 }, { "epoch": 1.3153153153153152, "grad_norm": 0.5552372932434082, "learning_rate": 5.96099314470268e-05, "loss": 0.1268, "step": 10950 }, { "epoch": 1.315915915915916, "grad_norm": 0.43672850728034973, "learning_rate": 5.957906846549385e-05, "loss": 0.0996, "step": 10955 }, { "epoch": 1.3165165165165165, "grad_norm": 0.4018603265285492, "learning_rate": 5.954820169471864e-05, "loss": 0.0981, "step": 10960 }, { "epoch": 1.317117117117117, "grad_norm": 0.5597862005233765, "learning_rate": 5.951733114691132e-05, "loss": 0.109, "step": 10965 }, { "epoch": 1.3177177177177177, "grad_norm": 0.6404910683631897, "learning_rate": 5.948645683428349e-05, "loss": 0.1129, "step": 10970 }, { "epoch": 1.3183183183183182, "grad_norm": 0.3674793243408203, "learning_rate": 5.945557876904829e-05, "loss": 0.091, "step": 10975 }, { "epoch": 1.318918918918919, "grad_norm": 0.6048736572265625, "learning_rate": 5.94246969634203e-05, "loss": 0.1173, "step": 10980 }, { "epoch": 1.3195195195195195, "grad_norm": 0.38514524698257446, "learning_rate": 5.939381142961561e-05, "loss": 0.0887, "step": 10985 }, { "epoch": 1.3201201201201203, "grad_norm": 0.5462285280227661, "learning_rate": 5.936292217985175e-05, "loss": 0.0979, "step": 10990 }, { "epoch": 1.3207207207207208, "grad_norm": 0.4344751536846161, "learning_rate": 5.9332029226347776e-05, "loss": 0.0829, "step": 10995 }, { "epoch": 1.3213213213213213, "grad_norm": 0.5517582893371582, "learning_rate": 5.930113258132415e-05, "loss": 0.1085, "step": 11000 }, { "epoch": 1.3213213213213213, "eval_loss": 0.12636400759220123, "eval_runtime": 35.8467, "eval_samples_per_second": 22.317, "eval_steps_per_second": 5.579, "step": 11000 }, { "epoch": 1.321921921921922, "grad_norm": 0.622289776802063, "learning_rate": 5.927023225700282e-05, "loss": 0.1156, "step": 11005 }, { "epoch": 1.3225225225225226, "grad_norm": 0.544995129108429, "learning_rate": 5.9239328265607195e-05, "loss": 0.1059, "step": 11010 }, { "epoch": 1.323123123123123, "grad_norm": 0.4979136288166046, "learning_rate": 5.9208420619362135e-05, "loss": 0.1104, "step": 11015 }, { "epoch": 1.3237237237237238, "grad_norm": 0.5703704357147217, "learning_rate": 5.917750933049393e-05, "loss": 0.0891, "step": 11020 }, { "epoch": 1.3243243243243243, "grad_norm": 0.4206981062889099, "learning_rate": 5.914659441123032e-05, "loss": 0.1, "step": 11025 }, { "epoch": 1.3249249249249249, "grad_norm": 0.5549303293228149, "learning_rate": 5.911567587380048e-05, "loss": 0.1016, "step": 11030 }, { "epoch": 1.3255255255255256, "grad_norm": 0.45054033398628235, "learning_rate": 5.908475373043504e-05, "loss": 0.0935, "step": 11035 }, { "epoch": 1.3261261261261261, "grad_norm": 0.46710655093193054, "learning_rate": 5.905382799336601e-05, "loss": 0.1064, "step": 11040 }, { "epoch": 1.3267267267267266, "grad_norm": 0.4755312204360962, "learning_rate": 5.902289867482684e-05, "loss": 0.0951, "step": 11045 }, { "epoch": 1.3273273273273274, "grad_norm": 0.5991599559783936, "learning_rate": 5.899196578705244e-05, "loss": 0.1126, "step": 11050 }, { "epoch": 1.327927927927928, "grad_norm": 0.4284519553184509, "learning_rate": 5.896102934227905e-05, "loss": 0.1034, "step": 11055 }, { "epoch": 1.3285285285285284, "grad_norm": 0.5215682983398438, "learning_rate": 5.893008935274438e-05, "loss": 0.1146, "step": 11060 }, { "epoch": 1.3291291291291292, "grad_norm": 0.38606658577919006, "learning_rate": 5.889914583068752e-05, "loss": 0.1045, "step": 11065 }, { "epoch": 1.3297297297297297, "grad_norm": 0.5296543836593628, "learning_rate": 5.886819878834898e-05, "loss": 0.097, "step": 11070 }, { "epoch": 1.3303303303303302, "grad_norm": 0.4166487455368042, "learning_rate": 5.8837248237970624e-05, "loss": 0.1028, "step": 11075 }, { "epoch": 1.330930930930931, "grad_norm": 0.43563294410705566, "learning_rate": 5.880629419179573e-05, "loss": 0.1033, "step": 11080 }, { "epoch": 1.3315315315315315, "grad_norm": 0.5101214051246643, "learning_rate": 5.8775336662068936e-05, "loss": 0.1046, "step": 11085 }, { "epoch": 1.332132132132132, "grad_norm": 0.6170099973678589, "learning_rate": 5.87443756610363e-05, "loss": 0.0988, "step": 11090 }, { "epoch": 1.3327327327327327, "grad_norm": 0.5087183117866516, "learning_rate": 5.871341120094522e-05, "loss": 0.1191, "step": 11095 }, { "epoch": 1.3333333333333333, "grad_norm": 0.5283128619194031, "learning_rate": 5.8682443294044455e-05, "loss": 0.0971, "step": 11100 }, { "epoch": 1.333933933933934, "grad_norm": 0.4020695984363556, "learning_rate": 5.8651471952584155e-05, "loss": 0.1006, "step": 11105 }, { "epoch": 1.3345345345345345, "grad_norm": 0.5804185271263123, "learning_rate": 5.8620497188815805e-05, "loss": 0.1019, "step": 11110 }, { "epoch": 1.3351351351351353, "grad_norm": 0.5557799935340881, "learning_rate": 5.858951901499228e-05, "loss": 0.0986, "step": 11115 }, { "epoch": 1.3357357357357358, "grad_norm": 0.5129476189613342, "learning_rate": 5.8558537443367734e-05, "loss": 0.096, "step": 11120 }, { "epoch": 1.3363363363363363, "grad_norm": 0.47615566849708557, "learning_rate": 5.8527552486197746e-05, "loss": 0.1057, "step": 11125 }, { "epoch": 1.336936936936937, "grad_norm": 0.4600653052330017, "learning_rate": 5.84965641557392e-05, "loss": 0.0856, "step": 11130 }, { "epoch": 1.3375375375375376, "grad_norm": 0.3756698668003082, "learning_rate": 5.846557246425028e-05, "loss": 0.085, "step": 11135 }, { "epoch": 1.338138138138138, "grad_norm": 0.40878963470458984, "learning_rate": 5.8434577423990577e-05, "loss": 0.09, "step": 11140 }, { "epoch": 1.3387387387387388, "grad_norm": 0.5921834111213684, "learning_rate": 5.8403579047220915e-05, "loss": 0.0947, "step": 11145 }, { "epoch": 1.3393393393393394, "grad_norm": 0.4594131112098694, "learning_rate": 5.8372577346203515e-05, "loss": 0.0899, "step": 11150 }, { "epoch": 1.3399399399399399, "grad_norm": 0.44041335582733154, "learning_rate": 5.834157233320186e-05, "loss": 0.0919, "step": 11155 }, { "epoch": 1.3405405405405406, "grad_norm": 0.5850462317466736, "learning_rate": 5.8310564020480774e-05, "loss": 0.0771, "step": 11160 }, { "epoch": 1.3411411411411411, "grad_norm": 0.545671284198761, "learning_rate": 5.8279552420306394e-05, "loss": 0.0919, "step": 11165 }, { "epoch": 1.3417417417417417, "grad_norm": 0.44084644317626953, "learning_rate": 5.824853754494611e-05, "loss": 0.0894, "step": 11170 }, { "epoch": 1.3423423423423424, "grad_norm": 0.49509671330451965, "learning_rate": 5.821751940666867e-05, "loss": 0.1111, "step": 11175 }, { "epoch": 1.342942942942943, "grad_norm": 0.5806544423103333, "learning_rate": 5.8186498017744063e-05, "loss": 0.116, "step": 11180 }, { "epoch": 1.3435435435435434, "grad_norm": 0.46228882670402527, "learning_rate": 5.815547339044359e-05, "loss": 0.0919, "step": 11185 }, { "epoch": 1.3441441441441442, "grad_norm": 0.4910290539264679, "learning_rate": 5.8124445537039826e-05, "loss": 0.1098, "step": 11190 }, { "epoch": 1.3447447447447447, "grad_norm": 0.5441747903823853, "learning_rate": 5.809341446980661e-05, "loss": 0.101, "step": 11195 }, { "epoch": 1.3453453453453452, "grad_norm": 0.4423178434371948, "learning_rate": 5.8062380201019086e-05, "loss": 0.1156, "step": 11200 }, { "epoch": 1.345945945945946, "grad_norm": 0.5379671454429626, "learning_rate": 5.80313427429536e-05, "loss": 0.1028, "step": 11205 }, { "epoch": 1.3465465465465465, "grad_norm": 0.572998046875, "learning_rate": 5.800030210788785e-05, "loss": 0.0994, "step": 11210 }, { "epoch": 1.3471471471471472, "grad_norm": 0.5262696146965027, "learning_rate": 5.7969258308100705e-05, "loss": 0.0943, "step": 11215 }, { "epoch": 1.3477477477477477, "grad_norm": 0.5349236130714417, "learning_rate": 5.793821135587235e-05, "loss": 0.1126, "step": 11220 }, { "epoch": 1.3483483483483483, "grad_norm": 0.6178399920463562, "learning_rate": 5.790716126348417e-05, "loss": 0.099, "step": 11225 }, { "epoch": 1.348948948948949, "grad_norm": 0.5026535987854004, "learning_rate": 5.7876108043218835e-05, "loss": 0.1036, "step": 11230 }, { "epoch": 1.3495495495495495, "grad_norm": 0.45836958289146423, "learning_rate": 5.784505170736022e-05, "loss": 0.0998, "step": 11235 }, { "epoch": 1.3501501501501503, "grad_norm": 0.534871518611908, "learning_rate": 5.781399226819342e-05, "loss": 0.1069, "step": 11240 }, { "epoch": 1.3507507507507508, "grad_norm": 0.33303356170654297, "learning_rate": 5.778292973800482e-05, "loss": 0.0877, "step": 11245 }, { "epoch": 1.3513513513513513, "grad_norm": 0.5539909601211548, "learning_rate": 5.7751864129081946e-05, "loss": 0.0913, "step": 11250 }, { "epoch": 1.3513513513513513, "eval_loss": 0.11916637420654297, "eval_runtime": 35.8503, "eval_samples_per_second": 22.315, "eval_steps_per_second": 5.579, "step": 11250 }, { "epoch": 1.351951951951952, "grad_norm": 0.4414505362510681, "learning_rate": 5.772079545371363e-05, "loss": 0.0952, "step": 11255 }, { "epoch": 1.3525525525525526, "grad_norm": 0.4687088429927826, "learning_rate": 5.768972372418981e-05, "loss": 0.0853, "step": 11260 }, { "epoch": 1.353153153153153, "grad_norm": 0.5335027575492859, "learning_rate": 5.765864895280175e-05, "loss": 0.0925, "step": 11265 }, { "epoch": 1.3537537537537538, "grad_norm": 0.46961474418640137, "learning_rate": 5.7627571151841855e-05, "loss": 0.0987, "step": 11270 }, { "epoch": 1.3543543543543544, "grad_norm": 0.43510282039642334, "learning_rate": 5.759649033360369e-05, "loss": 0.1019, "step": 11275 }, { "epoch": 1.3549549549549549, "grad_norm": 0.4986173212528229, "learning_rate": 5.7565406510382094e-05, "loss": 0.0981, "step": 11280 }, { "epoch": 1.3555555555555556, "grad_norm": 0.610945999622345, "learning_rate": 5.753431969447305e-05, "loss": 0.1075, "step": 11285 }, { "epoch": 1.3561561561561561, "grad_norm": 0.4827207326889038, "learning_rate": 5.750322989817373e-05, "loss": 0.0872, "step": 11290 }, { "epoch": 1.3567567567567567, "grad_norm": 0.5450891852378845, "learning_rate": 5.747213713378248e-05, "loss": 0.1096, "step": 11295 }, { "epoch": 1.3573573573573574, "grad_norm": 0.5981200337409973, "learning_rate": 5.7441041413598815e-05, "loss": 0.0983, "step": 11300 }, { "epoch": 1.357957957957958, "grad_norm": 0.40616247057914734, "learning_rate": 5.740994274992348e-05, "loss": 0.089, "step": 11305 }, { "epoch": 1.3585585585585584, "grad_norm": 0.45179131627082825, "learning_rate": 5.737884115505829e-05, "loss": 0.0843, "step": 11310 }, { "epoch": 1.3591591591591592, "grad_norm": 0.6250925064086914, "learning_rate": 5.73477366413063e-05, "loss": 0.1029, "step": 11315 }, { "epoch": 1.3597597597597597, "grad_norm": 0.4998907744884491, "learning_rate": 5.731662922097165e-05, "loss": 0.093, "step": 11320 }, { "epoch": 1.3603603603603602, "grad_norm": 0.47104600071907043, "learning_rate": 5.7285518906359706e-05, "loss": 0.0964, "step": 11325 }, { "epoch": 1.360960960960961, "grad_norm": 0.439411461353302, "learning_rate": 5.72544057097769e-05, "loss": 0.0895, "step": 11330 }, { "epoch": 1.3615615615615615, "grad_norm": 0.6331053972244263, "learning_rate": 5.722328964353085e-05, "loss": 0.1038, "step": 11335 }, { "epoch": 1.3621621621621622, "grad_norm": 0.47725340723991394, "learning_rate": 5.719217071993033e-05, "loss": 0.0896, "step": 11340 }, { "epoch": 1.3627627627627628, "grad_norm": 0.44108104705810547, "learning_rate": 5.716104895128518e-05, "loss": 0.081, "step": 11345 }, { "epoch": 1.3633633633633635, "grad_norm": 0.4921906888484955, "learning_rate": 5.712992434990642e-05, "loss": 0.0881, "step": 11350 }, { "epoch": 1.363963963963964, "grad_norm": 0.5186919569969177, "learning_rate": 5.7098796928106156e-05, "loss": 0.1097, "step": 11355 }, { "epoch": 1.3645645645645645, "grad_norm": 0.49276605248451233, "learning_rate": 5.7067666698197654e-05, "loss": 0.0965, "step": 11360 }, { "epoch": 1.3651651651651653, "grad_norm": 0.4950788915157318, "learning_rate": 5.703653367249522e-05, "loss": 0.0912, "step": 11365 }, { "epoch": 1.3657657657657658, "grad_norm": 0.588308572769165, "learning_rate": 5.700539786331436e-05, "loss": 0.1028, "step": 11370 }, { "epoch": 1.3663663663663663, "grad_norm": 0.4782957434654236, "learning_rate": 5.6974259282971585e-05, "loss": 0.0753, "step": 11375 }, { "epoch": 1.366966966966967, "grad_norm": 0.5269047617912292, "learning_rate": 5.6943117943784554e-05, "loss": 0.0916, "step": 11380 }, { "epoch": 1.3675675675675676, "grad_norm": 0.4562937915325165, "learning_rate": 5.691197385807203e-05, "loss": 0.0996, "step": 11385 }, { "epoch": 1.368168168168168, "grad_norm": 0.45184147357940674, "learning_rate": 5.688082703815382e-05, "loss": 0.0858, "step": 11390 }, { "epoch": 1.3687687687687689, "grad_norm": 0.5662463307380676, "learning_rate": 5.684967749635085e-05, "loss": 0.0923, "step": 11395 }, { "epoch": 1.3693693693693694, "grad_norm": 0.5363159775733948, "learning_rate": 5.6818525244985096e-05, "loss": 0.0989, "step": 11400 }, { "epoch": 1.36996996996997, "grad_norm": 0.47409459948539734, "learning_rate": 5.6787370296379616e-05, "loss": 0.0816, "step": 11405 }, { "epoch": 1.3705705705705706, "grad_norm": 0.4212060868740082, "learning_rate": 5.675621266285855e-05, "loss": 0.0909, "step": 11410 }, { "epoch": 1.3711711711711712, "grad_norm": 0.5455527305603027, "learning_rate": 5.6725052356747074e-05, "loss": 0.0888, "step": 11415 }, { "epoch": 1.3717717717717717, "grad_norm": 0.5372433662414551, "learning_rate": 5.669388939037146e-05, "loss": 0.0899, "step": 11420 }, { "epoch": 1.3723723723723724, "grad_norm": 0.4447285830974579, "learning_rate": 5.666272377605897e-05, "loss": 0.0709, "step": 11425 }, { "epoch": 1.372972972972973, "grad_norm": 0.46429330110549927, "learning_rate": 5.663155552613797e-05, "loss": 0.0814, "step": 11430 }, { "epoch": 1.3735735735735735, "grad_norm": 0.5037614107131958, "learning_rate": 5.660038465293782e-05, "loss": 0.0894, "step": 11435 }, { "epoch": 1.3741741741741742, "grad_norm": 0.5420588850975037, "learning_rate": 5.656921116878897e-05, "loss": 0.0943, "step": 11440 }, { "epoch": 1.3747747747747747, "grad_norm": 0.4849311113357544, "learning_rate": 5.6538035086022886e-05, "loss": 0.0971, "step": 11445 }, { "epoch": 1.3753753753753752, "grad_norm": 0.5062885284423828, "learning_rate": 5.650685641697203e-05, "loss": 0.089, "step": 11450 }, { "epoch": 1.375975975975976, "grad_norm": 0.5417320132255554, "learning_rate": 5.647567517396993e-05, "loss": 0.1063, "step": 11455 }, { "epoch": 1.3765765765765765, "grad_norm": 0.5244467258453369, "learning_rate": 5.64444913693511e-05, "loss": 0.0966, "step": 11460 }, { "epoch": 1.3771771771771772, "grad_norm": 0.5427935719490051, "learning_rate": 5.641330501545111e-05, "loss": 0.1022, "step": 11465 }, { "epoch": 1.3777777777777778, "grad_norm": 0.4481619596481323, "learning_rate": 5.6382116124606475e-05, "loss": 0.0845, "step": 11470 }, { "epoch": 1.3783783783783785, "grad_norm": 0.5331611037254333, "learning_rate": 5.635092470915476e-05, "loss": 0.0923, "step": 11475 }, { "epoch": 1.378978978978979, "grad_norm": 0.44660672545433044, "learning_rate": 5.631973078143452e-05, "loss": 0.0995, "step": 11480 }, { "epoch": 1.3795795795795796, "grad_norm": 0.40920501947402954, "learning_rate": 5.628853435378528e-05, "loss": 0.0844, "step": 11485 }, { "epoch": 1.3801801801801803, "grad_norm": 0.4937179386615753, "learning_rate": 5.625733543854762e-05, "loss": 0.1036, "step": 11490 }, { "epoch": 1.3807807807807808, "grad_norm": 0.4166986048221588, "learning_rate": 5.622613404806301e-05, "loss": 0.0855, "step": 11495 }, { "epoch": 1.3813813813813813, "grad_norm": 0.45914360880851746, "learning_rate": 5.619493019467397e-05, "loss": 0.0791, "step": 11500 }, { "epoch": 1.3813813813813813, "eval_loss": 0.11311966925859451, "eval_runtime": 35.9989, "eval_samples_per_second": 22.223, "eval_steps_per_second": 5.556, "step": 11500 }, { "epoch": 1.381981981981982, "grad_norm": 0.5590953826904297, "learning_rate": 5.6163723890723966e-05, "loss": 0.0941, "step": 11505 }, { "epoch": 1.3825825825825826, "grad_norm": 0.5889363884925842, "learning_rate": 5.613251514855744e-05, "loss": 0.1024, "step": 11510 }, { "epoch": 1.3831831831831831, "grad_norm": 0.4771200120449066, "learning_rate": 5.61013039805198e-05, "loss": 0.0995, "step": 11515 }, { "epoch": 1.3837837837837839, "grad_norm": 0.5556389689445496, "learning_rate": 5.607009039895742e-05, "loss": 0.097, "step": 11520 }, { "epoch": 1.3843843843843844, "grad_norm": 0.468735009431839, "learning_rate": 5.60388744162176e-05, "loss": 0.0982, "step": 11525 }, { "epoch": 1.384984984984985, "grad_norm": 0.548349142074585, "learning_rate": 5.600765604464861e-05, "loss": 0.1022, "step": 11530 }, { "epoch": 1.3855855855855856, "grad_norm": 0.470237135887146, "learning_rate": 5.59764352965997e-05, "loss": 0.0873, "step": 11535 }, { "epoch": 1.3861861861861862, "grad_norm": 0.47610995173454285, "learning_rate": 5.594521218442097e-05, "loss": 0.0953, "step": 11540 }, { "epoch": 1.3867867867867867, "grad_norm": 0.5809658169746399, "learning_rate": 5.5913986720463554e-05, "loss": 0.1012, "step": 11545 }, { "epoch": 1.3873873873873874, "grad_norm": 0.5556275248527527, "learning_rate": 5.588275891707946e-05, "loss": 0.0864, "step": 11550 }, { "epoch": 1.387987987987988, "grad_norm": 0.4226778447628021, "learning_rate": 5.585152878662161e-05, "loss": 0.0802, "step": 11555 }, { "epoch": 1.3885885885885885, "grad_norm": 0.5059261322021484, "learning_rate": 5.5820296341443915e-05, "loss": 0.0972, "step": 11560 }, { "epoch": 1.3891891891891892, "grad_norm": 0.5257276892662048, "learning_rate": 5.5789061593901126e-05, "loss": 0.0818, "step": 11565 }, { "epoch": 1.3897897897897897, "grad_norm": 0.4813799560070038, "learning_rate": 5.575782455634895e-05, "loss": 0.0833, "step": 11570 }, { "epoch": 1.3903903903903903, "grad_norm": 0.4413239061832428, "learning_rate": 5.572658524114396e-05, "loss": 0.0911, "step": 11575 }, { "epoch": 1.390990990990991, "grad_norm": 0.5474624633789062, "learning_rate": 5.569534366064367e-05, "loss": 0.0936, "step": 11580 }, { "epoch": 1.3915915915915915, "grad_norm": 0.4176657795906067, "learning_rate": 5.566409982720649e-05, "loss": 0.0978, "step": 11585 }, { "epoch": 1.3921921921921923, "grad_norm": 0.40932703018188477, "learning_rate": 5.56328537531917e-05, "loss": 0.1017, "step": 11590 }, { "epoch": 1.3927927927927928, "grad_norm": 0.5456036925315857, "learning_rate": 5.560160545095945e-05, "loss": 0.0892, "step": 11595 }, { "epoch": 1.3933933933933935, "grad_norm": 0.44474583864212036, "learning_rate": 5.557035493287082e-05, "loss": 0.1029, "step": 11600 }, { "epoch": 1.393993993993994, "grad_norm": 0.5171622633934021, "learning_rate": 5.5539102211287744e-05, "loss": 0.0941, "step": 11605 }, { "epoch": 1.3945945945945946, "grad_norm": 0.4478524923324585, "learning_rate": 5.5507847298573015e-05, "loss": 0.085, "step": 11610 }, { "epoch": 1.3951951951951953, "grad_norm": 0.3781989514827728, "learning_rate": 5.547659020709028e-05, "loss": 0.0875, "step": 11615 }, { "epoch": 1.3957957957957958, "grad_norm": 0.5028838515281677, "learning_rate": 5.544533094920411e-05, "loss": 0.0769, "step": 11620 }, { "epoch": 1.3963963963963963, "grad_norm": 0.4439200758934021, "learning_rate": 5.541406953727987e-05, "loss": 0.1135, "step": 11625 }, { "epoch": 1.396996996996997, "grad_norm": 0.43673768639564514, "learning_rate": 5.538280598368382e-05, "loss": 0.0711, "step": 11630 }, { "epoch": 1.3975975975975976, "grad_norm": 0.4030311703681946, "learning_rate": 5.5351540300783e-05, "loss": 0.1047, "step": 11635 }, { "epoch": 1.3981981981981981, "grad_norm": 0.36829933524131775, "learning_rate": 5.532027250094539e-05, "loss": 0.0877, "step": 11640 }, { "epoch": 1.3987987987987989, "grad_norm": 0.5318426489830017, "learning_rate": 5.528900259653975e-05, "loss": 0.0978, "step": 11645 }, { "epoch": 1.3993993993993994, "grad_norm": 0.45535722374916077, "learning_rate": 5.525773059993566e-05, "loss": 0.0831, "step": 11650 }, { "epoch": 1.4, "grad_norm": 0.4648362398147583, "learning_rate": 5.522645652350357e-05, "loss": 0.0882, "step": 11655 }, { "epoch": 1.4006006006006007, "grad_norm": 0.4772687256336212, "learning_rate": 5.519518037961471e-05, "loss": 0.0894, "step": 11660 }, { "epoch": 1.4012012012012012, "grad_norm": 0.40186789631843567, "learning_rate": 5.516390218064115e-05, "loss": 0.0895, "step": 11665 }, { "epoch": 1.4018018018018017, "grad_norm": 0.630961000919342, "learning_rate": 5.5132621938955774e-05, "loss": 0.1021, "step": 11670 }, { "epoch": 1.4024024024024024, "grad_norm": 0.5480983853340149, "learning_rate": 5.510133966693227e-05, "loss": 0.0921, "step": 11675 }, { "epoch": 1.403003003003003, "grad_norm": 0.395860493183136, "learning_rate": 5.507005537694515e-05, "loss": 0.0875, "step": 11680 }, { "epoch": 1.4036036036036035, "grad_norm": 0.454228013753891, "learning_rate": 5.5038769081369665e-05, "loss": 0.0784, "step": 11685 }, { "epoch": 1.4042042042042042, "grad_norm": 0.42922738194465637, "learning_rate": 5.5007480792581946e-05, "loss": 0.088, "step": 11690 }, { "epoch": 1.4048048048048047, "grad_norm": 0.5611411929130554, "learning_rate": 5.497619052295882e-05, "loss": 0.0848, "step": 11695 }, { "epoch": 1.4054054054054055, "grad_norm": 0.5752083659172058, "learning_rate": 5.4944898284877974e-05, "loss": 0.08, "step": 11700 }, { "epoch": 1.406006006006006, "grad_norm": 0.6316119432449341, "learning_rate": 5.491360409071784e-05, "loss": 0.0837, "step": 11705 }, { "epoch": 1.4066066066066065, "grad_norm": 0.527634859085083, "learning_rate": 5.4882307952857605e-05, "loss": 0.0815, "step": 11710 }, { "epoch": 1.4072072072072073, "grad_norm": 0.5850387811660767, "learning_rate": 5.4851009883677265e-05, "loss": 0.0875, "step": 11715 }, { "epoch": 1.4078078078078078, "grad_norm": 0.4735187888145447, "learning_rate": 5.4819709895557545e-05, "loss": 0.0892, "step": 11720 }, { "epoch": 1.4084084084084085, "grad_norm": 0.4472479820251465, "learning_rate": 5.4788408000879966e-05, "loss": 0.0829, "step": 11725 }, { "epoch": 1.409009009009009, "grad_norm": 0.49625205993652344, "learning_rate": 5.4757104212026755e-05, "loss": 0.0825, "step": 11730 }, { "epoch": 1.4096096096096096, "grad_norm": 0.515315592288971, "learning_rate": 5.472579854138096e-05, "loss": 0.0985, "step": 11735 }, { "epoch": 1.4102102102102103, "grad_norm": 0.4145263135433197, "learning_rate": 5.4694491001326276e-05, "loss": 0.0919, "step": 11740 }, { "epoch": 1.4108108108108108, "grad_norm": 0.4036807715892792, "learning_rate": 5.4663181604247226e-05, "loss": 0.0929, "step": 11745 }, { "epoch": 1.4114114114114114, "grad_norm": 0.4930163621902466, "learning_rate": 5.463187036252902e-05, "loss": 0.0922, "step": 11750 }, { "epoch": 1.4114114114114114, "eval_loss": 0.10833635926246643, "eval_runtime": 35.806, "eval_samples_per_second": 22.343, "eval_steps_per_second": 5.586, "step": 11750 }, { "epoch": 1.412012012012012, "grad_norm": 0.5043427348136902, "learning_rate": 5.4600557288557606e-05, "loss": 0.0835, "step": 11755 }, { "epoch": 1.4126126126126126, "grad_norm": 0.35307490825653076, "learning_rate": 5.456924239471968e-05, "loss": 0.0712, "step": 11760 }, { "epoch": 1.4132132132132131, "grad_norm": 0.4506548047065735, "learning_rate": 5.4537925693402604e-05, "loss": 0.0996, "step": 11765 }, { "epoch": 1.4138138138138139, "grad_norm": 0.5748133063316345, "learning_rate": 5.450660719699452e-05, "loss": 0.1143, "step": 11770 }, { "epoch": 1.4144144144144144, "grad_norm": 0.4180302917957306, "learning_rate": 5.4475286917884236e-05, "loss": 0.0895, "step": 11775 }, { "epoch": 1.415015015015015, "grad_norm": 0.43286964297294617, "learning_rate": 5.4443964868461286e-05, "loss": 0.087, "step": 11780 }, { "epoch": 1.4156156156156157, "grad_norm": 0.46737971901893616, "learning_rate": 5.441264106111589e-05, "loss": 0.0906, "step": 11785 }, { "epoch": 1.4162162162162162, "grad_norm": 0.3804052174091339, "learning_rate": 5.4381315508238974e-05, "loss": 0.0865, "step": 11790 }, { "epoch": 1.4168168168168167, "grad_norm": 0.5925003886222839, "learning_rate": 5.434998822222215e-05, "loss": 0.0968, "step": 11795 }, { "epoch": 1.4174174174174174, "grad_norm": 0.41046711802482605, "learning_rate": 5.4318659215457724e-05, "loss": 0.0826, "step": 11800 }, { "epoch": 1.418018018018018, "grad_norm": 0.4627430737018585, "learning_rate": 5.428732850033866e-05, "loss": 0.0902, "step": 11805 }, { "epoch": 1.4186186186186185, "grad_norm": 0.4553092420101166, "learning_rate": 5.4255996089258624e-05, "loss": 0.0904, "step": 11810 }, { "epoch": 1.4192192192192192, "grad_norm": 0.36503249406814575, "learning_rate": 5.4224661994611934e-05, "loss": 0.0669, "step": 11815 }, { "epoch": 1.4198198198198198, "grad_norm": 0.4996648132801056, "learning_rate": 5.4193326228793593e-05, "loss": 0.0908, "step": 11820 }, { "epoch": 1.4204204204204205, "grad_norm": 0.4459191858768463, "learning_rate": 5.416198880419924e-05, "loss": 0.0777, "step": 11825 }, { "epoch": 1.421021021021021, "grad_norm": 0.4893633723258972, "learning_rate": 5.41306497332252e-05, "loss": 0.1203, "step": 11830 }, { "epoch": 1.4216216216216218, "grad_norm": 0.4804662764072418, "learning_rate": 5.409930902826842e-05, "loss": 0.0859, "step": 11835 }, { "epoch": 1.4222222222222223, "grad_norm": 0.46695834398269653, "learning_rate": 5.406796670172651e-05, "loss": 0.0825, "step": 11840 }, { "epoch": 1.4228228228228228, "grad_norm": 0.5017839074134827, "learning_rate": 5.4036622765997736e-05, "loss": 0.0821, "step": 11845 }, { "epoch": 1.4234234234234235, "grad_norm": 0.4748544692993164, "learning_rate": 5.4005277233480945e-05, "loss": 0.0933, "step": 11850 }, { "epoch": 1.424024024024024, "grad_norm": 0.5358333587646484, "learning_rate": 5.397393011657569e-05, "loss": 0.0891, "step": 11855 }, { "epoch": 1.4246246246246246, "grad_norm": 0.616308331489563, "learning_rate": 5.394258142768208e-05, "loss": 0.1044, "step": 11860 }, { "epoch": 1.4252252252252253, "grad_norm": 0.4904516637325287, "learning_rate": 5.3911231179200924e-05, "loss": 0.082, "step": 11865 }, { "epoch": 1.4258258258258258, "grad_norm": 0.4936327040195465, "learning_rate": 5.387987938353356e-05, "loss": 0.0889, "step": 11870 }, { "epoch": 1.4264264264264264, "grad_norm": 0.5046936869621277, "learning_rate": 5.384852605308202e-05, "loss": 0.0868, "step": 11875 }, { "epoch": 1.427027027027027, "grad_norm": 0.3366491198539734, "learning_rate": 5.381717120024886e-05, "loss": 0.0926, "step": 11880 }, { "epoch": 1.4276276276276276, "grad_norm": 0.5158485770225525, "learning_rate": 5.378581483743732e-05, "loss": 0.0872, "step": 11885 }, { "epoch": 1.4282282282282281, "grad_norm": 0.4886568784713745, "learning_rate": 5.3754456977051205e-05, "loss": 0.0812, "step": 11890 }, { "epoch": 1.428828828828829, "grad_norm": 0.500196099281311, "learning_rate": 5.372309763149487e-05, "loss": 0.0782, "step": 11895 }, { "epoch": 1.4294294294294294, "grad_norm": 0.588158905506134, "learning_rate": 5.369173681317333e-05, "loss": 0.0908, "step": 11900 }, { "epoch": 1.43003003003003, "grad_norm": 0.3685852587223053, "learning_rate": 5.366037453449213e-05, "loss": 0.0759, "step": 11905 }, { "epoch": 1.4306306306306307, "grad_norm": 0.45046189427375793, "learning_rate": 5.3629010807857414e-05, "loss": 0.076, "step": 11910 }, { "epoch": 1.4312312312312312, "grad_norm": 0.4529851973056793, "learning_rate": 5.359764564567591e-05, "loss": 0.0815, "step": 11915 }, { "epoch": 1.4318318318318317, "grad_norm": 0.42264431715011597, "learning_rate": 5.356627906035488e-05, "loss": 0.0893, "step": 11920 }, { "epoch": 1.4324324324324325, "grad_norm": 0.4683610498905182, "learning_rate": 5.353491106430217e-05, "loss": 0.0842, "step": 11925 }, { "epoch": 1.433033033033033, "grad_norm": 0.40563976764678955, "learning_rate": 5.350354166992619e-05, "loss": 0.0844, "step": 11930 }, { "epoch": 1.4336336336336335, "grad_norm": 0.430910587310791, "learning_rate": 5.347217088963591e-05, "loss": 0.0873, "step": 11935 }, { "epoch": 1.4342342342342342, "grad_norm": 0.46596115827560425, "learning_rate": 5.3440798735840804e-05, "loss": 0.0853, "step": 11940 }, { "epoch": 1.4348348348348348, "grad_norm": 0.44316524267196655, "learning_rate": 5.340942522095095e-05, "loss": 0.082, "step": 11945 }, { "epoch": 1.4354354354354355, "grad_norm": 0.5083211660385132, "learning_rate": 5.337805035737689e-05, "loss": 0.1029, "step": 11950 }, { "epoch": 1.436036036036036, "grad_norm": 0.449158251285553, "learning_rate": 5.3346674157529776e-05, "loss": 0.0944, "step": 11955 }, { "epoch": 1.4366366366366368, "grad_norm": 0.47196316719055176, "learning_rate": 5.331529663382125e-05, "loss": 0.0943, "step": 11960 }, { "epoch": 1.4372372372372373, "grad_norm": 0.4012751579284668, "learning_rate": 5.328391779866348e-05, "loss": 0.0901, "step": 11965 }, { "epoch": 1.4378378378378378, "grad_norm": 0.5194813013076782, "learning_rate": 5.3252537664469185e-05, "loss": 0.0924, "step": 11970 }, { "epoch": 1.4384384384384385, "grad_norm": 0.5404078364372253, "learning_rate": 5.3221156243651505e-05, "loss": 0.0873, "step": 11975 }, { "epoch": 1.439039039039039, "grad_norm": 0.38077086210250854, "learning_rate": 5.318977354862421e-05, "loss": 0.0727, "step": 11980 }, { "epoch": 1.4396396396396396, "grad_norm": 0.4743851125240326, "learning_rate": 5.31583895918015e-05, "loss": 0.0927, "step": 11985 }, { "epoch": 1.4402402402402403, "grad_norm": 0.46732330322265625, "learning_rate": 5.312700438559808e-05, "loss": 0.096, "step": 11990 }, { "epoch": 1.4408408408408409, "grad_norm": 0.537147045135498, "learning_rate": 5.309561794242918e-05, "loss": 0.0771, "step": 11995 }, { "epoch": 1.4414414414414414, "grad_norm": 0.4657417833805084, "learning_rate": 5.306423027471046e-05, "loss": 0.0765, "step": 12000 }, { "epoch": 1.4414414414414414, "eval_loss": 0.10528457909822464, "eval_runtime": 36.0279, "eval_samples_per_second": 22.205, "eval_steps_per_second": 5.551, "step": 12000 }, { "epoch": 1.4420420420420421, "grad_norm": 0.5093770623207092, "learning_rate": 5.3032841394858154e-05, "loss": 0.0864, "step": 12005 }, { "epoch": 1.4426426426426426, "grad_norm": 0.4265924096107483, "learning_rate": 5.3001451315288895e-05, "loss": 0.0909, "step": 12010 }, { "epoch": 1.4432432432432432, "grad_norm": 0.47986164689064026, "learning_rate": 5.297006004841983e-05, "loss": 0.0814, "step": 12015 }, { "epoch": 1.443843843843844, "grad_norm": 0.5935015082359314, "learning_rate": 5.293866760666857e-05, "loss": 0.0855, "step": 12020 }, { "epoch": 1.4444444444444444, "grad_norm": 0.4596727192401886, "learning_rate": 5.290727400245319e-05, "loss": 0.0787, "step": 12025 }, { "epoch": 1.445045045045045, "grad_norm": 0.5177743434906006, "learning_rate": 5.2875879248192196e-05, "loss": 0.0867, "step": 12030 }, { "epoch": 1.4456456456456457, "grad_norm": 0.4075930118560791, "learning_rate": 5.284448335630462e-05, "loss": 0.0868, "step": 12035 }, { "epoch": 1.4462462462462462, "grad_norm": 0.4848853647708893, "learning_rate": 5.281308633920986e-05, "loss": 0.0647, "step": 12040 }, { "epoch": 1.4468468468468467, "grad_norm": 0.374552458524704, "learning_rate": 5.278168820932782e-05, "loss": 0.0723, "step": 12045 }, { "epoch": 1.4474474474474475, "grad_norm": 0.4907662570476532, "learning_rate": 5.27502889790788e-05, "loss": 0.0873, "step": 12050 }, { "epoch": 1.448048048048048, "grad_norm": 0.5037271976470947, "learning_rate": 5.2718888660883594e-05, "loss": 0.0884, "step": 12055 }, { "epoch": 1.4486486486486487, "grad_norm": 0.5089353919029236, "learning_rate": 5.268748726716335e-05, "loss": 0.0953, "step": 12060 }, { "epoch": 1.4492492492492492, "grad_norm": 0.4179510176181793, "learning_rate": 5.265608481033971e-05, "loss": 0.0768, "step": 12065 }, { "epoch": 1.4498498498498498, "grad_norm": 0.5133365988731384, "learning_rate": 5.26246813028347e-05, "loss": 0.0915, "step": 12070 }, { "epoch": 1.4504504504504505, "grad_norm": 0.5832177996635437, "learning_rate": 5.2593276757070775e-05, "loss": 0.0984, "step": 12075 }, { "epoch": 1.451051051051051, "grad_norm": 0.424835205078125, "learning_rate": 5.256187118547079e-05, "loss": 0.0844, "step": 12080 }, { "epoch": 1.4516516516516518, "grad_norm": 0.40258777141571045, "learning_rate": 5.253046460045799e-05, "loss": 0.0816, "step": 12085 }, { "epoch": 1.4522522522522523, "grad_norm": 0.4845524728298187, "learning_rate": 5.249905701445609e-05, "loss": 0.0761, "step": 12090 }, { "epoch": 1.4528528528528528, "grad_norm": 0.44638118147850037, "learning_rate": 5.24676484398891e-05, "loss": 0.0815, "step": 12095 }, { "epoch": 1.4534534534534536, "grad_norm": 0.5267859697341919, "learning_rate": 5.243623888918153e-05, "loss": 0.0742, "step": 12100 }, { "epoch": 1.454054054054054, "grad_norm": 0.5129780769348145, "learning_rate": 5.2404828374758174e-05, "loss": 0.0718, "step": 12105 }, { "epoch": 1.4546546546546546, "grad_norm": 0.4829353094100952, "learning_rate": 5.237341690904428e-05, "loss": 0.08, "step": 12110 }, { "epoch": 1.4552552552552553, "grad_norm": 0.4809311032295227, "learning_rate": 5.2342004504465426e-05, "loss": 0.0719, "step": 12115 }, { "epoch": 1.4558558558558559, "grad_norm": 0.5456272959709167, "learning_rate": 5.2310591173447596e-05, "loss": 0.074, "step": 12120 }, { "epoch": 1.4564564564564564, "grad_norm": 0.4419856071472168, "learning_rate": 5.2279176928417127e-05, "loss": 0.063, "step": 12125 }, { "epoch": 1.4570570570570571, "grad_norm": 0.41237685084342957, "learning_rate": 5.224776178180071e-05, "loss": 0.0824, "step": 12130 }, { "epoch": 1.4576576576576576, "grad_norm": 0.5364236235618591, "learning_rate": 5.22163457460254e-05, "loss": 0.0999, "step": 12135 }, { "epoch": 1.4582582582582582, "grad_norm": 0.4944758415222168, "learning_rate": 5.218492883351859e-05, "loss": 0.0775, "step": 12140 }, { "epoch": 1.458858858858859, "grad_norm": 0.4452052414417267, "learning_rate": 5.215351105670806e-05, "loss": 0.0739, "step": 12145 }, { "epoch": 1.4594594594594594, "grad_norm": 0.41491255164146423, "learning_rate": 5.2122092428021874e-05, "loss": 0.0785, "step": 12150 }, { "epoch": 1.46006006006006, "grad_norm": 0.39245888590812683, "learning_rate": 5.209067295988849e-05, "loss": 0.0763, "step": 12155 }, { "epoch": 1.4606606606606607, "grad_norm": 0.4065698981285095, "learning_rate": 5.205925266473666e-05, "loss": 0.0959, "step": 12160 }, { "epoch": 1.4612612612612612, "grad_norm": 0.5166289806365967, "learning_rate": 5.2027831554995464e-05, "loss": 0.0966, "step": 12165 }, { "epoch": 1.4618618618618617, "grad_norm": 0.4691830575466156, "learning_rate": 5.199640964309434e-05, "loss": 0.094, "step": 12170 }, { "epoch": 1.4624624624624625, "grad_norm": 0.5310338735580444, "learning_rate": 5.196498694146301e-05, "loss": 0.0825, "step": 12175 }, { "epoch": 1.463063063063063, "grad_norm": 0.41804417967796326, "learning_rate": 5.193356346253151e-05, "loss": 0.0696, "step": 12180 }, { "epoch": 1.4636636636636637, "grad_norm": 0.5980838537216187, "learning_rate": 5.190213921873017e-05, "loss": 0.0741, "step": 12185 }, { "epoch": 1.4642642642642643, "grad_norm": 0.4227926731109619, "learning_rate": 5.187071422248968e-05, "loss": 0.0805, "step": 12190 }, { "epoch": 1.464864864864865, "grad_norm": 0.48485782742500305, "learning_rate": 5.1839288486240975e-05, "loss": 0.0666, "step": 12195 }, { "epoch": 1.4654654654654655, "grad_norm": 0.3551216423511505, "learning_rate": 5.1807862022415275e-05, "loss": 0.0899, "step": 12200 }, { "epoch": 1.466066066066066, "grad_norm": 0.5003598928451538, "learning_rate": 5.1776434843444164e-05, "loss": 0.0685, "step": 12205 }, { "epoch": 1.4666666666666668, "grad_norm": 0.5011917948722839, "learning_rate": 5.17450069617594e-05, "loss": 0.076, "step": 12210 }, { "epoch": 1.4672672672672673, "grad_norm": 0.48780590295791626, "learning_rate": 5.1713578389793116e-05, "loss": 0.0751, "step": 12215 }, { "epoch": 1.4678678678678678, "grad_norm": 0.48917055130004883, "learning_rate": 5.1682149139977655e-05, "loss": 0.0926, "step": 12220 }, { "epoch": 1.4684684684684686, "grad_norm": 0.4546566307544708, "learning_rate": 5.165071922474564e-05, "loss": 0.0723, "step": 12225 }, { "epoch": 1.469069069069069, "grad_norm": 0.4893244504928589, "learning_rate": 5.1619288656529995e-05, "loss": 0.0798, "step": 12230 }, { "epoch": 1.4696696696696696, "grad_norm": 0.4245280921459198, "learning_rate": 5.158785744776385e-05, "loss": 0.0831, "step": 12235 }, { "epoch": 1.4702702702702704, "grad_norm": 0.4717468023300171, "learning_rate": 5.155642561088063e-05, "loss": 0.0937, "step": 12240 }, { "epoch": 1.4708708708708709, "grad_norm": 0.4630936086177826, "learning_rate": 5.152499315831398e-05, "loss": 0.0755, "step": 12245 }, { "epoch": 1.4714714714714714, "grad_norm": 0.5492383241653442, "learning_rate": 5.149356010249782e-05, "loss": 0.0997, "step": 12250 }, { "epoch": 1.4714714714714714, "eval_loss": 0.10047008842229843, "eval_runtime": 35.8826, "eval_samples_per_second": 22.295, "eval_steps_per_second": 5.574, "step": 12250 }, { "epoch": 1.4720720720720721, "grad_norm": 0.5058827996253967, "learning_rate": 5.1462126455866255e-05, "loss": 0.0765, "step": 12255 }, { "epoch": 1.4726726726726727, "grad_norm": 0.4335672855377197, "learning_rate": 5.143069223085368e-05, "loss": 0.0781, "step": 12260 }, { "epoch": 1.4732732732732732, "grad_norm": 0.5172309875488281, "learning_rate": 5.139925743989471e-05, "loss": 0.1014, "step": 12265 }, { "epoch": 1.473873873873874, "grad_norm": 0.45344531536102295, "learning_rate": 5.136782209542412e-05, "loss": 0.0718, "step": 12270 }, { "epoch": 1.4744744744744744, "grad_norm": 0.4879536032676697, "learning_rate": 5.133638620987701e-05, "loss": 0.0715, "step": 12275 }, { "epoch": 1.475075075075075, "grad_norm": 0.45065709948539734, "learning_rate": 5.130494979568859e-05, "loss": 0.0793, "step": 12280 }, { "epoch": 1.4756756756756757, "grad_norm": 0.5382446646690369, "learning_rate": 5.127351286529436e-05, "loss": 0.0925, "step": 12285 }, { "epoch": 1.4762762762762762, "grad_norm": 0.4322677552700043, "learning_rate": 5.124207543112998e-05, "loss": 0.084, "step": 12290 }, { "epoch": 1.4768768768768767, "grad_norm": 0.4754551649093628, "learning_rate": 5.121063750563131e-05, "loss": 0.1005, "step": 12295 }, { "epoch": 1.4774774774774775, "grad_norm": 0.4164341986179352, "learning_rate": 5.117919910123444e-05, "loss": 0.0896, "step": 12300 }, { "epoch": 1.478078078078078, "grad_norm": 0.44691160321235657, "learning_rate": 5.114776023037561e-05, "loss": 0.075, "step": 12305 }, { "epoch": 1.4786786786786787, "grad_norm": 0.41045352816581726, "learning_rate": 5.111632090549126e-05, "loss": 0.071, "step": 12310 }, { "epoch": 1.4792792792792793, "grad_norm": 0.48388731479644775, "learning_rate": 5.108488113901799e-05, "loss": 0.0802, "step": 12315 }, { "epoch": 1.47987987987988, "grad_norm": 0.5359501242637634, "learning_rate": 5.1053440943392626e-05, "loss": 0.1017, "step": 12320 }, { "epoch": 1.4804804804804805, "grad_norm": 0.5539635419845581, "learning_rate": 5.102200033105211e-05, "loss": 0.0962, "step": 12325 }, { "epoch": 1.481081081081081, "grad_norm": 0.4830450117588043, "learning_rate": 5.099055931443356e-05, "loss": 0.0804, "step": 12330 }, { "epoch": 1.4816816816816818, "grad_norm": 0.5440949201583862, "learning_rate": 5.0959117905974295e-05, "loss": 0.0932, "step": 12335 }, { "epoch": 1.4822822822822823, "grad_norm": 0.41771113872528076, "learning_rate": 5.092767611811172e-05, "loss": 0.0792, "step": 12340 }, { "epoch": 1.4828828828828828, "grad_norm": 0.4817197918891907, "learning_rate": 5.089623396328347e-05, "loss": 0.0794, "step": 12345 }, { "epoch": 1.4834834834834836, "grad_norm": 0.4798753261566162, "learning_rate": 5.086479145392725e-05, "loss": 0.0917, "step": 12350 }, { "epoch": 1.484084084084084, "grad_norm": 0.6235416531562805, "learning_rate": 5.0833348602480954e-05, "loss": 0.0787, "step": 12355 }, { "epoch": 1.4846846846846846, "grad_norm": 0.5768137574195862, "learning_rate": 5.080190542138259e-05, "loss": 0.0827, "step": 12360 }, { "epoch": 1.4852852852852854, "grad_norm": 0.4930762052536011, "learning_rate": 5.0770461923070286e-05, "loss": 0.0716, "step": 12365 }, { "epoch": 1.4858858858858859, "grad_norm": 0.4362534284591675, "learning_rate": 5.073901811998234e-05, "loss": 0.0683, "step": 12370 }, { "epoch": 1.4864864864864864, "grad_norm": 0.48355865478515625, "learning_rate": 5.070757402455712e-05, "loss": 0.0635, "step": 12375 }, { "epoch": 1.4870870870870871, "grad_norm": 0.5248156785964966, "learning_rate": 5.067612964923315e-05, "loss": 0.0669, "step": 12380 }, { "epoch": 1.4876876876876877, "grad_norm": 0.5497010350227356, "learning_rate": 5.064468500644903e-05, "loss": 0.076, "step": 12385 }, { "epoch": 1.4882882882882882, "grad_norm": 0.4656374454498291, "learning_rate": 5.061324010864349e-05, "loss": 0.0598, "step": 12390 }, { "epoch": 1.488888888888889, "grad_norm": 0.5101540088653564, "learning_rate": 5.058179496825535e-05, "loss": 0.0773, "step": 12395 }, { "epoch": 1.4894894894894894, "grad_norm": 0.4140486717224121, "learning_rate": 5.055034959772352e-05, "loss": 0.0633, "step": 12400 }, { "epoch": 1.49009009009009, "grad_norm": 0.39580681920051575, "learning_rate": 5.051890400948703e-05, "loss": 0.0754, "step": 12405 }, { "epoch": 1.4906906906906907, "grad_norm": 0.36044490337371826, "learning_rate": 5.0487458215984964e-05, "loss": 0.0733, "step": 12410 }, { "epoch": 1.4912912912912912, "grad_norm": 0.3795586824417114, "learning_rate": 5.04560122296565e-05, "loss": 0.0607, "step": 12415 }, { "epoch": 1.491891891891892, "grad_norm": 0.49337899684906006, "learning_rate": 5.042456606294088e-05, "loss": 0.0691, "step": 12420 }, { "epoch": 1.4924924924924925, "grad_norm": 0.4806540012359619, "learning_rate": 5.039311972827746e-05, "loss": 0.0736, "step": 12425 }, { "epoch": 1.493093093093093, "grad_norm": 0.4080536365509033, "learning_rate": 5.03616732381056e-05, "loss": 0.0681, "step": 12430 }, { "epoch": 1.4936936936936938, "grad_norm": 0.4222785532474518, "learning_rate": 5.033022660486475e-05, "loss": 0.0698, "step": 12435 }, { "epoch": 1.4942942942942943, "grad_norm": 0.4748011529445648, "learning_rate": 5.029877984099446e-05, "loss": 0.0726, "step": 12440 }, { "epoch": 1.494894894894895, "grad_norm": 0.3876466751098633, "learning_rate": 5.0267332958934246e-05, "loss": 0.0768, "step": 12445 }, { "epoch": 1.4954954954954955, "grad_norm": 0.4406418800354004, "learning_rate": 5.023588597112374e-05, "loss": 0.0794, "step": 12450 }, { "epoch": 1.496096096096096, "grad_norm": 0.5697823166847229, "learning_rate": 5.020443889000259e-05, "loss": 0.0802, "step": 12455 }, { "epoch": 1.4966966966966968, "grad_norm": 0.42151203751564026, "learning_rate": 5.017299172801049e-05, "loss": 0.0846, "step": 12460 }, { "epoch": 1.4972972972972973, "grad_norm": 0.39392387866973877, "learning_rate": 5.014154449758712e-05, "loss": 0.078, "step": 12465 }, { "epoch": 1.4978978978978978, "grad_norm": 0.5100020170211792, "learning_rate": 5.011009721117226e-05, "loss": 0.0641, "step": 12470 }, { "epoch": 1.4984984984984986, "grad_norm": 0.5471884608268738, "learning_rate": 5.0078649881205684e-05, "loss": 0.0765, "step": 12475 }, { "epoch": 1.499099099099099, "grad_norm": 0.42572712898254395, "learning_rate": 5.0047202520127144e-05, "loss": 0.0848, "step": 12480 }, { "epoch": 1.4996996996996996, "grad_norm": 0.4461537003517151, "learning_rate": 5.001575514037647e-05, "loss": 0.0732, "step": 12485 }, { "epoch": 1.5003003003003004, "grad_norm": 0.40868768095970154, "learning_rate": 4.9984307754393456e-05, "loss": 0.0807, "step": 12490 }, { "epoch": 1.500900900900901, "grad_norm": 0.4080524742603302, "learning_rate": 4.995286037461789e-05, "loss": 0.0494, "step": 12495 }, { "epoch": 1.5015015015015014, "grad_norm": 0.4617525637149811, "learning_rate": 4.9921413013489604e-05, "loss": 0.082, "step": 12500 }, { "epoch": 1.5015015015015014, "eval_loss": 0.09793703258037567, "eval_runtime": 36.0537, "eval_samples_per_second": 22.189, "eval_steps_per_second": 5.547, "step": 12500 }, { "epoch": 1.5021021021021022, "grad_norm": 0.47754546999931335, "learning_rate": 4.988996568344838e-05, "loss": 0.071, "step": 12505 }, { "epoch": 1.5027027027027027, "grad_norm": 0.5164263248443604, "learning_rate": 4.9858518396934e-05, "loss": 0.0709, "step": 12510 }, { "epoch": 1.5033033033033032, "grad_norm": 0.3986726999282837, "learning_rate": 4.982707116638625e-05, "loss": 0.0621, "step": 12515 }, { "epoch": 1.503903903903904, "grad_norm": 0.4838113486766815, "learning_rate": 4.9795624004244855e-05, "loss": 0.0867, "step": 12520 }, { "epoch": 1.5045045045045045, "grad_norm": 0.5557221174240112, "learning_rate": 4.976417692294954e-05, "loss": 0.0788, "step": 12525 }, { "epoch": 1.505105105105105, "grad_norm": 0.45177948474884033, "learning_rate": 4.973272993493999e-05, "loss": 0.069, "step": 12530 }, { "epoch": 1.5057057057057057, "grad_norm": 0.4850604832172394, "learning_rate": 4.9701283052655876e-05, "loss": 0.0876, "step": 12535 }, { "epoch": 1.5063063063063065, "grad_norm": 0.4056200087070465, "learning_rate": 4.966983628853679e-05, "loss": 0.0745, "step": 12540 }, { "epoch": 1.5069069069069068, "grad_norm": 0.45170727372169495, "learning_rate": 4.963838965502227e-05, "loss": 0.0801, "step": 12545 }, { "epoch": 1.5075075075075075, "grad_norm": 0.6197972297668457, "learning_rate": 4.960694316455187e-05, "loss": 0.076, "step": 12550 }, { "epoch": 1.5081081081081082, "grad_norm": 0.5713878870010376, "learning_rate": 4.9575496829564996e-05, "loss": 0.0849, "step": 12555 }, { "epoch": 1.5087087087087085, "grad_norm": 0.4119352102279663, "learning_rate": 4.954405066250109e-05, "loss": 0.0614, "step": 12560 }, { "epoch": 1.5093093093093093, "grad_norm": 0.35041505098342896, "learning_rate": 4.951260467579943e-05, "loss": 0.0585, "step": 12565 }, { "epoch": 1.50990990990991, "grad_norm": 0.7188326120376587, "learning_rate": 4.948115888189929e-05, "loss": 0.0865, "step": 12570 }, { "epoch": 1.5105105105105106, "grad_norm": 0.5880985260009766, "learning_rate": 4.944971329323985e-05, "loss": 0.0846, "step": 12575 }, { "epoch": 1.511111111111111, "grad_norm": 0.6127240657806396, "learning_rate": 4.941826792226019e-05, "loss": 0.0846, "step": 12580 }, { "epoch": 1.5117117117117118, "grad_norm": 0.4787910580635071, "learning_rate": 4.9386822781399366e-05, "loss": 0.087, "step": 12585 }, { "epoch": 1.5123123123123123, "grad_norm": 0.35946008563041687, "learning_rate": 4.935537788309624e-05, "loss": 0.0624, "step": 12590 }, { "epoch": 1.5129129129129129, "grad_norm": 0.4871029555797577, "learning_rate": 4.932393323978967e-05, "loss": 0.09, "step": 12595 }, { "epoch": 1.5135135135135136, "grad_norm": 0.5066032409667969, "learning_rate": 4.929248886391835e-05, "loss": 0.0708, "step": 12600 }, { "epoch": 1.5141141141141141, "grad_norm": 0.470005065202713, "learning_rate": 4.926104476792092e-05, "loss": 0.0727, "step": 12605 }, { "epoch": 1.5147147147147146, "grad_norm": 0.5710893869400024, "learning_rate": 4.92296009642359e-05, "loss": 0.0747, "step": 12610 }, { "epoch": 1.5153153153153154, "grad_norm": 0.42586013674736023, "learning_rate": 4.9198157465301634e-05, "loss": 0.07, "step": 12615 }, { "epoch": 1.515915915915916, "grad_norm": 0.412024587392807, "learning_rate": 4.916671428355641e-05, "loss": 0.0739, "step": 12620 }, { "epoch": 1.5165165165165164, "grad_norm": 0.46768057346343994, "learning_rate": 4.91352714314384e-05, "loss": 0.0683, "step": 12625 }, { "epoch": 1.5171171171171172, "grad_norm": 0.45962801575660706, "learning_rate": 4.91038289213856e-05, "loss": 0.0791, "step": 12630 }, { "epoch": 1.5177177177177177, "grad_norm": 0.494783490896225, "learning_rate": 4.9072386765835864e-05, "loss": 0.0786, "step": 12635 }, { "epoch": 1.5183183183183182, "grad_norm": 0.43444254994392395, "learning_rate": 4.904094497722696e-05, "loss": 0.0796, "step": 12640 }, { "epoch": 1.518918918918919, "grad_norm": 0.3887892961502075, "learning_rate": 4.900950356799647e-05, "loss": 0.066, "step": 12645 }, { "epoch": 1.5195195195195195, "grad_norm": 0.4380321800708771, "learning_rate": 4.8978062550581825e-05, "loss": 0.0813, "step": 12650 }, { "epoch": 1.52012012012012, "grad_norm": 0.43813252449035645, "learning_rate": 4.8946621937420356e-05, "loss": 0.0791, "step": 12655 }, { "epoch": 1.5207207207207207, "grad_norm": 0.5307855606079102, "learning_rate": 4.891518174094914e-05, "loss": 0.0898, "step": 12660 }, { "epoch": 1.5213213213213215, "grad_norm": 0.39943432807922363, "learning_rate": 4.8883741973605155e-05, "loss": 0.0809, "step": 12665 }, { "epoch": 1.5219219219219218, "grad_norm": 0.41812124848365784, "learning_rate": 4.88523026478252e-05, "loss": 0.072, "step": 12670 }, { "epoch": 1.5225225225225225, "grad_norm": 0.44765257835388184, "learning_rate": 4.88208637760459e-05, "loss": 0.0709, "step": 12675 }, { "epoch": 1.5231231231231233, "grad_norm": 0.4284890294075012, "learning_rate": 4.8789425370703704e-05, "loss": 0.0681, "step": 12680 }, { "epoch": 1.5237237237237238, "grad_norm": 0.5334815382957458, "learning_rate": 4.875798744423483e-05, "loss": 0.0914, "step": 12685 }, { "epoch": 1.5243243243243243, "grad_norm": 0.4032062292098999, "learning_rate": 4.872655000907538e-05, "loss": 0.0882, "step": 12690 }, { "epoch": 1.524924924924925, "grad_norm": 0.5700933933258057, "learning_rate": 4.8695113077661195e-05, "loss": 0.0743, "step": 12695 }, { "epoch": 1.5255255255255256, "grad_norm": 0.42844948172569275, "learning_rate": 4.866367666242798e-05, "loss": 0.0768, "step": 12700 }, { "epoch": 1.526126126126126, "grad_norm": 0.5078549385070801, "learning_rate": 4.863224077581115e-05, "loss": 0.09, "step": 12705 }, { "epoch": 1.5267267267267268, "grad_norm": 0.4741128087043762, "learning_rate": 4.860080543024601e-05, "loss": 0.064, "step": 12710 }, { "epoch": 1.5273273273273273, "grad_norm": 0.5249758362770081, "learning_rate": 4.856937063816758e-05, "loss": 0.0949, "step": 12715 }, { "epoch": 1.5279279279279279, "grad_norm": 0.43285176157951355, "learning_rate": 4.85379364120107e-05, "loss": 0.0872, "step": 12720 }, { "epoch": 1.5285285285285286, "grad_norm": 0.4995947480201721, "learning_rate": 4.850650276420999e-05, "loss": 0.0961, "step": 12725 }, { "epoch": 1.5291291291291291, "grad_norm": 0.3818022310733795, "learning_rate": 4.847506970719977e-05, "loss": 0.0653, "step": 12730 }, { "epoch": 1.5297297297297296, "grad_norm": 0.39495208859443665, "learning_rate": 4.844363725341422e-05, "loss": 0.0686, "step": 12735 }, { "epoch": 1.5303303303303304, "grad_norm": 0.4333951771259308, "learning_rate": 4.841220541528722e-05, "loss": 0.0658, "step": 12740 }, { "epoch": 1.530930930930931, "grad_norm": 0.5166015625, "learning_rate": 4.838077420525243e-05, "loss": 0.0712, "step": 12745 }, { "epoch": 1.5315315315315314, "grad_norm": 0.40091192722320557, "learning_rate": 4.834934363574329e-05, "loss": 0.07, "step": 12750 }, { "epoch": 1.5315315315315314, "eval_loss": 0.08978892117738724, "eval_runtime": 35.9101, "eval_samples_per_second": 22.278, "eval_steps_per_second": 5.569, "step": 12750 }, { "epoch": 1.5321321321321322, "grad_norm": 0.43245700001716614, "learning_rate": 4.83179137191929e-05, "loss": 0.0847, "step": 12755 }, { "epoch": 1.5327327327327327, "grad_norm": 0.42760345339775085, "learning_rate": 4.828648446803419e-05, "loss": 0.069, "step": 12760 }, { "epoch": 1.5333333333333332, "grad_norm": 0.39846494793891907, "learning_rate": 4.825505589469978e-05, "loss": 0.0689, "step": 12765 }, { "epoch": 1.533933933933934, "grad_norm": 0.3754206597805023, "learning_rate": 4.8223628011622065e-05, "loss": 0.0635, "step": 12770 }, { "epoch": 1.5345345345345347, "grad_norm": 0.48671194911003113, "learning_rate": 4.819220083123311e-05, "loss": 0.0628, "step": 12775 }, { "epoch": 1.535135135135135, "grad_norm": 0.49007099866867065, "learning_rate": 4.8160774365964736e-05, "loss": 0.0774, "step": 12780 }, { "epoch": 1.5357357357357357, "grad_norm": 0.332457035779953, "learning_rate": 4.8129348628248455e-05, "loss": 0.0556, "step": 12785 }, { "epoch": 1.5363363363363365, "grad_norm": 0.4247899651527405, "learning_rate": 4.809792363051553e-05, "loss": 0.0746, "step": 12790 }, { "epoch": 1.5369369369369368, "grad_norm": 0.38403886556625366, "learning_rate": 4.806649938519694e-05, "loss": 0.073, "step": 12795 }, { "epoch": 1.5375375375375375, "grad_norm": 0.460376501083374, "learning_rate": 4.803507590472328e-05, "loss": 0.0722, "step": 12800 }, { "epoch": 1.5381381381381383, "grad_norm": 0.38304775953292847, "learning_rate": 4.800365320152493e-05, "loss": 0.0696, "step": 12805 }, { "epoch": 1.5387387387387388, "grad_norm": 0.41683074831962585, "learning_rate": 4.797223128803193e-05, "loss": 0.0595, "step": 12810 }, { "epoch": 1.5393393393393393, "grad_norm": 0.2956644296646118, "learning_rate": 4.794081017667401e-05, "loss": 0.0602, "step": 12815 }, { "epoch": 1.53993993993994, "grad_norm": 0.47854262590408325, "learning_rate": 4.7909389879880616e-05, "loss": 0.0694, "step": 12820 }, { "epoch": 1.5405405405405406, "grad_norm": 0.4116211533546448, "learning_rate": 4.7877970410080785e-05, "loss": 0.0743, "step": 12825 }, { "epoch": 1.541141141141141, "grad_norm": 0.4726904332637787, "learning_rate": 4.784655177970332e-05, "loss": 0.076, "step": 12830 }, { "epoch": 1.5417417417417418, "grad_norm": 0.33450406789779663, "learning_rate": 4.781513400117662e-05, "loss": 0.0484, "step": 12835 }, { "epoch": 1.5423423423423424, "grad_norm": 0.4221593737602234, "learning_rate": 4.7783717086928804e-05, "loss": 0.0601, "step": 12840 }, { "epoch": 1.5429429429429429, "grad_norm": 0.5398896932601929, "learning_rate": 4.775230104938764e-05, "loss": 0.0695, "step": 12845 }, { "epoch": 1.5435435435435436, "grad_norm": 0.49371886253356934, "learning_rate": 4.7720885900980494e-05, "loss": 0.0627, "step": 12850 }, { "epoch": 1.5441441441441441, "grad_norm": 0.4454942047595978, "learning_rate": 4.7689471654134447e-05, "loss": 0.074, "step": 12855 }, { "epoch": 1.5447447447447447, "grad_norm": 0.4327937364578247, "learning_rate": 4.765805832127618e-05, "loss": 0.0751, "step": 12860 }, { "epoch": 1.5453453453453454, "grad_norm": 0.3502213656902313, "learning_rate": 4.762664591483207e-05, "loss": 0.0719, "step": 12865 }, { "epoch": 1.545945945945946, "grad_norm": 0.42783692479133606, "learning_rate": 4.759523444722803e-05, "loss": 0.0676, "step": 12870 }, { "epoch": 1.5465465465465464, "grad_norm": 0.4517103135585785, "learning_rate": 4.75638239308897e-05, "loss": 0.077, "step": 12875 }, { "epoch": 1.5471471471471472, "grad_norm": 0.42299434542655945, "learning_rate": 4.753241437824228e-05, "loss": 0.0779, "step": 12880 }, { "epoch": 1.5477477477477477, "grad_norm": 0.5330052375793457, "learning_rate": 4.750100580171062e-05, "loss": 0.0673, "step": 12885 }, { "epoch": 1.5483483483483482, "grad_norm": 0.5104110836982727, "learning_rate": 4.74695982137192e-05, "loss": 0.0661, "step": 12890 }, { "epoch": 1.548948948948949, "grad_norm": 0.507620632648468, "learning_rate": 4.743819162669202e-05, "loss": 0.0619, "step": 12895 }, { "epoch": 1.5495495495495497, "grad_norm": 0.5362961888313293, "learning_rate": 4.740678605305281e-05, "loss": 0.0699, "step": 12900 }, { "epoch": 1.55015015015015, "grad_norm": 0.42854073643684387, "learning_rate": 4.73753815052248e-05, "loss": 0.0564, "step": 12905 }, { "epoch": 1.5507507507507508, "grad_norm": 0.47177237272262573, "learning_rate": 4.734397799563088e-05, "loss": 0.0686, "step": 12910 }, { "epoch": 1.5513513513513515, "grad_norm": 0.4933551251888275, "learning_rate": 4.731257553669348e-05, "loss": 0.07, "step": 12915 }, { "epoch": 1.5519519519519518, "grad_norm": 0.5049030184745789, "learning_rate": 4.7281174140834636e-05, "loss": 0.0729, "step": 12920 }, { "epoch": 1.5525525525525525, "grad_norm": 0.4711320996284485, "learning_rate": 4.7249773820475987e-05, "loss": 0.0658, "step": 12925 }, { "epoch": 1.5531531531531533, "grad_norm": 0.5016087889671326, "learning_rate": 4.7218374588038675e-05, "loss": 0.075, "step": 12930 }, { "epoch": 1.5537537537537538, "grad_norm": 0.5302407145500183, "learning_rate": 4.718697645594352e-05, "loss": 0.0592, "step": 12935 }, { "epoch": 1.5543543543543543, "grad_norm": 0.5613186359405518, "learning_rate": 4.7155579436610785e-05, "loss": 0.075, "step": 12940 }, { "epoch": 1.554954954954955, "grad_norm": 0.44849929213523865, "learning_rate": 4.712418354246038e-05, "loss": 0.0603, "step": 12945 }, { "epoch": 1.5555555555555556, "grad_norm": 0.442667156457901, "learning_rate": 4.7092788785911746e-05, "loss": 0.0596, "step": 12950 }, { "epoch": 1.556156156156156, "grad_norm": 0.6306538581848145, "learning_rate": 4.7061395179383875e-05, "loss": 0.071, "step": 12955 }, { "epoch": 1.5567567567567568, "grad_norm": 0.401419073343277, "learning_rate": 4.70300027352953e-05, "loss": 0.0638, "step": 12960 }, { "epoch": 1.5573573573573574, "grad_norm": 0.4501631259918213, "learning_rate": 4.699861146606408e-05, "loss": 0.0808, "step": 12965 }, { "epoch": 1.5579579579579579, "grad_norm": 0.37132251262664795, "learning_rate": 4.6967221384107836e-05, "loss": 0.0613, "step": 12970 }, { "epoch": 1.5585585585585586, "grad_norm": 0.42588886618614197, "learning_rate": 4.693583250184369e-05, "loss": 0.0649, "step": 12975 }, { "epoch": 1.5591591591591591, "grad_norm": 0.41766226291656494, "learning_rate": 4.690444483168833e-05, "loss": 0.0632, "step": 12980 }, { "epoch": 1.5597597597597597, "grad_norm": 0.39264434576034546, "learning_rate": 4.687305838605794e-05, "loss": 0.0656, "step": 12985 }, { "epoch": 1.5603603603603604, "grad_norm": 0.42573830485343933, "learning_rate": 4.684167317736819e-05, "loss": 0.0614, "step": 12990 }, { "epoch": 1.560960960960961, "grad_norm": 0.4736945629119873, "learning_rate": 4.681028921803432e-05, "loss": 0.0691, "step": 12995 }, { "epoch": 1.5615615615615615, "grad_norm": 0.4946117699146271, "learning_rate": 4.677890652047103e-05, "loss": 0.063, "step": 13000 }, { "epoch": 1.5615615615615615, "eval_loss": 0.08609585464000702, "eval_runtime": 35.6392, "eval_samples_per_second": 22.447, "eval_steps_per_second": 5.612, "step": 13000 }, { "epoch": 1.5621621621621622, "grad_norm": 0.4591105580329895, "learning_rate": 4.6747525097092576e-05, "loss": 0.0857, "step": 13005 }, { "epoch": 1.5627627627627627, "grad_norm": 0.3890763223171234, "learning_rate": 4.671614496031262e-05, "loss": 0.0667, "step": 13010 }, { "epoch": 1.5633633633633632, "grad_norm": 0.3825789988040924, "learning_rate": 4.66847661225444e-05, "loss": 0.0656, "step": 13015 }, { "epoch": 1.563963963963964, "grad_norm": 0.4688428044319153, "learning_rate": 4.665338859620059e-05, "loss": 0.0715, "step": 13020 }, { "epoch": 1.5645645645645647, "grad_norm": 0.5024927854537964, "learning_rate": 4.662201239369336e-05, "loss": 0.0813, "step": 13025 }, { "epoch": 1.565165165165165, "grad_norm": 0.37907472252845764, "learning_rate": 4.6590637527434394e-05, "loss": 0.0783, "step": 13030 }, { "epoch": 1.5657657657657658, "grad_norm": 0.48286497592926025, "learning_rate": 4.6559264009834765e-05, "loss": 0.0677, "step": 13035 }, { "epoch": 1.5663663663663665, "grad_norm": 0.3524858355522156, "learning_rate": 4.6527891853305085e-05, "loss": 0.0568, "step": 13040 }, { "epoch": 1.5669669669669668, "grad_norm": 0.5434703826904297, "learning_rate": 4.6496521070255395e-05, "loss": 0.0685, "step": 13045 }, { "epoch": 1.5675675675675675, "grad_norm": 0.36601707339286804, "learning_rate": 4.6465151673095195e-05, "loss": 0.0659, "step": 13050 }, { "epoch": 1.5681681681681683, "grad_norm": 0.33329319953918457, "learning_rate": 4.6433783674233485e-05, "loss": 0.0607, "step": 13055 }, { "epoch": 1.5687687687687688, "grad_norm": 0.4235140383243561, "learning_rate": 4.64024170860786e-05, "loss": 0.0594, "step": 13060 }, { "epoch": 1.5693693693693693, "grad_norm": 0.4310402572154999, "learning_rate": 4.637105192103843e-05, "loss": 0.0678, "step": 13065 }, { "epoch": 1.56996996996997, "grad_norm": 0.45694392919540405, "learning_rate": 4.633968819152024e-05, "loss": 0.0875, "step": 13070 }, { "epoch": 1.5705705705705706, "grad_norm": 0.4142746329307556, "learning_rate": 4.6308325909930775e-05, "loss": 0.0581, "step": 13075 }, { "epoch": 1.571171171171171, "grad_norm": 0.5073320865631104, "learning_rate": 4.6276965088676125e-05, "loss": 0.0664, "step": 13080 }, { "epoch": 1.5717717717717719, "grad_norm": 0.4360414743423462, "learning_rate": 4.624560574016188e-05, "loss": 0.0641, "step": 13085 }, { "epoch": 1.5723723723723724, "grad_norm": 0.4940846860408783, "learning_rate": 4.621424787679303e-05, "loss": 0.0697, "step": 13090 }, { "epoch": 1.572972972972973, "grad_norm": 0.4336908459663391, "learning_rate": 4.618289151097395e-05, "loss": 0.0718, "step": 13095 }, { "epoch": 1.5735735735735736, "grad_norm": 0.42787450551986694, "learning_rate": 4.615153665510849e-05, "loss": 0.0683, "step": 13100 }, { "epoch": 1.5741741741741742, "grad_norm": 0.509724497795105, "learning_rate": 4.612018332159979e-05, "loss": 0.064, "step": 13105 }, { "epoch": 1.5747747747747747, "grad_norm": 0.4025382399559021, "learning_rate": 4.6088831522850483e-05, "loss": 0.0632, "step": 13110 }, { "epoch": 1.5753753753753754, "grad_norm": 0.45108699798583984, "learning_rate": 4.605748127126256e-05, "loss": 0.0724, "step": 13115 }, { "epoch": 1.575975975975976, "grad_norm": 0.36071160435676575, "learning_rate": 4.6026132579237407e-05, "loss": 0.056, "step": 13120 }, { "epoch": 1.5765765765765765, "grad_norm": 0.3282938301563263, "learning_rate": 4.599478545917581e-05, "loss": 0.0519, "step": 13125 }, { "epoch": 1.5771771771771772, "grad_norm": 0.3939158320426941, "learning_rate": 4.596343992347787e-05, "loss": 0.0582, "step": 13130 }, { "epoch": 1.5777777777777777, "grad_norm": 0.3940871059894562, "learning_rate": 4.593209598454313e-05, "loss": 0.0726, "step": 13135 }, { "epoch": 1.5783783783783782, "grad_norm": 0.4171847105026245, "learning_rate": 4.5900753654770465e-05, "loss": 0.0695, "step": 13140 }, { "epoch": 1.578978978978979, "grad_norm": 0.4544351100921631, "learning_rate": 4.586941294655816e-05, "loss": 0.0688, "step": 13145 }, { "epoch": 1.5795795795795797, "grad_norm": 0.3604528307914734, "learning_rate": 4.583807387230377e-05, "loss": 0.0534, "step": 13150 }, { "epoch": 1.58018018018018, "grad_norm": 0.4082534611225128, "learning_rate": 4.5806736444404294e-05, "loss": 0.0808, "step": 13155 }, { "epoch": 1.5807807807807808, "grad_norm": 0.3927527070045471, "learning_rate": 4.577540067525602e-05, "loss": 0.0636, "step": 13160 }, { "epoch": 1.5813813813813815, "grad_norm": 0.4562763571739197, "learning_rate": 4.5744066577254615e-05, "loss": 0.0791, "step": 13165 }, { "epoch": 1.581981981981982, "grad_norm": 0.4318278431892395, "learning_rate": 4.57127341627951e-05, "loss": 0.0648, "step": 13170 }, { "epoch": 1.5825825825825826, "grad_norm": 0.3479551374912262, "learning_rate": 4.5681403444271736e-05, "loss": 0.0519, "step": 13175 }, { "epoch": 1.5831831831831833, "grad_norm": 0.5226278305053711, "learning_rate": 4.565007443407822e-05, "loss": 0.0629, "step": 13180 }, { "epoch": 1.5837837837837838, "grad_norm": 0.34603849053382874, "learning_rate": 4.561874714460753e-05, "loss": 0.0503, "step": 13185 }, { "epoch": 1.5843843843843843, "grad_norm": 0.46688374876976013, "learning_rate": 4.558742158825197e-05, "loss": 0.0591, "step": 13190 }, { "epoch": 1.584984984984985, "grad_norm": 0.43887630105018616, "learning_rate": 4.5556097777403154e-05, "loss": 0.0722, "step": 13195 }, { "epoch": 1.5855855855855856, "grad_norm": 0.42593011260032654, "learning_rate": 4.552477572445199e-05, "loss": 0.0594, "step": 13200 }, { "epoch": 1.5861861861861861, "grad_norm": 0.4000687897205353, "learning_rate": 4.549345544178873e-05, "loss": 0.0729, "step": 13205 }, { "epoch": 1.5867867867867869, "grad_norm": 0.4771914482116699, "learning_rate": 4.546213694180286e-05, "loss": 0.0646, "step": 13210 }, { "epoch": 1.5873873873873874, "grad_norm": 0.4418584108352661, "learning_rate": 4.543082023688324e-05, "loss": 0.0657, "step": 13215 }, { "epoch": 1.587987987987988, "grad_norm": 0.40844422578811646, "learning_rate": 4.5399505339418e-05, "loss": 0.0598, "step": 13220 }, { "epoch": 1.5885885885885886, "grad_norm": 0.4462772011756897, "learning_rate": 4.536819226179449e-05, "loss": 0.0741, "step": 13225 }, { "epoch": 1.5891891891891892, "grad_norm": 0.45399248600006104, "learning_rate": 4.5336881016399416e-05, "loss": 0.0626, "step": 13230 }, { "epoch": 1.5897897897897897, "grad_norm": 0.5026511549949646, "learning_rate": 4.530557161561871e-05, "loss": 0.0714, "step": 13235 }, { "epoch": 1.5903903903903904, "grad_norm": 0.4819416105747223, "learning_rate": 4.5274264071837646e-05, "loss": 0.0764, "step": 13240 }, { "epoch": 1.590990990990991, "grad_norm": 0.37962061166763306, "learning_rate": 4.524295839744065e-05, "loss": 0.0627, "step": 13245 }, { "epoch": 1.5915915915915915, "grad_norm": 0.5724044442176819, "learning_rate": 4.521165460481151e-05, "loss": 0.0727, "step": 13250 }, { "epoch": 1.5915915915915915, "eval_loss": 0.07784755527973175, "eval_runtime": 35.7222, "eval_samples_per_second": 22.395, "eval_steps_per_second": 5.599, "step": 13250 }, { "epoch": 1.5921921921921922, "grad_norm": 0.40502458810806274, "learning_rate": 4.518035270633321e-05, "loss": 0.0611, "step": 13255 }, { "epoch": 1.592792792792793, "grad_norm": 0.3639732599258423, "learning_rate": 4.514905271438802e-05, "loss": 0.0559, "step": 13260 }, { "epoch": 1.5933933933933933, "grad_norm": 0.5118473172187805, "learning_rate": 4.5117754641357455e-05, "loss": 0.066, "step": 13265 }, { "epoch": 1.593993993993994, "grad_norm": 0.46344873309135437, "learning_rate": 4.508645849962222e-05, "loss": 0.0582, "step": 13270 }, { "epoch": 1.5945945945945947, "grad_norm": 0.5766790509223938, "learning_rate": 4.505516430156232e-05, "loss": 0.0626, "step": 13275 }, { "epoch": 1.595195195195195, "grad_norm": 0.37287622690200806, "learning_rate": 4.502387205955695e-05, "loss": 0.0707, "step": 13280 }, { "epoch": 1.5957957957957958, "grad_norm": 0.4307993948459625, "learning_rate": 4.4992581785984574e-05, "loss": 0.063, "step": 13285 }, { "epoch": 1.5963963963963965, "grad_norm": 0.42525312304496765, "learning_rate": 4.496129349322282e-05, "loss": 0.0716, "step": 13290 }, { "epoch": 1.596996996996997, "grad_norm": 0.5460354685783386, "learning_rate": 4.493000719364857e-05, "loss": 0.0709, "step": 13295 }, { "epoch": 1.5975975975975976, "grad_norm": 0.3980024755001068, "learning_rate": 4.489872289963792e-05, "loss": 0.0592, "step": 13300 }, { "epoch": 1.5981981981981983, "grad_norm": 0.42124074697494507, "learning_rate": 4.486744062356614e-05, "loss": 0.076, "step": 13305 }, { "epoch": 1.5987987987987988, "grad_norm": 0.5677942037582397, "learning_rate": 4.483616037780776e-05, "loss": 0.0667, "step": 13310 }, { "epoch": 1.5993993993993993, "grad_norm": 0.4418148398399353, "learning_rate": 4.4804882174736425e-05, "loss": 0.0667, "step": 13315 }, { "epoch": 1.6, "grad_norm": 0.5681725740432739, "learning_rate": 4.477360602672504e-05, "loss": 0.0699, "step": 13320 }, { "epoch": 1.6006006006006006, "grad_norm": 0.43010297417640686, "learning_rate": 4.4742331946145673e-05, "loss": 0.0674, "step": 13325 }, { "epoch": 1.6012012012012011, "grad_norm": 0.5609509348869324, "learning_rate": 4.471105994536958e-05, "loss": 0.071, "step": 13330 }, { "epoch": 1.6018018018018019, "grad_norm": 0.4860755205154419, "learning_rate": 4.4679790036767205e-05, "loss": 0.0648, "step": 13335 }, { "epoch": 1.6024024024024024, "grad_norm": 0.487942099571228, "learning_rate": 4.464852223270811e-05, "loss": 0.0681, "step": 13340 }, { "epoch": 1.603003003003003, "grad_norm": 0.48040616512298584, "learning_rate": 4.46172565455611e-05, "loss": 0.0573, "step": 13345 }, { "epoch": 1.6036036036036037, "grad_norm": 0.36271554231643677, "learning_rate": 4.458599298769407e-05, "loss": 0.0625, "step": 13350 }, { "epoch": 1.6042042042042042, "grad_norm": 0.474427193403244, "learning_rate": 4.455473157147414e-05, "loss": 0.0513, "step": 13355 }, { "epoch": 1.6048048048048047, "grad_norm": 0.34347042441368103, "learning_rate": 4.452347230926757e-05, "loss": 0.0555, "step": 13360 }, { "epoch": 1.6054054054054054, "grad_norm": 0.4109949767589569, "learning_rate": 4.44922152134397e-05, "loss": 0.0632, "step": 13365 }, { "epoch": 1.606006006006006, "grad_norm": 0.35330331325531006, "learning_rate": 4.4460960296355074e-05, "loss": 0.0563, "step": 13370 }, { "epoch": 1.6066066066066065, "grad_norm": 0.24096760153770447, "learning_rate": 4.442970757037739e-05, "loss": 0.0563, "step": 13375 }, { "epoch": 1.6072072072072072, "grad_norm": 0.3883642256259918, "learning_rate": 4.439845704786945e-05, "loss": 0.0624, "step": 13380 }, { "epoch": 1.607807807807808, "grad_norm": 0.5604926943778992, "learning_rate": 4.436720874119316e-05, "loss": 0.07, "step": 13385 }, { "epoch": 1.6084084084084083, "grad_norm": 0.4338735342025757, "learning_rate": 4.433596266270959e-05, "loss": 0.0548, "step": 13390 }, { "epoch": 1.609009009009009, "grad_norm": 0.4170035421848297, "learning_rate": 4.430471882477891e-05, "loss": 0.066, "step": 13395 }, { "epoch": 1.6096096096096097, "grad_norm": 0.39704200625419617, "learning_rate": 4.427347723976042e-05, "loss": 0.0713, "step": 13400 }, { "epoch": 1.61021021021021, "grad_norm": 0.382910817861557, "learning_rate": 4.424223792001253e-05, "loss": 0.0567, "step": 13405 }, { "epoch": 1.6108108108108108, "grad_norm": 0.4715207815170288, "learning_rate": 4.42110008778927e-05, "loss": 0.054, "step": 13410 }, { "epoch": 1.6114114114114115, "grad_norm": 0.3665212392807007, "learning_rate": 4.417976612575755e-05, "loss": 0.0554, "step": 13415 }, { "epoch": 1.612012012012012, "grad_norm": 0.4208891689777374, "learning_rate": 4.4148533675962774e-05, "loss": 0.0689, "step": 13420 }, { "epoch": 1.6126126126126126, "grad_norm": 0.4575747847557068, "learning_rate": 4.411730354086318e-05, "loss": 0.0714, "step": 13425 }, { "epoch": 1.6132132132132133, "grad_norm": 0.42173752188682556, "learning_rate": 4.408607573281261e-05, "loss": 0.0634, "step": 13430 }, { "epoch": 1.6138138138138138, "grad_norm": 0.45820704102516174, "learning_rate": 4.4054850264164e-05, "loss": 0.0697, "step": 13435 }, { "epoch": 1.6144144144144144, "grad_norm": 0.4981316030025482, "learning_rate": 4.402362714726941e-05, "loss": 0.0629, "step": 13440 }, { "epoch": 1.615015015015015, "grad_norm": 0.4847612679004669, "learning_rate": 4.399240639447989e-05, "loss": 0.0536, "step": 13445 }, { "epoch": 1.6156156156156156, "grad_norm": 0.38429567217826843, "learning_rate": 4.3961188018145644e-05, "loss": 0.0674, "step": 13450 }, { "epoch": 1.6162162162162161, "grad_norm": 0.45922940969467163, "learning_rate": 4.3929972030615834e-05, "loss": 0.0707, "step": 13455 }, { "epoch": 1.6168168168168169, "grad_norm": 0.4565389156341553, "learning_rate": 4.389875844423876e-05, "loss": 0.0674, "step": 13460 }, { "epoch": 1.6174174174174174, "grad_norm": 0.46716755628585815, "learning_rate": 4.3867547271361745e-05, "loss": 0.0632, "step": 13465 }, { "epoch": 1.618018018018018, "grad_norm": 0.4541369676589966, "learning_rate": 4.383633852433116e-05, "loss": 0.0819, "step": 13470 }, { "epoch": 1.6186186186186187, "grad_norm": 0.427727073431015, "learning_rate": 4.380513221549242e-05, "loss": 0.0653, "step": 13475 }, { "epoch": 1.6192192192192192, "grad_norm": 0.4517422020435333, "learning_rate": 4.377392835718993e-05, "loss": 0.0643, "step": 13480 }, { "epoch": 1.6198198198198197, "grad_norm": 0.41785985231399536, "learning_rate": 4.37427269617672e-05, "loss": 0.0772, "step": 13485 }, { "epoch": 1.6204204204204204, "grad_norm": 0.5061773657798767, "learning_rate": 4.3711528041566705e-05, "loss": 0.0597, "step": 13490 }, { "epoch": 1.621021021021021, "grad_norm": 0.37822848558425903, "learning_rate": 4.368033160892998e-05, "loss": 0.0533, "step": 13495 }, { "epoch": 1.6216216216216215, "grad_norm": 0.4786316156387329, "learning_rate": 4.364913767619758e-05, "loss": 0.0591, "step": 13500 }, { "epoch": 1.6216216216216215, "eval_loss": 0.07384883612394333, "eval_runtime": 35.7195, "eval_samples_per_second": 22.397, "eval_steps_per_second": 5.599, "step": 13500 }, { "epoch": 1.6222222222222222, "grad_norm": 0.5200320482254028, "learning_rate": 4.361794625570901e-05, "loss": 0.064, "step": 13505 }, { "epoch": 1.622822822822823, "grad_norm": 0.4395144581794739, "learning_rate": 4.3586757359802835e-05, "loss": 0.0669, "step": 13510 }, { "epoch": 1.6234234234234233, "grad_norm": 0.3707331120967865, "learning_rate": 4.355557100081663e-05, "loss": 0.0507, "step": 13515 }, { "epoch": 1.624024024024024, "grad_norm": 0.4052436053752899, "learning_rate": 4.352438719108695e-05, "loss": 0.0515, "step": 13520 }, { "epoch": 1.6246246246246248, "grad_norm": 0.549052894115448, "learning_rate": 4.34932059429493e-05, "loss": 0.0661, "step": 13525 }, { "epoch": 1.6252252252252253, "grad_norm": 0.4563562572002411, "learning_rate": 4.346202726873825e-05, "loss": 0.0545, "step": 13530 }, { "epoch": 1.6258258258258258, "grad_norm": 0.5008108615875244, "learning_rate": 4.3430851180787274e-05, "loss": 0.0569, "step": 13535 }, { "epoch": 1.6264264264264265, "grad_norm": 0.5426075458526611, "learning_rate": 4.339967769142889e-05, "loss": 0.0797, "step": 13540 }, { "epoch": 1.627027027027027, "grad_norm": 0.4764677584171295, "learning_rate": 4.3368506812994555e-05, "loss": 0.0726, "step": 13545 }, { "epoch": 1.6276276276276276, "grad_norm": 0.463705450296402, "learning_rate": 4.333733855781468e-05, "loss": 0.0614, "step": 13550 }, { "epoch": 1.6282282282282283, "grad_norm": 0.43991366028785706, "learning_rate": 4.330617293821866e-05, "loss": 0.0702, "step": 13555 }, { "epoch": 1.6288288288288288, "grad_norm": 0.5054963231086731, "learning_rate": 4.327500996653485e-05, "loss": 0.0444, "step": 13560 }, { "epoch": 1.6294294294294294, "grad_norm": 0.4601389467716217, "learning_rate": 4.324384965509054e-05, "loss": 0.0626, "step": 13565 }, { "epoch": 1.63003003003003, "grad_norm": 0.4446878135204315, "learning_rate": 4.3212692016212006e-05, "loss": 0.0577, "step": 13570 }, { "epoch": 1.6306306306306306, "grad_norm": 0.3773181736469269, "learning_rate": 4.31815370622244e-05, "loss": 0.0589, "step": 13575 }, { "epoch": 1.6312312312312311, "grad_norm": 0.5154123306274414, "learning_rate": 4.315038480545188e-05, "loss": 0.0561, "step": 13580 }, { "epoch": 1.631831831831832, "grad_norm": 0.46419796347618103, "learning_rate": 4.3119235258217473e-05, "loss": 0.0561, "step": 13585 }, { "epoch": 1.6324324324324324, "grad_norm": 0.47348397970199585, "learning_rate": 4.308808843284322e-05, "loss": 0.0639, "step": 13590 }, { "epoch": 1.633033033033033, "grad_norm": 0.5072610378265381, "learning_rate": 4.305694434164997e-05, "loss": 0.0637, "step": 13595 }, { "epoch": 1.6336336336336337, "grad_norm": 0.4275527894496918, "learning_rate": 4.30258029969576e-05, "loss": 0.0658, "step": 13600 }, { "epoch": 1.6342342342342342, "grad_norm": 0.36067524552345276, "learning_rate": 4.299466441108484e-05, "loss": 0.0512, "step": 13605 }, { "epoch": 1.6348348348348347, "grad_norm": 0.45838814973831177, "learning_rate": 4.296352859634934e-05, "loss": 0.0711, "step": 13610 }, { "epoch": 1.6354354354354355, "grad_norm": 0.4847932755947113, "learning_rate": 4.293239556506768e-05, "loss": 0.0587, "step": 13615 }, { "epoch": 1.6360360360360362, "grad_norm": 0.48203244805336, "learning_rate": 4.290126532955529e-05, "loss": 0.0654, "step": 13620 }, { "epoch": 1.6366366366366365, "grad_norm": 0.4438318908214569, "learning_rate": 4.2870137902126534e-05, "loss": 0.0582, "step": 13625 }, { "epoch": 1.6372372372372372, "grad_norm": 0.4480768144130707, "learning_rate": 4.2839013295094634e-05, "loss": 0.0521, "step": 13630 }, { "epoch": 1.637837837837838, "grad_norm": 0.4005804657936096, "learning_rate": 4.280789152077173e-05, "loss": 0.0543, "step": 13635 }, { "epoch": 1.6384384384384383, "grad_norm": 0.3628470003604889, "learning_rate": 4.277677259146884e-05, "loss": 0.0559, "step": 13640 }, { "epoch": 1.639039039039039, "grad_norm": 0.5323807001113892, "learning_rate": 4.2745656519495796e-05, "loss": 0.0631, "step": 13645 }, { "epoch": 1.6396396396396398, "grad_norm": 0.5428612232208252, "learning_rate": 4.2714543317161374e-05, "loss": 0.0609, "step": 13650 }, { "epoch": 1.6402402402402403, "grad_norm": 0.5070378184318542, "learning_rate": 4.268343299677319e-05, "loss": 0.0522, "step": 13655 }, { "epoch": 1.6408408408408408, "grad_norm": 0.40902385115623474, "learning_rate": 4.265232557063772e-05, "loss": 0.0589, "step": 13660 }, { "epoch": 1.6414414414414416, "grad_norm": 0.412062406539917, "learning_rate": 4.262122105106028e-05, "loss": 0.0638, "step": 13665 }, { "epoch": 1.642042042042042, "grad_norm": 0.445654958486557, "learning_rate": 4.259011945034504e-05, "loss": 0.0729, "step": 13670 }, { "epoch": 1.6426426426426426, "grad_norm": 0.4355279803276062, "learning_rate": 4.2559020780795044e-05, "loss": 0.0561, "step": 13675 }, { "epoch": 1.6432432432432433, "grad_norm": 0.45573675632476807, "learning_rate": 4.252792505471214e-05, "loss": 0.0554, "step": 13680 }, { "epoch": 1.6438438438438439, "grad_norm": 0.49397581815719604, "learning_rate": 4.249683228439704e-05, "loss": 0.0579, "step": 13685 }, { "epoch": 1.6444444444444444, "grad_norm": 0.5173845291137695, "learning_rate": 4.246574248214925e-05, "loss": 0.0522, "step": 13690 }, { "epoch": 1.6450450450450451, "grad_norm": 0.45187321305274963, "learning_rate": 4.243465566026714e-05, "loss": 0.0629, "step": 13695 }, { "epoch": 1.6456456456456456, "grad_norm": 0.5049217939376831, "learning_rate": 4.240357183104789e-05, "loss": 0.0548, "step": 13700 }, { "epoch": 1.6462462462462462, "grad_norm": 0.3371676504611969, "learning_rate": 4.2372491006787495e-05, "loss": 0.0686, "step": 13705 }, { "epoch": 1.646846846846847, "grad_norm": 0.4380626678466797, "learning_rate": 4.2341413199780774e-05, "loss": 0.0609, "step": 13710 }, { "epoch": 1.6474474474474474, "grad_norm": 0.48923227190971375, "learning_rate": 4.2310338422321294e-05, "loss": 0.056, "step": 13715 }, { "epoch": 1.648048048048048, "grad_norm": 0.48743122816085815, "learning_rate": 4.22792666867015e-05, "loss": 0.0558, "step": 13720 }, { "epoch": 1.6486486486486487, "grad_norm": 0.4722169041633606, "learning_rate": 4.2248198005212594e-05, "loss": 0.0545, "step": 13725 }, { "epoch": 1.6492492492492492, "grad_norm": 0.5330705046653748, "learning_rate": 4.221713239014456e-05, "loss": 0.0596, "step": 13730 }, { "epoch": 1.6498498498498497, "grad_norm": 0.39164403080940247, "learning_rate": 4.218606985378624e-05, "loss": 0.0527, "step": 13735 }, { "epoch": 1.6504504504504505, "grad_norm": 0.5172113180160522, "learning_rate": 4.2155010408425145e-05, "loss": 0.0546, "step": 13740 }, { "epoch": 1.6510510510510512, "grad_norm": 0.5502013564109802, "learning_rate": 4.2123954066347636e-05, "loss": 0.0554, "step": 13745 }, { "epoch": 1.6516516516516515, "grad_norm": 0.38821086287498474, "learning_rate": 4.2092900839838844e-05, "loss": 0.0543, "step": 13750 }, { "epoch": 1.6516516516516515, "eval_loss": 0.06733085960149765, "eval_runtime": 35.9799, "eval_samples_per_second": 22.235, "eval_steps_per_second": 5.559, "step": 13750 }, { "epoch": 1.6522522522522523, "grad_norm": 0.43382570147514343, "learning_rate": 4.2061850741182677e-05, "loss": 0.0619, "step": 13755 }, { "epoch": 1.652852852852853, "grad_norm": 0.33621084690093994, "learning_rate": 4.203080378266173e-05, "loss": 0.0629, "step": 13760 }, { "epoch": 1.6534534534534533, "grad_norm": 0.34766513109207153, "learning_rate": 4.199975997655746e-05, "loss": 0.0477, "step": 13765 }, { "epoch": 1.654054054054054, "grad_norm": 0.45685961842536926, "learning_rate": 4.196871933515e-05, "loss": 0.0626, "step": 13770 }, { "epoch": 1.6546546546546548, "grad_norm": 0.40836599469184875, "learning_rate": 4.193768187071826e-05, "loss": 0.0474, "step": 13775 }, { "epoch": 1.6552552552552553, "grad_norm": 0.49722006916999817, "learning_rate": 4.190664759553993e-05, "loss": 0.0562, "step": 13780 }, { "epoch": 1.6558558558558558, "grad_norm": 0.4253612458705902, "learning_rate": 4.1875616521891354e-05, "loss": 0.0479, "step": 13785 }, { "epoch": 1.6564564564564566, "grad_norm": 0.4923230707645416, "learning_rate": 4.184458866204767e-05, "loss": 0.0603, "step": 13790 }, { "epoch": 1.657057057057057, "grad_norm": 0.4575973153114319, "learning_rate": 4.181356402828274e-05, "loss": 0.0563, "step": 13795 }, { "epoch": 1.6576576576576576, "grad_norm": 0.467523455619812, "learning_rate": 4.178254263286914e-05, "loss": 0.057, "step": 13800 }, { "epoch": 1.6582582582582583, "grad_norm": 0.4888710379600525, "learning_rate": 4.175152448807816e-05, "loss": 0.0565, "step": 13805 }, { "epoch": 1.6588588588588589, "grad_norm": 0.4854770004749298, "learning_rate": 4.1720509606179806e-05, "loss": 0.0607, "step": 13810 }, { "epoch": 1.6594594594594594, "grad_norm": 0.47110292315483093, "learning_rate": 4.1689497999442804e-05, "loss": 0.0509, "step": 13815 }, { "epoch": 1.6600600600600601, "grad_norm": 0.35635825991630554, "learning_rate": 4.165848968013457e-05, "loss": 0.0622, "step": 13820 }, { "epoch": 1.6606606606606606, "grad_norm": 0.3194577395915985, "learning_rate": 4.162748466052126e-05, "loss": 0.0562, "step": 13825 }, { "epoch": 1.6612612612612612, "grad_norm": 0.4404977262020111, "learning_rate": 4.1596482952867645e-05, "loss": 0.0429, "step": 13830 }, { "epoch": 1.661861861861862, "grad_norm": 0.45366171002388, "learning_rate": 4.156548456943724e-05, "loss": 0.0594, "step": 13835 }, { "epoch": 1.6624624624624624, "grad_norm": 0.3973150849342346, "learning_rate": 4.1534489522492263e-05, "loss": 0.0479, "step": 13840 }, { "epoch": 1.663063063063063, "grad_norm": 0.4103251099586487, "learning_rate": 4.150349782429357e-05, "loss": 0.0499, "step": 13845 }, { "epoch": 1.6636636636636637, "grad_norm": 0.5832968354225159, "learning_rate": 4.1472509487100734e-05, "loss": 0.044, "step": 13850 }, { "epoch": 1.6642642642642642, "grad_norm": 0.3851698935031891, "learning_rate": 4.144152452317194e-05, "loss": 0.0533, "step": 13855 }, { "epoch": 1.6648648648648647, "grad_norm": 0.5267958045005798, "learning_rate": 4.1410542944764084e-05, "loss": 0.0743, "step": 13860 }, { "epoch": 1.6654654654654655, "grad_norm": 0.36634647846221924, "learning_rate": 4.137956476413271e-05, "loss": 0.0599, "step": 13865 }, { "epoch": 1.6660660660660662, "grad_norm": 0.4430139362812042, "learning_rate": 4.134858999353202e-05, "loss": 0.0642, "step": 13870 }, { "epoch": 1.6666666666666665, "grad_norm": 0.5938952565193176, "learning_rate": 4.1317618645214894e-05, "loss": 0.0616, "step": 13875 }, { "epoch": 1.6672672672672673, "grad_norm": 0.3854408860206604, "learning_rate": 4.128665073143278e-05, "loss": 0.0534, "step": 13880 }, { "epoch": 1.667867867867868, "grad_norm": 0.4133352041244507, "learning_rate": 4.1255686264435846e-05, "loss": 0.0561, "step": 13885 }, { "epoch": 1.6684684684684683, "grad_norm": 0.49693113565444946, "learning_rate": 4.1224725256472856e-05, "loss": 0.0647, "step": 13890 }, { "epoch": 1.669069069069069, "grad_norm": 0.3022277057170868, "learning_rate": 4.119376771979125e-05, "loss": 0.0626, "step": 13895 }, { "epoch": 1.6696696696696698, "grad_norm": 0.3540533185005188, "learning_rate": 4.116281366663702e-05, "loss": 0.0561, "step": 13900 }, { "epoch": 1.6702702702702703, "grad_norm": 0.4460397958755493, "learning_rate": 4.113186310925487e-05, "loss": 0.0531, "step": 13905 }, { "epoch": 1.6708708708708708, "grad_norm": 0.40761151909828186, "learning_rate": 4.110091605988801e-05, "loss": 0.049, "step": 13910 }, { "epoch": 1.6714714714714716, "grad_norm": 0.4298311769962311, "learning_rate": 4.106997253077837e-05, "loss": 0.0654, "step": 13915 }, { "epoch": 1.672072072072072, "grad_norm": 0.3780181407928467, "learning_rate": 4.103903253416647e-05, "loss": 0.0598, "step": 13920 }, { "epoch": 1.6726726726726726, "grad_norm": 0.42668402194976807, "learning_rate": 4.100809608229134e-05, "loss": 0.0531, "step": 13925 }, { "epoch": 1.6732732732732734, "grad_norm": 0.4466029107570648, "learning_rate": 4.09771631873907e-05, "loss": 0.0599, "step": 13930 }, { "epoch": 1.6738738738738739, "grad_norm": 0.3829571604728699, "learning_rate": 4.0946233861700856e-05, "loss": 0.0553, "step": 13935 }, { "epoch": 1.6744744744744744, "grad_norm": 0.36364227533340454, "learning_rate": 4.091530811745667e-05, "loss": 0.065, "step": 13940 }, { "epoch": 1.6750750750750751, "grad_norm": 0.43205782771110535, "learning_rate": 4.088438596689162e-05, "loss": 0.0586, "step": 13945 }, { "epoch": 1.6756756756756757, "grad_norm": 0.44587403535842896, "learning_rate": 4.0853467422237705e-05, "loss": 0.0506, "step": 13950 }, { "epoch": 1.6762762762762762, "grad_norm": 0.36174246668815613, "learning_rate": 4.082255249572557e-05, "loss": 0.0494, "step": 13955 }, { "epoch": 1.676876876876877, "grad_norm": 0.4244382977485657, "learning_rate": 4.0791641199584364e-05, "loss": 0.0497, "step": 13960 }, { "epoch": 1.6774774774774774, "grad_norm": 0.5707062482833862, "learning_rate": 4.0760733546041864e-05, "loss": 0.0536, "step": 13965 }, { "epoch": 1.678078078078078, "grad_norm": 0.3987087309360504, "learning_rate": 4.0729829547324335e-05, "loss": 0.043, "step": 13970 }, { "epoch": 1.6786786786786787, "grad_norm": 0.4837443232536316, "learning_rate": 4.069892921565666e-05, "loss": 0.0582, "step": 13975 }, { "epoch": 1.6792792792792792, "grad_norm": 0.41558149456977844, "learning_rate": 4.066803256326223e-05, "loss": 0.0588, "step": 13980 }, { "epoch": 1.6798798798798797, "grad_norm": 0.6192619800567627, "learning_rate": 4.0637139602363006e-05, "loss": 0.0582, "step": 13985 }, { "epoch": 1.6804804804804805, "grad_norm": 0.45900431275367737, "learning_rate": 4.06062503451795e-05, "loss": 0.0616, "step": 13990 }, { "epoch": 1.6810810810810812, "grad_norm": 0.5000198483467102, "learning_rate": 4.057536480393068e-05, "loss": 0.0608, "step": 13995 }, { "epoch": 1.6816816816816815, "grad_norm": 0.48604995012283325, "learning_rate": 4.054448299083415e-05, "loss": 0.0533, "step": 14000 }, { "epoch": 1.6816816816816815, "eval_loss": 0.06777658313512802, "eval_runtime": 35.9853, "eval_samples_per_second": 22.231, "eval_steps_per_second": 5.558, "step": 14000 }, { "epoch": 1.6822822822822823, "grad_norm": 0.46982526779174805, "learning_rate": 4.0513604918105966e-05, "loss": 0.0569, "step": 14005 }, { "epoch": 1.682882882882883, "grad_norm": 0.29595673084259033, "learning_rate": 4.048273059796074e-05, "loss": 0.0429, "step": 14010 }, { "epoch": 1.6834834834834835, "grad_norm": 0.5032521486282349, "learning_rate": 4.04518600426116e-05, "loss": 0.0484, "step": 14015 }, { "epoch": 1.684084084084084, "grad_norm": 0.4738074839115143, "learning_rate": 4.042099326427014e-05, "loss": 0.0511, "step": 14020 }, { "epoch": 1.6846846846846848, "grad_norm": 0.5076733231544495, "learning_rate": 4.039013027514651e-05, "loss": 0.0563, "step": 14025 }, { "epoch": 1.6852852852852853, "grad_norm": 0.45880991220474243, "learning_rate": 4.035927108744935e-05, "loss": 0.0474, "step": 14030 }, { "epoch": 1.6858858858858858, "grad_norm": 0.3994792699813843, "learning_rate": 4.0328415713385805e-05, "loss": 0.0481, "step": 14035 }, { "epoch": 1.6864864864864866, "grad_norm": 0.5211197137832642, "learning_rate": 4.029756416516145e-05, "loss": 0.0543, "step": 14040 }, { "epoch": 1.687087087087087, "grad_norm": 0.3836585283279419, "learning_rate": 4.026671645498044e-05, "loss": 0.0536, "step": 14045 }, { "epoch": 1.6876876876876876, "grad_norm": 0.45789793133735657, "learning_rate": 4.023587259504533e-05, "loss": 0.0495, "step": 14050 }, { "epoch": 1.6882882882882884, "grad_norm": 0.4536382853984833, "learning_rate": 4.0205032597557214e-05, "loss": 0.0537, "step": 14055 }, { "epoch": 1.6888888888888889, "grad_norm": 0.45672139525413513, "learning_rate": 4.017419647471564e-05, "loss": 0.0475, "step": 14060 }, { "epoch": 1.6894894894894894, "grad_norm": 0.4610535502433777, "learning_rate": 4.0143364238718564e-05, "loss": 0.0578, "step": 14065 }, { "epoch": 1.6900900900900901, "grad_norm": 0.3575139045715332, "learning_rate": 4.011253590176249e-05, "loss": 0.0474, "step": 14070 }, { "epoch": 1.6906906906906907, "grad_norm": 0.4532896876335144, "learning_rate": 4.0081711476042345e-05, "loss": 0.0518, "step": 14075 }, { "epoch": 1.6912912912912912, "grad_norm": 0.41786789894104004, "learning_rate": 4.00508909737515e-05, "loss": 0.0579, "step": 14080 }, { "epoch": 1.691891891891892, "grad_norm": 0.3306769132614136, "learning_rate": 4.0020074407081814e-05, "loss": 0.0541, "step": 14085 }, { "epoch": 1.6924924924924925, "grad_norm": 0.38540539145469666, "learning_rate": 3.998926178822351e-05, "loss": 0.0446, "step": 14090 }, { "epoch": 1.693093093093093, "grad_norm": 0.4451224207878113, "learning_rate": 3.995845312936532e-05, "loss": 0.0501, "step": 14095 }, { "epoch": 1.6936936936936937, "grad_norm": 0.49187320470809937, "learning_rate": 3.9927648442694375e-05, "loss": 0.0673, "step": 14100 }, { "epoch": 1.6942942942942945, "grad_norm": 0.3813941478729248, "learning_rate": 3.989684774039626e-05, "loss": 0.0519, "step": 14105 }, { "epoch": 1.6948948948948948, "grad_norm": 0.34133368730545044, "learning_rate": 3.9866051034654985e-05, "loss": 0.0622, "step": 14110 }, { "epoch": 1.6954954954954955, "grad_norm": 0.4624498784542084, "learning_rate": 3.983525833765292e-05, "loss": 0.0495, "step": 14115 }, { "epoch": 1.6960960960960962, "grad_norm": 0.39160802960395813, "learning_rate": 3.980446966157091e-05, "loss": 0.0554, "step": 14120 }, { "epoch": 1.6966966966966965, "grad_norm": 0.3328275680541992, "learning_rate": 3.977368501858821e-05, "loss": 0.0529, "step": 14125 }, { "epoch": 1.6972972972972973, "grad_norm": 0.4307607114315033, "learning_rate": 3.974290442088248e-05, "loss": 0.0498, "step": 14130 }, { "epoch": 1.697897897897898, "grad_norm": 0.41966062784194946, "learning_rate": 3.97121278806297e-05, "loss": 0.0501, "step": 14135 }, { "epoch": 1.6984984984984985, "grad_norm": 0.4566977918148041, "learning_rate": 3.968135541000435e-05, "loss": 0.0525, "step": 14140 }, { "epoch": 1.699099099099099, "grad_norm": 0.36930787563323975, "learning_rate": 3.965058702117925e-05, "loss": 0.044, "step": 14145 }, { "epoch": 1.6996996996996998, "grad_norm": 0.46633774042129517, "learning_rate": 3.961982272632561e-05, "loss": 0.0537, "step": 14150 }, { "epoch": 1.7003003003003003, "grad_norm": 0.5075660347938538, "learning_rate": 3.9589062537613044e-05, "loss": 0.0535, "step": 14155 }, { "epoch": 1.7009009009009008, "grad_norm": 0.4399413764476776, "learning_rate": 3.955830646720948e-05, "loss": 0.0443, "step": 14160 }, { "epoch": 1.7015015015015016, "grad_norm": 0.33186614513397217, "learning_rate": 3.952755452728128e-05, "loss": 0.0537, "step": 14165 }, { "epoch": 1.702102102102102, "grad_norm": 0.4482559263706207, "learning_rate": 3.949680672999315e-05, "loss": 0.0401, "step": 14170 }, { "epoch": 1.7027027027027026, "grad_norm": 0.34379854798316956, "learning_rate": 3.9466063087508156e-05, "loss": 0.0497, "step": 14175 }, { "epoch": 1.7033033033033034, "grad_norm": 0.2629309892654419, "learning_rate": 3.9435323611987736e-05, "loss": 0.0488, "step": 14180 }, { "epoch": 1.703903903903904, "grad_norm": 0.48224538564682007, "learning_rate": 3.9404588315591634e-05, "loss": 0.0467, "step": 14185 }, { "epoch": 1.7045045045045044, "grad_norm": 0.6717689037322998, "learning_rate": 3.9373857210478e-05, "loss": 0.0719, "step": 14190 }, { "epoch": 1.7051051051051052, "grad_norm": 0.48857033252716064, "learning_rate": 3.934313030880328e-05, "loss": 0.0556, "step": 14195 }, { "epoch": 1.7057057057057057, "grad_norm": 0.38591575622558594, "learning_rate": 3.93124076227223e-05, "loss": 0.0458, "step": 14200 }, { "epoch": 1.7063063063063062, "grad_norm": 0.34425076842308044, "learning_rate": 3.928168916438815e-05, "loss": 0.0492, "step": 14205 }, { "epoch": 1.706906906906907, "grad_norm": 0.427339106798172, "learning_rate": 3.925097494595233e-05, "loss": 0.0556, "step": 14210 }, { "epoch": 1.7075075075075075, "grad_norm": 0.4965413510799408, "learning_rate": 3.9220264979564604e-05, "loss": 0.054, "step": 14215 }, { "epoch": 1.708108108108108, "grad_norm": 0.5718177556991577, "learning_rate": 3.918955927737308e-05, "loss": 0.0571, "step": 14220 }, { "epoch": 1.7087087087087087, "grad_norm": 0.45786628127098083, "learning_rate": 3.91588578515242e-05, "loss": 0.0498, "step": 14225 }, { "epoch": 1.7093093093093095, "grad_norm": 0.35944420099258423, "learning_rate": 3.912816071416264e-05, "loss": 0.0546, "step": 14230 }, { "epoch": 1.7099099099099098, "grad_norm": 0.39434975385665894, "learning_rate": 3.909746787743147e-05, "loss": 0.0507, "step": 14235 }, { "epoch": 1.7105105105105105, "grad_norm": 0.4029919505119324, "learning_rate": 3.906677935347197e-05, "loss": 0.0489, "step": 14240 }, { "epoch": 1.7111111111111112, "grad_norm": 0.4398517310619354, "learning_rate": 3.903609515442379e-05, "loss": 0.0551, "step": 14245 }, { "epoch": 1.7117117117117115, "grad_norm": 0.47671040892601013, "learning_rate": 3.900541529242487e-05, "loss": 0.0511, "step": 14250 }, { "epoch": 1.7117117117117115, "eval_loss": 0.06355729699134827, "eval_runtime": 35.8789, "eval_samples_per_second": 22.297, "eval_steps_per_second": 5.574, "step": 14250 }, { "epoch": 1.7123123123123123, "grad_norm": 0.30654531717300415, "learning_rate": 3.897473977961134e-05, "loss": 0.0506, "step": 14255 }, { "epoch": 1.712912912912913, "grad_norm": 0.49727270007133484, "learning_rate": 3.89440686281177e-05, "loss": 0.0593, "step": 14260 }, { "epoch": 1.7135135135135136, "grad_norm": 0.40689897537231445, "learning_rate": 3.89134018500767e-05, "loss": 0.05, "step": 14265 }, { "epoch": 1.714114114114114, "grad_norm": 0.38134849071502686, "learning_rate": 3.8882739457619375e-05, "loss": 0.0519, "step": 14270 }, { "epoch": 1.7147147147147148, "grad_norm": 0.45295876264572144, "learning_rate": 3.885208146287498e-05, "loss": 0.0519, "step": 14275 }, { "epoch": 1.7153153153153153, "grad_norm": 0.3134695887565613, "learning_rate": 3.882142787797108e-05, "loss": 0.0438, "step": 14280 }, { "epoch": 1.7159159159159159, "grad_norm": 0.4081864655017853, "learning_rate": 3.879077871503344e-05, "loss": 0.0499, "step": 14285 }, { "epoch": 1.7165165165165166, "grad_norm": 0.3634887933731079, "learning_rate": 3.876013398618615e-05, "loss": 0.0386, "step": 14290 }, { "epoch": 1.7171171171171171, "grad_norm": 0.5056455135345459, "learning_rate": 3.87294937035515e-05, "loss": 0.0562, "step": 14295 }, { "epoch": 1.7177177177177176, "grad_norm": 0.39153170585632324, "learning_rate": 3.869885787925e-05, "loss": 0.0466, "step": 14300 }, { "epoch": 1.7183183183183184, "grad_norm": 0.5252231359481812, "learning_rate": 3.866822652540044e-05, "loss": 0.0501, "step": 14305 }, { "epoch": 1.718918918918919, "grad_norm": 0.6176850199699402, "learning_rate": 3.863759965411981e-05, "loss": 0.0549, "step": 14310 }, { "epoch": 1.7195195195195194, "grad_norm": 0.9495644569396973, "learning_rate": 3.8606977277523374e-05, "loss": 0.0515, "step": 14315 }, { "epoch": 1.7201201201201202, "grad_norm": 0.5057100653648376, "learning_rate": 3.857635940772459e-05, "loss": 0.044, "step": 14320 }, { "epoch": 1.7207207207207207, "grad_norm": 0.4010032117366791, "learning_rate": 3.854574605683508e-05, "loss": 0.0542, "step": 14325 }, { "epoch": 1.7213213213213212, "grad_norm": 0.3846031725406647, "learning_rate": 3.8515137236964766e-05, "loss": 0.054, "step": 14330 }, { "epoch": 1.721921921921922, "grad_norm": 0.3945229649543762, "learning_rate": 3.848453296022172e-05, "loss": 0.0583, "step": 14335 }, { "epoch": 1.7225225225225225, "grad_norm": 0.3763466477394104, "learning_rate": 3.845393323871226e-05, "loss": 0.061, "step": 14340 }, { "epoch": 1.723123123123123, "grad_norm": 0.3965177536010742, "learning_rate": 3.842333808454084e-05, "loss": 0.0441, "step": 14345 }, { "epoch": 1.7237237237237237, "grad_norm": 0.4570268988609314, "learning_rate": 3.839274750981017e-05, "loss": 0.0512, "step": 14350 }, { "epoch": 1.7243243243243245, "grad_norm": 0.4227088987827301, "learning_rate": 3.8362161526621115e-05, "loss": 0.0453, "step": 14355 }, { "epoch": 1.7249249249249248, "grad_norm": 0.40480488538742065, "learning_rate": 3.8331580147072746e-05, "loss": 0.0454, "step": 14360 }, { "epoch": 1.7255255255255255, "grad_norm": 0.37806281447410583, "learning_rate": 3.830100338326231e-05, "loss": 0.0516, "step": 14365 }, { "epoch": 1.7261261261261263, "grad_norm": 0.3402007520198822, "learning_rate": 3.8270431247285174e-05, "loss": 0.0398, "step": 14370 }, { "epoch": 1.7267267267267268, "grad_norm": 0.4493679702281952, "learning_rate": 3.8239863751234956e-05, "loss": 0.0539, "step": 14375 }, { "epoch": 1.7273273273273273, "grad_norm": 0.4124431312084198, "learning_rate": 3.8209300907203376e-05, "loss": 0.043, "step": 14380 }, { "epoch": 1.727927927927928, "grad_norm": 0.41435253620147705, "learning_rate": 3.817874272728035e-05, "loss": 0.0432, "step": 14385 }, { "epoch": 1.7285285285285286, "grad_norm": 0.43934547901153564, "learning_rate": 3.814818922355396e-05, "loss": 0.0524, "step": 14390 }, { "epoch": 1.729129129129129, "grad_norm": 0.30731597542762756, "learning_rate": 3.8117640408110366e-05, "loss": 0.0405, "step": 14395 }, { "epoch": 1.7297297297297298, "grad_norm": 0.5254538655281067, "learning_rate": 3.8087096293033954e-05, "loss": 0.0502, "step": 14400 }, { "epoch": 1.7303303303303303, "grad_norm": 0.4425089359283447, "learning_rate": 3.805655689040721e-05, "loss": 0.0506, "step": 14405 }, { "epoch": 1.7309309309309309, "grad_norm": 0.305950790643692, "learning_rate": 3.80260222123108e-05, "loss": 0.0504, "step": 14410 }, { "epoch": 1.7315315315315316, "grad_norm": 0.32326582074165344, "learning_rate": 3.799549227082343e-05, "loss": 0.0387, "step": 14415 }, { "epoch": 1.7321321321321321, "grad_norm": 0.37440550327301025, "learning_rate": 3.796496707802202e-05, "loss": 0.041, "step": 14420 }, { "epoch": 1.7327327327327327, "grad_norm": 0.5314804315567017, "learning_rate": 3.7934446645981566e-05, "loss": 0.0556, "step": 14425 }, { "epoch": 1.7333333333333334, "grad_norm": 0.3879922330379486, "learning_rate": 3.7903930986775206e-05, "loss": 0.0486, "step": 14430 }, { "epoch": 1.733933933933934, "grad_norm": 0.45070913434028625, "learning_rate": 3.78734201124742e-05, "loss": 0.0492, "step": 14435 }, { "epoch": 1.7345345345345344, "grad_norm": 0.4279508590698242, "learning_rate": 3.784291403514785e-05, "loss": 0.0446, "step": 14440 }, { "epoch": 1.7351351351351352, "grad_norm": 0.3520037829875946, "learning_rate": 3.781241276686362e-05, "loss": 0.0504, "step": 14445 }, { "epoch": 1.7357357357357357, "grad_norm": 0.4254855811595917, "learning_rate": 3.778191631968707e-05, "loss": 0.0467, "step": 14450 }, { "epoch": 1.7363363363363362, "grad_norm": 0.4110926687717438, "learning_rate": 3.775142470568183e-05, "loss": 0.044, "step": 14455 }, { "epoch": 1.736936936936937, "grad_norm": 0.4648517072200775, "learning_rate": 3.772093793690964e-05, "loss": 0.0519, "step": 14460 }, { "epoch": 1.7375375375375377, "grad_norm": 0.3212833106517792, "learning_rate": 3.7690456025430295e-05, "loss": 0.0561, "step": 14465 }, { "epoch": 1.738138138138138, "grad_norm": 0.4225824177265167, "learning_rate": 3.765997898330169e-05, "loss": 0.0435, "step": 14470 }, { "epoch": 1.7387387387387387, "grad_norm": 0.3474774956703186, "learning_rate": 3.762950682257978e-05, "loss": 0.0505, "step": 14475 }, { "epoch": 1.7393393393393395, "grad_norm": 0.4199853837490082, "learning_rate": 3.7599039555318626e-05, "loss": 0.0493, "step": 14480 }, { "epoch": 1.7399399399399398, "grad_norm": 0.44869092106819153, "learning_rate": 3.756857719357027e-05, "loss": 0.0411, "step": 14485 }, { "epoch": 1.7405405405405405, "grad_norm": 0.45615318417549133, "learning_rate": 3.753811974938491e-05, "loss": 0.0507, "step": 14490 }, { "epoch": 1.7411411411411413, "grad_norm": 0.43175235390663147, "learning_rate": 3.750766723481076e-05, "loss": 0.0599, "step": 14495 }, { "epoch": 1.7417417417417418, "grad_norm": 0.42501696944236755, "learning_rate": 3.747721966189405e-05, "loss": 0.0462, "step": 14500 }, { "epoch": 1.7417417417417418, "eval_loss": 0.060369253158569336, "eval_runtime": 35.8652, "eval_samples_per_second": 22.306, "eval_steps_per_second": 5.576, "step": 14500 }, { "epoch": 1.7423423423423423, "grad_norm": 0.4761236011981964, "learning_rate": 3.744677704267913e-05, "loss": 0.0526, "step": 14505 }, { "epoch": 1.742942942942943, "grad_norm": 0.4406774640083313, "learning_rate": 3.74163393892083e-05, "loss": 0.0474, "step": 14510 }, { "epoch": 1.7435435435435436, "grad_norm": 0.5392323732376099, "learning_rate": 3.738590671352197e-05, "loss": 0.0529, "step": 14515 }, { "epoch": 1.744144144144144, "grad_norm": 0.37301626801490784, "learning_rate": 3.735547902765853e-05, "loss": 0.0469, "step": 14520 }, { "epoch": 1.7447447447447448, "grad_norm": 0.5395182967185974, "learning_rate": 3.732505634365443e-05, "loss": 0.0435, "step": 14525 }, { "epoch": 1.7453453453453454, "grad_norm": 0.44104713201522827, "learning_rate": 3.729463867354417e-05, "loss": 0.0456, "step": 14530 }, { "epoch": 1.7459459459459459, "grad_norm": 0.3940773606300354, "learning_rate": 3.726422602936016e-05, "loss": 0.0501, "step": 14535 }, { "epoch": 1.7465465465465466, "grad_norm": 0.3776192367076874, "learning_rate": 3.7233818423132915e-05, "loss": 0.0433, "step": 14540 }, { "epoch": 1.7471471471471471, "grad_norm": 0.43645375967025757, "learning_rate": 3.720341586689095e-05, "loss": 0.0536, "step": 14545 }, { "epoch": 1.7477477477477477, "grad_norm": 0.4166800081729889, "learning_rate": 3.7173018372660745e-05, "loss": 0.0438, "step": 14550 }, { "epoch": 1.7483483483483484, "grad_norm": 0.32177260518074036, "learning_rate": 3.7142625952466805e-05, "loss": 0.0443, "step": 14555 }, { "epoch": 1.748948948948949, "grad_norm": 0.5205382108688354, "learning_rate": 3.71122386183316e-05, "loss": 0.0444, "step": 14560 }, { "epoch": 1.7495495495495494, "grad_norm": 0.37180835008621216, "learning_rate": 3.708185638227564e-05, "loss": 0.0368, "step": 14565 }, { "epoch": 1.7501501501501502, "grad_norm": 0.39908167719841003, "learning_rate": 3.7051479256317345e-05, "loss": 0.054, "step": 14570 }, { "epoch": 1.7507507507507507, "grad_norm": 0.39011678099632263, "learning_rate": 3.7021107252473196e-05, "loss": 0.0476, "step": 14575 }, { "epoch": 1.7513513513513512, "grad_norm": 0.29879021644592285, "learning_rate": 3.699074038275756e-05, "loss": 0.04, "step": 14580 }, { "epoch": 1.751951951951952, "grad_norm": 0.42431557178497314, "learning_rate": 3.696037865918285e-05, "loss": 0.0525, "step": 14585 }, { "epoch": 1.7525525525525527, "grad_norm": 0.384691059589386, "learning_rate": 3.69300220937594e-05, "loss": 0.0459, "step": 14590 }, { "epoch": 1.753153153153153, "grad_norm": 0.4510609805583954, "learning_rate": 3.689967069849552e-05, "loss": 0.0571, "step": 14595 }, { "epoch": 1.7537537537537538, "grad_norm": 0.3073751628398895, "learning_rate": 3.686932448539749e-05, "loss": 0.0455, "step": 14600 }, { "epoch": 1.7543543543543545, "grad_norm": 0.443437397480011, "learning_rate": 3.683898346646948e-05, "loss": 0.0393, "step": 14605 }, { "epoch": 1.7549549549549548, "grad_norm": 0.36913490295410156, "learning_rate": 3.6808647653713676e-05, "loss": 0.0387, "step": 14610 }, { "epoch": 1.7555555555555555, "grad_norm": 0.3835843801498413, "learning_rate": 3.677831705913016e-05, "loss": 0.0448, "step": 14615 }, { "epoch": 1.7561561561561563, "grad_norm": 0.5466217994689941, "learning_rate": 3.674799169471695e-05, "loss": 0.0567, "step": 14620 }, { "epoch": 1.7567567567567568, "grad_norm": 0.4442767798900604, "learning_rate": 3.671767157247007e-05, "loss": 0.0536, "step": 14625 }, { "epoch": 1.7573573573573573, "grad_norm": 0.405453622341156, "learning_rate": 3.668735670438333e-05, "loss": 0.0459, "step": 14630 }, { "epoch": 1.757957957957958, "grad_norm": 0.35475534200668335, "learning_rate": 3.665704710244859e-05, "loss": 0.0406, "step": 14635 }, { "epoch": 1.7585585585585586, "grad_norm": 0.38387325406074524, "learning_rate": 3.6626742778655556e-05, "loss": 0.0416, "step": 14640 }, { "epoch": 1.759159159159159, "grad_norm": 0.5198370814323425, "learning_rate": 3.65964437449919e-05, "loss": 0.0497, "step": 14645 }, { "epoch": 1.7597597597597598, "grad_norm": 0.3503769636154175, "learning_rate": 3.656615001344313e-05, "loss": 0.0485, "step": 14650 }, { "epoch": 1.7603603603603604, "grad_norm": 0.42356932163238525, "learning_rate": 3.653586159599272e-05, "loss": 0.0528, "step": 14655 }, { "epoch": 1.7609609609609609, "grad_norm": 0.33685001730918884, "learning_rate": 3.6505578504622004e-05, "loss": 0.0425, "step": 14660 }, { "epoch": 1.7615615615615616, "grad_norm": 0.36791616678237915, "learning_rate": 3.647530075131023e-05, "loss": 0.0528, "step": 14665 }, { "epoch": 1.7621621621621621, "grad_norm": 0.4632832407951355, "learning_rate": 3.644502834803454e-05, "loss": 0.0399, "step": 14670 }, { "epoch": 1.7627627627627627, "grad_norm": 0.5057429075241089, "learning_rate": 3.641476130676992e-05, "loss": 0.046, "step": 14675 }, { "epoch": 1.7633633633633634, "grad_norm": 0.41748347878456116, "learning_rate": 3.638449963948927e-05, "loss": 0.0415, "step": 14680 }, { "epoch": 1.763963963963964, "grad_norm": 0.3554884195327759, "learning_rate": 3.6354243358163365e-05, "loss": 0.0393, "step": 14685 }, { "epoch": 1.7645645645645645, "grad_norm": 0.4177655279636383, "learning_rate": 3.632399247476084e-05, "loss": 0.0424, "step": 14690 }, { "epoch": 1.7651651651651652, "grad_norm": 0.3438865542411804, "learning_rate": 3.629374700124821e-05, "loss": 0.0404, "step": 14695 }, { "epoch": 1.7657657657657657, "grad_norm": 0.4452444314956665, "learning_rate": 3.62635069495898e-05, "loss": 0.0458, "step": 14700 }, { "epoch": 1.7663663663663662, "grad_norm": 0.4158708453178406, "learning_rate": 3.6233272331747854e-05, "loss": 0.0473, "step": 14705 }, { "epoch": 1.766966966966967, "grad_norm": 0.33571508526802063, "learning_rate": 3.620304315968242e-05, "loss": 0.0445, "step": 14710 }, { "epoch": 1.7675675675675677, "grad_norm": 0.44375666975975037, "learning_rate": 3.617281944535144e-05, "loss": 0.0561, "step": 14715 }, { "epoch": 1.768168168168168, "grad_norm": 0.4546729326248169, "learning_rate": 3.6142601200710614e-05, "loss": 0.0489, "step": 14720 }, { "epoch": 1.7687687687687688, "grad_norm": 0.3842746615409851, "learning_rate": 3.611238843771356e-05, "loss": 0.043, "step": 14725 }, { "epoch": 1.7693693693693695, "grad_norm": 0.41635391116142273, "learning_rate": 3.608218116831171e-05, "loss": 0.0422, "step": 14730 }, { "epoch": 1.76996996996997, "grad_norm": 0.39130958914756775, "learning_rate": 3.60519794044543e-05, "loss": 0.0506, "step": 14735 }, { "epoch": 1.7705705705705705, "grad_norm": 0.3856130838394165, "learning_rate": 3.6021783158088415e-05, "loss": 0.0439, "step": 14740 }, { "epoch": 1.7711711711711713, "grad_norm": 0.28414976596832275, "learning_rate": 3.599159244115892e-05, "loss": 0.0502, "step": 14745 }, { "epoch": 1.7717717717717718, "grad_norm": 0.42334309220314026, "learning_rate": 3.596140726560853e-05, "loss": 0.0494, "step": 14750 }, { "epoch": 1.7717717717717718, "eval_loss": 0.057185668498277664, "eval_runtime": 35.8904, "eval_samples_per_second": 22.29, "eval_steps_per_second": 5.573, "step": 14750 }, { "epoch": 1.7723723723723723, "grad_norm": 0.42764347791671753, "learning_rate": 3.593122764337773e-05, "loss": 0.0419, "step": 14755 }, { "epoch": 1.772972972972973, "grad_norm": 0.4755389988422394, "learning_rate": 3.590105358640485e-05, "loss": 0.0529, "step": 14760 }, { "epoch": 1.7735735735735736, "grad_norm": 0.34451594948768616, "learning_rate": 3.587088510662603e-05, "loss": 0.0352, "step": 14765 }, { "epoch": 1.7741741741741741, "grad_norm": 0.3233650326728821, "learning_rate": 3.584072221597511e-05, "loss": 0.0477, "step": 14770 }, { "epoch": 1.7747747747747749, "grad_norm": 0.44847461581230164, "learning_rate": 3.5810564926383814e-05, "loss": 0.0457, "step": 14775 }, { "epoch": 1.7753753753753754, "grad_norm": 0.3650940954685211, "learning_rate": 3.5780413249781616e-05, "loss": 0.0406, "step": 14780 }, { "epoch": 1.775975975975976, "grad_norm": 0.4203718304634094, "learning_rate": 3.5750267198095804e-05, "loss": 0.0463, "step": 14785 }, { "epoch": 1.7765765765765766, "grad_norm": 0.4171884059906006, "learning_rate": 3.5720126783251354e-05, "loss": 0.0455, "step": 14790 }, { "epoch": 1.7771771771771772, "grad_norm": 0.2850398123264313, "learning_rate": 3.568999201717111e-05, "loss": 0.0452, "step": 14795 }, { "epoch": 1.7777777777777777, "grad_norm": 0.4151475727558136, "learning_rate": 3.565986291177561e-05, "loss": 0.0461, "step": 14800 }, { "epoch": 1.7783783783783784, "grad_norm": 0.40226131677627563, "learning_rate": 3.5629739478983195e-05, "loss": 0.0386, "step": 14805 }, { "epoch": 1.778978978978979, "grad_norm": 0.4193638861179352, "learning_rate": 3.559962173070996e-05, "loss": 0.0512, "step": 14810 }, { "epoch": 1.7795795795795795, "grad_norm": 0.2949560284614563, "learning_rate": 3.556950967886973e-05, "loss": 0.0327, "step": 14815 }, { "epoch": 1.7801801801801802, "grad_norm": 0.452722430229187, "learning_rate": 3.5539403335374065e-05, "loss": 0.0437, "step": 14820 }, { "epoch": 1.7807807807807807, "grad_norm": 0.335397332906723, "learning_rate": 3.5509302712132326e-05, "loss": 0.0414, "step": 14825 }, { "epoch": 1.7813813813813812, "grad_norm": 0.3416590690612793, "learning_rate": 3.547920782105155e-05, "loss": 0.0336, "step": 14830 }, { "epoch": 1.781981981981982, "grad_norm": 0.40698111057281494, "learning_rate": 3.5449118674036566e-05, "loss": 0.0442, "step": 14835 }, { "epoch": 1.7825825825825827, "grad_norm": 0.4348471760749817, "learning_rate": 3.541903528298984e-05, "loss": 0.0452, "step": 14840 }, { "epoch": 1.783183183183183, "grad_norm": 0.41390636563301086, "learning_rate": 3.538895765981166e-05, "loss": 0.0541, "step": 14845 }, { "epoch": 1.7837837837837838, "grad_norm": 0.36194291710853577, "learning_rate": 3.5358885816399964e-05, "loss": 0.0357, "step": 14850 }, { "epoch": 1.7843843843843845, "grad_norm": 0.4427974820137024, "learning_rate": 3.532881976465045e-05, "loss": 0.0396, "step": 14855 }, { "epoch": 1.784984984984985, "grad_norm": 0.3613729774951935, "learning_rate": 3.529875951645648e-05, "loss": 0.0413, "step": 14860 }, { "epoch": 1.7855855855855856, "grad_norm": 0.48508042097091675, "learning_rate": 3.526870508370915e-05, "loss": 0.0431, "step": 14865 }, { "epoch": 1.7861861861861863, "grad_norm": 0.34934136271476746, "learning_rate": 3.5238656478297264e-05, "loss": 0.0492, "step": 14870 }, { "epoch": 1.7867867867867868, "grad_norm": 0.4539346992969513, "learning_rate": 3.520861371210729e-05, "loss": 0.043, "step": 14875 }, { "epoch": 1.7873873873873873, "grad_norm": 0.532275915145874, "learning_rate": 3.5178576797023434e-05, "loss": 0.0405, "step": 14880 }, { "epoch": 1.787987987987988, "grad_norm": 0.34528690576553345, "learning_rate": 3.51485457449275e-05, "loss": 0.0366, "step": 14885 }, { "epoch": 1.7885885885885886, "grad_norm": 0.35474127531051636, "learning_rate": 3.511852056769907e-05, "loss": 0.0384, "step": 14890 }, { "epoch": 1.7891891891891891, "grad_norm": 0.4059201180934906, "learning_rate": 3.5088501277215355e-05, "loss": 0.0397, "step": 14895 }, { "epoch": 1.7897897897897899, "grad_norm": 0.430829793214798, "learning_rate": 3.505848788535123e-05, "loss": 0.0408, "step": 14900 }, { "epoch": 1.7903903903903904, "grad_norm": 0.426645427942276, "learning_rate": 3.5028480403979276e-05, "loss": 0.039, "step": 14905 }, { "epoch": 1.790990990990991, "grad_norm": 0.5083160400390625, "learning_rate": 3.499847884496967e-05, "loss": 0.0521, "step": 14910 }, { "epoch": 1.7915915915915916, "grad_norm": 0.4045082926750183, "learning_rate": 3.496848322019031e-05, "loss": 0.0361, "step": 14915 }, { "epoch": 1.7921921921921922, "grad_norm": 0.38742151856422424, "learning_rate": 3.4938493541506705e-05, "loss": 0.043, "step": 14920 }, { "epoch": 1.7927927927927927, "grad_norm": 0.4921604096889496, "learning_rate": 3.490850982078208e-05, "loss": 0.0427, "step": 14925 }, { "epoch": 1.7933933933933934, "grad_norm": 0.40055954456329346, "learning_rate": 3.487853206987718e-05, "loss": 0.0439, "step": 14930 }, { "epoch": 1.793993993993994, "grad_norm": 0.4105166792869568, "learning_rate": 3.4848560300650503e-05, "loss": 0.0433, "step": 14935 }, { "epoch": 1.7945945945945945, "grad_norm": 0.34285271167755127, "learning_rate": 3.4818594524958145e-05, "loss": 0.043, "step": 14940 }, { "epoch": 1.7951951951951952, "grad_norm": 0.4157925844192505, "learning_rate": 3.478863475465379e-05, "loss": 0.0393, "step": 14945 }, { "epoch": 1.795795795795796, "grad_norm": 0.4255988895893097, "learning_rate": 3.4758681001588835e-05, "loss": 0.0424, "step": 14950 }, { "epoch": 1.7963963963963963, "grad_norm": 0.40526896715164185, "learning_rate": 3.472873327761219e-05, "loss": 0.0488, "step": 14955 }, { "epoch": 1.796996996996997, "grad_norm": 0.44084569811820984, "learning_rate": 3.469879159457044e-05, "loss": 0.0493, "step": 14960 }, { "epoch": 1.7975975975975977, "grad_norm": 0.5121871829032898, "learning_rate": 3.46688559643078e-05, "loss": 0.0443, "step": 14965 }, { "epoch": 1.798198198198198, "grad_norm": 0.40672311186790466, "learning_rate": 3.463892639866605e-05, "loss": 0.0385, "step": 14970 }, { "epoch": 1.7987987987987988, "grad_norm": 0.3255296051502228, "learning_rate": 3.46090029094846e-05, "loss": 0.0372, "step": 14975 }, { "epoch": 1.7993993993993995, "grad_norm": 0.30151480436325073, "learning_rate": 3.4579085508600406e-05, "loss": 0.0419, "step": 14980 }, { "epoch": 1.8, "grad_norm": 0.4027726352214813, "learning_rate": 3.4549174207848075e-05, "loss": 0.0423, "step": 14985 }, { "epoch": 1.8006006006006006, "grad_norm": 0.48146966099739075, "learning_rate": 3.4519269019059775e-05, "loss": 0.0461, "step": 14990 }, { "epoch": 1.8012012012012013, "grad_norm": 0.46603134274482727, "learning_rate": 3.4489369954065275e-05, "loss": 0.0403, "step": 14995 }, { "epoch": 1.8018018018018018, "grad_norm": 0.3438381850719452, "learning_rate": 3.4459477024691866e-05, "loss": 0.0371, "step": 15000 }, { "epoch": 1.8018018018018018, "eval_loss": 0.057668328285217285, "eval_runtime": 35.8535, "eval_samples_per_second": 22.313, "eval_steps_per_second": 5.578, "step": 15000 }, { "epoch": 1.8024024024024023, "grad_norm": 0.47141313552856445, "learning_rate": 3.4429590242764467e-05, "loss": 0.0374, "step": 15005 }, { "epoch": 1.803003003003003, "grad_norm": 0.37017688155174255, "learning_rate": 3.439970962010555e-05, "loss": 0.033, "step": 15010 }, { "epoch": 1.8036036036036036, "grad_norm": 0.453005850315094, "learning_rate": 3.4369835168535155e-05, "loss": 0.0486, "step": 15015 }, { "epoch": 1.8042042042042041, "grad_norm": 0.4695863425731659, "learning_rate": 3.433996689987089e-05, "loss": 0.0418, "step": 15020 }, { "epoch": 1.8048048048048049, "grad_norm": 0.316760390996933, "learning_rate": 3.431010482592787e-05, "loss": 0.0357, "step": 15025 }, { "epoch": 1.8054054054054054, "grad_norm": 0.41289475560188293, "learning_rate": 3.428024895851881e-05, "loss": 0.0432, "step": 15030 }, { "epoch": 1.806006006006006, "grad_norm": 0.32462531328201294, "learning_rate": 3.425039930945394e-05, "loss": 0.0389, "step": 15035 }, { "epoch": 1.8066066066066067, "grad_norm": 0.38858461380004883, "learning_rate": 3.422055589054105e-05, "loss": 0.0466, "step": 15040 }, { "epoch": 1.8072072072072072, "grad_norm": 0.3275511562824249, "learning_rate": 3.4190718713585475e-05, "loss": 0.0453, "step": 15045 }, { "epoch": 1.8078078078078077, "grad_norm": 0.44248396158218384, "learning_rate": 3.416088779039003e-05, "loss": 0.0335, "step": 15050 }, { "epoch": 1.8084084084084084, "grad_norm": 0.39229512214660645, "learning_rate": 3.413106313275509e-05, "loss": 0.0435, "step": 15055 }, { "epoch": 1.809009009009009, "grad_norm": 0.33925944566726685, "learning_rate": 3.4101244752478576e-05, "loss": 0.0375, "step": 15060 }, { "epoch": 1.8096096096096095, "grad_norm": 0.306045800447464, "learning_rate": 3.4071432661355907e-05, "loss": 0.0448, "step": 15065 }, { "epoch": 1.8102102102102102, "grad_norm": 0.3433867394924164, "learning_rate": 3.404162687117999e-05, "loss": 0.033, "step": 15070 }, { "epoch": 1.810810810810811, "grad_norm": 0.3375025689601898, "learning_rate": 3.401182739374124e-05, "loss": 0.0406, "step": 15075 }, { "epoch": 1.8114114114114113, "grad_norm": 0.3570587635040283, "learning_rate": 3.398203424082762e-05, "loss": 0.0494, "step": 15080 }, { "epoch": 1.812012012012012, "grad_norm": 0.3204568028450012, "learning_rate": 3.395224742422455e-05, "loss": 0.0355, "step": 15085 }, { "epoch": 1.8126126126126128, "grad_norm": 0.39254069328308105, "learning_rate": 3.3922466955714985e-05, "loss": 0.0365, "step": 15090 }, { "epoch": 1.813213213213213, "grad_norm": 0.29781073331832886, "learning_rate": 3.389269284707929e-05, "loss": 0.0421, "step": 15095 }, { "epoch": 1.8138138138138138, "grad_norm": 0.431906521320343, "learning_rate": 3.3862925110095403e-05, "loss": 0.04, "step": 15100 }, { "epoch": 1.8144144144144145, "grad_norm": 0.44331666827201843, "learning_rate": 3.383316375653868e-05, "loss": 0.0434, "step": 15105 }, { "epoch": 1.815015015015015, "grad_norm": 0.3456747531890869, "learning_rate": 3.380340879818199e-05, "loss": 0.0389, "step": 15110 }, { "epoch": 1.8156156156156156, "grad_norm": 0.3462885320186615, "learning_rate": 3.377366024679568e-05, "loss": 0.0355, "step": 15115 }, { "epoch": 1.8162162162162163, "grad_norm": 0.3487900197505951, "learning_rate": 3.374391811414749e-05, "loss": 0.0431, "step": 15120 }, { "epoch": 1.8168168168168168, "grad_norm": 0.3203073740005493, "learning_rate": 3.3714182412002715e-05, "loss": 0.0348, "step": 15125 }, { "epoch": 1.8174174174174174, "grad_norm": 0.4262768626213074, "learning_rate": 3.368445315212403e-05, "loss": 0.0399, "step": 15130 }, { "epoch": 1.818018018018018, "grad_norm": 0.3522956073284149, "learning_rate": 3.365473034627161e-05, "loss": 0.046, "step": 15135 }, { "epoch": 1.8186186186186186, "grad_norm": 0.40157803893089294, "learning_rate": 3.362501400620308e-05, "loss": 0.0316, "step": 15140 }, { "epoch": 1.8192192192192191, "grad_norm": 0.46878618001937866, "learning_rate": 3.359530414367345e-05, "loss": 0.0451, "step": 15145 }, { "epoch": 1.8198198198198199, "grad_norm": 0.34287646412849426, "learning_rate": 3.356560077043521e-05, "loss": 0.0391, "step": 15150 }, { "epoch": 1.8204204204204204, "grad_norm": 0.4138559103012085, "learning_rate": 3.3535903898238305e-05, "loss": 0.0496, "step": 15155 }, { "epoch": 1.821021021021021, "grad_norm": 0.3784990608692169, "learning_rate": 3.350621353883009e-05, "loss": 0.0461, "step": 15160 }, { "epoch": 1.8216216216216217, "grad_norm": 0.44947463274002075, "learning_rate": 3.347652970395528e-05, "loss": 0.0434, "step": 15165 }, { "epoch": 1.8222222222222222, "grad_norm": 0.340086430311203, "learning_rate": 3.344685240535612e-05, "loss": 0.0359, "step": 15170 }, { "epoch": 1.8228228228228227, "grad_norm": 0.3417467772960663, "learning_rate": 3.3417181654772186e-05, "loss": 0.0486, "step": 15175 }, { "epoch": 1.8234234234234235, "grad_norm": 0.37504029273986816, "learning_rate": 3.3387517463940496e-05, "loss": 0.0411, "step": 15180 }, { "epoch": 1.824024024024024, "grad_norm": 0.2572895884513855, "learning_rate": 3.33578598445955e-05, "loss": 0.0369, "step": 15185 }, { "epoch": 1.8246246246246245, "grad_norm": 0.3781915307044983, "learning_rate": 3.3328208808468966e-05, "loss": 0.0415, "step": 15190 }, { "epoch": 1.8252252252252252, "grad_norm": 0.3484686613082886, "learning_rate": 3.3298564367290144e-05, "loss": 0.0393, "step": 15195 }, { "epoch": 1.825825825825826, "grad_norm": 0.42965856194496155, "learning_rate": 3.3268926532785615e-05, "loss": 0.0428, "step": 15200 }, { "epoch": 1.8264264264264263, "grad_norm": 0.3399992287158966, "learning_rate": 3.32392953166794e-05, "loss": 0.0446, "step": 15205 }, { "epoch": 1.827027027027027, "grad_norm": 0.3631638288497925, "learning_rate": 3.320967073069289e-05, "loss": 0.0368, "step": 15210 }, { "epoch": 1.8276276276276278, "grad_norm": 0.4014107286930084, "learning_rate": 3.318005278654478e-05, "loss": 0.0438, "step": 15215 }, { "epoch": 1.8282282282282283, "grad_norm": 0.3967078924179077, "learning_rate": 3.3150441495951234e-05, "loss": 0.0368, "step": 15220 }, { "epoch": 1.8288288288288288, "grad_norm": 0.5075540542602539, "learning_rate": 3.312083687062573e-05, "loss": 0.0358, "step": 15225 }, { "epoch": 1.8294294294294295, "grad_norm": 0.5178868174552917, "learning_rate": 3.3091238922279156e-05, "loss": 0.0439, "step": 15230 }, { "epoch": 1.83003003003003, "grad_norm": 0.4869144558906555, "learning_rate": 3.3061647662619676e-05, "loss": 0.0452, "step": 15235 }, { "epoch": 1.8306306306306306, "grad_norm": 0.32434096932411194, "learning_rate": 3.30320631033529e-05, "loss": 0.0407, "step": 15240 }, { "epoch": 1.8312312312312313, "grad_norm": 0.4024626612663269, "learning_rate": 3.3002485256181713e-05, "loss": 0.0398, "step": 15245 }, { "epoch": 1.8318318318318318, "grad_norm": 0.4433274567127228, "learning_rate": 3.297291413280641e-05, "loss": 0.0404, "step": 15250 }, { "epoch": 1.8318318318318318, "eval_loss": 0.052654825150966644, "eval_runtime": 35.9691, "eval_samples_per_second": 22.241, "eval_steps_per_second": 5.56, "step": 15250 }, { "epoch": 1.8324324324324324, "grad_norm": 0.4474799335002899, "learning_rate": 3.294334974492461e-05, "loss": 0.0413, "step": 15255 }, { "epoch": 1.833033033033033, "grad_norm": 0.35214540362358093, "learning_rate": 3.29137921042312e-05, "loss": 0.0427, "step": 15260 }, { "epoch": 1.8336336336336336, "grad_norm": 0.36843839287757874, "learning_rate": 3.288424122241849e-05, "loss": 0.042, "step": 15265 }, { "epoch": 1.8342342342342342, "grad_norm": 0.4027945399284363, "learning_rate": 3.285469711117606e-05, "loss": 0.04, "step": 15270 }, { "epoch": 1.834834834834835, "grad_norm": 0.4439586102962494, "learning_rate": 3.282515978219082e-05, "loss": 0.0452, "step": 15275 }, { "epoch": 1.8354354354354354, "grad_norm": 0.32045507431030273, "learning_rate": 3.279562924714705e-05, "loss": 0.0363, "step": 15280 }, { "epoch": 1.836036036036036, "grad_norm": 0.29067376255989075, "learning_rate": 3.276610551772624e-05, "loss": 0.0318, "step": 15285 }, { "epoch": 1.8366366366366367, "grad_norm": 0.46161335706710815, "learning_rate": 3.273658860560728e-05, "loss": 0.043, "step": 15290 }, { "epoch": 1.8372372372372372, "grad_norm": 0.3770528733730316, "learning_rate": 3.2707078522466324e-05, "loss": 0.0394, "step": 15295 }, { "epoch": 1.8378378378378377, "grad_norm": 0.3454394042491913, "learning_rate": 3.2677575279976846e-05, "loss": 0.0361, "step": 15300 }, { "epoch": 1.8384384384384385, "grad_norm": 0.383668452501297, "learning_rate": 3.2648078889809564e-05, "loss": 0.0412, "step": 15305 }, { "epoch": 1.8390390390390392, "grad_norm": 0.33942195773124695, "learning_rate": 3.261858936363254e-05, "loss": 0.0384, "step": 15310 }, { "epoch": 1.8396396396396395, "grad_norm": 0.3384036421775818, "learning_rate": 3.2589106713111095e-05, "loss": 0.0384, "step": 15315 }, { "epoch": 1.8402402402402402, "grad_norm": 0.4031851887702942, "learning_rate": 3.2559630949907824e-05, "loss": 0.0376, "step": 15320 }, { "epoch": 1.840840840840841, "grad_norm": 0.36432915925979614, "learning_rate": 3.253016208568266e-05, "loss": 0.0451, "step": 15325 }, { "epoch": 1.8414414414414413, "grad_norm": 0.4108644425868988, "learning_rate": 3.2500700132092686e-05, "loss": 0.0364, "step": 15330 }, { "epoch": 1.842042042042042, "grad_norm": 0.35947027802467346, "learning_rate": 3.247124510079236e-05, "loss": 0.0344, "step": 15335 }, { "epoch": 1.8426426426426428, "grad_norm": 0.3900996744632721, "learning_rate": 3.2441797003433347e-05, "loss": 0.0336, "step": 15340 }, { "epoch": 1.8432432432432433, "grad_norm": 0.35430777072906494, "learning_rate": 3.2412355851664596e-05, "loss": 0.036, "step": 15345 }, { "epoch": 1.8438438438438438, "grad_norm": 0.3509856164455414, "learning_rate": 3.238292165713232e-05, "loss": 0.0363, "step": 15350 }, { "epoch": 1.8444444444444446, "grad_norm": 0.3149247169494629, "learning_rate": 3.2353494431479916e-05, "loss": 0.0344, "step": 15355 }, { "epoch": 1.845045045045045, "grad_norm": 0.39669427275657654, "learning_rate": 3.2324074186348095e-05, "loss": 0.0381, "step": 15360 }, { "epoch": 1.8456456456456456, "grad_norm": 0.36199426651000977, "learning_rate": 3.229466093337474e-05, "loss": 0.0405, "step": 15365 }, { "epoch": 1.8462462462462463, "grad_norm": 0.4065885543823242, "learning_rate": 3.226525468419507e-05, "loss": 0.0468, "step": 15370 }, { "epoch": 1.8468468468468469, "grad_norm": 0.3283202052116394, "learning_rate": 3.22358554504414e-05, "loss": 0.034, "step": 15375 }, { "epoch": 1.8474474474474474, "grad_norm": 0.4245908558368683, "learning_rate": 3.220646324374337e-05, "loss": 0.038, "step": 15380 }, { "epoch": 1.8480480480480481, "grad_norm": 0.42148643732070923, "learning_rate": 3.2177078075727795e-05, "loss": 0.0349, "step": 15385 }, { "epoch": 1.8486486486486486, "grad_norm": 0.543740451335907, "learning_rate": 3.214769995801875e-05, "loss": 0.0465, "step": 15390 }, { "epoch": 1.8492492492492492, "grad_norm": 0.3897549510002136, "learning_rate": 3.211832890223748e-05, "loss": 0.0503, "step": 15395 }, { "epoch": 1.84984984984985, "grad_norm": 0.47316253185272217, "learning_rate": 3.208896492000243e-05, "loss": 0.0401, "step": 15400 }, { "epoch": 1.8504504504504504, "grad_norm": 0.36027249693870544, "learning_rate": 3.205960802292928e-05, "loss": 0.0318, "step": 15405 }, { "epoch": 1.851051051051051, "grad_norm": 0.3004674017429352, "learning_rate": 3.203025822263087e-05, "loss": 0.0371, "step": 15410 }, { "epoch": 1.8516516516516517, "grad_norm": 0.31336236000061035, "learning_rate": 3.200091553071727e-05, "loss": 0.0344, "step": 15415 }, { "epoch": 1.8522522522522522, "grad_norm": 0.3068157732486725, "learning_rate": 3.197157995879575e-05, "loss": 0.0353, "step": 15420 }, { "epoch": 1.8528528528528527, "grad_norm": 0.5400185585021973, "learning_rate": 3.1942251518470676e-05, "loss": 0.0409, "step": 15425 }, { "epoch": 1.8534534534534535, "grad_norm": 0.4230722188949585, "learning_rate": 3.1912930221343695e-05, "loss": 0.0383, "step": 15430 }, { "epoch": 1.8540540540540542, "grad_norm": 0.3303622901439667, "learning_rate": 3.1883616079013576e-05, "loss": 0.0315, "step": 15435 }, { "epoch": 1.8546546546546545, "grad_norm": 0.35778260231018066, "learning_rate": 3.185430910307628e-05, "loss": 0.0363, "step": 15440 }, { "epoch": 1.8552552552552553, "grad_norm": 0.31309306621551514, "learning_rate": 3.1825009305124895e-05, "loss": 0.0316, "step": 15445 }, { "epoch": 1.855855855855856, "grad_norm": 0.40511417388916016, "learning_rate": 3.17957166967497e-05, "loss": 0.043, "step": 15450 }, { "epoch": 1.8564564564564563, "grad_norm": 0.3628949820995331, "learning_rate": 3.176643128953815e-05, "loss": 0.0336, "step": 15455 }, { "epoch": 1.857057057057057, "grad_norm": 0.3685472011566162, "learning_rate": 3.1737153095074796e-05, "loss": 0.0452, "step": 15460 }, { "epoch": 1.8576576576576578, "grad_norm": 0.4899274706840515, "learning_rate": 3.1707882124941404e-05, "loss": 0.0393, "step": 15465 }, { "epoch": 1.8582582582582583, "grad_norm": 0.3528103530406952, "learning_rate": 3.1678618390716804e-05, "loss": 0.0329, "step": 15470 }, { "epoch": 1.8588588588588588, "grad_norm": 0.38148975372314453, "learning_rate": 3.1649361903977016e-05, "loss": 0.0408, "step": 15475 }, { "epoch": 1.8594594594594596, "grad_norm": 0.338446706533432, "learning_rate": 3.162011267629519e-05, "loss": 0.032, "step": 15480 }, { "epoch": 1.86006006006006, "grad_norm": 0.3764857351779938, "learning_rate": 3.15908707192416e-05, "loss": 0.038, "step": 15485 }, { "epoch": 1.8606606606606606, "grad_norm": 0.43757164478302, "learning_rate": 3.1561636044383643e-05, "loss": 0.0407, "step": 15490 }, { "epoch": 1.8612612612612613, "grad_norm": 0.4748579263687134, "learning_rate": 3.153240866328582e-05, "loss": 0.04, "step": 15495 }, { "epoch": 1.8618618618618619, "grad_norm": 0.35937410593032837, "learning_rate": 3.150318858750976e-05, "loss": 0.0403, "step": 15500 }, { "epoch": 1.8618618618618619, "eval_loss": 0.05141396448016167, "eval_runtime": 36.0092, "eval_samples_per_second": 22.217, "eval_steps_per_second": 5.554, "step": 15500 }, { "epoch": 1.8624624624624624, "grad_norm": 0.34568673372268677, "learning_rate": 3.14739758286142e-05, "loss": 0.0375, "step": 15505 }, { "epoch": 1.8630630630630631, "grad_norm": 0.49455657601356506, "learning_rate": 3.1444770398154985e-05, "loss": 0.038, "step": 15510 }, { "epoch": 1.8636636636636636, "grad_norm": 0.4506688117980957, "learning_rate": 3.141557230768508e-05, "loss": 0.0437, "step": 15515 }, { "epoch": 1.8642642642642642, "grad_norm": 0.4464740753173828, "learning_rate": 3.13863815687545e-05, "loss": 0.0428, "step": 15520 }, { "epoch": 1.864864864864865, "grad_norm": 0.297534316778183, "learning_rate": 3.135719819291038e-05, "loss": 0.0382, "step": 15525 }, { "epoch": 1.8654654654654654, "grad_norm": 0.33353927731513977, "learning_rate": 3.132802219169695e-05, "loss": 0.038, "step": 15530 }, { "epoch": 1.866066066066066, "grad_norm": 0.24684344232082367, "learning_rate": 3.129885357665553e-05, "loss": 0.0344, "step": 15535 }, { "epoch": 1.8666666666666667, "grad_norm": 0.32198721170425415, "learning_rate": 3.126969235932446e-05, "loss": 0.035, "step": 15540 }, { "epoch": 1.8672672672672672, "grad_norm": 0.4634726047515869, "learning_rate": 3.1240538551239234e-05, "loss": 0.0383, "step": 15545 }, { "epoch": 1.8678678678678677, "grad_norm": 0.4412508010864258, "learning_rate": 3.121139216393235e-05, "loss": 0.0352, "step": 15550 }, { "epoch": 1.8684684684684685, "grad_norm": 0.4202616512775421, "learning_rate": 3.118225320893341e-05, "loss": 0.0382, "step": 15555 }, { "epoch": 1.8690690690690692, "grad_norm": 0.3001733422279358, "learning_rate": 3.115312169776908e-05, "loss": 0.0397, "step": 15560 }, { "epoch": 1.8696696696696695, "grad_norm": 0.3958297669887543, "learning_rate": 3.112399764196303e-05, "loss": 0.0354, "step": 15565 }, { "epoch": 1.8702702702702703, "grad_norm": 0.3859178423881531, "learning_rate": 3.109488105303603e-05, "loss": 0.0429, "step": 15570 }, { "epoch": 1.870870870870871, "grad_norm": 0.4340052604675293, "learning_rate": 3.1065771942505884e-05, "loss": 0.0362, "step": 15575 }, { "epoch": 1.8714714714714715, "grad_norm": 0.2764926254749298, "learning_rate": 3.1036670321887426e-05, "loss": 0.0379, "step": 15580 }, { "epoch": 1.872072072072072, "grad_norm": 0.3525054454803467, "learning_rate": 3.100757620269257e-05, "loss": 0.0367, "step": 15585 }, { "epoch": 1.8726726726726728, "grad_norm": 0.3726922869682312, "learning_rate": 3.0978489596430184e-05, "loss": 0.0366, "step": 15590 }, { "epoch": 1.8732732732732733, "grad_norm": 0.3272388279438019, "learning_rate": 3.0949410514606234e-05, "loss": 0.0312, "step": 15595 }, { "epoch": 1.8738738738738738, "grad_norm": 0.34497085213661194, "learning_rate": 3.092033896872367e-05, "loss": 0.0319, "step": 15600 }, { "epoch": 1.8744744744744746, "grad_norm": 0.36654847860336304, "learning_rate": 3.0891274970282505e-05, "loss": 0.0366, "step": 15605 }, { "epoch": 1.875075075075075, "grad_norm": 0.4943583011627197, "learning_rate": 3.08622185307797e-05, "loss": 0.0369, "step": 15610 }, { "epoch": 1.8756756756756756, "grad_norm": 0.3475680649280548, "learning_rate": 3.083316966170927e-05, "loss": 0.0336, "step": 15615 }, { "epoch": 1.8762762762762764, "grad_norm": 0.3712926506996155, "learning_rate": 3.080412837456225e-05, "loss": 0.0359, "step": 15620 }, { "epoch": 1.8768768768768769, "grad_norm": 0.35502180457115173, "learning_rate": 3.0775094680826624e-05, "loss": 0.0302, "step": 15625 }, { "epoch": 1.8774774774774774, "grad_norm": 0.3283778429031372, "learning_rate": 3.074606859198746e-05, "loss": 0.0352, "step": 15630 }, { "epoch": 1.8780780780780781, "grad_norm": 0.3945302665233612, "learning_rate": 3.071705011952668e-05, "loss": 0.0254, "step": 15635 }, { "epoch": 1.8786786786786787, "grad_norm": 0.3465094268321991, "learning_rate": 3.068803927492333e-05, "loss": 0.0314, "step": 15640 }, { "epoch": 1.8792792792792792, "grad_norm": 0.3634204864501953, "learning_rate": 3.065903606965336e-05, "loss": 0.0358, "step": 15645 }, { "epoch": 1.87987987987988, "grad_norm": 0.3066357970237732, "learning_rate": 3.063004051518972e-05, "loss": 0.0381, "step": 15650 }, { "epoch": 1.8804804804804804, "grad_norm": 0.4703167974948883, "learning_rate": 3.060105262300236e-05, "loss": 0.0388, "step": 15655 }, { "epoch": 1.881081081081081, "grad_norm": 0.4352303147315979, "learning_rate": 3.057207240455812e-05, "loss": 0.0349, "step": 15660 }, { "epoch": 1.8816816816816817, "grad_norm": 0.3594943881034851, "learning_rate": 3.054309987132089e-05, "loss": 0.0411, "step": 15665 }, { "epoch": 1.8822822822822824, "grad_norm": 0.34663334488868713, "learning_rate": 3.051413503475149e-05, "loss": 0.0321, "step": 15670 }, { "epoch": 1.8828828828828827, "grad_norm": 0.508261501789093, "learning_rate": 3.0485177906307694e-05, "loss": 0.0384, "step": 15675 }, { "epoch": 1.8834834834834835, "grad_norm": 0.46104949712753296, "learning_rate": 3.0456228497444206e-05, "loss": 0.0375, "step": 15680 }, { "epoch": 1.8840840840840842, "grad_norm": 0.33268865942955017, "learning_rate": 3.042728681961271e-05, "loss": 0.0306, "step": 15685 }, { "epoch": 1.8846846846846845, "grad_norm": 0.3267901837825775, "learning_rate": 3.0398352884261792e-05, "loss": 0.0319, "step": 15690 }, { "epoch": 1.8852852852852853, "grad_norm": 0.3606499135494232, "learning_rate": 3.0369426702837035e-05, "loss": 0.035, "step": 15695 }, { "epoch": 1.885885885885886, "grad_norm": 0.39046889543533325, "learning_rate": 3.034050828678092e-05, "loss": 0.031, "step": 15700 }, { "epoch": 1.8864864864864865, "grad_norm": 0.30550241470336914, "learning_rate": 3.031159764753282e-05, "loss": 0.036, "step": 15705 }, { "epoch": 1.887087087087087, "grad_norm": 0.4246421158313751, "learning_rate": 3.0282694796529086e-05, "loss": 0.0379, "step": 15710 }, { "epoch": 1.8876876876876878, "grad_norm": 0.49292629957199097, "learning_rate": 3.0253799745202977e-05, "loss": 0.04, "step": 15715 }, { "epoch": 1.8882882882882883, "grad_norm": 0.24794501066207886, "learning_rate": 3.0224912504984653e-05, "loss": 0.0371, "step": 15720 }, { "epoch": 1.8888888888888888, "grad_norm": 0.3245203495025635, "learning_rate": 3.0196033087301213e-05, "loss": 0.0414, "step": 15725 }, { "epoch": 1.8894894894894896, "grad_norm": 0.5247988104820251, "learning_rate": 3.0167161503576603e-05, "loss": 0.0376, "step": 15730 }, { "epoch": 1.89009009009009, "grad_norm": 0.34899789094924927, "learning_rate": 3.013829776523173e-05, "loss": 0.0324, "step": 15735 }, { "epoch": 1.8906906906906906, "grad_norm": 0.34728893637657166, "learning_rate": 3.0109441883684363e-05, "loss": 0.031, "step": 15740 }, { "epoch": 1.8912912912912914, "grad_norm": 0.33809661865234375, "learning_rate": 3.0080593870349195e-05, "loss": 0.0323, "step": 15745 }, { "epoch": 1.8918918918918919, "grad_norm": 0.4066483676433563, "learning_rate": 3.005175373663775e-05, "loss": 0.0358, "step": 15750 }, { "epoch": 1.8918918918918919, "eval_loss": 0.05024780333042145, "eval_runtime": 35.6766, "eval_samples_per_second": 22.424, "eval_steps_per_second": 5.606, "step": 15750 }, { "epoch": 1.8924924924924924, "grad_norm": 0.4036324918270111, "learning_rate": 3.002292149395849e-05, "loss": 0.0339, "step": 15755 }, { "epoch": 1.8930930930930931, "grad_norm": 0.3037404716014862, "learning_rate": 2.9994097153716737e-05, "loss": 0.0341, "step": 15760 }, { "epoch": 1.8936936936936937, "grad_norm": 0.3263702392578125, "learning_rate": 2.996528072731468e-05, "loss": 0.0316, "step": 15765 }, { "epoch": 1.8942942942942942, "grad_norm": 0.3972446620464325, "learning_rate": 2.9936472226151414e-05, "loss": 0.0376, "step": 15770 }, { "epoch": 1.894894894894895, "grad_norm": 0.3838525116443634, "learning_rate": 2.990767166162282e-05, "loss": 0.0332, "step": 15775 }, { "epoch": 1.8954954954954955, "grad_norm": 0.3224433660507202, "learning_rate": 2.987887904512172e-05, "loss": 0.0316, "step": 15780 }, { "epoch": 1.896096096096096, "grad_norm": 0.29355913400650024, "learning_rate": 2.9850094388037747e-05, "loss": 0.0349, "step": 15785 }, { "epoch": 1.8966966966966967, "grad_norm": 0.4153788089752197, "learning_rate": 2.98213177017574e-05, "loss": 0.0314, "step": 15790 }, { "epoch": 1.8972972972972975, "grad_norm": 0.44168180227279663, "learning_rate": 2.979254899766405e-05, "loss": 0.0356, "step": 15795 }, { "epoch": 1.8978978978978978, "grad_norm": 0.3369816243648529, "learning_rate": 2.9763788287137835e-05, "loss": 0.0406, "step": 15800 }, { "epoch": 1.8984984984984985, "grad_norm": 0.2655574083328247, "learning_rate": 2.9735035581555805e-05, "loss": 0.0335, "step": 15805 }, { "epoch": 1.8990990990990992, "grad_norm": 0.44046923518180847, "learning_rate": 2.970629089229182e-05, "loss": 0.0379, "step": 15810 }, { "epoch": 1.8996996996996995, "grad_norm": 0.38456791639328003, "learning_rate": 2.9677554230716585e-05, "loss": 0.0349, "step": 15815 }, { "epoch": 1.9003003003003003, "grad_norm": 0.4505840241909027, "learning_rate": 2.9648825608197572e-05, "loss": 0.0323, "step": 15820 }, { "epoch": 1.900900900900901, "grad_norm": 0.383492112159729, "learning_rate": 2.9620105036099133e-05, "loss": 0.0322, "step": 15825 }, { "epoch": 1.9015015015015015, "grad_norm": 0.37796080112457275, "learning_rate": 2.9591392525782425e-05, "loss": 0.0347, "step": 15830 }, { "epoch": 1.902102102102102, "grad_norm": 0.3788483440876007, "learning_rate": 2.9562688088605384e-05, "loss": 0.0279, "step": 15835 }, { "epoch": 1.9027027027027028, "grad_norm": 0.36863160133361816, "learning_rate": 2.9533991735922805e-05, "loss": 0.0316, "step": 15840 }, { "epoch": 1.9033033033033033, "grad_norm": 0.27120909094810486, "learning_rate": 2.950530347908622e-05, "loss": 0.0292, "step": 15845 }, { "epoch": 1.9039039039039038, "grad_norm": 0.34686270356178284, "learning_rate": 2.947662332944401e-05, "loss": 0.0357, "step": 15850 }, { "epoch": 1.9045045045045046, "grad_norm": 0.3707510232925415, "learning_rate": 2.944795129834132e-05, "loss": 0.033, "step": 15855 }, { "epoch": 1.9051051051051051, "grad_norm": 0.2813546061515808, "learning_rate": 2.941928739712011e-05, "loss": 0.0283, "step": 15860 }, { "epoch": 1.9057057057057056, "grad_norm": 0.30208873748779297, "learning_rate": 2.9390631637119126e-05, "loss": 0.0315, "step": 15865 }, { "epoch": 1.9063063063063064, "grad_norm": 0.4704820513725281, "learning_rate": 2.9361984029673838e-05, "loss": 0.0319, "step": 15870 }, { "epoch": 1.906906906906907, "grad_norm": 0.3833897113800049, "learning_rate": 2.9333344586116563e-05, "loss": 0.0362, "step": 15875 }, { "epoch": 1.9075075075075074, "grad_norm": 0.3160555362701416, "learning_rate": 2.9304713317776323e-05, "loss": 0.0347, "step": 15880 }, { "epoch": 1.9081081081081082, "grad_norm": 0.3219717741012573, "learning_rate": 2.9276090235978976e-05, "loss": 0.034, "step": 15885 }, { "epoch": 1.9087087087087087, "grad_norm": 0.2876552939414978, "learning_rate": 2.9247475352047065e-05, "loss": 0.0327, "step": 15890 }, { "epoch": 1.9093093093093092, "grad_norm": 0.33561497926712036, "learning_rate": 2.921886867729995e-05, "loss": 0.0345, "step": 15895 }, { "epoch": 1.90990990990991, "grad_norm": 0.3135334849357605, "learning_rate": 2.9190270223053728e-05, "loss": 0.0376, "step": 15900 }, { "epoch": 1.9105105105105105, "grad_norm": 0.3284721374511719, "learning_rate": 2.916168000062123e-05, "loss": 0.0287, "step": 15905 }, { "epoch": 1.911111111111111, "grad_norm": 0.3291323184967041, "learning_rate": 2.9133098021312056e-05, "loss": 0.0341, "step": 15910 }, { "epoch": 1.9117117117117117, "grad_norm": 0.3968559503555298, "learning_rate": 2.91045242964325e-05, "loss": 0.0354, "step": 15915 }, { "epoch": 1.9123123123123125, "grad_norm": 0.3673100769519806, "learning_rate": 2.9075958837285644e-05, "loss": 0.0289, "step": 15920 }, { "epoch": 1.9129129129129128, "grad_norm": 0.32350149750709534, "learning_rate": 2.904740165517126e-05, "loss": 0.0356, "step": 15925 }, { "epoch": 1.9135135135135135, "grad_norm": 0.42222660779953003, "learning_rate": 2.9018852761385874e-05, "loss": 0.0309, "step": 15930 }, { "epoch": 1.9141141141141143, "grad_norm": 0.4391559362411499, "learning_rate": 2.8990312167222737e-05, "loss": 0.029, "step": 15935 }, { "epoch": 1.9147147147147145, "grad_norm": 0.28281211853027344, "learning_rate": 2.8961779883971763e-05, "loss": 0.0313, "step": 15940 }, { "epoch": 1.9153153153153153, "grad_norm": 0.3404557704925537, "learning_rate": 2.8933255922919655e-05, "loss": 0.0314, "step": 15945 }, { "epoch": 1.915915915915916, "grad_norm": 0.326612263917923, "learning_rate": 2.8904740295349747e-05, "loss": 0.0291, "step": 15950 }, { "epoch": 1.9165165165165166, "grad_norm": 0.3266064524650574, "learning_rate": 2.8876233012542132e-05, "loss": 0.0326, "step": 15955 }, { "epoch": 1.917117117117117, "grad_norm": 0.43275049328804016, "learning_rate": 2.8847734085773614e-05, "loss": 0.0359, "step": 15960 }, { "epoch": 1.9177177177177178, "grad_norm": 0.37904635071754456, "learning_rate": 2.8819243526317608e-05, "loss": 0.0292, "step": 15965 }, { "epoch": 1.9183183183183183, "grad_norm": 0.4266570210456848, "learning_rate": 2.8790761345444307e-05, "loss": 0.0294, "step": 15970 }, { "epoch": 1.9189189189189189, "grad_norm": 0.4261925518512726, "learning_rate": 2.8762287554420552e-05, "loss": 0.0312, "step": 15975 }, { "epoch": 1.9195195195195196, "grad_norm": 0.41322338581085205, "learning_rate": 2.873382216450988e-05, "loss": 0.0389, "step": 15980 }, { "epoch": 1.9201201201201201, "grad_norm": 0.31788089871406555, "learning_rate": 2.8705365186972473e-05, "loss": 0.0299, "step": 15985 }, { "epoch": 1.9207207207207206, "grad_norm": 0.33438605070114136, "learning_rate": 2.867691663306521e-05, "loss": 0.0336, "step": 15990 }, { "epoch": 1.9213213213213214, "grad_norm": 0.3594795763492584, "learning_rate": 2.8648476514041646e-05, "loss": 0.0354, "step": 15995 }, { "epoch": 1.921921921921922, "grad_norm": 0.2548877000808716, "learning_rate": 2.862004484115198e-05, "loss": 0.0284, "step": 16000 }, { "epoch": 1.921921921921922, "eval_loss": 0.04871018975973129, "eval_runtime": 36.0509, "eval_samples_per_second": 22.191, "eval_steps_per_second": 5.548, "step": 16000 }, { "epoch": 1.9225225225225224, "grad_norm": 0.5414465069770813, "learning_rate": 2.85916216256431e-05, "loss": 0.0398, "step": 16005 }, { "epoch": 1.9231231231231232, "grad_norm": 0.2570936679840088, "learning_rate": 2.8563206878758486e-05, "loss": 0.0322, "step": 16010 }, { "epoch": 1.9237237237237237, "grad_norm": 0.31128424406051636, "learning_rate": 2.853480061173833e-05, "loss": 0.0323, "step": 16015 }, { "epoch": 1.9243243243243242, "grad_norm": 0.36437293887138367, "learning_rate": 2.8506402835819445e-05, "loss": 0.0285, "step": 16020 }, { "epoch": 1.924924924924925, "grad_norm": 0.44023168087005615, "learning_rate": 2.847801356223529e-05, "loss": 0.0314, "step": 16025 }, { "epoch": 1.9255255255255255, "grad_norm": 0.3434520661830902, "learning_rate": 2.8449632802215974e-05, "loss": 0.0329, "step": 16030 }, { "epoch": 1.926126126126126, "grad_norm": 0.5105292201042175, "learning_rate": 2.8421260566988194e-05, "loss": 0.0315, "step": 16035 }, { "epoch": 1.9267267267267267, "grad_norm": 0.38238635659217834, "learning_rate": 2.839289686777533e-05, "loss": 0.0335, "step": 16040 }, { "epoch": 1.9273273273273275, "grad_norm": 0.36520659923553467, "learning_rate": 2.8364541715797333e-05, "loss": 0.0299, "step": 16045 }, { "epoch": 1.9279279279279278, "grad_norm": 0.3082834482192993, "learning_rate": 2.833619512227082e-05, "loss": 0.0312, "step": 16050 }, { "epoch": 1.9285285285285285, "grad_norm": 0.3804808557033539, "learning_rate": 2.8307857098408975e-05, "loss": 0.0292, "step": 16055 }, { "epoch": 1.9291291291291293, "grad_norm": 0.33361175656318665, "learning_rate": 2.827952765542164e-05, "loss": 0.032, "step": 16060 }, { "epoch": 1.9297297297297298, "grad_norm": 0.5027971863746643, "learning_rate": 2.8251206804515235e-05, "loss": 0.0339, "step": 16065 }, { "epoch": 1.9303303303303303, "grad_norm": 0.3659258186817169, "learning_rate": 2.8222894556892786e-05, "loss": 0.03, "step": 16070 }, { "epoch": 1.930930930930931, "grad_norm": 0.3499879240989685, "learning_rate": 2.8194590923753944e-05, "loss": 0.0316, "step": 16075 }, { "epoch": 1.9315315315315316, "grad_norm": 0.38677674531936646, "learning_rate": 2.8166295916294884e-05, "loss": 0.0334, "step": 16080 }, { "epoch": 1.932132132132132, "grad_norm": 0.3069120943546295, "learning_rate": 2.8138009545708422e-05, "loss": 0.0278, "step": 16085 }, { "epoch": 1.9327327327327328, "grad_norm": 0.5202799439430237, "learning_rate": 2.810973182318395e-05, "loss": 0.0325, "step": 16090 }, { "epoch": 1.9333333333333333, "grad_norm": 0.345246285200119, "learning_rate": 2.808146275990744e-05, "loss": 0.0284, "step": 16095 }, { "epoch": 1.9339339339339339, "grad_norm": 0.3261059522628784, "learning_rate": 2.805320236706145e-05, "loss": 0.0322, "step": 16100 }, { "epoch": 1.9345345345345346, "grad_norm": 0.3136518895626068, "learning_rate": 2.8024950655825044e-05, "loss": 0.0314, "step": 16105 }, { "epoch": 1.9351351351351351, "grad_norm": 0.4654875695705414, "learning_rate": 2.799670763737393e-05, "loss": 0.0388, "step": 16110 }, { "epoch": 1.9357357357357357, "grad_norm": 0.3657342791557312, "learning_rate": 2.796847332288034e-05, "loss": 0.0321, "step": 16115 }, { "epoch": 1.9363363363363364, "grad_norm": 0.347788006067276, "learning_rate": 2.7940247723513096e-05, "loss": 0.0312, "step": 16120 }, { "epoch": 1.936936936936937, "grad_norm": 0.35710352659225464, "learning_rate": 2.79120308504375e-05, "loss": 0.0338, "step": 16125 }, { "epoch": 1.9375375375375374, "grad_norm": 0.2987470030784607, "learning_rate": 2.7883822714815494e-05, "loss": 0.0267, "step": 16130 }, { "epoch": 1.9381381381381382, "grad_norm": 0.42691853642463684, "learning_rate": 2.785562332780547e-05, "loss": 0.0326, "step": 16135 }, { "epoch": 1.9387387387387387, "grad_norm": 0.4340433180332184, "learning_rate": 2.7827432700562433e-05, "loss": 0.033, "step": 16140 }, { "epoch": 1.9393393393393392, "grad_norm": 0.4037269949913025, "learning_rate": 2.779925084423791e-05, "loss": 0.0281, "step": 16145 }, { "epoch": 1.93993993993994, "grad_norm": 0.33447209000587463, "learning_rate": 2.7771077769979925e-05, "loss": 0.0344, "step": 16150 }, { "epoch": 1.9405405405405407, "grad_norm": 0.43541401624679565, "learning_rate": 2.7742913488933042e-05, "loss": 0.0333, "step": 16155 }, { "epoch": 1.941141141141141, "grad_norm": 0.2964218258857727, "learning_rate": 2.771475801223837e-05, "loss": 0.0258, "step": 16160 }, { "epoch": 1.9417417417417417, "grad_norm": 0.3433111608028412, "learning_rate": 2.768661135103351e-05, "loss": 0.0274, "step": 16165 }, { "epoch": 1.9423423423423425, "grad_norm": 0.37273839116096497, "learning_rate": 2.765847351645261e-05, "loss": 0.0283, "step": 16170 }, { "epoch": 1.9429429429429428, "grad_norm": 0.3430827260017395, "learning_rate": 2.7630344519626255e-05, "loss": 0.0296, "step": 16175 }, { "epoch": 1.9435435435435435, "grad_norm": 0.5047633051872253, "learning_rate": 2.7602224371681605e-05, "loss": 0.0299, "step": 16180 }, { "epoch": 1.9441441441441443, "grad_norm": 0.3661489486694336, "learning_rate": 2.757411308374229e-05, "loss": 0.0277, "step": 16185 }, { "epoch": 1.9447447447447448, "grad_norm": 0.32280826568603516, "learning_rate": 2.7546010666928468e-05, "loss": 0.0357, "step": 16190 }, { "epoch": 1.9453453453453453, "grad_norm": 0.4482559859752655, "learning_rate": 2.7517917132356707e-05, "loss": 0.0333, "step": 16195 }, { "epoch": 1.945945945945946, "grad_norm": 0.3581385910511017, "learning_rate": 2.7489832491140138e-05, "loss": 0.0309, "step": 16200 }, { "epoch": 1.9465465465465466, "grad_norm": 0.3367556631565094, "learning_rate": 2.746175675438835e-05, "loss": 0.0302, "step": 16205 }, { "epoch": 1.947147147147147, "grad_norm": 0.4131978154182434, "learning_rate": 2.7433689933207407e-05, "loss": 0.0393, "step": 16210 }, { "epoch": 1.9477477477477478, "grad_norm": 0.4304249882698059, "learning_rate": 2.740563203869988e-05, "loss": 0.0301, "step": 16215 }, { "epoch": 1.9483483483483484, "grad_norm": 0.4164542257785797, "learning_rate": 2.737758308196472e-05, "loss": 0.0366, "step": 16220 }, { "epoch": 1.9489489489489489, "grad_norm": 0.6016157865524292, "learning_rate": 2.734954307409745e-05, "loss": 0.0341, "step": 16225 }, { "epoch": 1.9495495495495496, "grad_norm": 0.25138306617736816, "learning_rate": 2.7321512026189956e-05, "loss": 0.027, "step": 16230 }, { "epoch": 1.9501501501501501, "grad_norm": 0.28494128584861755, "learning_rate": 2.7293489949330653e-05, "loss": 0.0239, "step": 16235 }, { "epoch": 1.9507507507507507, "grad_norm": 0.3030911982059479, "learning_rate": 2.7265476854604398e-05, "loss": 0.0294, "step": 16240 }, { "epoch": 1.9513513513513514, "grad_norm": 0.3640327751636505, "learning_rate": 2.723747275309244e-05, "loss": 0.0315, "step": 16245 }, { "epoch": 1.951951951951952, "grad_norm": 0.3330000638961792, "learning_rate": 2.7209477655872527e-05, "loss": 0.0389, "step": 16250 }, { "epoch": 1.951951951951952, "eval_loss": 0.04595092684030533, "eval_runtime": 36.109, "eval_samples_per_second": 22.155, "eval_steps_per_second": 5.539, "step": 16250 }, { "epoch": 1.9525525525525524, "grad_norm": 0.3315175175666809, "learning_rate": 2.7181491574018825e-05, "loss": 0.041, "step": 16255 }, { "epoch": 1.9531531531531532, "grad_norm": 0.3240010440349579, "learning_rate": 2.715351451860195e-05, "loss": 0.0305, "step": 16260 }, { "epoch": 1.9537537537537537, "grad_norm": 0.32649537920951843, "learning_rate": 2.71255465006889e-05, "loss": 0.0316, "step": 16265 }, { "epoch": 1.9543543543543542, "grad_norm": 0.34833481907844543, "learning_rate": 2.7097587531343145e-05, "loss": 0.0373, "step": 16270 }, { "epoch": 1.954954954954955, "grad_norm": 0.28214502334594727, "learning_rate": 2.7069637621624565e-05, "loss": 0.0258, "step": 16275 }, { "epoch": 1.9555555555555557, "grad_norm": 0.3369249403476715, "learning_rate": 2.7041696782589442e-05, "loss": 0.0345, "step": 16280 }, { "epoch": 1.956156156156156, "grad_norm": 0.511701226234436, "learning_rate": 2.7013765025290516e-05, "loss": 0.0252, "step": 16285 }, { "epoch": 1.9567567567567568, "grad_norm": 0.31603947281837463, "learning_rate": 2.698584236077685e-05, "loss": 0.0311, "step": 16290 }, { "epoch": 1.9573573573573575, "grad_norm": 0.338832825422287, "learning_rate": 2.6957928800093978e-05, "loss": 0.0313, "step": 16295 }, { "epoch": 1.9579579579579578, "grad_norm": 0.31404685974121094, "learning_rate": 2.6930024354283816e-05, "loss": 0.0258, "step": 16300 }, { "epoch": 1.9585585585585585, "grad_norm": 0.4568083584308624, "learning_rate": 2.690212903438467e-05, "loss": 0.033, "step": 16305 }, { "epoch": 1.9591591591591593, "grad_norm": 0.40261292457580566, "learning_rate": 2.6874242851431253e-05, "loss": 0.0338, "step": 16310 }, { "epoch": 1.9597597597597598, "grad_norm": 0.30654647946357727, "learning_rate": 2.6846365816454623e-05, "loss": 0.0297, "step": 16315 }, { "epoch": 1.9603603603603603, "grad_norm": 0.32817405462265015, "learning_rate": 2.6818497940482266e-05, "loss": 0.0325, "step": 16320 }, { "epoch": 1.960960960960961, "grad_norm": 0.32023903727531433, "learning_rate": 2.6790639234537996e-05, "loss": 0.0272, "step": 16325 }, { "epoch": 1.9615615615615616, "grad_norm": 0.33003783226013184, "learning_rate": 2.6762789709642056e-05, "loss": 0.0277, "step": 16330 }, { "epoch": 1.962162162162162, "grad_norm": 0.2898213267326355, "learning_rate": 2.6734949376811004e-05, "loss": 0.0273, "step": 16335 }, { "epoch": 1.9627627627627628, "grad_norm": 0.3364682197570801, "learning_rate": 2.6707118247057793e-05, "loss": 0.0293, "step": 16340 }, { "epoch": 1.9633633633633634, "grad_norm": 0.3233550488948822, "learning_rate": 2.6679296331391733e-05, "loss": 0.0292, "step": 16345 }, { "epoch": 1.9639639639639639, "grad_norm": 0.27092820405960083, "learning_rate": 2.6651483640818488e-05, "loss": 0.0251, "step": 16350 }, { "epoch": 1.9645645645645646, "grad_norm": 0.3484068512916565, "learning_rate": 2.662368018634009e-05, "loss": 0.0302, "step": 16355 }, { "epoch": 1.9651651651651652, "grad_norm": 0.2783617675304413, "learning_rate": 2.659588597895485e-05, "loss": 0.0285, "step": 16360 }, { "epoch": 1.9657657657657657, "grad_norm": 0.32954466342926025, "learning_rate": 2.656810102965749e-05, "loss": 0.0298, "step": 16365 }, { "epoch": 1.9663663663663664, "grad_norm": 0.3614228069782257, "learning_rate": 2.6540325349439054e-05, "loss": 0.0335, "step": 16370 }, { "epoch": 1.966966966966967, "grad_norm": 0.3480527400970459, "learning_rate": 2.6512558949286903e-05, "loss": 0.0342, "step": 16375 }, { "epoch": 1.9675675675675675, "grad_norm": 0.3006151020526886, "learning_rate": 2.648480184018477e-05, "loss": 0.0299, "step": 16380 }, { "epoch": 1.9681681681681682, "grad_norm": 0.3213295638561249, "learning_rate": 2.645705403311261e-05, "loss": 0.0287, "step": 16385 }, { "epoch": 1.9687687687687687, "grad_norm": 0.3097591698169708, "learning_rate": 2.642931553904685e-05, "loss": 0.0268, "step": 16390 }, { "epoch": 1.9693693693693692, "grad_norm": 0.3181003928184509, "learning_rate": 2.6401586368960098e-05, "loss": 0.0301, "step": 16395 }, { "epoch": 1.96996996996997, "grad_norm": 0.2848129868507385, "learning_rate": 2.637386653382134e-05, "loss": 0.0246, "step": 16400 }, { "epoch": 1.9705705705705707, "grad_norm": 0.4496921896934509, "learning_rate": 2.634615604459587e-05, "loss": 0.0294, "step": 16405 }, { "epoch": 1.971171171171171, "grad_norm": 0.2925359308719635, "learning_rate": 2.6318454912245248e-05, "loss": 0.0222, "step": 16410 }, { "epoch": 1.9717717717717718, "grad_norm": 0.37273648381233215, "learning_rate": 2.6290763147727372e-05, "loss": 0.0302, "step": 16415 }, { "epoch": 1.9723723723723725, "grad_norm": 0.3598352074623108, "learning_rate": 2.626308076199642e-05, "loss": 0.0315, "step": 16420 }, { "epoch": 1.972972972972973, "grad_norm": 0.3557840883731842, "learning_rate": 2.623540776600284e-05, "loss": 0.026, "step": 16425 }, { "epoch": 1.9735735735735735, "grad_norm": 0.35025104880332947, "learning_rate": 2.6207744170693392e-05, "loss": 0.0288, "step": 16430 }, { "epoch": 1.9741741741741743, "grad_norm": 0.35492363572120667, "learning_rate": 2.6180089987011115e-05, "loss": 0.0268, "step": 16435 }, { "epoch": 1.9747747747747748, "grad_norm": 0.3412788510322571, "learning_rate": 2.615244522589534e-05, "loss": 0.0321, "step": 16440 }, { "epoch": 1.9753753753753753, "grad_norm": 0.3852511942386627, "learning_rate": 2.61248098982816e-05, "loss": 0.0299, "step": 16445 }, { "epoch": 1.975975975975976, "grad_norm": 0.4844917953014374, "learning_rate": 2.6097184015101772e-05, "loss": 0.0284, "step": 16450 }, { "epoch": 1.9765765765765766, "grad_norm": 0.2313963621854782, "learning_rate": 2.6069567587283977e-05, "loss": 0.0245, "step": 16455 }, { "epoch": 1.9771771771771771, "grad_norm": 0.29417532682418823, "learning_rate": 2.6041960625752582e-05, "loss": 0.0261, "step": 16460 }, { "epoch": 1.9777777777777779, "grad_norm": 0.2930136024951935, "learning_rate": 2.6014363141428242e-05, "loss": 0.0287, "step": 16465 }, { "epoch": 1.9783783783783784, "grad_norm": 0.3059934377670288, "learning_rate": 2.598677514522779e-05, "loss": 0.0293, "step": 16470 }, { "epoch": 1.978978978978979, "grad_norm": 0.39566734433174133, "learning_rate": 2.5959196648064392e-05, "loss": 0.031, "step": 16475 }, { "epoch": 1.9795795795795796, "grad_norm": 0.32728102803230286, "learning_rate": 2.5931627660847402e-05, "loss": 0.025, "step": 16480 }, { "epoch": 1.9801801801801802, "grad_norm": 0.35917210578918457, "learning_rate": 2.5904068194482455e-05, "loss": 0.0292, "step": 16485 }, { "epoch": 1.9807807807807807, "grad_norm": 0.32126685976982117, "learning_rate": 2.5876518259871353e-05, "loss": 0.027, "step": 16490 }, { "epoch": 1.9813813813813814, "grad_norm": 0.27140477299690247, "learning_rate": 2.58489778679122e-05, "loss": 0.0279, "step": 16495 }, { "epoch": 1.981981981981982, "grad_norm": 0.3503856658935547, "learning_rate": 2.582144702949927e-05, "loss": 0.0279, "step": 16500 }, { "epoch": 1.981981981981982, "eval_loss": 0.044447824358940125, "eval_runtime": 35.9264, "eval_samples_per_second": 22.268, "eval_steps_per_second": 5.567, "step": 16500 }, { "epoch": 1.9825825825825825, "grad_norm": 0.38946840167045593, "learning_rate": 2.579392575552308e-05, "loss": 0.0249, "step": 16505 }, { "epoch": 1.9831831831831832, "grad_norm": 0.3265872001647949, "learning_rate": 2.57664140568704e-05, "loss": 0.0276, "step": 16510 }, { "epoch": 1.983783783783784, "grad_norm": 0.3320499062538147, "learning_rate": 2.5738911944424133e-05, "loss": 0.0317, "step": 16515 }, { "epoch": 1.9843843843843842, "grad_norm": 0.27032506465911865, "learning_rate": 2.571141942906345e-05, "loss": 0.0295, "step": 16520 }, { "epoch": 1.984984984984985, "grad_norm": 0.3871995210647583, "learning_rate": 2.568393652166371e-05, "loss": 0.0286, "step": 16525 }, { "epoch": 1.9855855855855857, "grad_norm": 0.3653221130371094, "learning_rate": 2.5656463233096474e-05, "loss": 0.0258, "step": 16530 }, { "epoch": 1.986186186186186, "grad_norm": 0.3507516086101532, "learning_rate": 2.562899957422952e-05, "loss": 0.0281, "step": 16535 }, { "epoch": 1.9867867867867868, "grad_norm": 0.32332906126976013, "learning_rate": 2.5601545555926733e-05, "loss": 0.0273, "step": 16540 }, { "epoch": 1.9873873873873875, "grad_norm": 0.41225871443748474, "learning_rate": 2.5574101189048276e-05, "loss": 0.0265, "step": 16545 }, { "epoch": 1.987987987987988, "grad_norm": 0.33340364694595337, "learning_rate": 2.554666648445046e-05, "loss": 0.033, "step": 16550 }, { "epoch": 1.9885885885885886, "grad_norm": 0.4418809413909912, "learning_rate": 2.5519241452985777e-05, "loss": 0.0248, "step": 16555 }, { "epoch": 1.9891891891891893, "grad_norm": 0.30932164192199707, "learning_rate": 2.5491826105502897e-05, "loss": 0.0297, "step": 16560 }, { "epoch": 1.9897897897897898, "grad_norm": 0.33863547444343567, "learning_rate": 2.5464420452846627e-05, "loss": 0.0302, "step": 16565 }, { "epoch": 1.9903903903903903, "grad_norm": 0.3218494653701782, "learning_rate": 2.5437024505857983e-05, "loss": 0.0309, "step": 16570 }, { "epoch": 1.990990990990991, "grad_norm": 0.3258943259716034, "learning_rate": 2.5409638275374113e-05, "loss": 0.0286, "step": 16575 }, { "epoch": 1.9915915915915916, "grad_norm": 0.34837913513183594, "learning_rate": 2.538226177222836e-05, "loss": 0.0251, "step": 16580 }, { "epoch": 1.9921921921921921, "grad_norm": 0.37401098012924194, "learning_rate": 2.535489500725015e-05, "loss": 0.0306, "step": 16585 }, { "epoch": 1.9927927927927929, "grad_norm": 0.2671530246734619, "learning_rate": 2.5327537991265137e-05, "loss": 0.0261, "step": 16590 }, { "epoch": 1.9933933933933934, "grad_norm": 0.3131420910358429, "learning_rate": 2.5300190735095038e-05, "loss": 0.0252, "step": 16595 }, { "epoch": 1.993993993993994, "grad_norm": 0.28659263253211975, "learning_rate": 2.527285324955777e-05, "loss": 0.0333, "step": 16600 }, { "epoch": 1.9945945945945946, "grad_norm": 0.37140652537345886, "learning_rate": 2.524552554546738e-05, "loss": 0.0271, "step": 16605 }, { "epoch": 1.9951951951951952, "grad_norm": 0.3571425974369049, "learning_rate": 2.5218207633634005e-05, "loss": 0.0267, "step": 16610 }, { "epoch": 1.9957957957957957, "grad_norm": 0.2770904004573822, "learning_rate": 2.5190899524863942e-05, "loss": 0.0253, "step": 16615 }, { "epoch": 1.9963963963963964, "grad_norm": 0.33116596937179565, "learning_rate": 2.5163601229959606e-05, "loss": 0.0285, "step": 16620 }, { "epoch": 1.996996996996997, "grad_norm": 0.4822072684764862, "learning_rate": 2.5136312759719525e-05, "loss": 0.0308, "step": 16625 }, { "epoch": 1.9975975975975975, "grad_norm": 0.3068985641002655, "learning_rate": 2.510903412493837e-05, "loss": 0.0302, "step": 16630 }, { "epoch": 1.9981981981981982, "grad_norm": 0.267890989780426, "learning_rate": 2.5081765336406838e-05, "loss": 0.025, "step": 16635 }, { "epoch": 1.998798798798799, "grad_norm": 0.3101384937763214, "learning_rate": 2.5054506404911827e-05, "loss": 0.0257, "step": 16640 }, { "epoch": 1.9993993993993993, "grad_norm": 0.3918618857860565, "learning_rate": 2.5027257341236275e-05, "loss": 0.0303, "step": 16645 }, { "epoch": 2.0, "grad_norm": 0.43106088042259216, "learning_rate": 2.5000018156159266e-05, "loss": 0.0318, "step": 16650 }, { "epoch": 2.0006006006006007, "grad_norm": 0.3042386472225189, "learning_rate": 2.497278886045591e-05, "loss": 0.0189, "step": 16655 }, { "epoch": 2.001201201201201, "grad_norm": 0.2652837932109833, "learning_rate": 2.4945569464897458e-05, "loss": 0.02, "step": 16660 }, { "epoch": 2.001801801801802, "grad_norm": 0.21576140820980072, "learning_rate": 2.4918359980251226e-05, "loss": 0.018, "step": 16665 }, { "epoch": 2.0024024024024025, "grad_norm": 0.2946082055568695, "learning_rate": 2.4891160417280617e-05, "loss": 0.018, "step": 16670 }, { "epoch": 2.003003003003003, "grad_norm": 0.2486870139837265, "learning_rate": 2.486397078674513e-05, "loss": 0.0184, "step": 16675 }, { "epoch": 2.0036036036036036, "grad_norm": 0.21376429498195648, "learning_rate": 2.4836791099400253e-05, "loss": 0.0185, "step": 16680 }, { "epoch": 2.0042042042042043, "grad_norm": 0.4368433356285095, "learning_rate": 2.480962136599765e-05, "loss": 0.021, "step": 16685 }, { "epoch": 2.0048048048048046, "grad_norm": 0.22392675280570984, "learning_rate": 2.478246159728495e-05, "loss": 0.0178, "step": 16690 }, { "epoch": 2.0054054054054054, "grad_norm": 0.3195738196372986, "learning_rate": 2.475531180400591e-05, "loss": 0.0177, "step": 16695 }, { "epoch": 2.006006006006006, "grad_norm": 0.27009859681129456, "learning_rate": 2.472817199690033e-05, "loss": 0.0153, "step": 16700 }, { "epoch": 2.0066066066066064, "grad_norm": 0.26511505246162415, "learning_rate": 2.470104218670401e-05, "loss": 0.02, "step": 16705 }, { "epoch": 2.007207207207207, "grad_norm": 0.2634662389755249, "learning_rate": 2.4673922384148847e-05, "loss": 0.0211, "step": 16710 }, { "epoch": 2.007807807807808, "grad_norm": 0.2865537106990814, "learning_rate": 2.4646812599962766e-05, "loss": 0.019, "step": 16715 }, { "epoch": 2.0084084084084086, "grad_norm": 0.38468220829963684, "learning_rate": 2.461971284486974e-05, "loss": 0.0171, "step": 16720 }, { "epoch": 2.009009009009009, "grad_norm": 0.20631487667560577, "learning_rate": 2.459262312958973e-05, "loss": 0.0165, "step": 16725 }, { "epoch": 2.0096096096096097, "grad_norm": 0.2423275113105774, "learning_rate": 2.456554346483877e-05, "loss": 0.0169, "step": 16730 }, { "epoch": 2.0102102102102104, "grad_norm": 0.3133910000324249, "learning_rate": 2.453847386132891e-05, "loss": 0.0179, "step": 16735 }, { "epoch": 2.0108108108108107, "grad_norm": 0.24423719942569733, "learning_rate": 2.451141432976821e-05, "loss": 0.0178, "step": 16740 }, { "epoch": 2.0114114114114114, "grad_norm": 0.37259042263031006, "learning_rate": 2.4484364880860777e-05, "loss": 0.0183, "step": 16745 }, { "epoch": 2.012012012012012, "grad_norm": 0.2786225378513336, "learning_rate": 2.445732552530665e-05, "loss": 0.0175, "step": 16750 }, { "epoch": 2.012012012012012, "eval_loss": 0.04507996514439583, "eval_runtime": 35.9044, "eval_samples_per_second": 22.281, "eval_steps_per_second": 5.57, "step": 16750 }, { "epoch": 2.0126126126126125, "grad_norm": 0.3350470960140228, "learning_rate": 2.4430296273801968e-05, "loss": 0.0201, "step": 16755 }, { "epoch": 2.0132132132132132, "grad_norm": 0.1945425271987915, "learning_rate": 2.4403277137038815e-05, "loss": 0.0197, "step": 16760 }, { "epoch": 2.013813813813814, "grad_norm": 0.29737523198127747, "learning_rate": 2.4376268125705322e-05, "loss": 0.0177, "step": 16765 }, { "epoch": 2.0144144144144143, "grad_norm": 0.3143198788166046, "learning_rate": 2.434926925048554e-05, "loss": 0.0165, "step": 16770 }, { "epoch": 2.015015015015015, "grad_norm": 0.2871094048023224, "learning_rate": 2.4322280522059583e-05, "loss": 0.0153, "step": 16775 }, { "epoch": 2.0156156156156158, "grad_norm": 0.3184792399406433, "learning_rate": 2.4295301951103532e-05, "loss": 0.0195, "step": 16780 }, { "epoch": 2.016216216216216, "grad_norm": 0.18864959478378296, "learning_rate": 2.4268333548289417e-05, "loss": 0.0198, "step": 16785 }, { "epoch": 2.016816816816817, "grad_norm": 0.25299298763275146, "learning_rate": 2.4241375324285276e-05, "loss": 0.0175, "step": 16790 }, { "epoch": 2.0174174174174175, "grad_norm": 0.31384697556495667, "learning_rate": 2.4214427289755142e-05, "loss": 0.0177, "step": 16795 }, { "epoch": 2.018018018018018, "grad_norm": 0.24413210153579712, "learning_rate": 2.4187489455358948e-05, "loss": 0.0175, "step": 16800 }, { "epoch": 2.0186186186186186, "grad_norm": 0.23016425967216492, "learning_rate": 2.4160561831752653e-05, "loss": 0.0156, "step": 16805 }, { "epoch": 2.0192192192192193, "grad_norm": 0.35870787501335144, "learning_rate": 2.4133644429588164e-05, "loss": 0.0166, "step": 16810 }, { "epoch": 2.0198198198198196, "grad_norm": 0.22207961976528168, "learning_rate": 2.410673725951335e-05, "loss": 0.0173, "step": 16815 }, { "epoch": 2.0204204204204204, "grad_norm": 0.2038796991109848, "learning_rate": 2.407984033217199e-05, "loss": 0.0167, "step": 16820 }, { "epoch": 2.021021021021021, "grad_norm": 0.22484976053237915, "learning_rate": 2.405295365820385e-05, "loss": 0.0174, "step": 16825 }, { "epoch": 2.0216216216216214, "grad_norm": 0.20852796733379364, "learning_rate": 2.4026077248244642e-05, "loss": 0.0174, "step": 16830 }, { "epoch": 2.022222222222222, "grad_norm": 0.23906341195106506, "learning_rate": 2.399921111292601e-05, "loss": 0.0161, "step": 16835 }, { "epoch": 2.022822822822823, "grad_norm": 0.21950270235538483, "learning_rate": 2.3972355262875545e-05, "loss": 0.0167, "step": 16840 }, { "epoch": 2.0234234234234236, "grad_norm": 0.23244847357273102, "learning_rate": 2.3945509708716723e-05, "loss": 0.0173, "step": 16845 }, { "epoch": 2.024024024024024, "grad_norm": 0.3130114674568176, "learning_rate": 2.391867446106899e-05, "loss": 0.0186, "step": 16850 }, { "epoch": 2.0246246246246247, "grad_norm": 0.30832698941230774, "learning_rate": 2.389184953054772e-05, "loss": 0.015, "step": 16855 }, { "epoch": 2.0252252252252254, "grad_norm": 0.2931411564350128, "learning_rate": 2.3865034927764195e-05, "loss": 0.0172, "step": 16860 }, { "epoch": 2.0258258258258257, "grad_norm": 0.2423478215932846, "learning_rate": 2.3838230663325582e-05, "loss": 0.0184, "step": 16865 }, { "epoch": 2.0264264264264265, "grad_norm": 0.23603129386901855, "learning_rate": 2.3811436747835014e-05, "loss": 0.0181, "step": 16870 }, { "epoch": 2.027027027027027, "grad_norm": 0.23850959539413452, "learning_rate": 2.3784653191891466e-05, "loss": 0.0147, "step": 16875 }, { "epoch": 2.0276276276276275, "grad_norm": 0.19226299226284027, "learning_rate": 2.3757880006089866e-05, "loss": 0.0179, "step": 16880 }, { "epoch": 2.0282282282282282, "grad_norm": 0.1841306984424591, "learning_rate": 2.3731117201021048e-05, "loss": 0.0159, "step": 16885 }, { "epoch": 2.028828828828829, "grad_norm": 0.2256769835948944, "learning_rate": 2.370436478727167e-05, "loss": 0.016, "step": 16890 }, { "epoch": 2.0294294294294293, "grad_norm": 0.1400245726108551, "learning_rate": 2.3677622775424347e-05, "loss": 0.0113, "step": 16895 }, { "epoch": 2.03003003003003, "grad_norm": 0.30644339323043823, "learning_rate": 2.3650891176057556e-05, "loss": 0.0197, "step": 16900 }, { "epoch": 2.0306306306306308, "grad_norm": 0.2984524071216583, "learning_rate": 2.3624169999745654e-05, "loss": 0.0161, "step": 16905 }, { "epoch": 2.031231231231231, "grad_norm": 0.30133652687072754, "learning_rate": 2.3597459257058897e-05, "loss": 0.0179, "step": 16910 }, { "epoch": 2.031831831831832, "grad_norm": 0.3699958622455597, "learning_rate": 2.3570758958563354e-05, "loss": 0.0203, "step": 16915 }, { "epoch": 2.0324324324324325, "grad_norm": 0.22517237067222595, "learning_rate": 2.3544069114821027e-05, "loss": 0.0166, "step": 16920 }, { "epoch": 2.033033033033033, "grad_norm": 0.24874772131443024, "learning_rate": 2.351738973638975e-05, "loss": 0.0198, "step": 16925 }, { "epoch": 2.0336336336336336, "grad_norm": 0.26324090361595154, "learning_rate": 2.3490720833823228e-05, "loss": 0.0152, "step": 16930 }, { "epoch": 2.0342342342342343, "grad_norm": 0.22640137374401093, "learning_rate": 2.3464062417671035e-05, "loss": 0.016, "step": 16935 }, { "epoch": 2.0348348348348346, "grad_norm": 0.19408166408538818, "learning_rate": 2.3437414498478542e-05, "loss": 0.0179, "step": 16940 }, { "epoch": 2.0354354354354354, "grad_norm": 0.25801095366477966, "learning_rate": 2.341077708678703e-05, "loss": 0.0179, "step": 16945 }, { "epoch": 2.036036036036036, "grad_norm": 0.1984955370426178, "learning_rate": 2.3384150193133587e-05, "loss": 0.0196, "step": 16950 }, { "epoch": 2.036636636636637, "grad_norm": 0.3280339241027832, "learning_rate": 2.3357533828051188e-05, "loss": 0.0172, "step": 16955 }, { "epoch": 2.037237237237237, "grad_norm": 0.4044642150402069, "learning_rate": 2.333092800206856e-05, "loss": 0.0177, "step": 16960 }, { "epoch": 2.037837837837838, "grad_norm": 0.3203864097595215, "learning_rate": 2.330433272571035e-05, "loss": 0.0176, "step": 16965 }, { "epoch": 2.0384384384384386, "grad_norm": 0.19249285757541656, "learning_rate": 2.3277748009496947e-05, "loss": 0.0172, "step": 16970 }, { "epoch": 2.039039039039039, "grad_norm": 0.3457205891609192, "learning_rate": 2.3251173863944636e-05, "loss": 0.017, "step": 16975 }, { "epoch": 2.0396396396396397, "grad_norm": 0.24442507326602936, "learning_rate": 2.3224610299565503e-05, "loss": 0.0149, "step": 16980 }, { "epoch": 2.0402402402402404, "grad_norm": 0.2592158317565918, "learning_rate": 2.3198057326867395e-05, "loss": 0.0189, "step": 16985 }, { "epoch": 2.0408408408408407, "grad_norm": 0.3123731315135956, "learning_rate": 2.3171514956354033e-05, "loss": 0.0167, "step": 16990 }, { "epoch": 2.0414414414414415, "grad_norm": 0.28269627690315247, "learning_rate": 2.314498319852493e-05, "loss": 0.0174, "step": 16995 }, { "epoch": 2.042042042042042, "grad_norm": 0.3151399493217468, "learning_rate": 2.3118462063875373e-05, "loss": 0.0156, "step": 17000 }, { "epoch": 2.042042042042042, "eval_loss": 0.04410775005817413, "eval_runtime": 35.8994, "eval_samples_per_second": 22.284, "eval_steps_per_second": 5.571, "step": 17000 }, { "epoch": 2.0426426426426425, "grad_norm": 0.22928915917873383, "learning_rate": 2.3091951562896502e-05, "loss": 0.0169, "step": 17005 }, { "epoch": 2.0432432432432432, "grad_norm": 0.22066111862659454, "learning_rate": 2.306545170607517e-05, "loss": 0.0166, "step": 17010 }, { "epoch": 2.043843843843844, "grad_norm": 0.16600383818149567, "learning_rate": 2.3038962503894086e-05, "loss": 0.0153, "step": 17015 }, { "epoch": 2.0444444444444443, "grad_norm": 0.21326720714569092, "learning_rate": 2.3012483966831718e-05, "loss": 0.0159, "step": 17020 }, { "epoch": 2.045045045045045, "grad_norm": 0.3878456652164459, "learning_rate": 2.298601610536234e-05, "loss": 0.017, "step": 17025 }, { "epoch": 2.0456456456456458, "grad_norm": 0.23753908276557922, "learning_rate": 2.2959558929955943e-05, "loss": 0.0151, "step": 17030 }, { "epoch": 2.046246246246246, "grad_norm": 0.25556668639183044, "learning_rate": 2.293311245107836e-05, "loss": 0.0162, "step": 17035 }, { "epoch": 2.046846846846847, "grad_norm": 0.29425516724586487, "learning_rate": 2.2906676679191146e-05, "loss": 0.0191, "step": 17040 }, { "epoch": 2.0474474474474476, "grad_norm": 0.22631238400936127, "learning_rate": 2.288025162475165e-05, "loss": 0.0164, "step": 17045 }, { "epoch": 2.048048048048048, "grad_norm": 0.2662695348262787, "learning_rate": 2.285383729821298e-05, "loss": 0.0155, "step": 17050 }, { "epoch": 2.0486486486486486, "grad_norm": 0.31905195116996765, "learning_rate": 2.2827433710023967e-05, "loss": 0.0171, "step": 17055 }, { "epoch": 2.0492492492492493, "grad_norm": 0.27785614132881165, "learning_rate": 2.2801040870629232e-05, "loss": 0.0163, "step": 17060 }, { "epoch": 2.0498498498498496, "grad_norm": 0.19194890558719635, "learning_rate": 2.2774658790469106e-05, "loss": 0.0152, "step": 17065 }, { "epoch": 2.0504504504504504, "grad_norm": 0.32209452986717224, "learning_rate": 2.27482874799797e-05, "loss": 0.0193, "step": 17070 }, { "epoch": 2.051051051051051, "grad_norm": 0.2572319507598877, "learning_rate": 2.2721926949592877e-05, "loss": 0.0146, "step": 17075 }, { "epoch": 2.051651651651652, "grad_norm": 0.3050822913646698, "learning_rate": 2.2695577209736163e-05, "loss": 0.0177, "step": 17080 }, { "epoch": 2.052252252252252, "grad_norm": 0.1752171516418457, "learning_rate": 2.2669238270832883e-05, "loss": 0.0138, "step": 17085 }, { "epoch": 2.052852852852853, "grad_norm": 0.32753434777259827, "learning_rate": 2.264291014330207e-05, "loss": 0.0202, "step": 17090 }, { "epoch": 2.0534534534534536, "grad_norm": 0.2691376209259033, "learning_rate": 2.2616592837558502e-05, "loss": 0.018, "step": 17095 }, { "epoch": 2.054054054054054, "grad_norm": 0.27995845675468445, "learning_rate": 2.259028636401262e-05, "loss": 0.0186, "step": 17100 }, { "epoch": 2.0546546546546547, "grad_norm": 0.20545314252376556, "learning_rate": 2.2563990733070616e-05, "loss": 0.0143, "step": 17105 }, { "epoch": 2.0552552552552554, "grad_norm": 0.24307379126548767, "learning_rate": 2.2537705955134402e-05, "loss": 0.0156, "step": 17110 }, { "epoch": 2.0558558558558557, "grad_norm": 0.2568596303462982, "learning_rate": 2.251143204060159e-05, "loss": 0.0161, "step": 17115 }, { "epoch": 2.0564564564564565, "grad_norm": 0.3136351704597473, "learning_rate": 2.2485168999865493e-05, "loss": 0.0154, "step": 17120 }, { "epoch": 2.057057057057057, "grad_norm": 0.2579204738140106, "learning_rate": 2.2458916843315092e-05, "loss": 0.0171, "step": 17125 }, { "epoch": 2.0576576576576575, "grad_norm": 0.2734680771827698, "learning_rate": 2.2432675581335105e-05, "loss": 0.0187, "step": 17130 }, { "epoch": 2.0582582582582583, "grad_norm": 0.2730713486671448, "learning_rate": 2.2406445224305928e-05, "loss": 0.0164, "step": 17135 }, { "epoch": 2.058858858858859, "grad_norm": 0.29078954458236694, "learning_rate": 2.2380225782603665e-05, "loss": 0.0169, "step": 17140 }, { "epoch": 2.0594594594594593, "grad_norm": 0.21432556211948395, "learning_rate": 2.2354017266600032e-05, "loss": 0.0184, "step": 17145 }, { "epoch": 2.06006006006006, "grad_norm": 0.26164016127586365, "learning_rate": 2.2327819686662504e-05, "loss": 0.0157, "step": 17150 }, { "epoch": 2.060660660660661, "grad_norm": 0.27027854323387146, "learning_rate": 2.2301633053154164e-05, "loss": 0.0165, "step": 17155 }, { "epoch": 2.061261261261261, "grad_norm": 0.269004762172699, "learning_rate": 2.227545737643381e-05, "loss": 0.0159, "step": 17160 }, { "epoch": 2.061861861861862, "grad_norm": 0.2210037261247635, "learning_rate": 2.2249292666855916e-05, "loss": 0.0153, "step": 17165 }, { "epoch": 2.0624624624624626, "grad_norm": 0.3236193358898163, "learning_rate": 2.222313893477055e-05, "loss": 0.0172, "step": 17170 }, { "epoch": 2.063063063063063, "grad_norm": 0.25057151913642883, "learning_rate": 2.2196996190523507e-05, "loss": 0.0166, "step": 17175 }, { "epoch": 2.0636636636636636, "grad_norm": 0.27561861276626587, "learning_rate": 2.2170864444456195e-05, "loss": 0.0151, "step": 17180 }, { "epoch": 2.0642642642642643, "grad_norm": 0.2706261873245239, "learning_rate": 2.21447437069057e-05, "loss": 0.0189, "step": 17185 }, { "epoch": 2.064864864864865, "grad_norm": 0.3415422737598419, "learning_rate": 2.2118633988204753e-05, "loss": 0.0158, "step": 17190 }, { "epoch": 2.0654654654654654, "grad_norm": 0.2783031761646271, "learning_rate": 2.2092535298681667e-05, "loss": 0.0182, "step": 17195 }, { "epoch": 2.066066066066066, "grad_norm": 0.2432776540517807, "learning_rate": 2.2066447648660465e-05, "loss": 0.0205, "step": 17200 }, { "epoch": 2.066666666666667, "grad_norm": 0.236969456076622, "learning_rate": 2.2040371048460778e-05, "loss": 0.0173, "step": 17205 }, { "epoch": 2.067267267267267, "grad_norm": 0.23664186894893646, "learning_rate": 2.2014305508397853e-05, "loss": 0.0164, "step": 17210 }, { "epoch": 2.067867867867868, "grad_norm": 0.31544530391693115, "learning_rate": 2.1988251038782593e-05, "loss": 0.0168, "step": 17215 }, { "epoch": 2.0684684684684687, "grad_norm": 0.18375684320926666, "learning_rate": 2.1962207649921472e-05, "loss": 0.0166, "step": 17220 }, { "epoch": 2.069069069069069, "grad_norm": 0.24297571182250977, "learning_rate": 2.193617535211662e-05, "loss": 0.0164, "step": 17225 }, { "epoch": 2.0696696696696697, "grad_norm": 0.3058975040912628, "learning_rate": 2.1910154155665774e-05, "loss": 0.0182, "step": 17230 }, { "epoch": 2.0702702702702704, "grad_norm": 0.3959430158138275, "learning_rate": 2.1884144070862288e-05, "loss": 0.0188, "step": 17235 }, { "epoch": 2.0708708708708707, "grad_norm": 0.31866326928138733, "learning_rate": 2.1858145107995078e-05, "loss": 0.0191, "step": 17240 }, { "epoch": 2.0714714714714715, "grad_norm": 0.2213527113199234, "learning_rate": 2.183215727734872e-05, "loss": 0.0163, "step": 17245 }, { "epoch": 2.0720720720720722, "grad_norm": 0.30789023637771606, "learning_rate": 2.1806180589203318e-05, "loss": 0.0191, "step": 17250 }, { "epoch": 2.0720720720720722, "eval_loss": 0.04393164813518524, "eval_runtime": 35.9507, "eval_samples_per_second": 22.253, "eval_steps_per_second": 5.563, "step": 17250 }, { "epoch": 2.0726726726726725, "grad_norm": 0.37385308742523193, "learning_rate": 2.1780215053834635e-05, "loss": 0.0165, "step": 17255 }, { "epoch": 2.0732732732732733, "grad_norm": 0.2261199951171875, "learning_rate": 2.1754260681514e-05, "loss": 0.0168, "step": 17260 }, { "epoch": 2.073873873873874, "grad_norm": 0.2935144901275635, "learning_rate": 2.1728317482508293e-05, "loss": 0.0171, "step": 17265 }, { "epoch": 2.0744744744744743, "grad_norm": 0.25347235798835754, "learning_rate": 2.170238546708001e-05, "loss": 0.0172, "step": 17270 }, { "epoch": 2.075075075075075, "grad_norm": 0.30140021443367004, "learning_rate": 2.1676464645487227e-05, "loss": 0.0142, "step": 17275 }, { "epoch": 2.075675675675676, "grad_norm": 0.26232364773750305, "learning_rate": 2.1650555027983566e-05, "loss": 0.0169, "step": 17280 }, { "epoch": 2.076276276276276, "grad_norm": 0.2933792471885681, "learning_rate": 2.1624656624818258e-05, "loss": 0.0152, "step": 17285 }, { "epoch": 2.076876876876877, "grad_norm": 0.23124483227729797, "learning_rate": 2.159876944623602e-05, "loss": 0.0175, "step": 17290 }, { "epoch": 2.0774774774774776, "grad_norm": 0.18243825435638428, "learning_rate": 2.1572893502477216e-05, "loss": 0.0142, "step": 17295 }, { "epoch": 2.078078078078078, "grad_norm": 0.2552858293056488, "learning_rate": 2.1547028803777718e-05, "loss": 0.0152, "step": 17300 }, { "epoch": 2.0786786786786786, "grad_norm": 0.19179783761501312, "learning_rate": 2.1521175360368956e-05, "loss": 0.0148, "step": 17305 }, { "epoch": 2.0792792792792794, "grad_norm": 0.2637120187282562, "learning_rate": 2.149533318247794e-05, "loss": 0.0159, "step": 17310 }, { "epoch": 2.07987987987988, "grad_norm": 0.34350207448005676, "learning_rate": 2.1469502280327147e-05, "loss": 0.02, "step": 17315 }, { "epoch": 2.0804804804804804, "grad_norm": 0.32478630542755127, "learning_rate": 2.1443682664134675e-05, "loss": 0.0219, "step": 17320 }, { "epoch": 2.081081081081081, "grad_norm": 0.25575026869773865, "learning_rate": 2.1417874344114114e-05, "loss": 0.0172, "step": 17325 }, { "epoch": 2.081681681681682, "grad_norm": 0.23104359209537506, "learning_rate": 2.139207733047462e-05, "loss": 0.0223, "step": 17330 }, { "epoch": 2.082282282282282, "grad_norm": 0.2576219439506531, "learning_rate": 2.136629163342081e-05, "loss": 0.0164, "step": 17335 }, { "epoch": 2.082882882882883, "grad_norm": 0.259834885597229, "learning_rate": 2.134051726315291e-05, "loss": 0.0147, "step": 17340 }, { "epoch": 2.0834834834834837, "grad_norm": 0.26843321323394775, "learning_rate": 2.131475422986658e-05, "loss": 0.0172, "step": 17345 }, { "epoch": 2.084084084084084, "grad_norm": 0.2459273487329483, "learning_rate": 2.1289002543753062e-05, "loss": 0.0174, "step": 17350 }, { "epoch": 2.0846846846846847, "grad_norm": 0.29039624333381653, "learning_rate": 2.1263262214999103e-05, "loss": 0.0167, "step": 17355 }, { "epoch": 2.0852852852852855, "grad_norm": 0.31065475940704346, "learning_rate": 2.1237533253786902e-05, "loss": 0.0172, "step": 17360 }, { "epoch": 2.0858858858858857, "grad_norm": 0.2677178680896759, "learning_rate": 2.121181567029421e-05, "loss": 0.0166, "step": 17365 }, { "epoch": 2.0864864864864865, "grad_norm": 0.22189515829086304, "learning_rate": 2.1186109474694277e-05, "loss": 0.0176, "step": 17370 }, { "epoch": 2.0870870870870872, "grad_norm": 0.27094221115112305, "learning_rate": 2.116041467715583e-05, "loss": 0.0186, "step": 17375 }, { "epoch": 2.0876876876876875, "grad_norm": 0.2697623074054718, "learning_rate": 2.1134731287843125e-05, "loss": 0.0194, "step": 17380 }, { "epoch": 2.0882882882882883, "grad_norm": 0.32814136147499084, "learning_rate": 2.1109059316915817e-05, "loss": 0.0207, "step": 17385 }, { "epoch": 2.088888888888889, "grad_norm": 0.32178542017936707, "learning_rate": 2.108339877452914e-05, "loss": 0.017, "step": 17390 }, { "epoch": 2.0894894894894893, "grad_norm": 0.29471707344055176, "learning_rate": 2.105774967083376e-05, "loss": 0.0168, "step": 17395 }, { "epoch": 2.09009009009009, "grad_norm": 0.252353698015213, "learning_rate": 2.1032112015975845e-05, "loss": 0.0166, "step": 17400 }, { "epoch": 2.090690690690691, "grad_norm": 0.24682864546775818, "learning_rate": 2.1006485820096983e-05, "loss": 0.0163, "step": 17405 }, { "epoch": 2.091291291291291, "grad_norm": 0.2174302339553833, "learning_rate": 2.0980871093334274e-05, "loss": 0.0166, "step": 17410 }, { "epoch": 2.091891891891892, "grad_norm": 0.2508426010608673, "learning_rate": 2.095526784582028e-05, "loss": 0.0176, "step": 17415 }, { "epoch": 2.0924924924924926, "grad_norm": 0.19831790030002594, "learning_rate": 2.092967608768301e-05, "loss": 0.0141, "step": 17420 }, { "epoch": 2.093093093093093, "grad_norm": 0.2547295391559601, "learning_rate": 2.0904095829045933e-05, "loss": 0.0178, "step": 17425 }, { "epoch": 2.0936936936936936, "grad_norm": 0.3118404746055603, "learning_rate": 2.0878527080027948e-05, "loss": 0.0157, "step": 17430 }, { "epoch": 2.0942942942942944, "grad_norm": 0.18359223008155823, "learning_rate": 2.0852969850743447e-05, "loss": 0.0169, "step": 17435 }, { "epoch": 2.094894894894895, "grad_norm": 0.2618004083633423, "learning_rate": 2.08274241513022e-05, "loss": 0.0181, "step": 17440 }, { "epoch": 2.0954954954954954, "grad_norm": 0.33545413613319397, "learning_rate": 2.0801889991809477e-05, "loss": 0.0181, "step": 17445 }, { "epoch": 2.096096096096096, "grad_norm": 0.1954948455095291, "learning_rate": 2.077636738236597e-05, "loss": 0.0124, "step": 17450 }, { "epoch": 2.096696696696697, "grad_norm": 0.2551083564758301, "learning_rate": 2.075085633306776e-05, "loss": 0.0128, "step": 17455 }, { "epoch": 2.097297297297297, "grad_norm": 0.1947358250617981, "learning_rate": 2.0725356854006396e-05, "loss": 0.0163, "step": 17460 }, { "epoch": 2.097897897897898, "grad_norm": 0.24798135459423065, "learning_rate": 2.0699868955268854e-05, "loss": 0.0182, "step": 17465 }, { "epoch": 2.0984984984984987, "grad_norm": 0.12923765182495117, "learning_rate": 2.067439264693752e-05, "loss": 0.0156, "step": 17470 }, { "epoch": 2.099099099099099, "grad_norm": 0.21395176649093628, "learning_rate": 2.0648927939090164e-05, "loss": 0.0142, "step": 17475 }, { "epoch": 2.0996996996996997, "grad_norm": 0.284847229719162, "learning_rate": 2.0623474841800007e-05, "loss": 0.0164, "step": 17480 }, { "epoch": 2.1003003003003005, "grad_norm": 0.38137951493263245, "learning_rate": 2.0598033365135665e-05, "loss": 0.0157, "step": 17485 }, { "epoch": 2.1009009009009008, "grad_norm": 0.3931626081466675, "learning_rate": 2.057260351916116e-05, "loss": 0.0167, "step": 17490 }, { "epoch": 2.1015015015015015, "grad_norm": 0.2608947157859802, "learning_rate": 2.0547185313935924e-05, "loss": 0.0152, "step": 17495 }, { "epoch": 2.1021021021021022, "grad_norm": 0.21299488842487335, "learning_rate": 2.0521778759514732e-05, "loss": 0.0153, "step": 17500 }, { "epoch": 2.1021021021021022, "eval_loss": 0.04359756410121918, "eval_runtime": 35.8185, "eval_samples_per_second": 22.335, "eval_steps_per_second": 5.584, "step": 17500 }, { "epoch": 2.1027027027027025, "grad_norm": 0.2490372657775879, "learning_rate": 2.0496383865947806e-05, "loss": 0.0143, "step": 17505 }, { "epoch": 2.1033033033033033, "grad_norm": 0.2208997756242752, "learning_rate": 2.0471000643280735e-05, "loss": 0.018, "step": 17510 }, { "epoch": 2.103903903903904, "grad_norm": 0.2556515634059906, "learning_rate": 2.044562910155452e-05, "loss": 0.0154, "step": 17515 }, { "epoch": 2.1045045045045043, "grad_norm": 0.23904521763324738, "learning_rate": 2.042026925080547e-05, "loss": 0.0178, "step": 17520 }, { "epoch": 2.105105105105105, "grad_norm": 0.3773913085460663, "learning_rate": 2.039492110106535e-05, "loss": 0.0186, "step": 17525 }, { "epoch": 2.105705705705706, "grad_norm": 0.284322053194046, "learning_rate": 2.0369584662361234e-05, "loss": 0.0155, "step": 17530 }, { "epoch": 2.106306306306306, "grad_norm": 0.19863803684711456, "learning_rate": 2.0344259944715594e-05, "loss": 0.0181, "step": 17535 }, { "epoch": 2.106906906906907, "grad_norm": 0.3278902471065521, "learning_rate": 2.031894695814629e-05, "loss": 0.015, "step": 17540 }, { "epoch": 2.1075075075075076, "grad_norm": 0.25113287568092346, "learning_rate": 2.029364571266647e-05, "loss": 0.0138, "step": 17545 }, { "epoch": 2.108108108108108, "grad_norm": 0.3565726578235626, "learning_rate": 2.02683562182847e-05, "loss": 0.0173, "step": 17550 }, { "epoch": 2.1087087087087086, "grad_norm": 0.2991584539413452, "learning_rate": 2.0243078485004885e-05, "loss": 0.0157, "step": 17555 }, { "epoch": 2.1093093093093094, "grad_norm": 0.27290019392967224, "learning_rate": 2.0217812522826256e-05, "loss": 0.0154, "step": 17560 }, { "epoch": 2.10990990990991, "grad_norm": 0.29226356744766235, "learning_rate": 2.0192558341743427e-05, "loss": 0.0167, "step": 17565 }, { "epoch": 2.1105105105105104, "grad_norm": 0.21249055862426758, "learning_rate": 2.0167315951746298e-05, "loss": 0.0135, "step": 17570 }, { "epoch": 2.111111111111111, "grad_norm": 0.2446049600839615, "learning_rate": 2.014208536282014e-05, "loss": 0.015, "step": 17575 }, { "epoch": 2.111711711711712, "grad_norm": 0.21438179910182953, "learning_rate": 2.011686658494555e-05, "loss": 0.0172, "step": 17580 }, { "epoch": 2.112312312312312, "grad_norm": 0.3306219279766083, "learning_rate": 2.0091659628098458e-05, "loss": 0.0184, "step": 17585 }, { "epoch": 2.112912912912913, "grad_norm": 0.20943187177181244, "learning_rate": 2.0066464502250127e-05, "loss": 0.0118, "step": 17590 }, { "epoch": 2.1135135135135137, "grad_norm": 0.3500710129737854, "learning_rate": 2.004128121736709e-05, "loss": 0.0159, "step": 17595 }, { "epoch": 2.114114114114114, "grad_norm": 0.24584412574768066, "learning_rate": 2.0016109783411246e-05, "loss": 0.015, "step": 17600 }, { "epoch": 2.1147147147147147, "grad_norm": 0.25247928500175476, "learning_rate": 1.9990950210339794e-05, "loss": 0.0166, "step": 17605 }, { "epoch": 2.1153153153153155, "grad_norm": 0.2562161982059479, "learning_rate": 1.9965802508105253e-05, "loss": 0.0134, "step": 17610 }, { "epoch": 2.1159159159159158, "grad_norm": 0.1872304528951645, "learning_rate": 1.99406666866554e-05, "loss": 0.018, "step": 17615 }, { "epoch": 2.1165165165165165, "grad_norm": 0.2885187566280365, "learning_rate": 1.9915542755933376e-05, "loss": 0.0169, "step": 17620 }, { "epoch": 2.1171171171171173, "grad_norm": 0.3037269711494446, "learning_rate": 1.9890430725877546e-05, "loss": 0.0148, "step": 17625 }, { "epoch": 2.1177177177177176, "grad_norm": 0.35903996229171753, "learning_rate": 1.9865330606421634e-05, "loss": 0.0165, "step": 17630 }, { "epoch": 2.1183183183183183, "grad_norm": 0.41558587551116943, "learning_rate": 1.9840242407494637e-05, "loss": 0.0185, "step": 17635 }, { "epoch": 2.118918918918919, "grad_norm": 0.2805681526660919, "learning_rate": 1.981516613902079e-05, "loss": 0.0168, "step": 17640 }, { "epoch": 2.1195195195195193, "grad_norm": 0.31591591238975525, "learning_rate": 1.9790101810919665e-05, "loss": 0.0161, "step": 17645 }, { "epoch": 2.12012012012012, "grad_norm": 0.29931020736694336, "learning_rate": 1.976504943310608e-05, "loss": 0.0181, "step": 17650 }, { "epoch": 2.120720720720721, "grad_norm": 0.22215649485588074, "learning_rate": 1.974000901549015e-05, "loss": 0.016, "step": 17655 }, { "epoch": 2.121321321321321, "grad_norm": 0.19030337035655975, "learning_rate": 1.9714980567977254e-05, "loss": 0.0164, "step": 17660 }, { "epoch": 2.121921921921922, "grad_norm": 0.30670931935310364, "learning_rate": 1.968996410046799e-05, "loss": 0.0157, "step": 17665 }, { "epoch": 2.1225225225225226, "grad_norm": 0.3222202658653259, "learning_rate": 1.966495962285827e-05, "loss": 0.0173, "step": 17670 }, { "epoch": 2.123123123123123, "grad_norm": 0.23883286118507385, "learning_rate": 1.9639967145039252e-05, "loss": 0.0166, "step": 17675 }, { "epoch": 2.1237237237237236, "grad_norm": 0.23918716609477997, "learning_rate": 1.961498667689733e-05, "loss": 0.0182, "step": 17680 }, { "epoch": 2.1243243243243244, "grad_norm": 0.26056933403015137, "learning_rate": 1.959001822831419e-05, "loss": 0.0178, "step": 17685 }, { "epoch": 2.124924924924925, "grad_norm": 0.2223052978515625, "learning_rate": 1.9565061809166685e-05, "loss": 0.0157, "step": 17690 }, { "epoch": 2.1255255255255254, "grad_norm": 0.30344024300575256, "learning_rate": 1.9540117429326977e-05, "loss": 0.0191, "step": 17695 }, { "epoch": 2.126126126126126, "grad_norm": 0.38834190368652344, "learning_rate": 1.9515185098662447e-05, "loss": 0.0174, "step": 17700 }, { "epoch": 2.126726726726727, "grad_norm": 0.26158246397972107, "learning_rate": 1.9490264827035733e-05, "loss": 0.0181, "step": 17705 }, { "epoch": 2.127327327327327, "grad_norm": 0.1912645399570465, "learning_rate": 1.9465356624304625e-05, "loss": 0.02, "step": 17710 }, { "epoch": 2.127927927927928, "grad_norm": 0.20378795266151428, "learning_rate": 1.944046050032224e-05, "loss": 0.0177, "step": 17715 }, { "epoch": 2.1285285285285287, "grad_norm": 0.2622855603694916, "learning_rate": 1.9415576464936824e-05, "loss": 0.0172, "step": 17720 }, { "epoch": 2.129129129129129, "grad_norm": 0.2557198107242584, "learning_rate": 1.9390704527991904e-05, "loss": 0.019, "step": 17725 }, { "epoch": 2.1297297297297297, "grad_norm": 0.22268472611904144, "learning_rate": 1.936584469932623e-05, "loss": 0.0163, "step": 17730 }, { "epoch": 2.1303303303303305, "grad_norm": 0.28545132279396057, "learning_rate": 1.9340996988773685e-05, "loss": 0.0144, "step": 17735 }, { "epoch": 2.1309309309309308, "grad_norm": 0.2868815064430237, "learning_rate": 1.931616140616344e-05, "loss": 0.0172, "step": 17740 }, { "epoch": 2.1315315315315315, "grad_norm": 0.26733162999153137, "learning_rate": 1.929133796131982e-05, "loss": 0.0173, "step": 17745 }, { "epoch": 2.1321321321321323, "grad_norm": 0.33977895975112915, "learning_rate": 1.9266526664062386e-05, "loss": 0.0159, "step": 17750 }, { "epoch": 2.1321321321321323, "eval_loss": 0.04252379387617111, "eval_runtime": 35.7755, "eval_samples_per_second": 22.362, "eval_steps_per_second": 5.59, "step": 17750 }, { "epoch": 2.1327327327327326, "grad_norm": 0.2979249954223633, "learning_rate": 1.9241727524205865e-05, "loss": 0.0149, "step": 17755 }, { "epoch": 2.1333333333333333, "grad_norm": 0.36992010474205017, "learning_rate": 1.921694055156017e-05, "loss": 0.0162, "step": 17760 }, { "epoch": 2.133933933933934, "grad_norm": 0.34459370374679565, "learning_rate": 1.919216575593042e-05, "loss": 0.0154, "step": 17765 }, { "epoch": 2.1345345345345343, "grad_norm": 0.3337080180644989, "learning_rate": 1.9167403147116908e-05, "loss": 0.0151, "step": 17770 }, { "epoch": 2.135135135135135, "grad_norm": 0.2594672739505768, "learning_rate": 1.9142652734915134e-05, "loss": 0.0163, "step": 17775 }, { "epoch": 2.135735735735736, "grad_norm": 0.21162456274032593, "learning_rate": 1.9117914529115706e-05, "loss": 0.0167, "step": 17780 }, { "epoch": 2.1363363363363366, "grad_norm": 0.304496169090271, "learning_rate": 1.909318853950447e-05, "loss": 0.0155, "step": 17785 }, { "epoch": 2.136936936936937, "grad_norm": 0.30738282203674316, "learning_rate": 1.906847477586241e-05, "loss": 0.015, "step": 17790 }, { "epoch": 2.1375375375375376, "grad_norm": 0.21666869521141052, "learning_rate": 1.9043773247965678e-05, "loss": 0.0169, "step": 17795 }, { "epoch": 2.138138138138138, "grad_norm": 0.29714494943618774, "learning_rate": 1.901908396558561e-05, "loss": 0.016, "step": 17800 }, { "epoch": 2.1387387387387387, "grad_norm": 0.22285744547843933, "learning_rate": 1.899440693848864e-05, "loss": 0.0145, "step": 17805 }, { "epoch": 2.1393393393393394, "grad_norm": 0.24469813704490662, "learning_rate": 1.896974217643641e-05, "loss": 0.017, "step": 17810 }, { "epoch": 2.13993993993994, "grad_norm": 0.3498464822769165, "learning_rate": 1.8945089689185673e-05, "loss": 0.0153, "step": 17815 }, { "epoch": 2.1405405405405404, "grad_norm": 0.22742371261119843, "learning_rate": 1.8920449486488352e-05, "loss": 0.0176, "step": 17820 }, { "epoch": 2.141141141141141, "grad_norm": 0.3167477250099182, "learning_rate": 1.889582157809151e-05, "loss": 0.0147, "step": 17825 }, { "epoch": 2.141741741741742, "grad_norm": 0.2402694821357727, "learning_rate": 1.8871205973737316e-05, "loss": 0.0158, "step": 17830 }, { "epoch": 2.142342342342342, "grad_norm": 0.34336015582084656, "learning_rate": 1.8846602683163106e-05, "loss": 0.0179, "step": 17835 }, { "epoch": 2.142942942942943, "grad_norm": 0.36309754848480225, "learning_rate": 1.882201171610133e-05, "loss": 0.0165, "step": 17840 }, { "epoch": 2.1435435435435437, "grad_norm": 0.299583375453949, "learning_rate": 1.8797433082279582e-05, "loss": 0.0146, "step": 17845 }, { "epoch": 2.144144144144144, "grad_norm": 0.17966710031032562, "learning_rate": 1.877286679142053e-05, "loss": 0.015, "step": 17850 }, { "epoch": 2.1447447447447447, "grad_norm": 0.26901188492774963, "learning_rate": 1.8748312853242005e-05, "loss": 0.0171, "step": 17855 }, { "epoch": 2.1453453453453455, "grad_norm": 0.2321656048297882, "learning_rate": 1.872377127745694e-05, "loss": 0.0165, "step": 17860 }, { "epoch": 2.145945945945946, "grad_norm": 0.2894320487976074, "learning_rate": 1.869924207377337e-05, "loss": 0.0163, "step": 17865 }, { "epoch": 2.1465465465465465, "grad_norm": 0.22987785935401917, "learning_rate": 1.8674725251894464e-05, "loss": 0.0166, "step": 17870 }, { "epoch": 2.1471471471471473, "grad_norm": 0.2710099518299103, "learning_rate": 1.8650220821518432e-05, "loss": 0.0167, "step": 17875 }, { "epoch": 2.1477477477477476, "grad_norm": 0.2632339596748352, "learning_rate": 1.862572879233863e-05, "loss": 0.0155, "step": 17880 }, { "epoch": 2.1483483483483483, "grad_norm": 0.17593629658222198, "learning_rate": 1.860124917404351e-05, "loss": 0.0129, "step": 17885 }, { "epoch": 2.148948948948949, "grad_norm": 0.3508222997188568, "learning_rate": 1.8576781976316615e-05, "loss": 0.0174, "step": 17890 }, { "epoch": 2.1495495495495494, "grad_norm": 0.28124260902404785, "learning_rate": 1.8552327208836528e-05, "loss": 0.0167, "step": 17895 }, { "epoch": 2.15015015015015, "grad_norm": 0.2841458320617676, "learning_rate": 1.852788488127698e-05, "loss": 0.0172, "step": 17900 }, { "epoch": 2.150750750750751, "grad_norm": 0.19693568348884583, "learning_rate": 1.850345500330672e-05, "loss": 0.0163, "step": 17905 }, { "epoch": 2.1513513513513516, "grad_norm": 0.2247840017080307, "learning_rate": 1.847903758458962e-05, "loss": 0.0137, "step": 17910 }, { "epoch": 2.151951951951952, "grad_norm": 0.2549600899219513, "learning_rate": 1.8454632634784626e-05, "loss": 0.0159, "step": 17915 }, { "epoch": 2.1525525525525526, "grad_norm": 0.2734972834587097, "learning_rate": 1.8430240163545685e-05, "loss": 0.0155, "step": 17920 }, { "epoch": 2.153153153153153, "grad_norm": 0.31436336040496826, "learning_rate": 1.8405860180521888e-05, "loss": 0.0172, "step": 17925 }, { "epoch": 2.1537537537537537, "grad_norm": 0.18977700173854828, "learning_rate": 1.8381492695357344e-05, "loss": 0.0168, "step": 17930 }, { "epoch": 2.1543543543543544, "grad_norm": 0.25732213258743286, "learning_rate": 1.8357137717691232e-05, "loss": 0.0177, "step": 17935 }, { "epoch": 2.154954954954955, "grad_norm": 0.18424879014492035, "learning_rate": 1.8332795257157788e-05, "loss": 0.0147, "step": 17940 }, { "epoch": 2.1555555555555554, "grad_norm": 0.23075073957443237, "learning_rate": 1.8308465323386253e-05, "loss": 0.0157, "step": 17945 }, { "epoch": 2.156156156156156, "grad_norm": 0.2986072897911072, "learning_rate": 1.8284147926000967e-05, "loss": 0.0168, "step": 17950 }, { "epoch": 2.156756756756757, "grad_norm": 0.2990579307079315, "learning_rate": 1.8259843074621287e-05, "loss": 0.0153, "step": 17955 }, { "epoch": 2.1573573573573572, "grad_norm": 0.31677985191345215, "learning_rate": 1.8235550778861617e-05, "loss": 0.015, "step": 17960 }, { "epoch": 2.157957957957958, "grad_norm": 0.30569136142730713, "learning_rate": 1.8211271048331392e-05, "loss": 0.0164, "step": 17965 }, { "epoch": 2.1585585585585587, "grad_norm": 0.20969319343566895, "learning_rate": 1.8187003892635052e-05, "loss": 0.0125, "step": 17970 }, { "epoch": 2.159159159159159, "grad_norm": 0.2510526180267334, "learning_rate": 1.816274932137209e-05, "loss": 0.0166, "step": 17975 }, { "epoch": 2.1597597597597598, "grad_norm": 0.2927291691303253, "learning_rate": 1.8138507344137016e-05, "loss": 0.0158, "step": 17980 }, { "epoch": 2.1603603603603605, "grad_norm": 0.20787876844406128, "learning_rate": 1.8114277970519378e-05, "loss": 0.0165, "step": 17985 }, { "epoch": 2.160960960960961, "grad_norm": 0.17879043519496918, "learning_rate": 1.8090061210103675e-05, "loss": 0.0176, "step": 17990 }, { "epoch": 2.1615615615615615, "grad_norm": 0.24847956001758575, "learning_rate": 1.80658570724695e-05, "loss": 0.0137, "step": 17995 }, { "epoch": 2.1621621621621623, "grad_norm": 0.20084017515182495, "learning_rate": 1.804166556719137e-05, "loss": 0.0147, "step": 18000 }, { "epoch": 2.1621621621621623, "eval_loss": 0.04187976196408272, "eval_runtime": 35.7108, "eval_samples_per_second": 22.402, "eval_steps_per_second": 5.601, "step": 18000 }, { "epoch": 2.1627627627627626, "grad_norm": 0.24123363196849823, "learning_rate": 1.801748670383886e-05, "loss": 0.0142, "step": 18005 }, { "epoch": 2.1633633633633633, "grad_norm": 0.24969927966594696, "learning_rate": 1.799332049197655e-05, "loss": 0.0142, "step": 18010 }, { "epoch": 2.163963963963964, "grad_norm": 0.24504432082176208, "learning_rate": 1.796916694116396e-05, "loss": 0.0159, "step": 18015 }, { "epoch": 2.1645645645645644, "grad_norm": 0.22552812099456787, "learning_rate": 1.7945026060955662e-05, "loss": 0.0139, "step": 18020 }, { "epoch": 2.165165165165165, "grad_norm": 0.26121315360069275, "learning_rate": 1.7920897860901175e-05, "loss": 0.0138, "step": 18025 }, { "epoch": 2.165765765765766, "grad_norm": 0.3007679879665375, "learning_rate": 1.7896782350545034e-05, "loss": 0.0154, "step": 18030 }, { "epoch": 2.1663663663663666, "grad_norm": 0.208225816488266, "learning_rate": 1.787267953942674e-05, "loss": 0.0145, "step": 18035 }, { "epoch": 2.166966966966967, "grad_norm": 0.24646946787834167, "learning_rate": 1.7848589437080738e-05, "loss": 0.0163, "step": 18040 }, { "epoch": 2.1675675675675676, "grad_norm": 0.2667646110057831, "learning_rate": 1.7824512053036495e-05, "loss": 0.0168, "step": 18045 }, { "epoch": 2.1681681681681684, "grad_norm": 0.3075443506240845, "learning_rate": 1.780044739681843e-05, "loss": 0.0172, "step": 18050 }, { "epoch": 2.1687687687687687, "grad_norm": 0.2193213403224945, "learning_rate": 1.7776395477945945e-05, "loss": 0.0119, "step": 18055 }, { "epoch": 2.1693693693693694, "grad_norm": 0.34656471014022827, "learning_rate": 1.7752356305933338e-05, "loss": 0.0168, "step": 18060 }, { "epoch": 2.16996996996997, "grad_norm": 0.2753620743751526, "learning_rate": 1.7728329890289934e-05, "loss": 0.0173, "step": 18065 }, { "epoch": 2.1705705705705705, "grad_norm": 0.2601529359817505, "learning_rate": 1.7704316240519992e-05, "loss": 0.0152, "step": 18070 }, { "epoch": 2.171171171171171, "grad_norm": 0.22908732295036316, "learning_rate": 1.768031536612271e-05, "loss": 0.0148, "step": 18075 }, { "epoch": 2.171771771771772, "grad_norm": 0.31950539350509644, "learning_rate": 1.7656327276592256e-05, "loss": 0.0171, "step": 18080 }, { "epoch": 2.1723723723723722, "grad_norm": 0.16539399325847626, "learning_rate": 1.7632351981417692e-05, "loss": 0.0135, "step": 18085 }, { "epoch": 2.172972972972973, "grad_norm": 0.2653179168701172, "learning_rate": 1.7608389490083088e-05, "loss": 0.0166, "step": 18090 }, { "epoch": 2.1735735735735737, "grad_norm": 0.28123563528060913, "learning_rate": 1.7584439812067384e-05, "loss": 0.0151, "step": 18095 }, { "epoch": 2.174174174174174, "grad_norm": 0.20468199253082275, "learning_rate": 1.7560502956844478e-05, "loss": 0.012, "step": 18100 }, { "epoch": 2.1747747747747748, "grad_norm": 0.168495774269104, "learning_rate": 1.753657893388323e-05, "loss": 0.0147, "step": 18105 }, { "epoch": 2.1753753753753755, "grad_norm": 0.2905060946941376, "learning_rate": 1.751266775264735e-05, "loss": 0.0131, "step": 18110 }, { "epoch": 2.175975975975976, "grad_norm": 0.26909762620925903, "learning_rate": 1.748876942259553e-05, "loss": 0.0175, "step": 18115 }, { "epoch": 2.1765765765765765, "grad_norm": 0.19784219563007355, "learning_rate": 1.7464883953181354e-05, "loss": 0.0161, "step": 18120 }, { "epoch": 2.1771771771771773, "grad_norm": 0.3387581706047058, "learning_rate": 1.744101135385332e-05, "loss": 0.0177, "step": 18125 }, { "epoch": 2.1777777777777776, "grad_norm": 0.21490976214408875, "learning_rate": 1.7417151634054864e-05, "loss": 0.0146, "step": 18130 }, { "epoch": 2.1783783783783783, "grad_norm": 0.27226021885871887, "learning_rate": 1.739330480322426e-05, "loss": 0.0178, "step": 18135 }, { "epoch": 2.178978978978979, "grad_norm": 0.3881310522556305, "learning_rate": 1.7369470870794734e-05, "loss": 0.019, "step": 18140 }, { "epoch": 2.1795795795795794, "grad_norm": 0.249447301030159, "learning_rate": 1.734564984619441e-05, "loss": 0.0176, "step": 18145 }, { "epoch": 2.18018018018018, "grad_norm": 0.18503446877002716, "learning_rate": 1.7321841738846307e-05, "loss": 0.016, "step": 18150 }, { "epoch": 2.180780780780781, "grad_norm": 0.19159843027591705, "learning_rate": 1.729804655816829e-05, "loss": 0.0157, "step": 18155 }, { "epoch": 2.1813813813813816, "grad_norm": 0.25416600704193115, "learning_rate": 1.7274264313573162e-05, "loss": 0.0149, "step": 18160 }, { "epoch": 2.181981981981982, "grad_norm": 0.36102357506752014, "learning_rate": 1.7250495014468586e-05, "loss": 0.0204, "step": 18165 }, { "epoch": 2.1825825825825826, "grad_norm": 0.25126516819000244, "learning_rate": 1.7226738670257113e-05, "loss": 0.0154, "step": 18170 }, { "epoch": 2.1831831831831834, "grad_norm": 0.2282605916261673, "learning_rate": 1.7202995290336176e-05, "loss": 0.0148, "step": 18175 }, { "epoch": 2.1837837837837837, "grad_norm": 0.22766336798667908, "learning_rate": 1.717926488409804e-05, "loss": 0.015, "step": 18180 }, { "epoch": 2.1843843843843844, "grad_norm": 0.23042090237140656, "learning_rate": 1.7155547460929892e-05, "loss": 0.016, "step": 18185 }, { "epoch": 2.184984984984985, "grad_norm": 0.30710074305534363, "learning_rate": 1.7131843030213735e-05, "loss": 0.0134, "step": 18190 }, { "epoch": 2.1855855855855855, "grad_norm": 0.24370187520980835, "learning_rate": 1.710815160132646e-05, "loss": 0.0132, "step": 18195 }, { "epoch": 2.186186186186186, "grad_norm": 0.37445372343063354, "learning_rate": 1.708447318363983e-05, "loss": 0.0208, "step": 18200 }, { "epoch": 2.186786786786787, "grad_norm": 0.2919495701789856, "learning_rate": 1.7060807786520412e-05, "loss": 0.0158, "step": 18205 }, { "epoch": 2.1873873873873872, "grad_norm": 0.38046374917030334, "learning_rate": 1.7037155419329658e-05, "loss": 0.0173, "step": 18210 }, { "epoch": 2.187987987987988, "grad_norm": 0.194807767868042, "learning_rate": 1.701351609142387e-05, "loss": 0.0149, "step": 18215 }, { "epoch": 2.1885885885885887, "grad_norm": 0.2758733928203583, "learning_rate": 1.6989889812154192e-05, "loss": 0.0141, "step": 18220 }, { "epoch": 2.189189189189189, "grad_norm": 0.27009743452072144, "learning_rate": 1.6966276590866553e-05, "loss": 0.0149, "step": 18225 }, { "epoch": 2.1897897897897898, "grad_norm": 0.39949873089790344, "learning_rate": 1.6942676436901794e-05, "loss": 0.0195, "step": 18230 }, { "epoch": 2.1903903903903905, "grad_norm": 0.23725907504558563, "learning_rate": 1.6919089359595537e-05, "loss": 0.0164, "step": 18235 }, { "epoch": 2.190990990990991, "grad_norm": 0.22770358622074127, "learning_rate": 1.689551536827825e-05, "loss": 0.0139, "step": 18240 }, { "epoch": 2.1915915915915916, "grad_norm": 0.2741831839084625, "learning_rate": 1.6871954472275232e-05, "loss": 0.0163, "step": 18245 }, { "epoch": 2.1921921921921923, "grad_norm": 0.2730298638343811, "learning_rate": 1.6848406680906563e-05, "loss": 0.0177, "step": 18250 }, { "epoch": 2.1921921921921923, "eval_loss": 0.040937528014183044, "eval_runtime": 35.8008, "eval_samples_per_second": 22.346, "eval_steps_per_second": 5.586, "step": 18250 }, { "epoch": 2.1927927927927926, "grad_norm": 0.2571689486503601, "learning_rate": 1.6824872003487186e-05, "loss": 0.0184, "step": 18255 }, { "epoch": 2.1933933933933933, "grad_norm": 0.29271450638771057, "learning_rate": 1.680135044932683e-05, "loss": 0.015, "step": 18260 }, { "epoch": 2.193993993993994, "grad_norm": 0.1757679283618927, "learning_rate": 1.6777842027730055e-05, "loss": 0.0145, "step": 18265 }, { "epoch": 2.1945945945945944, "grad_norm": 0.22230972349643707, "learning_rate": 1.675434674799618e-05, "loss": 0.017, "step": 18270 }, { "epoch": 2.195195195195195, "grad_norm": 0.2796711325645447, "learning_rate": 1.673086461941939e-05, "loss": 0.0158, "step": 18275 }, { "epoch": 2.195795795795796, "grad_norm": 0.28163281083106995, "learning_rate": 1.670739565128859e-05, "loss": 0.0171, "step": 18280 }, { "epoch": 2.1963963963963966, "grad_norm": 0.2757713794708252, "learning_rate": 1.668393985288756e-05, "loss": 0.0132, "step": 18285 }, { "epoch": 2.196996996996997, "grad_norm": 0.19740013778209686, "learning_rate": 1.6660497233494833e-05, "loss": 0.015, "step": 18290 }, { "epoch": 2.1975975975975977, "grad_norm": 0.3601270914077759, "learning_rate": 1.66370678023837e-05, "loss": 0.0178, "step": 18295 }, { "epoch": 2.1981981981981984, "grad_norm": 0.20853619277477264, "learning_rate": 1.661365156882228e-05, "loss": 0.0143, "step": 18300 }, { "epoch": 2.1987987987987987, "grad_norm": 0.18104422092437744, "learning_rate": 1.6590248542073457e-05, "loss": 0.014, "step": 18305 }, { "epoch": 2.1993993993993994, "grad_norm": 0.16723549365997314, "learning_rate": 1.656685873139488e-05, "loss": 0.0151, "step": 18310 }, { "epoch": 2.2, "grad_norm": 0.2669627070426941, "learning_rate": 1.6543482146039006e-05, "loss": 0.0143, "step": 18315 }, { "epoch": 2.2006006006006005, "grad_norm": 0.19926106929779053, "learning_rate": 1.6520118795253004e-05, "loss": 0.0183, "step": 18320 }, { "epoch": 2.201201201201201, "grad_norm": 0.2758762538433075, "learning_rate": 1.6496768688278834e-05, "loss": 0.0175, "step": 18325 }, { "epoch": 2.201801801801802, "grad_norm": 0.25652170181274414, "learning_rate": 1.6473431834353242e-05, "loss": 0.0143, "step": 18330 }, { "epoch": 2.2024024024024023, "grad_norm": 0.25897717475891113, "learning_rate": 1.6450108242707695e-05, "loss": 0.0164, "step": 18335 }, { "epoch": 2.203003003003003, "grad_norm": 0.34619587659835815, "learning_rate": 1.6426797922568447e-05, "loss": 0.0186, "step": 18340 }, { "epoch": 2.2036036036036037, "grad_norm": 0.2068956345319748, "learning_rate": 1.640350088315646e-05, "loss": 0.0127, "step": 18345 }, { "epoch": 2.204204204204204, "grad_norm": 0.2720538377761841, "learning_rate": 1.638021713368747e-05, "loss": 0.0145, "step": 18350 }, { "epoch": 2.204804804804805, "grad_norm": 0.26894304156303406, "learning_rate": 1.635694668337196e-05, "loss": 0.014, "step": 18355 }, { "epoch": 2.2054054054054055, "grad_norm": 0.29101064801216125, "learning_rate": 1.6333689541415155e-05, "loss": 0.0165, "step": 18360 }, { "epoch": 2.206006006006006, "grad_norm": 0.3419967293739319, "learning_rate": 1.631044571701697e-05, "loss": 0.0187, "step": 18365 }, { "epoch": 2.2066066066066066, "grad_norm": 0.27701854705810547, "learning_rate": 1.6287215219372128e-05, "loss": 0.0164, "step": 18370 }, { "epoch": 2.2072072072072073, "grad_norm": 0.20764677226543427, "learning_rate": 1.6263998057669994e-05, "loss": 0.0158, "step": 18375 }, { "epoch": 2.2078078078078076, "grad_norm": 0.14065375924110413, "learning_rate": 1.6240794241094727e-05, "loss": 0.0116, "step": 18380 }, { "epoch": 2.2084084084084084, "grad_norm": 0.2389654666185379, "learning_rate": 1.6217603778825196e-05, "loss": 0.0147, "step": 18385 }, { "epoch": 2.209009009009009, "grad_norm": 0.1625911295413971, "learning_rate": 1.6194426680034947e-05, "loss": 0.0148, "step": 18390 }, { "epoch": 2.2096096096096094, "grad_norm": 0.2317337989807129, "learning_rate": 1.6171262953892267e-05, "loss": 0.0178, "step": 18395 }, { "epoch": 2.21021021021021, "grad_norm": 0.23551051318645477, "learning_rate": 1.6148112609560174e-05, "loss": 0.0137, "step": 18400 }, { "epoch": 2.210810810810811, "grad_norm": 0.2810523211956024, "learning_rate": 1.612497565619635e-05, "loss": 0.0162, "step": 18405 }, { "epoch": 2.2114114114114116, "grad_norm": 0.2298928052186966, "learning_rate": 1.610185210295323e-05, "loss": 0.0163, "step": 18410 }, { "epoch": 2.212012012012012, "grad_norm": 0.30341780185699463, "learning_rate": 1.6078741958977877e-05, "loss": 0.0157, "step": 18415 }, { "epoch": 2.2126126126126127, "grad_norm": 0.28883466124534607, "learning_rate": 1.605564523341212e-05, "loss": 0.0125, "step": 18420 }, { "epoch": 2.2132132132132134, "grad_norm": 0.29520153999328613, "learning_rate": 1.6032561935392442e-05, "loss": 0.0156, "step": 18425 }, { "epoch": 2.2138138138138137, "grad_norm": 0.2968877851963043, "learning_rate": 1.600949207405004e-05, "loss": 0.0177, "step": 18430 }, { "epoch": 2.2144144144144144, "grad_norm": 0.230467289686203, "learning_rate": 1.5986435658510758e-05, "loss": 0.0156, "step": 18435 }, { "epoch": 2.215015015015015, "grad_norm": 0.30984342098236084, "learning_rate": 1.5963392697895147e-05, "loss": 0.016, "step": 18440 }, { "epoch": 2.2156156156156155, "grad_norm": 0.20223285257816315, "learning_rate": 1.5940363201318443e-05, "loss": 0.0158, "step": 18445 }, { "epoch": 2.2162162162162162, "grad_norm": 0.23069490492343903, "learning_rate": 1.591734717789053e-05, "loss": 0.0161, "step": 18450 }, { "epoch": 2.216816816816817, "grad_norm": 0.20524130761623383, "learning_rate": 1.5894344636716003e-05, "loss": 0.0139, "step": 18455 }, { "epoch": 2.2174174174174173, "grad_norm": 0.28721746802330017, "learning_rate": 1.5871355586894064e-05, "loss": 0.0164, "step": 18460 }, { "epoch": 2.218018018018018, "grad_norm": 0.2776319682598114, "learning_rate": 1.584838003751864e-05, "loss": 0.0153, "step": 18465 }, { "epoch": 2.2186186186186188, "grad_norm": 0.2575279176235199, "learning_rate": 1.5825417997678264e-05, "loss": 0.0152, "step": 18470 }, { "epoch": 2.219219219219219, "grad_norm": 0.1764586865901947, "learning_rate": 1.580246947645616e-05, "loss": 0.0179, "step": 18475 }, { "epoch": 2.21981981981982, "grad_norm": 0.2602247893810272, "learning_rate": 1.577953448293022e-05, "loss": 0.0145, "step": 18480 }, { "epoch": 2.2204204204204205, "grad_norm": 0.28490474820137024, "learning_rate": 1.575661302617291e-05, "loss": 0.0138, "step": 18485 }, { "epoch": 2.221021021021021, "grad_norm": 0.2959938943386078, "learning_rate": 1.5733705115251428e-05, "loss": 0.0152, "step": 18490 }, { "epoch": 2.2216216216216216, "grad_norm": 0.14737527072429657, "learning_rate": 1.5710810759227563e-05, "loss": 0.0111, "step": 18495 }, { "epoch": 2.2222222222222223, "grad_norm": 0.37371930480003357, "learning_rate": 1.5687929967157766e-05, "loss": 0.0162, "step": 18500 }, { "epoch": 2.2222222222222223, "eval_loss": 0.0416293703019619, "eval_runtime": 35.9144, "eval_samples_per_second": 22.275, "eval_steps_per_second": 5.569, "step": 18500 }, { "epoch": 2.2228228228228226, "grad_norm": 0.17344534397125244, "learning_rate": 1.5665062748093095e-05, "loss": 0.0131, "step": 18505 }, { "epoch": 2.2234234234234234, "grad_norm": 0.29084810614585876, "learning_rate": 1.5642209111079266e-05, "loss": 0.0151, "step": 18510 }, { "epoch": 2.224024024024024, "grad_norm": 0.20893439650535583, "learning_rate": 1.5619369065156604e-05, "loss": 0.0138, "step": 18515 }, { "epoch": 2.2246246246246244, "grad_norm": 0.307782918214798, "learning_rate": 1.5596542619360073e-05, "loss": 0.0177, "step": 18520 }, { "epoch": 2.225225225225225, "grad_norm": 0.3421696126461029, "learning_rate": 1.5573729782719266e-05, "loss": 0.0143, "step": 18525 }, { "epoch": 2.225825825825826, "grad_norm": 0.23515865206718445, "learning_rate": 1.5550930564258336e-05, "loss": 0.019, "step": 18530 }, { "epoch": 2.2264264264264266, "grad_norm": 0.27856892347335815, "learning_rate": 1.5528144972996116e-05, "loss": 0.0161, "step": 18535 }, { "epoch": 2.227027027027027, "grad_norm": 0.3409879505634308, "learning_rate": 1.5505373017946024e-05, "loss": 0.0125, "step": 18540 }, { "epoch": 2.2276276276276277, "grad_norm": 0.2655802369117737, "learning_rate": 1.5482614708116083e-05, "loss": 0.0156, "step": 18545 }, { "epoch": 2.2282282282282284, "grad_norm": 0.22531571984291077, "learning_rate": 1.545987005250889e-05, "loss": 0.0173, "step": 18550 }, { "epoch": 2.2288288288288287, "grad_norm": 0.24592383205890656, "learning_rate": 1.5437139060121692e-05, "loss": 0.0155, "step": 18555 }, { "epoch": 2.2294294294294295, "grad_norm": 0.3500378429889679, "learning_rate": 1.5414421739946312e-05, "loss": 0.0153, "step": 18560 }, { "epoch": 2.23003003003003, "grad_norm": 0.2400636225938797, "learning_rate": 1.5391718100969132e-05, "loss": 0.0153, "step": 18565 }, { "epoch": 2.2306306306306305, "grad_norm": 0.32631024718284607, "learning_rate": 1.5369028152171178e-05, "loss": 0.0129, "step": 18570 }, { "epoch": 2.2312312312312312, "grad_norm": 0.2156463861465454, "learning_rate": 1.5346351902528007e-05, "loss": 0.0182, "step": 18575 }, { "epoch": 2.231831831831832, "grad_norm": 0.22314335405826569, "learning_rate": 1.532368936100979e-05, "loss": 0.0205, "step": 18580 }, { "epoch": 2.2324324324324323, "grad_norm": 0.3144841194152832, "learning_rate": 1.5301040536581275e-05, "loss": 0.0139, "step": 18585 }, { "epoch": 2.233033033033033, "grad_norm": 0.27216628193855286, "learning_rate": 1.527840543820177e-05, "loss": 0.0173, "step": 18590 }, { "epoch": 2.2336336336336338, "grad_norm": 0.24075335264205933, "learning_rate": 1.5255784074825175e-05, "loss": 0.016, "step": 18595 }, { "epoch": 2.234234234234234, "grad_norm": 0.18056322634220123, "learning_rate": 1.5233176455399916e-05, "loss": 0.0148, "step": 18600 }, { "epoch": 2.234834834834835, "grad_norm": 0.18213163316249847, "learning_rate": 1.5210582588869016e-05, "loss": 0.0156, "step": 18605 }, { "epoch": 2.2354354354354355, "grad_norm": 0.23092873394489288, "learning_rate": 1.518800248417005e-05, "loss": 0.0174, "step": 18610 }, { "epoch": 2.236036036036036, "grad_norm": 0.2707630693912506, "learning_rate": 1.5165436150235146e-05, "loss": 0.0122, "step": 18615 }, { "epoch": 2.2366366366366366, "grad_norm": 0.20673693716526031, "learning_rate": 1.5142883595991014e-05, "loss": 0.0129, "step": 18620 }, { "epoch": 2.2372372372372373, "grad_norm": 0.22654296457767487, "learning_rate": 1.512034483035884e-05, "loss": 0.0139, "step": 18625 }, { "epoch": 2.237837837837838, "grad_norm": 0.24357669055461884, "learning_rate": 1.5097819862254426e-05, "loss": 0.0149, "step": 18630 }, { "epoch": 2.2384384384384384, "grad_norm": 0.2146761119365692, "learning_rate": 1.5075308700588093e-05, "loss": 0.0139, "step": 18635 }, { "epoch": 2.239039039039039, "grad_norm": 0.2448071390390396, "learning_rate": 1.5052811354264706e-05, "loss": 0.0127, "step": 18640 }, { "epoch": 2.2396396396396394, "grad_norm": 0.22823013365268707, "learning_rate": 1.5030327832183633e-05, "loss": 0.0176, "step": 18645 }, { "epoch": 2.24024024024024, "grad_norm": 0.3258076012134552, "learning_rate": 1.5007858143238834e-05, "loss": 0.0144, "step": 18650 }, { "epoch": 2.240840840840841, "grad_norm": 0.27186718583106995, "learning_rate": 1.4985402296318718e-05, "loss": 0.0132, "step": 18655 }, { "epoch": 2.2414414414414416, "grad_norm": 0.2503277063369751, "learning_rate": 1.496296030030629e-05, "loss": 0.0139, "step": 18660 }, { "epoch": 2.242042042042042, "grad_norm": 0.19911149144172668, "learning_rate": 1.4940532164079052e-05, "loss": 0.0153, "step": 18665 }, { "epoch": 2.2426426426426427, "grad_norm": 0.28745782375335693, "learning_rate": 1.4918117896508999e-05, "loss": 0.0139, "step": 18670 }, { "epoch": 2.2432432432432434, "grad_norm": 0.21540391445159912, "learning_rate": 1.4895717506462665e-05, "loss": 0.0137, "step": 18675 }, { "epoch": 2.2438438438438437, "grad_norm": 0.23623935878276825, "learning_rate": 1.48733310028011e-05, "loss": 0.0133, "step": 18680 }, { "epoch": 2.2444444444444445, "grad_norm": 0.29339754581451416, "learning_rate": 1.4850958394379844e-05, "loss": 0.0128, "step": 18685 }, { "epoch": 2.245045045045045, "grad_norm": 0.3102782666683197, "learning_rate": 1.4828599690048961e-05, "loss": 0.0136, "step": 18690 }, { "epoch": 2.2456456456456455, "grad_norm": 0.31585463881492615, "learning_rate": 1.4806254898652977e-05, "loss": 0.0153, "step": 18695 }, { "epoch": 2.2462462462462462, "grad_norm": 0.2358711212873459, "learning_rate": 1.478392402903095e-05, "loss": 0.0139, "step": 18700 }, { "epoch": 2.246846846846847, "grad_norm": 0.24576008319854736, "learning_rate": 1.476160709001641e-05, "loss": 0.0139, "step": 18705 }, { "epoch": 2.2474474474474473, "grad_norm": 0.2061036229133606, "learning_rate": 1.47393040904374e-05, "loss": 0.0177, "step": 18710 }, { "epoch": 2.248048048048048, "grad_norm": 0.25496989488601685, "learning_rate": 1.4717015039116445e-05, "loss": 0.0146, "step": 18715 }, { "epoch": 2.2486486486486488, "grad_norm": 0.21531526744365692, "learning_rate": 1.4694739944870506e-05, "loss": 0.0147, "step": 18720 }, { "epoch": 2.249249249249249, "grad_norm": 0.22462278604507446, "learning_rate": 1.4672478816511076e-05, "loss": 0.0129, "step": 18725 }, { "epoch": 2.24984984984985, "grad_norm": 0.20219901204109192, "learning_rate": 1.465023166284411e-05, "loss": 0.0135, "step": 18730 }, { "epoch": 2.2504504504504506, "grad_norm": 0.23641879856586456, "learning_rate": 1.4627998492670042e-05, "loss": 0.0131, "step": 18735 }, { "epoch": 2.251051051051051, "grad_norm": 0.17973272502422333, "learning_rate": 1.4605779314783736e-05, "loss": 0.0153, "step": 18740 }, { "epoch": 2.2516516516516516, "grad_norm": 0.2191532552242279, "learning_rate": 1.458357413797457e-05, "loss": 0.0149, "step": 18745 }, { "epoch": 2.2522522522522523, "grad_norm": 0.21080400049686432, "learning_rate": 1.456138297102635e-05, "loss": 0.0141, "step": 18750 }, { "epoch": 2.2522522522522523, "eval_loss": 0.04001760855317116, "eval_runtime": 35.8843, "eval_samples_per_second": 22.294, "eval_steps_per_second": 5.573, "step": 18750 }, { "epoch": 2.252852852852853, "grad_norm": 0.26723742485046387, "learning_rate": 1.453920582271735e-05, "loss": 0.0157, "step": 18755 }, { "epoch": 2.2534534534534534, "grad_norm": 0.25108709931373596, "learning_rate": 1.451704270182032e-05, "loss": 0.0129, "step": 18760 }, { "epoch": 2.254054054054054, "grad_norm": 0.19916701316833496, "learning_rate": 1.4494893617102418e-05, "loss": 0.0126, "step": 18765 }, { "epoch": 2.2546546546546544, "grad_norm": 0.19446638226509094, "learning_rate": 1.447275857732528e-05, "loss": 0.0137, "step": 18770 }, { "epoch": 2.255255255255255, "grad_norm": 0.21993963420391083, "learning_rate": 1.4450637591244987e-05, "loss": 0.0127, "step": 18775 }, { "epoch": 2.255855855855856, "grad_norm": 0.23572473227977753, "learning_rate": 1.442853066761205e-05, "loss": 0.0134, "step": 18780 }, { "epoch": 2.2564564564564566, "grad_norm": 0.2728061378002167, "learning_rate": 1.4406437815171431e-05, "loss": 0.0128, "step": 18785 }, { "epoch": 2.257057057057057, "grad_norm": 0.18432195484638214, "learning_rate": 1.4384359042662493e-05, "loss": 0.0129, "step": 18790 }, { "epoch": 2.2576576576576577, "grad_norm": 0.26017066836357117, "learning_rate": 1.4362294358819062e-05, "loss": 0.0157, "step": 18795 }, { "epoch": 2.2582582582582584, "grad_norm": 0.25586754083633423, "learning_rate": 1.4340243772369383e-05, "loss": 0.0149, "step": 18800 }, { "epoch": 2.2588588588588587, "grad_norm": 0.1822763830423355, "learning_rate": 1.4318207292036134e-05, "loss": 0.0144, "step": 18805 }, { "epoch": 2.2594594594594595, "grad_norm": 0.3942986726760864, "learning_rate": 1.4296184926536371e-05, "loss": 0.0132, "step": 18810 }, { "epoch": 2.26006006006006, "grad_norm": 0.23238366842269897, "learning_rate": 1.4274176684581619e-05, "loss": 0.0128, "step": 18815 }, { "epoch": 2.2606606606606605, "grad_norm": 0.1807553619146347, "learning_rate": 1.4252182574877781e-05, "loss": 0.0134, "step": 18820 }, { "epoch": 2.2612612612612613, "grad_norm": 0.22502589225769043, "learning_rate": 1.4230202606125186e-05, "loss": 0.013, "step": 18825 }, { "epoch": 2.261861861861862, "grad_norm": 0.15234749019145966, "learning_rate": 1.4208236787018592e-05, "loss": 0.0135, "step": 18830 }, { "epoch": 2.2624624624624623, "grad_norm": 0.17176970839500427, "learning_rate": 1.4186285126247083e-05, "loss": 0.0156, "step": 18835 }, { "epoch": 2.263063063063063, "grad_norm": 0.2273995280265808, "learning_rate": 1.416434763249424e-05, "loss": 0.0139, "step": 18840 }, { "epoch": 2.263663663663664, "grad_norm": 0.26800450682640076, "learning_rate": 1.4142424314437957e-05, "loss": 0.0147, "step": 18845 }, { "epoch": 2.264264264264264, "grad_norm": 0.2593317925930023, "learning_rate": 1.4120515180750565e-05, "loss": 0.0127, "step": 18850 }, { "epoch": 2.264864864864865, "grad_norm": 0.2639774978160858, "learning_rate": 1.4098620240098793e-05, "loss": 0.0149, "step": 18855 }, { "epoch": 2.2654654654654656, "grad_norm": 0.15085655450820923, "learning_rate": 1.407673950114371e-05, "loss": 0.0142, "step": 18860 }, { "epoch": 2.266066066066066, "grad_norm": 0.3097408413887024, "learning_rate": 1.4054872972540806e-05, "loss": 0.0182, "step": 18865 }, { "epoch": 2.2666666666666666, "grad_norm": 0.27929165959358215, "learning_rate": 1.4033020662939939e-05, "loss": 0.0174, "step": 18870 }, { "epoch": 2.2672672672672673, "grad_norm": 0.17149385809898376, "learning_rate": 1.4011182580985355e-05, "loss": 0.0135, "step": 18875 }, { "epoch": 2.267867867867868, "grad_norm": 0.4180721938610077, "learning_rate": 1.3989358735315633e-05, "loss": 0.0153, "step": 18880 }, { "epoch": 2.2684684684684684, "grad_norm": 0.18978695571422577, "learning_rate": 1.3967549134563757e-05, "loss": 0.0127, "step": 18885 }, { "epoch": 2.269069069069069, "grad_norm": 0.2437458336353302, "learning_rate": 1.3945753787357068e-05, "loss": 0.0125, "step": 18890 }, { "epoch": 2.2696696696696694, "grad_norm": 0.21342216432094574, "learning_rate": 1.3923972702317262e-05, "loss": 0.0142, "step": 18895 }, { "epoch": 2.27027027027027, "grad_norm": 0.23508882522583008, "learning_rate": 1.3902205888060415e-05, "loss": 0.0143, "step": 18900 }, { "epoch": 2.270870870870871, "grad_norm": 0.3441837430000305, "learning_rate": 1.3880453353196905e-05, "loss": 0.0154, "step": 18905 }, { "epoch": 2.2714714714714717, "grad_norm": 0.2490258663892746, "learning_rate": 1.3858715106331516e-05, "loss": 0.0147, "step": 18910 }, { "epoch": 2.272072072072072, "grad_norm": 0.20941585302352905, "learning_rate": 1.3836991156063361e-05, "loss": 0.0121, "step": 18915 }, { "epoch": 2.2726726726726727, "grad_norm": 0.26372185349464417, "learning_rate": 1.3815281510985906e-05, "loss": 0.0136, "step": 18920 }, { "epoch": 2.2732732732732734, "grad_norm": 0.22036220133304596, "learning_rate": 1.3793586179686923e-05, "loss": 0.0139, "step": 18925 }, { "epoch": 2.2738738738738737, "grad_norm": 0.21494606137275696, "learning_rate": 1.3771905170748562e-05, "loss": 0.0152, "step": 18930 }, { "epoch": 2.2744744744744745, "grad_norm": 0.22034406661987305, "learning_rate": 1.3750238492747302e-05, "loss": 0.013, "step": 18935 }, { "epoch": 2.2750750750750752, "grad_norm": 0.2613002061843872, "learning_rate": 1.3728586154253925e-05, "loss": 0.0154, "step": 18940 }, { "epoch": 2.2756756756756755, "grad_norm": 0.2192060500383377, "learning_rate": 1.370694816383359e-05, "loss": 0.016, "step": 18945 }, { "epoch": 2.2762762762762763, "grad_norm": 0.26106199622154236, "learning_rate": 1.3685324530045707e-05, "loss": 0.0133, "step": 18950 }, { "epoch": 2.276876876876877, "grad_norm": 0.2007218450307846, "learning_rate": 1.3663715261444077e-05, "loss": 0.0149, "step": 18955 }, { "epoch": 2.2774774774774773, "grad_norm": 0.20750047266483307, "learning_rate": 1.3642120366576789e-05, "loss": 0.0126, "step": 18960 }, { "epoch": 2.278078078078078, "grad_norm": 0.39288845658302307, "learning_rate": 1.3620539853986242e-05, "loss": 0.0154, "step": 18965 }, { "epoch": 2.278678678678679, "grad_norm": 0.3260713517665863, "learning_rate": 1.3598973732209175e-05, "loss": 0.0139, "step": 18970 }, { "epoch": 2.279279279279279, "grad_norm": 0.22316108644008636, "learning_rate": 1.357742200977658e-05, "loss": 0.0149, "step": 18975 }, { "epoch": 2.27987987987988, "grad_norm": 0.3133355677127838, "learning_rate": 1.3555884695213799e-05, "loss": 0.0141, "step": 18980 }, { "epoch": 2.2804804804804806, "grad_norm": 0.2897303104400635, "learning_rate": 1.3534361797040457e-05, "loss": 0.0154, "step": 18985 }, { "epoch": 2.281081081081081, "grad_norm": 0.3254048228263855, "learning_rate": 1.3512853323770486e-05, "loss": 0.0152, "step": 18990 }, { "epoch": 2.2816816816816816, "grad_norm": 0.26747003197669983, "learning_rate": 1.3491359283912114e-05, "loss": 0.0185, "step": 18995 }, { "epoch": 2.2822822822822824, "grad_norm": 0.14857745170593262, "learning_rate": 1.3469879685967824e-05, "loss": 0.0144, "step": 19000 }, { "epoch": 2.2822822822822824, "eval_loss": 0.03958373889327049, "eval_runtime": 35.7399, "eval_samples_per_second": 22.384, "eval_steps_per_second": 5.596, "step": 19000 }, { "epoch": 2.282882882882883, "grad_norm": 0.25740018486976624, "learning_rate": 1.3448414538434428e-05, "loss": 0.0119, "step": 19005 }, { "epoch": 2.2834834834834834, "grad_norm": 0.23623026907444, "learning_rate": 1.342696384980301e-05, "loss": 0.0131, "step": 19010 }, { "epoch": 2.284084084084084, "grad_norm": 0.25876763463020325, "learning_rate": 1.340552762855894e-05, "loss": 0.0145, "step": 19015 }, { "epoch": 2.2846846846846844, "grad_norm": 0.2639022469520569, "learning_rate": 1.338410588318183e-05, "loss": 0.0155, "step": 19020 }, { "epoch": 2.285285285285285, "grad_norm": 0.23563387989997864, "learning_rate": 1.336269862214562e-05, "loss": 0.0142, "step": 19025 }, { "epoch": 2.285885885885886, "grad_norm": 0.32455793023109436, "learning_rate": 1.3341305853918462e-05, "loss": 0.015, "step": 19030 }, { "epoch": 2.2864864864864867, "grad_norm": 0.22415214776992798, "learning_rate": 1.331992758696282e-05, "loss": 0.0135, "step": 19035 }, { "epoch": 2.287087087087087, "grad_norm": 0.2912454307079315, "learning_rate": 1.3298563829735427e-05, "loss": 0.0146, "step": 19040 }, { "epoch": 2.2876876876876877, "grad_norm": 0.18687337636947632, "learning_rate": 1.327721459068722e-05, "loss": 0.0138, "step": 19045 }, { "epoch": 2.2882882882882885, "grad_norm": 0.23039725422859192, "learning_rate": 1.3255879878263449e-05, "loss": 0.0126, "step": 19050 }, { "epoch": 2.2888888888888888, "grad_norm": 0.3004606366157532, "learning_rate": 1.3234559700903592e-05, "loss": 0.0134, "step": 19055 }, { "epoch": 2.2894894894894895, "grad_norm": 0.22218632698059082, "learning_rate": 1.3213254067041392e-05, "loss": 0.0149, "step": 19060 }, { "epoch": 2.2900900900900902, "grad_norm": 0.16605770587921143, "learning_rate": 1.3191962985104838e-05, "loss": 0.0112, "step": 19065 }, { "epoch": 2.2906906906906905, "grad_norm": 0.3117055892944336, "learning_rate": 1.3170686463516125e-05, "loss": 0.0148, "step": 19070 }, { "epoch": 2.2912912912912913, "grad_norm": 0.3259162902832031, "learning_rate": 1.3149424510691738e-05, "loss": 0.015, "step": 19075 }, { "epoch": 2.291891891891892, "grad_norm": 0.2495785802602768, "learning_rate": 1.3128177135042374e-05, "loss": 0.0141, "step": 19080 }, { "epoch": 2.2924924924924923, "grad_norm": 0.30315738916397095, "learning_rate": 1.3106944344972965e-05, "loss": 0.0158, "step": 19085 }, { "epoch": 2.293093093093093, "grad_norm": 0.19319814443588257, "learning_rate": 1.3085726148882704e-05, "loss": 0.016, "step": 19090 }, { "epoch": 2.293693693693694, "grad_norm": 0.2703433036804199, "learning_rate": 1.3064522555164948e-05, "loss": 0.0148, "step": 19095 }, { "epoch": 2.294294294294294, "grad_norm": 0.2594946324825287, "learning_rate": 1.3043333572207322e-05, "loss": 0.0177, "step": 19100 }, { "epoch": 2.294894894894895, "grad_norm": 0.251132071018219, "learning_rate": 1.302215920839167e-05, "loss": 0.0128, "step": 19105 }, { "epoch": 2.2954954954954956, "grad_norm": 0.18883265554904938, "learning_rate": 1.300099947209406e-05, "loss": 0.0139, "step": 19110 }, { "epoch": 2.296096096096096, "grad_norm": 0.30045944452285767, "learning_rate": 1.297985437168473e-05, "loss": 0.0129, "step": 19115 }, { "epoch": 2.2966966966966966, "grad_norm": 0.22515608370304108, "learning_rate": 1.2958723915528187e-05, "loss": 0.0141, "step": 19120 }, { "epoch": 2.2972972972972974, "grad_norm": 0.19037845730781555, "learning_rate": 1.2937608111983085e-05, "loss": 0.0124, "step": 19125 }, { "epoch": 2.297897897897898, "grad_norm": 0.3264663815498352, "learning_rate": 1.291650696940233e-05, "loss": 0.0135, "step": 19130 }, { "epoch": 2.2984984984984984, "grad_norm": 0.34269478917121887, "learning_rate": 1.2895420496133027e-05, "loss": 0.014, "step": 19135 }, { "epoch": 2.299099099099099, "grad_norm": 0.24214765429496765, "learning_rate": 1.2874348700516432e-05, "loss": 0.0141, "step": 19140 }, { "epoch": 2.2996996996996995, "grad_norm": 0.18193790316581726, "learning_rate": 1.2853291590888034e-05, "loss": 0.0136, "step": 19145 }, { "epoch": 2.3003003003003, "grad_norm": 0.19907261431217194, "learning_rate": 1.2832249175577515e-05, "loss": 0.0122, "step": 19150 }, { "epoch": 2.300900900900901, "grad_norm": 0.17116400599479675, "learning_rate": 1.2811221462908723e-05, "loss": 0.0126, "step": 19155 }, { "epoch": 2.3015015015015017, "grad_norm": 0.2657437026500702, "learning_rate": 1.2790208461199726e-05, "loss": 0.0146, "step": 19160 }, { "epoch": 2.302102102102102, "grad_norm": 0.2539885938167572, "learning_rate": 1.2769210178762709e-05, "loss": 0.0117, "step": 19165 }, { "epoch": 2.3027027027027027, "grad_norm": 0.22538946568965912, "learning_rate": 1.2748226623904092e-05, "loss": 0.0142, "step": 19170 }, { "epoch": 2.3033033033033035, "grad_norm": 0.1703753024339676, "learning_rate": 1.2727257804924447e-05, "loss": 0.0126, "step": 19175 }, { "epoch": 2.3039039039039038, "grad_norm": 0.19249996542930603, "learning_rate": 1.270630373011853e-05, "loss": 0.0145, "step": 19180 }, { "epoch": 2.3045045045045045, "grad_norm": 0.33727169036865234, "learning_rate": 1.2685364407775236e-05, "loss": 0.0149, "step": 19185 }, { "epoch": 2.3051051051051052, "grad_norm": 0.2722015380859375, "learning_rate": 1.2664439846177644e-05, "loss": 0.0128, "step": 19190 }, { "epoch": 2.3057057057057055, "grad_norm": 0.21558161079883575, "learning_rate": 1.2643530053603003e-05, "loss": 0.0166, "step": 19195 }, { "epoch": 2.3063063063063063, "grad_norm": 0.16690169274806976, "learning_rate": 1.2622635038322705e-05, "loss": 0.0111, "step": 19200 }, { "epoch": 2.306906906906907, "grad_norm": 0.26086804270744324, "learning_rate": 1.2601754808602318e-05, "loss": 0.0166, "step": 19205 }, { "epoch": 2.3075075075075073, "grad_norm": 0.19430798292160034, "learning_rate": 1.2580889372701503e-05, "loss": 0.013, "step": 19210 }, { "epoch": 2.308108108108108, "grad_norm": 0.22891777753829956, "learning_rate": 1.2560038738874157e-05, "loss": 0.014, "step": 19215 }, { "epoch": 2.308708708708709, "grad_norm": 0.20051150023937225, "learning_rate": 1.253920291536823e-05, "loss": 0.013, "step": 19220 }, { "epoch": 2.3093093093093096, "grad_norm": 0.3059994876384735, "learning_rate": 1.251838191042588e-05, "loss": 0.0144, "step": 19225 }, { "epoch": 2.30990990990991, "grad_norm": 0.24633249640464783, "learning_rate": 1.249757573228339e-05, "loss": 0.0114, "step": 19230 }, { "epoch": 2.3105105105105106, "grad_norm": 0.3297385275363922, "learning_rate": 1.2476784389171148e-05, "loss": 0.0171, "step": 19235 }, { "epoch": 2.311111111111111, "grad_norm": 0.2548968195915222, "learning_rate": 1.2456007889313703e-05, "loss": 0.0138, "step": 19240 }, { "epoch": 2.3117117117117116, "grad_norm": 0.26288342475891113, "learning_rate": 1.2435246240929726e-05, "loss": 0.0137, "step": 19245 }, { "epoch": 2.3123123123123124, "grad_norm": 0.2402072250843048, "learning_rate": 1.241449945223202e-05, "loss": 0.0152, "step": 19250 }, { "epoch": 2.3123123123123124, "eval_loss": 0.04008075222373009, "eval_runtime": 35.795, "eval_samples_per_second": 22.35, "eval_steps_per_second": 5.587, "step": 19250 }, { "epoch": 2.312912912912913, "grad_norm": 0.19226953387260437, "learning_rate": 1.239376753142748e-05, "loss": 0.0141, "step": 19255 }, { "epoch": 2.3135135135135134, "grad_norm": 0.2099335491657257, "learning_rate": 1.2373050486717153e-05, "loss": 0.0134, "step": 19260 }, { "epoch": 2.314114114114114, "grad_norm": 0.24056895077228546, "learning_rate": 1.2352348326296182e-05, "loss": 0.0133, "step": 19265 }, { "epoch": 2.314714714714715, "grad_norm": 0.3099709451198578, "learning_rate": 1.2331661058353834e-05, "loss": 0.0129, "step": 19270 }, { "epoch": 2.315315315315315, "grad_norm": 0.20563752949237823, "learning_rate": 1.2310988691073494e-05, "loss": 0.0123, "step": 19275 }, { "epoch": 2.315915915915916, "grad_norm": 0.33363813161849976, "learning_rate": 1.2290331232632613e-05, "loss": 0.0156, "step": 19280 }, { "epoch": 2.3165165165165167, "grad_norm": 0.27251991629600525, "learning_rate": 1.2269688691202779e-05, "loss": 0.014, "step": 19285 }, { "epoch": 2.317117117117117, "grad_norm": 0.33110061287879944, "learning_rate": 1.2249061074949674e-05, "loss": 0.0137, "step": 19290 }, { "epoch": 2.3177177177177177, "grad_norm": 0.22270050644874573, "learning_rate": 1.2228448392033087e-05, "loss": 0.0136, "step": 19295 }, { "epoch": 2.3183183183183185, "grad_norm": 0.26671379804611206, "learning_rate": 1.220785065060685e-05, "loss": 0.0137, "step": 19300 }, { "epoch": 2.3189189189189188, "grad_norm": 0.25220632553100586, "learning_rate": 1.218726785881894e-05, "loss": 0.0133, "step": 19305 }, { "epoch": 2.3195195195195195, "grad_norm": 0.23740239441394806, "learning_rate": 1.2166700024811411e-05, "loss": 0.0142, "step": 19310 }, { "epoch": 2.3201201201201203, "grad_norm": 0.26378750801086426, "learning_rate": 1.2146147156720361e-05, "loss": 0.0123, "step": 19315 }, { "epoch": 2.3207207207207206, "grad_norm": 0.18735115230083466, "learning_rate": 1.2125609262676024e-05, "loss": 0.0126, "step": 19320 }, { "epoch": 2.3213213213213213, "grad_norm": 0.25290191173553467, "learning_rate": 1.2105086350802653e-05, "loss": 0.0117, "step": 19325 }, { "epoch": 2.321921921921922, "grad_norm": 0.17708788812160492, "learning_rate": 1.2084578429218617e-05, "loss": 0.012, "step": 19330 }, { "epoch": 2.3225225225225223, "grad_norm": 0.23459002375602722, "learning_rate": 1.2064085506036349e-05, "loss": 0.0138, "step": 19335 }, { "epoch": 2.323123123123123, "grad_norm": 0.28581830859184265, "learning_rate": 1.2043607589362332e-05, "loss": 0.018, "step": 19340 }, { "epoch": 2.323723723723724, "grad_norm": 0.2849411368370056, "learning_rate": 1.2023144687297144e-05, "loss": 0.0124, "step": 19345 }, { "epoch": 2.3243243243243246, "grad_norm": 0.1840084195137024, "learning_rate": 1.2002696807935365e-05, "loss": 0.0135, "step": 19350 }, { "epoch": 2.324924924924925, "grad_norm": 0.3442290425300598, "learning_rate": 1.1982263959365697e-05, "loss": 0.0142, "step": 19355 }, { "epoch": 2.3255255255255256, "grad_norm": 0.28359463810920715, "learning_rate": 1.1961846149670858e-05, "loss": 0.0145, "step": 19360 }, { "epoch": 2.326126126126126, "grad_norm": 0.16738919913768768, "learning_rate": 1.1941443386927637e-05, "loss": 0.0135, "step": 19365 }, { "epoch": 2.3267267267267266, "grad_norm": 0.22982092201709747, "learning_rate": 1.1921055679206866e-05, "loss": 0.0139, "step": 19370 }, { "epoch": 2.3273273273273274, "grad_norm": 0.1813298910856247, "learning_rate": 1.1900683034573396e-05, "loss": 0.016, "step": 19375 }, { "epoch": 2.327927927927928, "grad_norm": 0.21618647873401642, "learning_rate": 1.188032546108615e-05, "loss": 0.0138, "step": 19380 }, { "epoch": 2.3285285285285284, "grad_norm": 0.22631651163101196, "learning_rate": 1.1859982966798084e-05, "loss": 0.0153, "step": 19385 }, { "epoch": 2.329129129129129, "grad_norm": 0.26831868290901184, "learning_rate": 1.1839655559756197e-05, "loss": 0.0123, "step": 19390 }, { "epoch": 2.32972972972973, "grad_norm": 0.224435955286026, "learning_rate": 1.181934324800148e-05, "loss": 0.0127, "step": 19395 }, { "epoch": 2.33033033033033, "grad_norm": 0.27236878871917725, "learning_rate": 1.1799046039569006e-05, "loss": 0.0115, "step": 19400 }, { "epoch": 2.330930930930931, "grad_norm": 0.2713559865951538, "learning_rate": 1.1778763942487825e-05, "loss": 0.0143, "step": 19405 }, { "epoch": 2.3315315315315317, "grad_norm": 0.21134965121746063, "learning_rate": 1.1758496964781045e-05, "loss": 0.0137, "step": 19410 }, { "epoch": 2.332132132132132, "grad_norm": 0.20588813722133636, "learning_rate": 1.173824511446579e-05, "loss": 0.0127, "step": 19415 }, { "epoch": 2.3327327327327327, "grad_norm": 0.2838841378688812, "learning_rate": 1.1718008399553165e-05, "loss": 0.0146, "step": 19420 }, { "epoch": 2.3333333333333335, "grad_norm": 0.25593286752700806, "learning_rate": 1.1697786828048335e-05, "loss": 0.012, "step": 19425 }, { "epoch": 2.333933933933934, "grad_norm": 0.1968752145767212, "learning_rate": 1.1677580407950439e-05, "loss": 0.0141, "step": 19430 }, { "epoch": 2.3345345345345345, "grad_norm": 0.21549277007579803, "learning_rate": 1.1657389147252645e-05, "loss": 0.0144, "step": 19435 }, { "epoch": 2.3351351351351353, "grad_norm": 0.34446537494659424, "learning_rate": 1.1637213053942126e-05, "loss": 0.0128, "step": 19440 }, { "epoch": 2.3357357357357356, "grad_norm": 0.17235496640205383, "learning_rate": 1.1617052136000023e-05, "loss": 0.0117, "step": 19445 }, { "epoch": 2.3363363363363363, "grad_norm": 0.27614420652389526, "learning_rate": 1.1596906401401503e-05, "loss": 0.0124, "step": 19450 }, { "epoch": 2.336936936936937, "grad_norm": 0.17631419003009796, "learning_rate": 1.1576775858115718e-05, "loss": 0.0136, "step": 19455 }, { "epoch": 2.3375375375375373, "grad_norm": 0.24650456011295319, "learning_rate": 1.1556660514105839e-05, "loss": 0.0112, "step": 19460 }, { "epoch": 2.338138138138138, "grad_norm": 0.2779387831687927, "learning_rate": 1.1536560377328952e-05, "loss": 0.0132, "step": 19465 }, { "epoch": 2.338738738738739, "grad_norm": 0.22595803439617157, "learning_rate": 1.1516475455736203e-05, "loss": 0.0132, "step": 19470 }, { "epoch": 2.3393393393393396, "grad_norm": 0.21261551976203918, "learning_rate": 1.1496405757272682e-05, "loss": 0.015, "step": 19475 }, { "epoch": 2.33993993993994, "grad_norm": 0.2531772553920746, "learning_rate": 1.1476351289877468e-05, "loss": 0.0141, "step": 19480 }, { "epoch": 2.3405405405405406, "grad_norm": 0.21187564730644226, "learning_rate": 1.145631206148362e-05, "loss": 0.0136, "step": 19485 }, { "epoch": 2.341141141141141, "grad_norm": 0.2373325675725937, "learning_rate": 1.1436288080018137e-05, "loss": 0.0116, "step": 19490 }, { "epoch": 2.3417417417417417, "grad_norm": 0.2928980886936188, "learning_rate": 1.1416279353402038e-05, "loss": 0.0142, "step": 19495 }, { "epoch": 2.3423423423423424, "grad_norm": 0.21609313786029816, "learning_rate": 1.139628588955025e-05, "loss": 0.0124, "step": 19500 }, { "epoch": 2.3423423423423424, "eval_loss": 0.03953830152750015, "eval_runtime": 35.8563, "eval_samples_per_second": 22.311, "eval_steps_per_second": 5.578, "step": 19500 }, { "epoch": 2.342942942942943, "grad_norm": 0.2801402509212494, "learning_rate": 1.1376307696371707e-05, "loss": 0.0136, "step": 19505 }, { "epoch": 2.3435435435435434, "grad_norm": 0.22694651782512665, "learning_rate": 1.1356344781769301e-05, "loss": 0.0142, "step": 19510 }, { "epoch": 2.344144144144144, "grad_norm": 0.27109283208847046, "learning_rate": 1.1336397153639844e-05, "loss": 0.0138, "step": 19515 }, { "epoch": 2.344744744744745, "grad_norm": 0.20073270797729492, "learning_rate": 1.1316464819874129e-05, "loss": 0.0143, "step": 19520 }, { "epoch": 2.3453453453453452, "grad_norm": 0.3076915442943573, "learning_rate": 1.1296547788356898e-05, "loss": 0.0133, "step": 19525 }, { "epoch": 2.345945945945946, "grad_norm": 0.2712023854255676, "learning_rate": 1.1276646066966834e-05, "loss": 0.0144, "step": 19530 }, { "epoch": 2.3465465465465467, "grad_norm": 0.21880017220973969, "learning_rate": 1.1256759663576576e-05, "loss": 0.0126, "step": 19535 }, { "epoch": 2.347147147147147, "grad_norm": 0.16714221239089966, "learning_rate": 1.1236888586052673e-05, "loss": 0.0128, "step": 19540 }, { "epoch": 2.3477477477477477, "grad_norm": 0.284549742937088, "learning_rate": 1.1217032842255643e-05, "loss": 0.015, "step": 19545 }, { "epoch": 2.3483483483483485, "grad_norm": 0.15571853518486023, "learning_rate": 1.1197192440039921e-05, "loss": 0.0118, "step": 19550 }, { "epoch": 2.348948948948949, "grad_norm": 0.23313234746456146, "learning_rate": 1.1177367387253896e-05, "loss": 0.0133, "step": 19555 }, { "epoch": 2.3495495495495495, "grad_norm": 0.19322821497917175, "learning_rate": 1.115755769173984e-05, "loss": 0.0127, "step": 19560 }, { "epoch": 2.3501501501501503, "grad_norm": 0.26968666911125183, "learning_rate": 1.1137763361333992e-05, "loss": 0.0123, "step": 19565 }, { "epoch": 2.3507507507507506, "grad_norm": 0.22539116442203522, "learning_rate": 1.1117984403866499e-05, "loss": 0.0129, "step": 19570 }, { "epoch": 2.3513513513513513, "grad_norm": 0.2742008566856384, "learning_rate": 1.1098220827161427e-05, "loss": 0.0164, "step": 19575 }, { "epoch": 2.351951951951952, "grad_norm": 0.21559664607048035, "learning_rate": 1.1078472639036769e-05, "loss": 0.013, "step": 19580 }, { "epoch": 2.3525525525525524, "grad_norm": 0.2503921091556549, "learning_rate": 1.1058739847304394e-05, "loss": 0.0125, "step": 19585 }, { "epoch": 2.353153153153153, "grad_norm": 0.16386358439922333, "learning_rate": 1.1039022459770132e-05, "loss": 0.0119, "step": 19590 }, { "epoch": 2.353753753753754, "grad_norm": 0.22208142280578613, "learning_rate": 1.101932048423367e-05, "loss": 0.0142, "step": 19595 }, { "epoch": 2.3543543543543546, "grad_norm": 0.23939143121242523, "learning_rate": 1.0999633928488629e-05, "loss": 0.0121, "step": 19600 }, { "epoch": 2.354954954954955, "grad_norm": 0.1505102664232254, "learning_rate": 1.0979962800322535e-05, "loss": 0.0123, "step": 19605 }, { "epoch": 2.3555555555555556, "grad_norm": 0.3065943419933319, "learning_rate": 1.0960307107516782e-05, "loss": 0.0131, "step": 19610 }, { "epoch": 2.356156156156156, "grad_norm": 0.19549353420734406, "learning_rate": 1.0940666857846682e-05, "loss": 0.0134, "step": 19615 }, { "epoch": 2.3567567567567567, "grad_norm": 0.23805366456508636, "learning_rate": 1.0921042059081426e-05, "loss": 0.0129, "step": 19620 }, { "epoch": 2.3573573573573574, "grad_norm": 0.2263217568397522, "learning_rate": 1.0901432718984128e-05, "loss": 0.0142, "step": 19625 }, { "epoch": 2.357957957957958, "grad_norm": 0.24713027477264404, "learning_rate": 1.0881838845311714e-05, "loss": 0.0134, "step": 19630 }, { "epoch": 2.3585585585585584, "grad_norm": 0.22670410573482513, "learning_rate": 1.0862260445815053e-05, "loss": 0.0122, "step": 19635 }, { "epoch": 2.359159159159159, "grad_norm": 0.27957212924957275, "learning_rate": 1.0842697528238883e-05, "loss": 0.0126, "step": 19640 }, { "epoch": 2.35975975975976, "grad_norm": 0.25959333777427673, "learning_rate": 1.08231501003218e-05, "loss": 0.0117, "step": 19645 }, { "epoch": 2.3603603603603602, "grad_norm": 0.16630196571350098, "learning_rate": 1.0803618169796298e-05, "loss": 0.013, "step": 19650 }, { "epoch": 2.360960960960961, "grad_norm": 0.24647246301174164, "learning_rate": 1.0784101744388702e-05, "loss": 0.0142, "step": 19655 }, { "epoch": 2.3615615615615617, "grad_norm": 0.3010789453983307, "learning_rate": 1.0764600831819238e-05, "loss": 0.0135, "step": 19660 }, { "epoch": 2.362162162162162, "grad_norm": 0.2706170976161957, "learning_rate": 1.0745115439801984e-05, "loss": 0.0139, "step": 19665 }, { "epoch": 2.3627627627627628, "grad_norm": 0.17888972163200378, "learning_rate": 1.072564557604489e-05, "loss": 0.0118, "step": 19670 }, { "epoch": 2.3633633633633635, "grad_norm": 0.21044613420963287, "learning_rate": 1.0706191248249725e-05, "loss": 0.012, "step": 19675 }, { "epoch": 2.363963963963964, "grad_norm": 0.3188132345676422, "learning_rate": 1.0686752464112153e-05, "loss": 0.0129, "step": 19680 }, { "epoch": 2.3645645645645645, "grad_norm": 0.2343267947435379, "learning_rate": 1.0667329231321699e-05, "loss": 0.0131, "step": 19685 }, { "epoch": 2.3651651651651653, "grad_norm": 0.15735098719596863, "learning_rate": 1.0647921557561668e-05, "loss": 0.0118, "step": 19690 }, { "epoch": 2.3657657657657656, "grad_norm": 0.221736878156662, "learning_rate": 1.06285294505093e-05, "loss": 0.0133, "step": 19695 }, { "epoch": 2.3663663663663663, "grad_norm": 0.19816137850284576, "learning_rate": 1.0609152917835591e-05, "loss": 0.0121, "step": 19700 }, { "epoch": 2.366966966966967, "grad_norm": 0.17485511302947998, "learning_rate": 1.0589791967205437e-05, "loss": 0.0129, "step": 19705 }, { "epoch": 2.3675675675675674, "grad_norm": 0.27220287919044495, "learning_rate": 1.0570446606277551e-05, "loss": 0.0142, "step": 19710 }, { "epoch": 2.368168168168168, "grad_norm": 0.20521901547908783, "learning_rate": 1.0551116842704479e-05, "loss": 0.0152, "step": 19715 }, { "epoch": 2.368768768768769, "grad_norm": 0.3141981363296509, "learning_rate": 1.0531802684132608e-05, "loss": 0.0133, "step": 19720 }, { "epoch": 2.3693693693693696, "grad_norm": 0.2389078289270401, "learning_rate": 1.0512504138202112e-05, "loss": 0.0141, "step": 19725 }, { "epoch": 2.36996996996997, "grad_norm": 0.19536085426807404, "learning_rate": 1.0493221212547038e-05, "loss": 0.012, "step": 19730 }, { "epoch": 2.3705705705705706, "grad_norm": 0.22260215878486633, "learning_rate": 1.0473953914795225e-05, "loss": 0.0124, "step": 19735 }, { "epoch": 2.371171171171171, "grad_norm": 0.20631587505340576, "learning_rate": 1.0454702252568349e-05, "loss": 0.0134, "step": 19740 }, { "epoch": 2.3717717717717717, "grad_norm": 0.1789875328540802, "learning_rate": 1.04354662334819e-05, "loss": 0.0133, "step": 19745 }, { "epoch": 2.3723723723723724, "grad_norm": 0.27481332421302795, "learning_rate": 1.0416245865145141e-05, "loss": 0.0143, "step": 19750 }, { "epoch": 2.3723723723723724, "eval_loss": 0.03786711394786835, "eval_runtime": 35.9806, "eval_samples_per_second": 22.234, "eval_steps_per_second": 5.559, "step": 19750 }, { "epoch": 2.372972972972973, "grad_norm": 0.19248706102371216, "learning_rate": 1.0397041155161185e-05, "loss": 0.012, "step": 19755 }, { "epoch": 2.3735735735735735, "grad_norm": 0.25902336835861206, "learning_rate": 1.0377852111126951e-05, "loss": 0.0131, "step": 19760 }, { "epoch": 2.374174174174174, "grad_norm": 0.16944050788879395, "learning_rate": 1.0358678740633154e-05, "loss": 0.0104, "step": 19765 }, { "epoch": 2.374774774774775, "grad_norm": 0.30373936891555786, "learning_rate": 1.0339521051264278e-05, "loss": 0.013, "step": 19770 }, { "epoch": 2.3753753753753752, "grad_norm": 0.3525901138782501, "learning_rate": 1.0320379050598654e-05, "loss": 0.0135, "step": 19775 }, { "epoch": 2.375975975975976, "grad_norm": 0.1787666529417038, "learning_rate": 1.0301252746208367e-05, "loss": 0.0106, "step": 19780 }, { "epoch": 2.3765765765765767, "grad_norm": 0.17433393001556396, "learning_rate": 1.0282142145659319e-05, "loss": 0.0139, "step": 19785 }, { "epoch": 2.377177177177177, "grad_norm": 0.21298392117023468, "learning_rate": 1.0263047256511199e-05, "loss": 0.0123, "step": 19790 }, { "epoch": 2.3777777777777778, "grad_norm": 0.27403366565704346, "learning_rate": 1.0243968086317446e-05, "loss": 0.012, "step": 19795 }, { "epoch": 2.3783783783783785, "grad_norm": 0.224418506026268, "learning_rate": 1.0224904642625327e-05, "loss": 0.0122, "step": 19800 }, { "epoch": 2.378978978978979, "grad_norm": 0.19469419121742249, "learning_rate": 1.020585693297586e-05, "loss": 0.0138, "step": 19805 }, { "epoch": 2.3795795795795796, "grad_norm": 0.25854915380477905, "learning_rate": 1.0186824964903851e-05, "loss": 0.0129, "step": 19810 }, { "epoch": 2.3801801801801803, "grad_norm": 0.3246530294418335, "learning_rate": 1.0167808745937891e-05, "loss": 0.0154, "step": 19815 }, { "epoch": 2.3807807807807806, "grad_norm": 0.2533751428127289, "learning_rate": 1.0148808283600297e-05, "loss": 0.0125, "step": 19820 }, { "epoch": 2.3813813813813813, "grad_norm": 0.22221092879772186, "learning_rate": 1.0129823585407194e-05, "loss": 0.0143, "step": 19825 }, { "epoch": 2.381981981981982, "grad_norm": 0.266520619392395, "learning_rate": 1.0110854658868457e-05, "loss": 0.0148, "step": 19830 }, { "epoch": 2.3825825825825824, "grad_norm": 0.3407699167728424, "learning_rate": 1.0091901511487738e-05, "loss": 0.0123, "step": 19835 }, { "epoch": 2.383183183183183, "grad_norm": 0.19929823279380798, "learning_rate": 1.007296415076241e-05, "loss": 0.0132, "step": 19840 }, { "epoch": 2.383783783783784, "grad_norm": 0.24646946787834167, "learning_rate": 1.0054042584183632e-05, "loss": 0.0116, "step": 19845 }, { "epoch": 2.3843843843843846, "grad_norm": 0.20933978259563446, "learning_rate": 1.0035136819236307e-05, "loss": 0.0112, "step": 19850 }, { "epoch": 2.384984984984985, "grad_norm": 0.15630047023296356, "learning_rate": 1.0016246863399087e-05, "loss": 0.0139, "step": 19855 }, { "epoch": 2.3855855855855856, "grad_norm": 0.1657629907131195, "learning_rate": 9.997372724144388e-06, "loss": 0.0139, "step": 19860 }, { "epoch": 2.386186186186186, "grad_norm": 0.20281122624874115, "learning_rate": 9.978514408938328e-06, "loss": 0.0124, "step": 19865 }, { "epoch": 2.3867867867867867, "grad_norm": 0.17628662288188934, "learning_rate": 9.95967192524081e-06, "loss": 0.0125, "step": 19870 }, { "epoch": 2.3873873873873874, "grad_norm": 0.21708153188228607, "learning_rate": 9.940845280505423e-06, "loss": 0.0122, "step": 19875 }, { "epoch": 2.387987987987988, "grad_norm": 0.22435113787651062, "learning_rate": 9.922034482179549e-06, "loss": 0.0132, "step": 19880 }, { "epoch": 2.3885885885885885, "grad_norm": 0.28971534967422485, "learning_rate": 9.903239537704272e-06, "loss": 0.0124, "step": 19885 }, { "epoch": 2.389189189189189, "grad_norm": 0.17425502836704254, "learning_rate": 9.884460454514389e-06, "loss": 0.013, "step": 19890 }, { "epoch": 2.38978978978979, "grad_norm": 0.12470649927854538, "learning_rate": 9.865697240038452e-06, "loss": 0.0123, "step": 19895 }, { "epoch": 2.3903903903903903, "grad_norm": 0.1936112493276596, "learning_rate": 9.846949901698727e-06, "loss": 0.0135, "step": 19900 }, { "epoch": 2.390990990990991, "grad_norm": 0.15805506706237793, "learning_rate": 9.828218446911203e-06, "loss": 0.0118, "step": 19905 }, { "epoch": 2.3915915915915917, "grad_norm": 0.32184869050979614, "learning_rate": 9.809502883085553e-06, "loss": 0.0134, "step": 19910 }, { "epoch": 2.392192192192192, "grad_norm": 0.21912485361099243, "learning_rate": 9.79080321762521e-06, "loss": 0.0134, "step": 19915 }, { "epoch": 2.3927927927927928, "grad_norm": 0.24116133153438568, "learning_rate": 9.772119457927298e-06, "loss": 0.0122, "step": 19920 }, { "epoch": 2.3933933933933935, "grad_norm": 0.20128588378429413, "learning_rate": 9.753451611382647e-06, "loss": 0.0107, "step": 19925 }, { "epoch": 2.393993993993994, "grad_norm": 0.24893133342266083, "learning_rate": 9.734799685375806e-06, "loss": 0.0127, "step": 19930 }, { "epoch": 2.3945945945945946, "grad_norm": 0.16727831959724426, "learning_rate": 9.71616368728499e-06, "loss": 0.0119, "step": 19935 }, { "epoch": 2.3951951951951953, "grad_norm": 0.3045255243778229, "learning_rate": 9.697543624482158e-06, "loss": 0.0131, "step": 19940 }, { "epoch": 2.3957957957957956, "grad_norm": 0.23958060145378113, "learning_rate": 9.678939504332934e-06, "loss": 0.0135, "step": 19945 }, { "epoch": 2.3963963963963963, "grad_norm": 0.18694347143173218, "learning_rate": 9.66035133419666e-06, "loss": 0.0138, "step": 19950 }, { "epoch": 2.396996996996997, "grad_norm": 0.1764531284570694, "learning_rate": 9.641779121426358e-06, "loss": 0.0126, "step": 19955 }, { "epoch": 2.3975975975975974, "grad_norm": 0.21081571280956268, "learning_rate": 9.623222873368714e-06, "loss": 0.0125, "step": 19960 }, { "epoch": 2.398198198198198, "grad_norm": 0.21348458528518677, "learning_rate": 9.604682597364145e-06, "loss": 0.0125, "step": 19965 }, { "epoch": 2.398798798798799, "grad_norm": 0.1940726935863495, "learning_rate": 9.5861583007467e-06, "loss": 0.013, "step": 19970 }, { "epoch": 2.3993993993993996, "grad_norm": 0.1993122398853302, "learning_rate": 9.567649990844146e-06, "loss": 0.0125, "step": 19975 }, { "epoch": 2.4, "grad_norm": 0.3673401176929474, "learning_rate": 9.54915767497792e-06, "loss": 0.0142, "step": 19980 }, { "epoch": 2.4006006006006007, "grad_norm": 0.23906080424785614, "learning_rate": 9.530681360463107e-06, "loss": 0.0127, "step": 19985 }, { "epoch": 2.401201201201201, "grad_norm": 0.15805503726005554, "learning_rate": 9.512221054608483e-06, "loss": 0.0131, "step": 19990 }, { "epoch": 2.4018018018018017, "grad_norm": 0.1870601326227188, "learning_rate": 9.493776764716495e-06, "loss": 0.0116, "step": 19995 }, { "epoch": 2.4024024024024024, "grad_norm": 0.2301345020532608, "learning_rate": 9.47534849808326e-06, "loss": 0.0128, "step": 20000 }, { "epoch": 2.4024024024024024, "eval_loss": 0.038246750831604004, "eval_runtime": 35.8886, "eval_samples_per_second": 22.291, "eval_steps_per_second": 5.573, "step": 20000 } ], "logging_steps": 5, "max_steps": 24975, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0321032618337056e+20, "train_batch_size": 24, "trial_name": null, "trial_params": null }