{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 2853, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010515247108307045, "grad_norm": 21.846562454867367, "learning_rate": 6.993006993006993e-08, "loss": 1.3669, "step": 1 }, { "epoch": 0.005257623554153523, "grad_norm": 21.132009448461105, "learning_rate": 3.496503496503497e-07, "loss": 1.3522, "step": 5 }, { "epoch": 0.010515247108307046, "grad_norm": 17.494352717419737, "learning_rate": 6.993006993006994e-07, "loss": 1.354, "step": 10 }, { "epoch": 0.015772870662460567, "grad_norm": 3.3387986746360583, "learning_rate": 1.0489510489510491e-06, "loss": 1.2867, "step": 15 }, { "epoch": 0.02103049421661409, "grad_norm": 2.205920355996413, "learning_rate": 1.3986013986013987e-06, "loss": 1.2275, "step": 20 }, { "epoch": 0.026288117770767613, "grad_norm": 1.369597839526372, "learning_rate": 1.7482517482517483e-06, "loss": 1.1889, "step": 25 }, { "epoch": 0.031545741324921134, "grad_norm": 1.1230719730143253, "learning_rate": 2.0979020979020983e-06, "loss": 1.1654, "step": 30 }, { "epoch": 0.03680336487907466, "grad_norm": 0.8609236106610554, "learning_rate": 2.4475524475524477e-06, "loss": 1.1648, "step": 35 }, { "epoch": 0.04206098843322818, "grad_norm": 0.797862698606503, "learning_rate": 2.7972027972027974e-06, "loss": 1.1519, "step": 40 }, { "epoch": 0.0473186119873817, "grad_norm": 0.8072361631632317, "learning_rate": 3.1468531468531472e-06, "loss": 1.1237, "step": 45 }, { "epoch": 0.052576235541535225, "grad_norm": 0.7629783040030311, "learning_rate": 3.4965034965034966e-06, "loss": 1.1325, "step": 50 }, { "epoch": 0.05783385909568875, "grad_norm": 0.6873017397880803, "learning_rate": 3.846153846153847e-06, "loss": 1.1026, "step": 55 }, { "epoch": 0.06309148264984227, "grad_norm": 0.696611266506068, "learning_rate": 4.195804195804197e-06, "loss": 1.1039, "step": 60 }, { "epoch": 0.0683491062039958, "grad_norm": 0.7029981157873147, "learning_rate": 4.5454545454545455e-06, "loss": 1.0926, "step": 65 }, { "epoch": 0.07360672975814932, "grad_norm": 0.7133610172269549, "learning_rate": 4.895104895104895e-06, "loss": 1.1062, "step": 70 }, { "epoch": 0.07886435331230283, "grad_norm": 0.7217968834872122, "learning_rate": 5.244755244755245e-06, "loss": 1.1003, "step": 75 }, { "epoch": 0.08412197686645637, "grad_norm": 0.7014523604685313, "learning_rate": 5.594405594405595e-06, "loss": 1.0944, "step": 80 }, { "epoch": 0.08937960042060988, "grad_norm": 0.7206581604903497, "learning_rate": 5.944055944055944e-06, "loss": 1.1056, "step": 85 }, { "epoch": 0.0946372239747634, "grad_norm": 0.7457066334039347, "learning_rate": 6.2937062937062944e-06, "loss": 1.1065, "step": 90 }, { "epoch": 0.09989484752891693, "grad_norm": 0.7208104956061856, "learning_rate": 6.643356643356644e-06, "loss": 1.0892, "step": 95 }, { "epoch": 0.10515247108307045, "grad_norm": 0.7717192179121782, "learning_rate": 6.993006993006993e-06, "loss": 1.103, "step": 100 }, { "epoch": 0.10515247108307045, "eval_loss": 1.0989242792129517, "eval_runtime": 734.3008, "eval_samples_per_second": 18.326, "eval_steps_per_second": 0.144, "step": 100 }, { "epoch": 0.11041009463722397, "grad_norm": 0.8259894569490014, "learning_rate": 7.342657342657343e-06, "loss": 1.0814, "step": 105 }, { "epoch": 0.1156677181913775, "grad_norm": 1.1085848778320089, "learning_rate": 7.692307692307694e-06, "loss": 1.1025, "step": 110 }, { "epoch": 0.12092534174553102, "grad_norm": 0.7455910381865771, "learning_rate": 8.041958041958042e-06, "loss": 1.1079, "step": 115 }, { "epoch": 0.12618296529968454, "grad_norm": 0.9674079050397694, "learning_rate": 8.391608391608393e-06, "loss": 1.0784, "step": 120 }, { "epoch": 0.13144058885383805, "grad_norm": 0.9988378537350968, "learning_rate": 8.741258741258743e-06, "loss": 1.0759, "step": 125 }, { "epoch": 0.1366982124079916, "grad_norm": 0.8719562477062238, "learning_rate": 9.090909090909091e-06, "loss": 1.1161, "step": 130 }, { "epoch": 0.14195583596214512, "grad_norm": 0.7675881427867184, "learning_rate": 9.44055944055944e-06, "loss": 1.0935, "step": 135 }, { "epoch": 0.14721345951629863, "grad_norm": 0.724214660257682, "learning_rate": 9.79020979020979e-06, "loss": 1.0905, "step": 140 }, { "epoch": 0.15247108307045215, "grad_norm": 0.7816188167257716, "learning_rate": 1.013986013986014e-05, "loss": 1.1015, "step": 145 }, { "epoch": 0.15772870662460567, "grad_norm": 0.8160659875512388, "learning_rate": 1.048951048951049e-05, "loss": 1.0841, "step": 150 }, { "epoch": 0.16298633017875921, "grad_norm": 0.7024355651373789, "learning_rate": 1.083916083916084e-05, "loss": 1.0944, "step": 155 }, { "epoch": 0.16824395373291273, "grad_norm": 0.8350948417258764, "learning_rate": 1.118881118881119e-05, "loss": 1.0865, "step": 160 }, { "epoch": 0.17350157728706625, "grad_norm": 0.7759430616830292, "learning_rate": 1.1538461538461538e-05, "loss": 1.0957, "step": 165 }, { "epoch": 0.17875920084121977, "grad_norm": 0.8253932883708276, "learning_rate": 1.1888111888111888e-05, "loss": 1.0862, "step": 170 }, { "epoch": 0.18401682439537329, "grad_norm": 0.727381322987075, "learning_rate": 1.2237762237762239e-05, "loss": 1.0531, "step": 175 }, { "epoch": 0.1892744479495268, "grad_norm": 0.7093901420339217, "learning_rate": 1.2587412587412589e-05, "loss": 1.0983, "step": 180 }, { "epoch": 0.19453207150368035, "grad_norm": 0.953147430950059, "learning_rate": 1.2937062937062939e-05, "loss": 1.0971, "step": 185 }, { "epoch": 0.19978969505783387, "grad_norm": 0.7716908173558169, "learning_rate": 1.3286713286713288e-05, "loss": 1.075, "step": 190 }, { "epoch": 0.20504731861198738, "grad_norm": 0.8672736054906722, "learning_rate": 1.3636363636363637e-05, "loss": 1.0793, "step": 195 }, { "epoch": 0.2103049421661409, "grad_norm": 0.8675965486083684, "learning_rate": 1.3986013986013986e-05, "loss": 1.0867, "step": 200 }, { "epoch": 0.2103049421661409, "eval_loss": 1.0965888500213623, "eval_runtime": 649.556, "eval_samples_per_second": 20.717, "eval_steps_per_second": 0.163, "step": 200 }, { "epoch": 0.21556256572029442, "grad_norm": 0.8977881972264273, "learning_rate": 1.4335664335664336e-05, "loss": 1.0954, "step": 205 }, { "epoch": 0.22082018927444794, "grad_norm": 0.7235719037878356, "learning_rate": 1.4685314685314686e-05, "loss": 1.0983, "step": 210 }, { "epoch": 0.22607781282860148, "grad_norm": 0.7296340934381736, "learning_rate": 1.5034965034965037e-05, "loss": 1.0782, "step": 215 }, { "epoch": 0.231335436382755, "grad_norm": 0.8547235635957527, "learning_rate": 1.5384615384615387e-05, "loss": 1.0924, "step": 220 }, { "epoch": 0.23659305993690852, "grad_norm": 0.7564410266828079, "learning_rate": 1.5734265734265734e-05, "loss": 1.0912, "step": 225 }, { "epoch": 0.24185068349106204, "grad_norm": 0.8525133534517718, "learning_rate": 1.6083916083916083e-05, "loss": 1.0954, "step": 230 }, { "epoch": 0.24710830704521555, "grad_norm": 0.805401759340136, "learning_rate": 1.6433566433566433e-05, "loss": 1.0749, "step": 235 }, { "epoch": 0.25236593059936907, "grad_norm": 0.7402154168479581, "learning_rate": 1.6783216783216786e-05, "loss": 1.0986, "step": 240 }, { "epoch": 0.2576235541535226, "grad_norm": 0.7474174775706688, "learning_rate": 1.7132867132867136e-05, "loss": 1.0869, "step": 245 }, { "epoch": 0.2628811777076761, "grad_norm": 0.7369490806417859, "learning_rate": 1.7482517482517486e-05, "loss": 1.0776, "step": 250 }, { "epoch": 0.26813880126182965, "grad_norm": 0.9287606428487797, "learning_rate": 1.7832167832167832e-05, "loss": 1.1021, "step": 255 }, { "epoch": 0.2733964248159832, "grad_norm": 0.6947297961768544, "learning_rate": 1.8181818181818182e-05, "loss": 1.1012, "step": 260 }, { "epoch": 0.2786540483701367, "grad_norm": 0.8123153065370199, "learning_rate": 1.8531468531468532e-05, "loss": 1.1074, "step": 265 }, { "epoch": 0.28391167192429023, "grad_norm": 0.8302401027015646, "learning_rate": 1.888111888111888e-05, "loss": 1.1083, "step": 270 }, { "epoch": 0.2891692954784437, "grad_norm": 0.6881962349423542, "learning_rate": 1.923076923076923e-05, "loss": 1.0937, "step": 275 }, { "epoch": 0.29442691903259727, "grad_norm": 0.7279189773202484, "learning_rate": 1.958041958041958e-05, "loss": 1.0913, "step": 280 }, { "epoch": 0.2996845425867508, "grad_norm": 1.0229788845896652, "learning_rate": 1.993006993006993e-05, "loss": 1.062, "step": 285 }, { "epoch": 0.3049421661409043, "grad_norm": 0.8087865464750971, "learning_rate": 1.9999880177844552e-05, "loss": 1.0943, "step": 290 }, { "epoch": 0.31019978969505785, "grad_norm": 0.7888576974082969, "learning_rate": 1.9999393405259354e-05, "loss": 1.0814, "step": 295 }, { "epoch": 0.31545741324921134, "grad_norm": 0.7387794672867187, "learning_rate": 1.9998532211572566e-05, "loss": 1.111, "step": 300 }, { "epoch": 0.31545741324921134, "eval_loss": 1.101216435432434, "eval_runtime": 582.6045, "eval_samples_per_second": 23.098, "eval_steps_per_second": 0.182, "step": 300 }, { "epoch": 0.3207150368033649, "grad_norm": 0.7908623885539283, "learning_rate": 1.999729662903106e-05, "loss": 1.0945, "step": 305 }, { "epoch": 0.32597266035751843, "grad_norm": 0.6771503993700702, "learning_rate": 1.999568670390045e-05, "loss": 1.0926, "step": 310 }, { "epoch": 0.3312302839116719, "grad_norm": 0.6841445829487095, "learning_rate": 1.9993702496463395e-05, "loss": 1.1157, "step": 315 }, { "epoch": 0.33648790746582546, "grad_norm": 0.6751678361124496, "learning_rate": 1.9991344081017312e-05, "loss": 1.1029, "step": 320 }, { "epoch": 0.34174553101997895, "grad_norm": 0.6430661618178782, "learning_rate": 1.9988611545871606e-05, "loss": 1.0914, "step": 325 }, { "epoch": 0.3470031545741325, "grad_norm": 0.6415970890637294, "learning_rate": 1.9985504993344375e-05, "loss": 1.095, "step": 330 }, { "epoch": 0.352260778128286, "grad_norm": 0.7730277501959658, "learning_rate": 1.9982024539758547e-05, "loss": 1.1047, "step": 335 }, { "epoch": 0.35751840168243953, "grad_norm": 0.697788892685463, "learning_rate": 1.997817031543756e-05, "loss": 1.0943, "step": 340 }, { "epoch": 0.3627760252365931, "grad_norm": 0.7222719849480133, "learning_rate": 1.9973942464700456e-05, "loss": 1.0723, "step": 345 }, { "epoch": 0.36803364879074657, "grad_norm": 0.7260613938093592, "learning_rate": 1.9969341145856493e-05, "loss": 1.0839, "step": 350 }, { "epoch": 0.3732912723449001, "grad_norm": 0.733047206796414, "learning_rate": 1.9964366531199205e-05, "loss": 1.1031, "step": 355 }, { "epoch": 0.3785488958990536, "grad_norm": 0.7026038822669134, "learning_rate": 1.995901880699997e-05, "loss": 1.0921, "step": 360 }, { "epoch": 0.38380651945320715, "grad_norm": 0.7130129636017671, "learning_rate": 1.9953298173501007e-05, "loss": 1.1082, "step": 365 }, { "epoch": 0.3890641430073607, "grad_norm": 0.6640121507704535, "learning_rate": 1.9947204844907903e-05, "loss": 1.0865, "step": 370 }, { "epoch": 0.3943217665615142, "grad_norm": 0.6489539943403665, "learning_rate": 1.994073904938157e-05, "loss": 1.1005, "step": 375 }, { "epoch": 0.39957939011566773, "grad_norm": 0.6442461845826825, "learning_rate": 1.9933901029029732e-05, "loss": 1.0723, "step": 380 }, { "epoch": 0.4048370136698212, "grad_norm": 0.7261445745563544, "learning_rate": 1.992669103989783e-05, "loss": 1.1011, "step": 385 }, { "epoch": 0.41009463722397477, "grad_norm": 0.7222317305626339, "learning_rate": 1.9919109351959444e-05, "loss": 1.0908, "step": 390 }, { "epoch": 0.4153522607781283, "grad_norm": 0.6323590176184729, "learning_rate": 1.9911156249106186e-05, "loss": 1.089, "step": 395 }, { "epoch": 0.4206098843322818, "grad_norm": 0.6782790061464099, "learning_rate": 1.9902832029137086e-05, "loss": 1.0974, "step": 400 }, { "epoch": 0.4206098843322818, "eval_loss": 1.0965957641601562, "eval_runtime": 611.6915, "eval_samples_per_second": 22.0, "eval_steps_per_second": 0.173, "step": 400 }, { "epoch": 0.42586750788643535, "grad_norm": 0.7130643351963079, "learning_rate": 1.9894137003747404e-05, "loss": 1.0863, "step": 405 }, { "epoch": 0.43112513144058884, "grad_norm": 0.7035673344638229, "learning_rate": 1.988507149851699e-05, "loss": 1.0928, "step": 410 }, { "epoch": 0.4363827549947424, "grad_norm": 0.6366880839024881, "learning_rate": 1.987563585289808e-05, "loss": 1.0876, "step": 415 }, { "epoch": 0.4416403785488959, "grad_norm": 0.6724589905602705, "learning_rate": 1.9865830420202587e-05, "loss": 1.0814, "step": 420 }, { "epoch": 0.4468980021030494, "grad_norm": 0.7296112006903912, "learning_rate": 1.9855655567588877e-05, "loss": 1.0849, "step": 425 }, { "epoch": 0.45215562565720296, "grad_norm": 0.7301202009190912, "learning_rate": 1.984511167604801e-05, "loss": 1.0943, "step": 430 }, { "epoch": 0.45741324921135645, "grad_norm": 0.6728038801425467, "learning_rate": 1.9834199140389485e-05, "loss": 1.0958, "step": 435 }, { "epoch": 0.46267087276551, "grad_norm": 0.6461291574762016, "learning_rate": 1.982291836922645e-05, "loss": 1.0814, "step": 440 }, { "epoch": 0.4679284963196635, "grad_norm": 0.7354488392025322, "learning_rate": 1.9811269784960404e-05, "loss": 1.1019, "step": 445 }, { "epoch": 0.47318611987381703, "grad_norm": 0.871639557338332, "learning_rate": 1.9799253823765383e-05, "loss": 1.1006, "step": 450 }, { "epoch": 0.4784437434279706, "grad_norm": 0.7250167929947016, "learning_rate": 1.9786870935571617e-05, "loss": 1.0976, "step": 455 }, { "epoch": 0.48370136698212407, "grad_norm": 0.7624377086650501, "learning_rate": 1.97741215840487e-05, "loss": 1.073, "step": 460 }, { "epoch": 0.4889589905362776, "grad_norm": 0.7335919595002304, "learning_rate": 1.9761006246588217e-05, "loss": 1.0928, "step": 465 }, { "epoch": 0.4942166140904311, "grad_norm": 0.6382852192610631, "learning_rate": 1.9747525414285863e-05, "loss": 1.0945, "step": 470 }, { "epoch": 0.49947423764458465, "grad_norm": 0.718180529210079, "learning_rate": 1.9733679591923062e-05, "loss": 1.0749, "step": 475 }, { "epoch": 0.5047318611987381, "grad_norm": 0.6600718457016724, "learning_rate": 1.9719469297948076e-05, "loss": 1.1181, "step": 480 }, { "epoch": 0.5099894847528917, "grad_norm": 0.6689062165685349, "learning_rate": 1.9704895064456573e-05, "loss": 1.0952, "step": 485 }, { "epoch": 0.5152471083070452, "grad_norm": 0.7143276328895771, "learning_rate": 1.968995743717171e-05, "loss": 1.0896, "step": 490 }, { "epoch": 0.5205047318611987, "grad_norm": 0.6221294359823765, "learning_rate": 1.9674656975423704e-05, "loss": 1.0742, "step": 495 }, { "epoch": 0.5257623554153522, "grad_norm": 0.7268351101096144, "learning_rate": 1.9658994252128884e-05, "loss": 1.0898, "step": 500 }, { "epoch": 0.5257623554153522, "eval_loss": 1.091992974281311, "eval_runtime": 577.2656, "eval_samples_per_second": 23.312, "eval_steps_per_second": 0.184, "step": 500 }, { "epoch": 0.5310199789695058, "grad_norm": 0.7409815849447423, "learning_rate": 1.964296985376823e-05, "loss": 1.0785, "step": 505 }, { "epoch": 0.5362776025236593, "grad_norm": 0.7136236155581998, "learning_rate": 1.962658438036543e-05, "loss": 1.0983, "step": 510 }, { "epoch": 0.5415352260778128, "grad_norm": 0.7215624141555339, "learning_rate": 1.9609838445464406e-05, "loss": 1.1007, "step": 515 }, { "epoch": 0.5467928496319664, "grad_norm": 0.6979369948772214, "learning_rate": 1.959273267610633e-05, "loss": 1.0806, "step": 520 }, { "epoch": 0.5520504731861199, "grad_norm": 0.7255670203711404, "learning_rate": 1.9575267712806152e-05, "loss": 1.0753, "step": 525 }, { "epoch": 0.5573080967402734, "grad_norm": 0.6378781651024482, "learning_rate": 1.955744420952863e-05, "loss": 1.1001, "step": 530 }, { "epoch": 0.562565720294427, "grad_norm": 0.6440842622036477, "learning_rate": 1.9539262833663813e-05, "loss": 1.0867, "step": 535 }, { "epoch": 0.5678233438485805, "grad_norm": 0.650711077304966, "learning_rate": 1.9520724266002078e-05, "loss": 1.0861, "step": 540 }, { "epoch": 0.573080967402734, "grad_norm": 0.9412839294952584, "learning_rate": 1.9501829200708627e-05, "loss": 1.066, "step": 545 }, { "epoch": 0.5783385909568874, "grad_norm": 0.7997373349509072, "learning_rate": 1.948257834529749e-05, "loss": 1.0804, "step": 550 }, { "epoch": 0.583596214511041, "grad_norm": 0.6632970321629863, "learning_rate": 1.9462972420605045e-05, "loss": 1.0796, "step": 555 }, { "epoch": 0.5888538380651945, "grad_norm": 0.6907348547616222, "learning_rate": 1.9443012160763014e-05, "loss": 1.0914, "step": 560 }, { "epoch": 0.594111461619348, "grad_norm": 0.7602392699866063, "learning_rate": 1.9422698313170982e-05, "loss": 1.0782, "step": 565 }, { "epoch": 0.5993690851735016, "grad_norm": 0.7425506195668518, "learning_rate": 1.9402031638468407e-05, "loss": 1.0728, "step": 570 }, { "epoch": 0.6046267087276551, "grad_norm": 0.6057134136385478, "learning_rate": 1.9381012910506146e-05, "loss": 1.0944, "step": 575 }, { "epoch": 0.6098843322818086, "grad_norm": 0.611926050399381, "learning_rate": 1.935964291631746e-05, "loss": 1.0887, "step": 580 }, { "epoch": 0.6151419558359621, "grad_norm": 0.6044521957797464, "learning_rate": 1.933792245608857e-05, "loss": 1.0653, "step": 585 }, { "epoch": 0.6203995793901157, "grad_norm": 0.6160859598416025, "learning_rate": 1.9315852343128677e-05, "loss": 1.0697, "step": 590 }, { "epoch": 0.6256572029442692, "grad_norm": 0.6454926848454089, "learning_rate": 1.9293433403839506e-05, "loss": 1.0835, "step": 595 }, { "epoch": 0.6309148264984227, "grad_norm": 0.6271287719549755, "learning_rate": 1.9270666477684375e-05, "loss": 1.0749, "step": 600 }, { "epoch": 0.6309148264984227, "eval_loss": 1.0876203775405884, "eval_runtime": 619.1152, "eval_samples_per_second": 21.736, "eval_steps_per_second": 0.171, "step": 600 }, { "epoch": 0.6361724500525763, "grad_norm": 0.634393838535348, "learning_rate": 1.9247552417156758e-05, "loss": 1.0729, "step": 605 }, { "epoch": 0.6414300736067298, "grad_norm": 0.6594690945271786, "learning_rate": 1.9224092087748344e-05, "loss": 1.0827, "step": 610 }, { "epoch": 0.6466876971608833, "grad_norm": 0.611714575208264, "learning_rate": 1.920028636791667e-05, "loss": 1.0882, "step": 615 }, { "epoch": 0.6519453207150369, "grad_norm": 0.7463577820820205, "learning_rate": 1.9176136149052184e-05, "loss": 1.0756, "step": 620 }, { "epoch": 0.6572029442691903, "grad_norm": 0.5943822071057456, "learning_rate": 1.9151642335444894e-05, "loss": 1.0781, "step": 625 }, { "epoch": 0.6624605678233438, "grad_norm": 0.6478466639224281, "learning_rate": 1.9126805844250507e-05, "loss": 1.0799, "step": 630 }, { "epoch": 0.6677181913774973, "grad_norm": 1.125407499631879, "learning_rate": 1.910162760545607e-05, "loss": 1.0863, "step": 635 }, { "epoch": 0.6729758149316509, "grad_norm": 0.6317836803464292, "learning_rate": 1.9076108561845167e-05, "loss": 1.068, "step": 640 }, { "epoch": 0.6782334384858044, "grad_norm": 0.6782741352289255, "learning_rate": 1.90502496689626e-05, "loss": 1.0717, "step": 645 }, { "epoch": 0.6834910620399579, "grad_norm": 0.6549048073170591, "learning_rate": 1.902405189507862e-05, "loss": 1.0729, "step": 650 }, { "epoch": 0.6887486855941115, "grad_norm": 0.5944400668808439, "learning_rate": 1.899751622115267e-05, "loss": 1.073, "step": 655 }, { "epoch": 0.694006309148265, "grad_norm": 0.6344443790559094, "learning_rate": 1.8970643640796642e-05, "loss": 1.0765, "step": 660 }, { "epoch": 0.6992639327024185, "grad_norm": 0.6066328657447971, "learning_rate": 1.8943435160237693e-05, "loss": 1.068, "step": 665 }, { "epoch": 0.704521556256572, "grad_norm": 0.7935810543521484, "learning_rate": 1.8915891798280545e-05, "loss": 1.075, "step": 670 }, { "epoch": 0.7097791798107256, "grad_norm": 0.6311479883642119, "learning_rate": 1.8888014586269353e-05, "loss": 1.0605, "step": 675 }, { "epoch": 0.7150368033648791, "grad_norm": 0.6247754068444527, "learning_rate": 1.8859804568049083e-05, "loss": 1.0853, "step": 680 }, { "epoch": 0.7202944269190326, "grad_norm": 0.6133863303859032, "learning_rate": 1.8831262799926412e-05, "loss": 1.0751, "step": 685 }, { "epoch": 0.7255520504731862, "grad_norm": 0.6378281851358015, "learning_rate": 1.88023903506302e-05, "loss": 1.086, "step": 690 }, { "epoch": 0.7308096740273397, "grad_norm": 0.6695843196133265, "learning_rate": 1.8773188301271458e-05, "loss": 1.0655, "step": 695 }, { "epoch": 0.7360672975814931, "grad_norm": 0.6310578043108518, "learning_rate": 1.874365774530285e-05, "loss": 1.0847, "step": 700 }, { "epoch": 0.7360672975814931, "eval_loss": 1.083134412765503, "eval_runtime": 594.902, "eval_samples_per_second": 22.621, "eval_steps_per_second": 0.178, "step": 700 }, { "epoch": 0.7413249211356467, "grad_norm": 0.7538683907974313, "learning_rate": 1.8713799788477794e-05, "loss": 1.0691, "step": 705 }, { "epoch": 0.7465825446898002, "grad_norm": 0.706371524563473, "learning_rate": 1.8683615548809007e-05, "loss": 1.0654, "step": 710 }, { "epoch": 0.7518401682439537, "grad_norm": 0.7089836009644308, "learning_rate": 1.865310615652668e-05, "loss": 1.0732, "step": 715 }, { "epoch": 0.7570977917981072, "grad_norm": 0.6253449282146815, "learning_rate": 1.862227275403614e-05, "loss": 1.0595, "step": 720 }, { "epoch": 0.7623554153522608, "grad_norm": 0.6352792231235775, "learning_rate": 1.8591116495875065e-05, "loss": 1.0611, "step": 725 }, { "epoch": 0.7676130389064143, "grad_norm": 0.6559807547521417, "learning_rate": 1.8559638548670276e-05, "loss": 1.0772, "step": 730 }, { "epoch": 0.7728706624605678, "grad_norm": 0.660949169309788, "learning_rate": 1.8527840091094038e-05, "loss": 1.0723, "step": 735 }, { "epoch": 0.7781282860147214, "grad_norm": 0.6485292004090661, "learning_rate": 1.849572231381993e-05, "loss": 1.0756, "step": 740 }, { "epoch": 0.7833859095688749, "grad_norm": 0.5894518164357108, "learning_rate": 1.8463286419478256e-05, "loss": 1.0878, "step": 745 }, { "epoch": 0.7886435331230284, "grad_norm": 0.6373909243160687, "learning_rate": 1.843053362261102e-05, "loss": 1.0698, "step": 750 }, { "epoch": 0.7939011566771819, "grad_norm": 0.6247774742453552, "learning_rate": 1.8397465149626438e-05, "loss": 1.0689, "step": 755 }, { "epoch": 0.7991587802313355, "grad_norm": 0.6702489085237104, "learning_rate": 1.836408223875303e-05, "loss": 1.0878, "step": 760 }, { "epoch": 0.804416403785489, "grad_norm": 0.5901778445639561, "learning_rate": 1.8330386139993253e-05, "loss": 1.0615, "step": 765 }, { "epoch": 0.8096740273396424, "grad_norm": 0.5690160698641555, "learning_rate": 1.8296378115076683e-05, "loss": 1.0627, "step": 770 }, { "epoch": 0.814931650893796, "grad_norm": 0.7286612536078287, "learning_rate": 1.826205943741277e-05, "loss": 1.0599, "step": 775 }, { "epoch": 0.8201892744479495, "grad_norm": 0.6255138205467193, "learning_rate": 1.8227431392043188e-05, "loss": 1.0738, "step": 780 }, { "epoch": 0.825446898002103, "grad_norm": 0.6089376456915286, "learning_rate": 1.8192495275593667e-05, "loss": 1.0682, "step": 785 }, { "epoch": 0.8307045215562566, "grad_norm": 0.6155868150283563, "learning_rate": 1.8157252396225487e-05, "loss": 1.065, "step": 790 }, { "epoch": 0.8359621451104101, "grad_norm": 0.7289316735890606, "learning_rate": 1.812170407358647e-05, "loss": 1.0577, "step": 795 }, { "epoch": 0.8412197686645636, "grad_norm": 0.6194611530873854, "learning_rate": 1.8085851638761564e-05, "loss": 1.0749, "step": 800 }, { "epoch": 0.8412197686645636, "eval_loss": 1.0777511596679688, "eval_runtime": 578.5287, "eval_samples_per_second": 23.261, "eval_steps_per_second": 0.183, "step": 800 }, { "epoch": 0.8464773922187171, "grad_norm": 0.5897179737564566, "learning_rate": 1.8049696434223018e-05, "loss": 1.064, "step": 805 }, { "epoch": 0.8517350157728707, "grad_norm": 0.6249138645283078, "learning_rate": 1.801323981378011e-05, "loss": 1.0689, "step": 810 }, { "epoch": 0.8569926393270242, "grad_norm": 0.6094536651967496, "learning_rate": 1.797648314252844e-05, "loss": 1.0547, "step": 815 }, { "epoch": 0.8622502628811777, "grad_norm": 0.6427649229281082, "learning_rate": 1.7939427796798835e-05, "loss": 1.0709, "step": 820 }, { "epoch": 0.8675078864353313, "grad_norm": 0.625645109760211, "learning_rate": 1.790207516410579e-05, "loss": 1.0711, "step": 825 }, { "epoch": 0.8727655099894848, "grad_norm": 0.6900102876237034, "learning_rate": 1.7864426643095537e-05, "loss": 1.0551, "step": 830 }, { "epoch": 0.8780231335436383, "grad_norm": 0.6633694160119932, "learning_rate": 1.7826483643493664e-05, "loss": 1.0647, "step": 835 }, { "epoch": 0.8832807570977917, "grad_norm": 0.6706740933862908, "learning_rate": 1.7788247586052324e-05, "loss": 1.068, "step": 840 }, { "epoch": 0.8885383806519453, "grad_norm": 0.6147588746912578, "learning_rate": 1.774971990249703e-05, "loss": 1.0675, "step": 845 }, { "epoch": 0.8937960042060988, "grad_norm": 0.650347913047383, "learning_rate": 1.7710902035473075e-05, "loss": 1.0563, "step": 850 }, { "epoch": 0.8990536277602523, "grad_norm": 0.5896501069060196, "learning_rate": 1.7671795438491476e-05, "loss": 1.0549, "step": 855 }, { "epoch": 0.9043112513144059, "grad_norm": 0.5865757288759952, "learning_rate": 1.763240157587457e-05, "loss": 1.074, "step": 860 }, { "epoch": 0.9095688748685594, "grad_norm": 0.6448523425472431, "learning_rate": 1.759272192270118e-05, "loss": 1.0406, "step": 865 }, { "epoch": 0.9148264984227129, "grad_norm": 0.628930087369231, "learning_rate": 1.7552757964751375e-05, "loss": 1.0604, "step": 870 }, { "epoch": 0.9200841219768665, "grad_norm": 0.5573844980993936, "learning_rate": 1.751251119845085e-05, "loss": 1.0712, "step": 875 }, { "epoch": 0.92534174553102, "grad_norm": 0.5760631844651097, "learning_rate": 1.7471983130814872e-05, "loss": 1.0677, "step": 880 }, { "epoch": 0.9305993690851735, "grad_norm": 0.6608474625527273, "learning_rate": 1.7431175279391864e-05, "loss": 1.0564, "step": 885 }, { "epoch": 0.935856992639327, "grad_norm": 0.6158122817932856, "learning_rate": 1.7390089172206594e-05, "loss": 1.0698, "step": 890 }, { "epoch": 0.9411146161934806, "grad_norm": 0.6348226976928315, "learning_rate": 1.7348726347702922e-05, "loss": 1.0541, "step": 895 }, { "epoch": 0.9463722397476341, "grad_norm": 0.5893951119046926, "learning_rate": 1.730708835468624e-05, "loss": 1.055, "step": 900 }, { "epoch": 0.9463722397476341, "eval_loss": 1.0719902515411377, "eval_runtime": 554.5404, "eval_samples_per_second": 24.267, "eval_steps_per_second": 0.191, "step": 900 }, { "epoch": 0.9516298633017876, "grad_norm": 0.6398319094636862, "learning_rate": 1.7265176752265437e-05, "loss": 1.0606, "step": 905 }, { "epoch": 0.9568874868559412, "grad_norm": 0.6048116978972946, "learning_rate": 1.7222993109794547e-05, "loss": 1.0602, "step": 910 }, { "epoch": 0.9621451104100947, "grad_norm": 0.5840246341713026, "learning_rate": 1.7180539006813973e-05, "loss": 1.0479, "step": 915 }, { "epoch": 0.9674027339642481, "grad_norm": 0.5778229669814231, "learning_rate": 1.7137816032991338e-05, "loss": 1.0552, "step": 920 }, { "epoch": 0.9726603575184016, "grad_norm": 0.599559903007225, "learning_rate": 1.7094825788061984e-05, "loss": 1.0602, "step": 925 }, { "epoch": 0.9779179810725552, "grad_norm": 0.6085935007813816, "learning_rate": 1.7051569881769033e-05, "loss": 1.0702, "step": 930 }, { "epoch": 0.9831756046267087, "grad_norm": 0.6210127216958851, "learning_rate": 1.7008049933803153e-05, "loss": 1.0562, "step": 935 }, { "epoch": 0.9884332281808622, "grad_norm": 0.5660970609343743, "learning_rate": 1.696426757374187e-05, "loss": 1.0488, "step": 940 }, { "epoch": 0.9936908517350158, "grad_norm": 0.6052820312725565, "learning_rate": 1.6920224440988578e-05, "loss": 1.0579, "step": 945 }, { "epoch": 0.9989484752891693, "grad_norm": 0.6336659141670167, "learning_rate": 1.6875922184711152e-05, "loss": 1.0391, "step": 950 }, { "epoch": 1.0042060988433228, "grad_norm": 0.8649311407022923, "learning_rate": 1.6831362463780173e-05, "loss": 0.9427, "step": 955 }, { "epoch": 1.0094637223974763, "grad_norm": 0.7906840430230622, "learning_rate": 1.6786546946706826e-05, "loss": 0.9093, "step": 960 }, { "epoch": 1.0147213459516298, "grad_norm": 0.7615451637281871, "learning_rate": 1.6741477311580442e-05, "loss": 0.9129, "step": 965 }, { "epoch": 1.0199789695057835, "grad_norm": 0.81395189037578, "learning_rate": 1.669615524600562e-05, "loss": 0.9116, "step": 970 }, { "epoch": 1.025236593059937, "grad_norm": 0.6675565867389684, "learning_rate": 1.6650582447039087e-05, "loss": 0.897, "step": 975 }, { "epoch": 1.0304942166140905, "grad_norm": 0.6558457233521835, "learning_rate": 1.6604760621126104e-05, "loss": 0.9059, "step": 980 }, { "epoch": 1.035751840168244, "grad_norm": 0.791116301575079, "learning_rate": 1.655869148403661e-05, "loss": 0.9123, "step": 985 }, { "epoch": 1.0410094637223974, "grad_norm": 0.6281691549427542, "learning_rate": 1.6512376760800943e-05, "loss": 0.9165, "step": 990 }, { "epoch": 1.046267087276551, "grad_norm": 0.722210053446233, "learning_rate": 1.646581818564528e-05, "loss": 0.8885, "step": 995 }, { "epoch": 1.0515247108307044, "grad_norm": 0.6566766982009167, "learning_rate": 1.641901750192666e-05, "loss": 0.9184, "step": 1000 }, { "epoch": 1.0515247108307044, "eval_loss": 1.0817060470581055, "eval_runtime": 548.8481, "eval_samples_per_second": 24.519, "eval_steps_per_second": 0.193, "step": 1000 }, { "epoch": 1.0567823343848581, "grad_norm": 0.7215682123240776, "learning_rate": 1.6371976462067744e-05, "loss": 0.9048, "step": 1005 }, { "epoch": 1.0620399579390116, "grad_norm": 0.5754913559382355, "learning_rate": 1.6324696827491178e-05, "loss": 0.9062, "step": 1010 }, { "epoch": 1.0672975814931651, "grad_norm": 0.7713724891213452, "learning_rate": 1.6277180368553637e-05, "loss": 0.9003, "step": 1015 }, { "epoch": 1.0725552050473186, "grad_norm": 0.6705202466831766, "learning_rate": 1.622942886447953e-05, "loss": 0.9076, "step": 1020 }, { "epoch": 1.077812828601472, "grad_norm": 0.7709385226269342, "learning_rate": 1.6181444103294405e-05, "loss": 0.9016, "step": 1025 }, { "epoch": 1.0830704521556256, "grad_norm": 0.6618094790250554, "learning_rate": 1.613322788175796e-05, "loss": 0.9087, "step": 1030 }, { "epoch": 1.088328075709779, "grad_norm": 0.7111642531915952, "learning_rate": 1.608478200529679e-05, "loss": 0.8993, "step": 1035 }, { "epoch": 1.0935856992639328, "grad_norm": 0.9967278615618546, "learning_rate": 1.6036108287936774e-05, "loss": 0.9053, "step": 1040 }, { "epoch": 1.0988433228180863, "grad_norm": 0.7211016358920939, "learning_rate": 1.598720855223516e-05, "loss": 0.8967, "step": 1045 }, { "epoch": 1.1041009463722398, "grad_norm": 0.681965857428634, "learning_rate": 1.5938084629212308e-05, "loss": 0.9069, "step": 1050 }, { "epoch": 1.1093585699263933, "grad_norm": 0.7296745556202008, "learning_rate": 1.5888738358283125e-05, "loss": 0.8918, "step": 1055 }, { "epoch": 1.1146161934805467, "grad_norm": 0.6472282910374098, "learning_rate": 1.5839171587188213e-05, "loss": 0.8953, "step": 1060 }, { "epoch": 1.1198738170347002, "grad_norm": 0.6420578981972046, "learning_rate": 1.5789386171924656e-05, "loss": 0.9185, "step": 1065 }, { "epoch": 1.125131440588854, "grad_norm": 0.6592365438130466, "learning_rate": 1.5739383976676538e-05, "loss": 0.9338, "step": 1070 }, { "epoch": 1.1303890641430074, "grad_norm": 0.6668713420054354, "learning_rate": 1.5689166873745133e-05, "loss": 0.9071, "step": 1075 }, { "epoch": 1.135646687697161, "grad_norm": 0.6314319656757978, "learning_rate": 1.5638736743478807e-05, "loss": 0.9094, "step": 1080 }, { "epoch": 1.1409043112513144, "grad_norm": 0.6557318538936868, "learning_rate": 1.5588095474202597e-05, "loss": 0.9056, "step": 1085 }, { "epoch": 1.146161934805468, "grad_norm": 0.6988942180423913, "learning_rate": 1.55372449621475e-05, "loss": 0.9093, "step": 1090 }, { "epoch": 1.1514195583596214, "grad_norm": 0.6288925365676942, "learning_rate": 1.54861871113795e-05, "loss": 0.8931, "step": 1095 }, { "epoch": 1.1566771819137749, "grad_norm": 0.6060978130757313, "learning_rate": 1.5434923833728238e-05, "loss": 0.8955, "step": 1100 }, { "epoch": 1.1566771819137749, "eval_loss": 1.0778801441192627, "eval_runtime": 560.7689, "eval_samples_per_second": 23.997, "eval_steps_per_second": 0.189, "step": 1100 }, { "epoch": 1.1619348054679284, "grad_norm": 0.636138975576772, "learning_rate": 1.538345704871544e-05, "loss": 0.9164, "step": 1105 }, { "epoch": 1.167192429022082, "grad_norm": 0.7813214708227075, "learning_rate": 1.533178868348304e-05, "loss": 0.9123, "step": 1110 }, { "epoch": 1.1724500525762356, "grad_norm": 0.6454922302300423, "learning_rate": 1.5279920672721014e-05, "loss": 0.9096, "step": 1115 }, { "epoch": 1.177707676130389, "grad_norm": 0.6684532969652581, "learning_rate": 1.522785495859495e-05, "loss": 0.913, "step": 1120 }, { "epoch": 1.1829652996845426, "grad_norm": 0.659104192691736, "learning_rate": 1.517559349067331e-05, "loss": 0.9127, "step": 1125 }, { "epoch": 1.188222923238696, "grad_norm": 0.6327096229416864, "learning_rate": 1.5123138225854437e-05, "loss": 0.9179, "step": 1130 }, { "epoch": 1.1934805467928495, "grad_norm": 0.6821427010599724, "learning_rate": 1.507049112829328e-05, "loss": 0.916, "step": 1135 }, { "epoch": 1.1987381703470033, "grad_norm": 0.6383663706263557, "learning_rate": 1.5017654169327847e-05, "loss": 0.9205, "step": 1140 }, { "epoch": 1.2039957939011567, "grad_norm": 0.6642751432840621, "learning_rate": 1.4964629327405385e-05, "loss": 0.9064, "step": 1145 }, { "epoch": 1.2092534174553102, "grad_norm": 0.6370926988086576, "learning_rate": 1.4911418588008302e-05, "loss": 0.9009, "step": 1150 }, { "epoch": 1.2145110410094637, "grad_norm": 0.6726809074089126, "learning_rate": 1.4858023943579831e-05, "loss": 0.9177, "step": 1155 }, { "epoch": 1.2197686645636172, "grad_norm": 0.6624168311883211, "learning_rate": 1.4804447393449408e-05, "loss": 0.9008, "step": 1160 }, { "epoch": 1.2250262881177707, "grad_norm": 0.6736191492385858, "learning_rate": 1.4750690943757815e-05, "loss": 0.9177, "step": 1165 }, { "epoch": 1.2302839116719242, "grad_norm": 0.6626164162916314, "learning_rate": 1.469675660738206e-05, "loss": 0.9125, "step": 1170 }, { "epoch": 1.235541535226078, "grad_norm": 0.6561095205909978, "learning_rate": 1.4642646403860017e-05, "loss": 0.9224, "step": 1175 }, { "epoch": 1.2407991587802314, "grad_norm": 0.6404857197573285, "learning_rate": 1.4588362359314787e-05, "loss": 0.9147, "step": 1180 }, { "epoch": 1.2460567823343849, "grad_norm": 0.6247458161762777, "learning_rate": 1.453390650637884e-05, "loss": 0.9055, "step": 1185 }, { "epoch": 1.2513144058885384, "grad_norm": 0.6205798650094878, "learning_rate": 1.4479280884117919e-05, "loss": 0.9098, "step": 1190 }, { "epoch": 1.2565720294426919, "grad_norm": 0.6171085702613818, "learning_rate": 1.4424487537954658e-05, "loss": 0.9086, "step": 1195 }, { "epoch": 1.2618296529968454, "grad_norm": 0.6817002284070426, "learning_rate": 1.4369528519592016e-05, "loss": 0.914, "step": 1200 }, { "epoch": 1.2618296529968454, "eval_loss": 1.0758436918258667, "eval_runtime": 554.9555, "eval_samples_per_second": 24.249, "eval_steps_per_second": 0.191, "step": 1200 }, { "epoch": 1.267087276550999, "grad_norm": 0.6556393089241064, "learning_rate": 1.4314405886936444e-05, "loss": 0.907, "step": 1205 }, { "epoch": 1.2723449001051526, "grad_norm": 0.6564247019338768, "learning_rate": 1.425912170402083e-05, "loss": 0.8947, "step": 1210 }, { "epoch": 1.277602523659306, "grad_norm": 0.6909745550376631, "learning_rate": 1.4203678040927211e-05, "loss": 0.9015, "step": 1215 }, { "epoch": 1.2828601472134595, "grad_norm": 0.6649938010634878, "learning_rate": 1.414807697370926e-05, "loss": 0.9147, "step": 1220 }, { "epoch": 1.288117770767613, "grad_norm": 0.6827602346821062, "learning_rate": 1.4092320584314552e-05, "loss": 0.9223, "step": 1225 }, { "epoch": 1.2933753943217665, "grad_norm": 0.6891969548538285, "learning_rate": 1.4036410960506601e-05, "loss": 0.909, "step": 1230 }, { "epoch": 1.29863301787592, "grad_norm": 0.7488612526253159, "learning_rate": 1.3980350195786691e-05, "loss": 0.9063, "step": 1235 }, { "epoch": 1.3038906414300735, "grad_norm": 0.8765777386899024, "learning_rate": 1.3924140389315488e-05, "loss": 0.8949, "step": 1240 }, { "epoch": 1.3091482649842272, "grad_norm": 0.6756135072464465, "learning_rate": 1.3867783645834428e-05, "loss": 0.9173, "step": 1245 }, { "epoch": 1.3144058885383807, "grad_norm": 0.6511543641668399, "learning_rate": 1.3811282075586916e-05, "loss": 0.9075, "step": 1250 }, { "epoch": 1.3196635120925342, "grad_norm": 0.6171780710166301, "learning_rate": 1.3754637794239303e-05, "loss": 0.8977, "step": 1255 }, { "epoch": 1.3249211356466877, "grad_norm": 0.658721220404947, "learning_rate": 1.3697852922801669e-05, "loss": 0.9072, "step": 1260 }, { "epoch": 1.3301787592008412, "grad_norm": 0.6417444192429201, "learning_rate": 1.3640929587548403e-05, "loss": 0.9091, "step": 1265 }, { "epoch": 1.3354363827549949, "grad_norm": 0.6187189724748463, "learning_rate": 1.3583869919938597e-05, "loss": 0.9129, "step": 1270 }, { "epoch": 1.3406940063091484, "grad_norm": 0.5843959371785157, "learning_rate": 1.3526676056536205e-05, "loss": 0.9092, "step": 1275 }, { "epoch": 1.3459516298633019, "grad_norm": 0.6932618289744372, "learning_rate": 1.3469350138930073e-05, "loss": 0.9079, "step": 1280 }, { "epoch": 1.3512092534174553, "grad_norm": 0.6598615985676897, "learning_rate": 1.3411894313653727e-05, "loss": 0.8944, "step": 1285 }, { "epoch": 1.3564668769716088, "grad_norm": 0.6427748827555393, "learning_rate": 1.3354310732105014e-05, "loss": 0.898, "step": 1290 }, { "epoch": 1.3617245005257623, "grad_norm": 0.6121349209877303, "learning_rate": 1.3296601550465525e-05, "loss": 0.909, "step": 1295 }, { "epoch": 1.3669821240799158, "grad_norm": 0.6575524447093695, "learning_rate": 1.3238768929619874e-05, "loss": 0.9098, "step": 1300 }, { "epoch": 1.3669821240799158, "eval_loss": 1.069818139076233, "eval_runtime": 559.1797, "eval_samples_per_second": 24.066, "eval_steps_per_second": 0.19, "step": 1300 }, { "epoch": 1.3722397476340693, "grad_norm": 0.6531825341664897, "learning_rate": 1.3180815035074786e-05, "loss": 0.9171, "step": 1305 }, { "epoch": 1.3774973711882228, "grad_norm": 0.6882987706313063, "learning_rate": 1.3122742036877994e-05, "loss": 0.8888, "step": 1310 }, { "epoch": 1.3827549947423765, "grad_norm": 0.724082633852385, "learning_rate": 1.3064552109537e-05, "loss": 0.896, "step": 1315 }, { "epoch": 1.38801261829653, "grad_norm": 0.6895669186673943, "learning_rate": 1.3006247431937644e-05, "loss": 0.925, "step": 1320 }, { "epoch": 1.3932702418506835, "grad_norm": 0.6718431536804129, "learning_rate": 1.2947830187262514e-05, "loss": 0.9099, "step": 1325 }, { "epoch": 1.398527865404837, "grad_norm": 0.688445352407702, "learning_rate": 1.2889302562909214e-05, "loss": 0.8949, "step": 1330 }, { "epoch": 1.4037854889589905, "grad_norm": 0.6016293866381901, "learning_rate": 1.2830666750408434e-05, "loss": 0.9015, "step": 1335 }, { "epoch": 1.4090431125131442, "grad_norm": 0.6182893633299666, "learning_rate": 1.2771924945341906e-05, "loss": 0.9075, "step": 1340 }, { "epoch": 1.4143007360672977, "grad_norm": 0.6593893582600123, "learning_rate": 1.2713079347260198e-05, "loss": 0.8963, "step": 1345 }, { "epoch": 1.4195583596214512, "grad_norm": 0.6688143172592789, "learning_rate": 1.2654132159600327e-05, "loss": 0.9021, "step": 1350 }, { "epoch": 1.4248159831756047, "grad_norm": 0.6250269029897194, "learning_rate": 1.2595085589603281e-05, "loss": 0.9001, "step": 1355 }, { "epoch": 1.4300736067297581, "grad_norm": 0.6184329559921266, "learning_rate": 1.2535941848231352e-05, "loss": 0.8931, "step": 1360 }, { "epoch": 1.4353312302839116, "grad_norm": 0.6598155701237914, "learning_rate": 1.2476703150085356e-05, "loss": 0.9046, "step": 1365 }, { "epoch": 1.4405888538380651, "grad_norm": 0.6728059285538895, "learning_rate": 1.2417371713321713e-05, "loss": 0.9081, "step": 1370 }, { "epoch": 1.4458464773922186, "grad_norm": 0.6795053004000011, "learning_rate": 1.2357949759569372e-05, "loss": 0.8935, "step": 1375 }, { "epoch": 1.4511041009463723, "grad_norm": 0.6370835079324721, "learning_rate": 1.2298439513846634e-05, "loss": 0.9134, "step": 1380 }, { "epoch": 1.4563617245005258, "grad_norm": 0.6511674325575209, "learning_rate": 1.2238843204477855e-05, "loss": 0.9025, "step": 1385 }, { "epoch": 1.4616193480546793, "grad_norm": 0.6486276822993603, "learning_rate": 1.2179163063009974e-05, "loss": 0.9084, "step": 1390 }, { "epoch": 1.4668769716088328, "grad_norm": 0.6375900541444521, "learning_rate": 1.2119401324128976e-05, "loss": 0.892, "step": 1395 }, { "epoch": 1.4721345951629863, "grad_norm": 0.6636437536958206, "learning_rate": 1.2059560225576212e-05, "loss": 0.9126, "step": 1400 }, { "epoch": 1.4721345951629863, "eval_loss": 1.066650629043579, "eval_runtime": 578.6632, "eval_samples_per_second": 23.255, "eval_steps_per_second": 0.183, "step": 1400 }, { "epoch": 1.4773922187171398, "grad_norm": 0.6880602268392096, "learning_rate": 1.1999642008064612e-05, "loss": 0.9133, "step": 1405 }, { "epoch": 1.4826498422712935, "grad_norm": 0.6439745800900593, "learning_rate": 1.1939648915194766e-05, "loss": 0.8956, "step": 1410 }, { "epoch": 1.487907465825447, "grad_norm": 0.6333947925789535, "learning_rate": 1.1879583193370934e-05, "loss": 0.8967, "step": 1415 }, { "epoch": 1.4931650893796005, "grad_norm": 0.6887095313857406, "learning_rate": 1.1819447091716918e-05, "loss": 0.8953, "step": 1420 }, { "epoch": 1.498422712933754, "grad_norm": 0.8314454423988585, "learning_rate": 1.1759242861991855e-05, "loss": 0.9061, "step": 1425 }, { "epoch": 1.5036803364879074, "grad_norm": 0.6207340757493971, "learning_rate": 1.1698972758505891e-05, "loss": 0.884, "step": 1430 }, { "epoch": 1.508937960042061, "grad_norm": 0.6356005817235517, "learning_rate": 1.1638639038035771e-05, "loss": 0.9056, "step": 1435 }, { "epoch": 1.5141955835962144, "grad_norm": 0.6341731273814719, "learning_rate": 1.1578243959740345e-05, "loss": 0.8926, "step": 1440 }, { "epoch": 1.519453207150368, "grad_norm": 0.6524260051325438, "learning_rate": 1.1517789785075965e-05, "loss": 0.8925, "step": 1445 }, { "epoch": 1.5247108307045214, "grad_norm": 0.6390976768866661, "learning_rate": 1.1457278777711816e-05, "loss": 0.896, "step": 1450 }, { "epoch": 1.5299684542586751, "grad_norm": 0.672745789784435, "learning_rate": 1.139671320344514e-05, "loss": 0.8919, "step": 1455 }, { "epoch": 1.5352260778128286, "grad_norm": 0.6849640495250097, "learning_rate": 1.1336095330116406e-05, "loss": 0.8908, "step": 1460 }, { "epoch": 1.540483701366982, "grad_norm": 0.6909452334309092, "learning_rate": 1.127542742752439e-05, "loss": 0.901, "step": 1465 }, { "epoch": 1.5457413249211358, "grad_norm": 0.6514347502639167, "learning_rate": 1.1214711767341184e-05, "loss": 0.8886, "step": 1470 }, { "epoch": 1.5509989484752893, "grad_norm": 0.7670979545467012, "learning_rate": 1.1153950623027127e-05, "loss": 0.8915, "step": 1475 }, { "epoch": 1.5562565720294428, "grad_norm": 0.7094429002966973, "learning_rate": 1.1093146269745694e-05, "loss": 0.8986, "step": 1480 }, { "epoch": 1.5615141955835963, "grad_norm": 0.7040092519773771, "learning_rate": 1.1032300984278286e-05, "loss": 0.8995, "step": 1485 }, { "epoch": 1.5667718191377498, "grad_norm": 0.6717747776159033, "learning_rate": 1.0971417044938984e-05, "loss": 0.8894, "step": 1490 }, { "epoch": 1.5720294426919033, "grad_norm": 0.6111734491076107, "learning_rate": 1.091049673148924e-05, "loss": 0.8903, "step": 1495 }, { "epoch": 1.5772870662460567, "grad_norm": 0.6339144886316356, "learning_rate": 1.0849542325052514e-05, "loss": 0.9032, "step": 1500 }, { "epoch": 1.5772870662460567, "eval_loss": 1.060400366783142, "eval_runtime": 553.3344, "eval_samples_per_second": 24.32, "eval_steps_per_second": 0.192, "step": 1500 }, { "epoch": 1.5825446898002102, "grad_norm": 0.6119889525138412, "learning_rate": 1.0788556108028854e-05, "loss": 0.9059, "step": 1505 }, { "epoch": 1.5878023133543637, "grad_norm": 0.6610719745391888, "learning_rate": 1.072754036400944e-05, "loss": 0.8845, "step": 1510 }, { "epoch": 1.5930599369085172, "grad_norm": 0.6334246363490683, "learning_rate": 1.0666497377691067e-05, "loss": 0.909, "step": 1515 }, { "epoch": 1.598317560462671, "grad_norm": 0.6600607162051635, "learning_rate": 1.0605429434790607e-05, "loss": 0.9101, "step": 1520 }, { "epoch": 1.6035751840168244, "grad_norm": 0.6624807422048473, "learning_rate": 1.0544338821959407e-05, "loss": 0.8918, "step": 1525 }, { "epoch": 1.608832807570978, "grad_norm": 0.6540415860179337, "learning_rate": 1.0483227826697686e-05, "loss": 0.902, "step": 1530 }, { "epoch": 1.6140904311251314, "grad_norm": 0.6339684794581751, "learning_rate": 1.0422098737268862e-05, "loss": 0.9047, "step": 1535 }, { "epoch": 1.619348054679285, "grad_norm": 0.63411282308358, "learning_rate": 1.0360953842613886e-05, "loss": 0.9106, "step": 1540 }, { "epoch": 1.6246056782334386, "grad_norm": 0.6246624939138397, "learning_rate": 1.0299795432265516e-05, "loss": 0.8941, "step": 1545 }, { "epoch": 1.629863301787592, "grad_norm": 0.6422075365217625, "learning_rate": 1.0238625796262604e-05, "loss": 0.8969, "step": 1550 }, { "epoch": 1.6351209253417456, "grad_norm": 0.641718675847965, "learning_rate": 1.0177447225064334e-05, "loss": 0.8932, "step": 1555 }, { "epoch": 1.640378548895899, "grad_norm": 0.6996379461819543, "learning_rate": 1.0116262009464475e-05, "loss": 0.8988, "step": 1560 }, { "epoch": 1.6456361724500526, "grad_norm": 0.6496660294162664, "learning_rate": 1.0055072440505576e-05, "loss": 0.8857, "step": 1565 }, { "epoch": 1.650893796004206, "grad_norm": 0.6913136358312865, "learning_rate": 9.993880809393203e-06, "loss": 0.8953, "step": 1570 }, { "epoch": 1.6561514195583595, "grad_norm": 0.6323428927883549, "learning_rate": 9.932689407410136e-06, "loss": 0.894, "step": 1575 }, { "epoch": 1.661409043112513, "grad_norm": 0.7165826659774039, "learning_rate": 9.871500525830581e-06, "loss": 0.8946, "step": 1580 }, { "epoch": 1.6666666666666665, "grad_norm": 0.6630355223364007, "learning_rate": 9.810316455834359e-06, "loss": 0.8907, "step": 1585 }, { "epoch": 1.6719242902208202, "grad_norm": 0.6096362135364939, "learning_rate": 9.749139488421133e-06, "loss": 0.893, "step": 1590 }, { "epoch": 1.6771819137749737, "grad_norm": 0.6414609478289887, "learning_rate": 9.687971914324607e-06, "loss": 0.897, "step": 1595 }, { "epoch": 1.6824395373291272, "grad_norm": 0.6909628111495161, "learning_rate": 9.626816023926771e-06, "loss": 0.8882, "step": 1600 }, { "epoch": 1.6824395373291272, "eval_loss": 1.054638385772705, "eval_runtime": 554.7908, "eval_samples_per_second": 24.256, "eval_steps_per_second": 0.191, "step": 1600 }, { "epoch": 1.687697160883281, "grad_norm": 0.658967287448874, "learning_rate": 9.565674107172109e-06, "loss": 0.8963, "step": 1605 }, { "epoch": 1.6929547844374344, "grad_norm": 0.671113099618244, "learning_rate": 9.504548453481875e-06, "loss": 0.9006, "step": 1610 }, { "epoch": 1.698212407991588, "grad_norm": 0.6356681511467472, "learning_rate": 9.443441351668375e-06, "loss": 0.8855, "step": 1615 }, { "epoch": 1.7034700315457414, "grad_norm": 0.6733155642148883, "learning_rate": 9.382355089849235e-06, "loss": 0.8918, "step": 1620 }, { "epoch": 1.7087276550998949, "grad_norm": 0.6923042640634246, "learning_rate": 9.321291955361756e-06, "loss": 0.8933, "step": 1625 }, { "epoch": 1.7139852786540484, "grad_norm": 0.6424747471753014, "learning_rate": 9.260254234677235e-06, "loss": 0.8816, "step": 1630 }, { "epoch": 1.7192429022082019, "grad_norm": 0.6144029954554266, "learning_rate": 9.199244213315377e-06, "loss": 0.8905, "step": 1635 }, { "epoch": 1.7245005257623554, "grad_norm": 0.6170077707358106, "learning_rate": 9.138264175758693e-06, "loss": 0.8863, "step": 1640 }, { "epoch": 1.7297581493165088, "grad_norm": 0.6197301026220858, "learning_rate": 9.07731640536698e-06, "loss": 0.8796, "step": 1645 }, { "epoch": 1.7350157728706623, "grad_norm": 0.6643068163348533, "learning_rate": 9.016403184291805e-06, "loss": 0.8908, "step": 1650 }, { "epoch": 1.7402733964248158, "grad_norm": 0.5938702426426252, "learning_rate": 8.955526793391049e-06, "loss": 0.8902, "step": 1655 }, { "epoch": 1.7455310199789695, "grad_norm": 0.6474013520993763, "learning_rate": 8.894689512143528e-06, "loss": 0.8862, "step": 1660 }, { "epoch": 1.750788643533123, "grad_norm": 0.6350300886593221, "learning_rate": 8.833893618563604e-06, "loss": 0.8847, "step": 1665 }, { "epoch": 1.7560462670872765, "grad_norm": 0.6377181777254709, "learning_rate": 8.773141389115914e-06, "loss": 0.8865, "step": 1670 }, { "epoch": 1.7613038906414302, "grad_norm": 0.6115861001350186, "learning_rate": 8.712435098630116e-06, "loss": 0.8863, "step": 1675 }, { "epoch": 1.7665615141955837, "grad_norm": 0.6631610912721477, "learning_rate": 8.651777020215713e-06, "loss": 0.8959, "step": 1680 }, { "epoch": 1.7718191377497372, "grad_norm": 0.6241016927327407, "learning_rate": 8.591169425176931e-06, "loss": 0.8726, "step": 1685 }, { "epoch": 1.7770767613038907, "grad_norm": 0.6207578864949994, "learning_rate": 8.53061458292768e-06, "loss": 0.8892, "step": 1690 }, { "epoch": 1.7823343848580442, "grad_norm": 0.6848519519116634, "learning_rate": 8.470114760906583e-06, "loss": 0.8943, "step": 1695 }, { "epoch": 1.7875920084121977, "grad_norm": 0.6571666376626863, "learning_rate": 8.409672224492051e-06, "loss": 0.8847, "step": 1700 }, { "epoch": 1.7875920084121977, "eval_loss": 1.049035906791687, "eval_runtime": 554.1715, "eval_samples_per_second": 24.283, "eval_steps_per_second": 0.191, "step": 1700 }, { "epoch": 1.7928496319663512, "grad_norm": 0.645616472902103, "learning_rate": 8.349289236917482e-06, "loss": 0.8816, "step": 1705 }, { "epoch": 1.7981072555205047, "grad_norm": 0.6574673506951342, "learning_rate": 8.28896805918649e-06, "loss": 0.8648, "step": 1710 }, { "epoch": 1.8033648790746581, "grad_norm": 0.6469048695832662, "learning_rate": 8.228710949988283e-06, "loss": 0.8844, "step": 1715 }, { "epoch": 1.8086225026288116, "grad_norm": 0.6503850752063266, "learning_rate": 8.168520165613035e-06, "loss": 0.8927, "step": 1720 }, { "epoch": 1.8138801261829653, "grad_norm": 0.6478121209226875, "learning_rate": 8.108397959867445e-06, "loss": 0.8973, "step": 1725 }, { "epoch": 1.8191377497371188, "grad_norm": 0.6456428948521569, "learning_rate": 8.04834658399032e-06, "loss": 0.8829, "step": 1730 }, { "epoch": 1.8243953732912723, "grad_norm": 0.6790517960706193, "learning_rate": 7.988368286568287e-06, "loss": 0.8756, "step": 1735 }, { "epoch": 1.8296529968454258, "grad_norm": 0.7206882773594423, "learning_rate": 7.928465313451603e-06, "loss": 0.9051, "step": 1740 }, { "epoch": 1.8349106203995795, "grad_norm": 0.6598808403648849, "learning_rate": 7.868639907670042e-06, "loss": 0.9019, "step": 1745 }, { "epoch": 1.840168243953733, "grad_norm": 0.8136122353035425, "learning_rate": 7.808894309348925e-06, "loss": 0.8814, "step": 1750 }, { "epoch": 1.8454258675078865, "grad_norm": 0.6383354015679575, "learning_rate": 7.749230755625228e-06, "loss": 0.8775, "step": 1755 }, { "epoch": 1.85068349106204, "grad_norm": 0.6270079438127367, "learning_rate": 7.689651480563824e-06, "loss": 0.8959, "step": 1760 }, { "epoch": 1.8559411146161935, "grad_norm": 0.6235081226194247, "learning_rate": 7.630158715073813e-06, "loss": 0.8871, "step": 1765 }, { "epoch": 1.861198738170347, "grad_norm": 0.6526009528156013, "learning_rate": 7.570754686825004e-06, "loss": 0.8867, "step": 1770 }, { "epoch": 1.8664563617245005, "grad_norm": 0.6321251834720393, "learning_rate": 7.511441620164499e-06, "loss": 0.9111, "step": 1775 }, { "epoch": 1.871713985278654, "grad_norm": 0.6579757228675541, "learning_rate": 7.452221736033387e-06, "loss": 0.8758, "step": 1780 }, { "epoch": 1.8769716088328074, "grad_norm": 0.6288476910531294, "learning_rate": 7.393097251883609e-06, "loss": 0.8848, "step": 1785 }, { "epoch": 1.882229232386961, "grad_norm": 0.6671896908639643, "learning_rate": 7.334070381594904e-06, "loss": 0.8879, "step": 1790 }, { "epoch": 1.8874868559411146, "grad_norm": 0.6375714540658346, "learning_rate": 7.275143335391927e-06, "loss": 0.8871, "step": 1795 }, { "epoch": 1.8927444794952681, "grad_norm": 0.6461378473926269, "learning_rate": 7.21631831976147e-06, "loss": 0.8831, "step": 1800 }, { "epoch": 1.8927444794952681, "eval_loss": 1.0454537868499756, "eval_runtime": 554.7002, "eval_samples_per_second": 24.26, "eval_steps_per_second": 0.191, "step": 1800 }, { "epoch": 1.8980021030494216, "grad_norm": 0.6266360175385085, "learning_rate": 7.157597537369866e-06, "loss": 0.8836, "step": 1805 }, { "epoch": 1.9032597266035753, "grad_norm": 0.7106762429735706, "learning_rate": 7.098983186980495e-06, "loss": 0.8894, "step": 1810 }, { "epoch": 1.9085173501577288, "grad_norm": 0.6449309860617594, "learning_rate": 7.040477463371449e-06, "loss": 0.8961, "step": 1815 }, { "epoch": 1.9137749737118823, "grad_norm": 0.6118460786718801, "learning_rate": 6.982082557253371e-06, "loss": 0.8898, "step": 1820 }, { "epoch": 1.9190325972660358, "grad_norm": 0.6200070078112132, "learning_rate": 6.9238006551873985e-06, "loss": 0.8993, "step": 1825 }, { "epoch": 1.9242902208201893, "grad_norm": 0.62946195709294, "learning_rate": 6.86563393950331e-06, "loss": 0.8746, "step": 1830 }, { "epoch": 1.9295478443743428, "grad_norm": 0.6894329752058552, "learning_rate": 6.807584588217798e-06, "loss": 0.8768, "step": 1835 }, { "epoch": 1.9348054679284963, "grad_norm": 0.6337025218810814, "learning_rate": 6.749654774952925e-06, "loss": 0.8774, "step": 1840 }, { "epoch": 1.9400630914826498, "grad_norm": 0.6061458342443647, "learning_rate": 6.691846668854709e-06, "loss": 0.8925, "step": 1845 }, { "epoch": 1.9453207150368033, "grad_norm": 0.6323722322620482, "learning_rate": 6.634162434511939e-06, "loss": 0.8878, "step": 1850 }, { "epoch": 1.9505783385909568, "grad_norm": 0.6295608770739457, "learning_rate": 6.57660423187509e-06, "loss": 0.8894, "step": 1855 }, { "epoch": 1.9558359621451105, "grad_norm": 0.7448236764255614, "learning_rate": 6.519174216175458e-06, "loss": 0.884, "step": 1860 }, { "epoch": 1.961093585699264, "grad_norm": 0.6643005564433259, "learning_rate": 6.461874537844465e-06, "loss": 0.8712, "step": 1865 }, { "epoch": 1.9663512092534174, "grad_norm": 0.6460805751831616, "learning_rate": 6.404707342433123e-06, "loss": 0.8794, "step": 1870 }, { "epoch": 1.971608832807571, "grad_norm": 0.6437260367816269, "learning_rate": 6.347674770531716e-06, "loss": 0.8913, "step": 1875 }, { "epoch": 1.9768664563617246, "grad_norm": 0.6422567155892785, "learning_rate": 6.2907789576896125e-06, "loss": 0.8722, "step": 1880 }, { "epoch": 1.9821240799158781, "grad_norm": 0.6631332611742206, "learning_rate": 6.2340220343353455e-06, "loss": 0.8747, "step": 1885 }, { "epoch": 1.9873817034700316, "grad_norm": 0.5923326352879508, "learning_rate": 6.177406125696804e-06, "loss": 0.8863, "step": 1890 }, { "epoch": 1.9926393270241851, "grad_norm": 0.6040038053093328, "learning_rate": 6.120933351721665e-06, "loss": 0.8822, "step": 1895 }, { "epoch": 1.9978969505783386, "grad_norm": 0.6155901401028533, "learning_rate": 6.064605826998031e-06, "loss": 0.8781, "step": 1900 }, { "epoch": 1.9978969505783386, "eval_loss": 1.0413092374801636, "eval_runtime": 562.2304, "eval_samples_per_second": 23.935, "eval_steps_per_second": 0.189, "step": 1900 }, { "epoch": 2.003154574132492, "grad_norm": 0.9856009462574625, "learning_rate": 6.00842566067522e-06, "loss": 0.7565, "step": 1905 }, { "epoch": 2.0084121976866456, "grad_norm": 1.1010436371290768, "learning_rate": 5.952394956384823e-06, "loss": 0.7157, "step": 1910 }, { "epoch": 2.013669821240799, "grad_norm": 0.7976498381871772, "learning_rate": 5.896515812161896e-06, "loss": 0.7125, "step": 1915 }, { "epoch": 2.0189274447949526, "grad_norm": 0.7307266469267819, "learning_rate": 5.840790320366444e-06, "loss": 0.7208, "step": 1920 }, { "epoch": 2.024185068349106, "grad_norm": 0.7601898382424687, "learning_rate": 5.7852205676050355e-06, "loss": 0.7079, "step": 1925 }, { "epoch": 2.0294426919032595, "grad_norm": 0.7636705961643997, "learning_rate": 5.7298086346527e-06, "loss": 0.7021, "step": 1930 }, { "epoch": 2.034700315457413, "grad_norm": 0.7600945150765135, "learning_rate": 5.674556596374993e-06, "loss": 0.698, "step": 1935 }, { "epoch": 2.039957939011567, "grad_norm": 0.7283661435999434, "learning_rate": 5.619466521650309e-06, "loss": 0.7135, "step": 1940 }, { "epoch": 2.0452155625657205, "grad_norm": 0.673772367415323, "learning_rate": 5.564540473292433e-06, "loss": 0.712, "step": 1945 }, { "epoch": 2.050473186119874, "grad_norm": 0.7274211486508272, "learning_rate": 5.509780507973266e-06, "loss": 0.7316, "step": 1950 }, { "epoch": 2.0557308096740274, "grad_norm": 0.7019707530514135, "learning_rate": 5.455188676145846e-06, "loss": 0.7178, "step": 1955 }, { "epoch": 2.060988433228181, "grad_norm": 0.727356607819457, "learning_rate": 5.40076702196755e-06, "loss": 0.6901, "step": 1960 }, { "epoch": 2.0662460567823344, "grad_norm": 0.7393497514045044, "learning_rate": 5.346517583223567e-06, "loss": 0.7091, "step": 1965 }, { "epoch": 2.071503680336488, "grad_norm": 0.6909005753061759, "learning_rate": 5.292442391250567e-06, "loss": 0.7103, "step": 1970 }, { "epoch": 2.0767613038906414, "grad_norm": 0.7199779190451211, "learning_rate": 5.238543470860677e-06, "loss": 0.7142, "step": 1975 }, { "epoch": 2.082018927444795, "grad_norm": 0.6986050924763797, "learning_rate": 5.184822840265635e-06, "loss": 0.719, "step": 1980 }, { "epoch": 2.0872765509989484, "grad_norm": 0.6873483374112779, "learning_rate": 5.131282511001221e-06, "loss": 0.7188, "step": 1985 }, { "epoch": 2.092534174553102, "grad_norm": 0.704017833699201, "learning_rate": 5.077924487851954e-06, "loss": 0.7206, "step": 1990 }, { "epoch": 2.0977917981072554, "grad_norm": 0.6869215244017003, "learning_rate": 5.024750768776011e-06, "loss": 0.7197, "step": 1995 }, { "epoch": 2.103049421661409, "grad_norm": 0.6616206251205331, "learning_rate": 4.971763344830419e-06, "loss": 0.7197, "step": 2000 }, { "epoch": 2.103049421661409, "eval_loss": 1.0822256803512573, "eval_runtime": 566.9236, "eval_samples_per_second": 23.737, "eval_steps_per_second": 0.187, "step": 2000 }, { "epoch": 2.108307045215563, "grad_norm": 0.747519024431639, "learning_rate": 4.91896420009649e-06, "loss": 0.7115, "step": 2005 }, { "epoch": 2.1135646687697163, "grad_norm": 0.7095283324919017, "learning_rate": 4.866355311605547e-06, "loss": 0.7215, "step": 2010 }, { "epoch": 2.1188222923238698, "grad_norm": 0.7245597363837365, "learning_rate": 4.813938649264881e-06, "loss": 0.7038, "step": 2015 }, { "epoch": 2.1240799158780233, "grad_norm": 0.7212203821120433, "learning_rate": 4.7617161757839895e-06, "loss": 0.715, "step": 2020 }, { "epoch": 2.1293375394321767, "grad_norm": 0.7027132940392441, "learning_rate": 4.7096898466010976e-06, "loss": 0.716, "step": 2025 }, { "epoch": 2.1345951629863302, "grad_norm": 0.6920491890608464, "learning_rate": 4.657861609809923e-06, "loss": 0.7027, "step": 2030 }, { "epoch": 2.1398527865404837, "grad_norm": 0.7246862757367895, "learning_rate": 4.6062334060867416e-06, "loss": 0.7211, "step": 2035 }, { "epoch": 2.145110410094637, "grad_norm": 0.6816731320053306, "learning_rate": 4.554807168617703e-06, "loss": 0.7127, "step": 2040 }, { "epoch": 2.1503680336487907, "grad_norm": 0.7013385203267727, "learning_rate": 4.5035848230264715e-06, "loss": 0.7158, "step": 2045 }, { "epoch": 2.155625657202944, "grad_norm": 0.7169543079018775, "learning_rate": 4.452568287302088e-06, "loss": 0.7071, "step": 2050 }, { "epoch": 2.1608832807570977, "grad_norm": 0.713248407044651, "learning_rate": 4.40175947172719e-06, "loss": 0.7068, "step": 2055 }, { "epoch": 2.166140904311251, "grad_norm": 0.6698951380098755, "learning_rate": 4.351160278806444e-06, "loss": 0.7169, "step": 2060 }, { "epoch": 2.1713985278654047, "grad_norm": 0.6926886822542322, "learning_rate": 4.300772603195335e-06, "loss": 0.7097, "step": 2065 }, { "epoch": 2.176656151419558, "grad_norm": 0.7101604887955768, "learning_rate": 4.250598331629215e-06, "loss": 0.7199, "step": 2070 }, { "epoch": 2.181913774973712, "grad_norm": 0.6817786841786956, "learning_rate": 4.200639342852648e-06, "loss": 0.709, "step": 2075 }, { "epoch": 2.1871713985278656, "grad_norm": 0.670024634466742, "learning_rate": 4.150897507549076e-06, "loss": 0.7031, "step": 2080 }, { "epoch": 2.192429022082019, "grad_norm": 0.704511383930273, "learning_rate": 4.101374688270751e-06, "loss": 0.716, "step": 2085 }, { "epoch": 2.1976866456361726, "grad_norm": 0.6737857814580686, "learning_rate": 4.052072739369015e-06, "loss": 0.7151, "step": 2090 }, { "epoch": 2.202944269190326, "grad_norm": 0.7004818342552892, "learning_rate": 4.0029935069248494e-06, "loss": 0.7084, "step": 2095 }, { "epoch": 2.2082018927444795, "grad_norm": 0.6938485406548258, "learning_rate": 3.954138828679762e-06, "loss": 0.7137, "step": 2100 }, { "epoch": 2.2082018927444795, "eval_loss": 1.0840835571289062, "eval_runtime": 554.526, "eval_samples_per_second": 24.268, "eval_steps_per_second": 0.191, "step": 2100 }, { "epoch": 2.213459516298633, "grad_norm": 0.6902078976776752, "learning_rate": 3.905510533966959e-06, "loss": 0.7096, "step": 2105 }, { "epoch": 2.2187171398527865, "grad_norm": 0.7110522716973304, "learning_rate": 3.857110443642864e-06, "loss": 0.6949, "step": 2110 }, { "epoch": 2.22397476340694, "grad_norm": 0.7247408104466715, "learning_rate": 3.8089403700189254e-06, "loss": 0.7187, "step": 2115 }, { "epoch": 2.2292323869610935, "grad_norm": 0.7097288878868501, "learning_rate": 3.7610021167937526e-06, "loss": 0.7036, "step": 2120 }, { "epoch": 2.234490010515247, "grad_norm": 0.7612906599424331, "learning_rate": 3.713297478985595e-06, "loss": 0.7205, "step": 2125 }, { "epoch": 2.2397476340694005, "grad_norm": 0.7985865232124967, "learning_rate": 3.6658282428651026e-06, "loss": 0.7018, "step": 2130 }, { "epoch": 2.245005257623554, "grad_norm": 0.6445514804150951, "learning_rate": 3.618596185888471e-06, "loss": 0.6983, "step": 2135 }, { "epoch": 2.250262881177708, "grad_norm": 0.6788252376343907, "learning_rate": 3.5716030766308553e-06, "loss": 0.6963, "step": 2140 }, { "epoch": 2.2555205047318614, "grad_norm": 0.6558652902911214, "learning_rate": 3.5248506747201694e-06, "loss": 0.6988, "step": 2145 }, { "epoch": 2.260778128286015, "grad_norm": 0.727190238646923, "learning_rate": 3.4783407307711913e-06, "loss": 0.701, "step": 2150 }, { "epoch": 2.2660357518401684, "grad_norm": 0.7053251271830925, "learning_rate": 3.4320749863199987e-06, "loss": 0.7038, "step": 2155 }, { "epoch": 2.271293375394322, "grad_norm": 0.691685408706534, "learning_rate": 3.3860551737587857e-06, "loss": 0.7068, "step": 2160 }, { "epoch": 2.2765509989484753, "grad_norm": 0.6897266118308167, "learning_rate": 3.3402830162709644e-06, "loss": 0.703, "step": 2165 }, { "epoch": 2.281808622502629, "grad_norm": 0.6917521598477109, "learning_rate": 3.2947602277666678e-06, "loss": 0.7136, "step": 2170 }, { "epoch": 2.2870662460567823, "grad_norm": 0.6899343095386444, "learning_rate": 3.2494885128185517e-06, "loss": 0.6984, "step": 2175 }, { "epoch": 2.292323869610936, "grad_norm": 0.6869089208872174, "learning_rate": 3.2044695665979865e-06, "loss": 0.724, "step": 2180 }, { "epoch": 2.2975814931650893, "grad_norm": 0.7005346292608602, "learning_rate": 3.1597050748115655e-06, "loss": 0.7035, "step": 2185 }, { "epoch": 2.302839116719243, "grad_norm": 0.7061499912056902, "learning_rate": 3.115196713638e-06, "loss": 0.6865, "step": 2190 }, { "epoch": 2.3080967402733963, "grad_norm": 0.6815319705079519, "learning_rate": 3.0709461496653504e-06, "loss": 0.7156, "step": 2195 }, { "epoch": 2.3133543638275498, "grad_norm": 0.7049825225126681, "learning_rate": 3.0269550398286096e-06, "loss": 0.7115, "step": 2200 }, { "epoch": 2.3133543638275498, "eval_loss": 1.0800352096557617, "eval_runtime": 568.5479, "eval_samples_per_second": 23.669, "eval_steps_per_second": 0.186, "step": 2200 }, { "epoch": 2.3186119873817033, "grad_norm": 0.6675183707377966, "learning_rate": 2.983225031347683e-06, "loss": 0.7087, "step": 2205 }, { "epoch": 2.3238696109358568, "grad_norm": 0.7114348169331429, "learning_rate": 2.939757761665686e-06, "loss": 0.7077, "step": 2210 }, { "epoch": 2.3291272344900107, "grad_norm": 0.7191874914216904, "learning_rate": 2.8965548583876534e-06, "loss": 0.7201, "step": 2215 }, { "epoch": 2.334384858044164, "grad_norm": 0.6766258501238187, "learning_rate": 2.853617939219574e-06, "loss": 0.7072, "step": 2220 }, { "epoch": 2.3396424815983177, "grad_norm": 0.7028752741574394, "learning_rate": 2.810948611907832e-06, "loss": 0.6955, "step": 2225 }, { "epoch": 2.344900105152471, "grad_norm": 0.7210493538085075, "learning_rate": 2.7685484741790023e-06, "loss": 0.7129, "step": 2230 }, { "epoch": 2.3501577287066246, "grad_norm": 0.6928964162595481, "learning_rate": 2.7264191136800112e-06, "loss": 0.6873, "step": 2235 }, { "epoch": 2.355415352260778, "grad_norm": 0.6949752358383088, "learning_rate": 2.6845621079187122e-06, "loss": 0.7207, "step": 2240 }, { "epoch": 2.3606729758149316, "grad_norm": 0.7000497878298911, "learning_rate": 2.6429790242047927e-06, "loss": 0.7019, "step": 2245 }, { "epoch": 2.365930599369085, "grad_norm": 0.6655488986940491, "learning_rate": 2.6016714195911085e-06, "loss": 0.6909, "step": 2250 }, { "epoch": 2.3711882229232386, "grad_norm": 0.6946100724369102, "learning_rate": 2.560640840815363e-06, "loss": 0.703, "step": 2255 }, { "epoch": 2.376445846477392, "grad_norm": 0.6799665527381428, "learning_rate": 2.5198888242422014e-06, "loss": 0.7029, "step": 2260 }, { "epoch": 2.3817034700315456, "grad_norm": 0.698092499847167, "learning_rate": 2.4794168958056854e-06, "loss": 0.706, "step": 2265 }, { "epoch": 2.386961093585699, "grad_norm": 0.6725956864860293, "learning_rate": 2.439226570952137e-06, "loss": 0.7087, "step": 2270 }, { "epoch": 2.392218717139853, "grad_norm": 0.7109494323803826, "learning_rate": 2.3993193545834182e-06, "loss": 0.7125, "step": 2275 }, { "epoch": 2.3974763406940065, "grad_norm": 0.7088160313512611, "learning_rate": 2.35969674100056e-06, "loss": 0.6979, "step": 2280 }, { "epoch": 2.40273396424816, "grad_norm": 0.6826523489540324, "learning_rate": 2.3203602138478264e-06, "loss": 0.7055, "step": 2285 }, { "epoch": 2.4079915878023135, "grad_norm": 0.6930882874841964, "learning_rate": 2.281311246057143e-06, "loss": 0.7201, "step": 2290 }, { "epoch": 2.413249211356467, "grad_norm": 0.6782194389254947, "learning_rate": 2.242551299792962e-06, "loss": 0.7278, "step": 2295 }, { "epoch": 2.4185068349106205, "grad_norm": 0.6611886260527141, "learning_rate": 2.204081826397494e-06, "loss": 0.7178, "step": 2300 }, { "epoch": 2.4185068349106205, "eval_loss": 1.0789012908935547, "eval_runtime": 548.9059, "eval_samples_per_second": 24.516, "eval_steps_per_second": 0.193, "step": 2300 }, { "epoch": 2.423764458464774, "grad_norm": 0.6913748928617807, "learning_rate": 2.1659042663363795e-06, "loss": 0.7031, "step": 2305 }, { "epoch": 2.4290220820189274, "grad_norm": 0.68971986235768, "learning_rate": 2.1280200491447465e-06, "loss": 0.6902, "step": 2310 }, { "epoch": 2.434279705573081, "grad_norm": 0.7068453091320502, "learning_rate": 2.0904305933736714e-06, "loss": 0.7064, "step": 2315 }, { "epoch": 2.4395373291272344, "grad_norm": 0.7009937280786678, "learning_rate": 2.053137306537082e-06, "loss": 0.702, "step": 2320 }, { "epoch": 2.444794952681388, "grad_norm": 0.7009541498050648, "learning_rate": 2.0161415850590327e-06, "loss": 0.7072, "step": 2325 }, { "epoch": 2.4500525762355414, "grad_norm": 0.6679413662712783, "learning_rate": 1.9794448142214396e-06, "loss": 0.7121, "step": 2330 }, { "epoch": 2.455310199789695, "grad_norm": 0.6929272185822167, "learning_rate": 1.9430483681121836e-06, "loss": 0.7164, "step": 2335 }, { "epoch": 2.4605678233438484, "grad_norm": 0.7778000958451866, "learning_rate": 1.9069536095736817e-06, "loss": 0.7091, "step": 2340 }, { "epoch": 2.465825446898002, "grad_norm": 0.6672776696135466, "learning_rate": 1.8711618901518446e-06, "loss": 0.7132, "step": 2345 }, { "epoch": 2.471083070452156, "grad_norm": 0.6949140160619673, "learning_rate": 1.8356745500454699e-06, "loss": 0.6974, "step": 2350 }, { "epoch": 2.4763406940063093, "grad_norm": 0.6950911698278153, "learning_rate": 1.8004929180560582e-06, "loss": 0.6894, "step": 2355 }, { "epoch": 2.481598317560463, "grad_norm": 0.6826148060946653, "learning_rate": 1.7656183115380577e-06, "loss": 0.7043, "step": 2360 }, { "epoch": 2.4868559411146163, "grad_norm": 0.7310354415413428, "learning_rate": 1.7310520363495454e-06, "loss": 0.7021, "step": 2365 }, { "epoch": 2.4921135646687698, "grad_norm": 0.6754671470342107, "learning_rate": 1.6967953868033104e-06, "loss": 0.7043, "step": 2370 }, { "epoch": 2.4973711882229233, "grad_norm": 0.6935442287350769, "learning_rate": 1.6628496456184107e-06, "loss": 0.6994, "step": 2375 }, { "epoch": 2.5026288117770767, "grad_norm": 0.690259266155438, "learning_rate": 1.6292160838721316e-06, "loss": 0.6946, "step": 2380 }, { "epoch": 2.5078864353312302, "grad_norm": 0.6934285014568452, "learning_rate": 1.5958959609523905e-06, "loss": 0.719, "step": 2385 }, { "epoch": 2.5131440588853837, "grad_norm": 0.706595235609839, "learning_rate": 1.562890524510583e-06, "loss": 0.699, "step": 2390 }, { "epoch": 2.518401682439537, "grad_norm": 0.7031045404384867, "learning_rate": 1.530201010414859e-06, "loss": 0.7019, "step": 2395 }, { "epoch": 2.5236593059936907, "grad_norm": 0.6611225731580428, "learning_rate": 1.4978286427038602e-06, "loss": 0.7063, "step": 2400 }, { "epoch": 2.5236593059936907, "eval_loss": 1.0776675939559937, "eval_runtime": 549.0786, "eval_samples_per_second": 24.508, "eval_steps_per_second": 0.193, "step": 2400 }, { "epoch": 2.5289169295478446, "grad_norm": 0.6876289627741422, "learning_rate": 1.4657746335408695e-06, "loss": 0.7068, "step": 2405 }, { "epoch": 2.534174553101998, "grad_norm": 0.680233555417602, "learning_rate": 1.4340401831684413e-06, "loss": 0.6807, "step": 2410 }, { "epoch": 2.5394321766561516, "grad_norm": 0.6654932547762412, "learning_rate": 1.4026264798634359e-06, "loss": 0.7179, "step": 2415 }, { "epoch": 2.544689800210305, "grad_norm": 0.6945732690751362, "learning_rate": 1.371534699892547e-06, "loss": 0.7086, "step": 2420 }, { "epoch": 2.5499474237644586, "grad_norm": 0.6862420273962914, "learning_rate": 1.3407660074682472e-06, "loss": 0.7028, "step": 2425 }, { "epoch": 2.555205047318612, "grad_norm": 0.651460129300283, "learning_rate": 1.3103215547051962e-06, "loss": 0.6975, "step": 2430 }, { "epoch": 2.5604626708727656, "grad_norm": 0.6970590762896678, "learning_rate": 1.2802024815770942e-06, "loss": 0.7115, "step": 2435 }, { "epoch": 2.565720294426919, "grad_norm": 0.6744240212503375, "learning_rate": 1.250409915874007e-06, "loss": 0.7057, "step": 2440 }, { "epoch": 2.5709779179810726, "grad_norm": 0.6699733139877856, "learning_rate": 1.220944973160133e-06, "loss": 0.6884, "step": 2445 }, { "epoch": 2.576235541535226, "grad_norm": 0.6915305368046275, "learning_rate": 1.1918087567320257e-06, "loss": 0.7026, "step": 2450 }, { "epoch": 2.5814931650893795, "grad_norm": 0.6755768658668228, "learning_rate": 1.1630023575772908e-06, "loss": 0.6966, "step": 2455 }, { "epoch": 2.586750788643533, "grad_norm": 0.705779731938613, "learning_rate": 1.1345268543337283e-06, "loss": 0.6988, "step": 2460 }, { "epoch": 2.5920084121976865, "grad_norm": 0.7111985726538933, "learning_rate": 1.1063833132489477e-06, "loss": 0.696, "step": 2465 }, { "epoch": 2.59726603575184, "grad_norm": 0.6539744158999056, "learning_rate": 1.0785727881404329e-06, "loss": 0.6961, "step": 2470 }, { "epoch": 2.6025236593059935, "grad_norm": 0.6848492944946433, "learning_rate": 1.051096320356103e-06, "loss": 0.7046, "step": 2475 }, { "epoch": 2.607781282860147, "grad_norm": 0.7032823101149783, "learning_rate": 1.0239549387352954e-06, "loss": 0.7201, "step": 2480 }, { "epoch": 2.6130389064143005, "grad_norm": 0.6762173164818084, "learning_rate": 9.97149659570259e-07, "loss": 0.7116, "step": 2485 }, { "epoch": 2.6182965299684544, "grad_norm": 0.6806035208648271, "learning_rate": 9.706814865680957e-07, "loss": 0.7045, "step": 2490 }, { "epoch": 2.623554153522608, "grad_norm": 0.6776503088053696, "learning_rate": 9.445514108131693e-07, "loss": 0.6888, "step": 2495 }, { "epoch": 2.6288117770767614, "grad_norm": 0.6836339268439919, "learning_rate": 9.187604107300107e-07, "loss": 0.6964, "step": 2500 }, { "epoch": 2.6288117770767614, "eval_loss": 1.0754879713058472, "eval_runtime": 544.4972, "eval_samples_per_second": 24.715, "eval_steps_per_second": 0.195, "step": 2500 }, { "epoch": 2.634069400630915, "grad_norm": 0.6761130619047382, "learning_rate": 8.933094520466634e-07, "loss": 0.7058, "step": 2505 }, { "epoch": 2.6393270241850684, "grad_norm": 0.6672694366752451, "learning_rate": 8.681994877585365e-07, "loss": 0.7054, "step": 2510 }, { "epoch": 2.644584647739222, "grad_norm": 0.7017173692899314, "learning_rate": 8.434314580927105e-07, "loss": 0.7003, "step": 2515 }, { "epoch": 2.6498422712933754, "grad_norm": 0.6828167224204641, "learning_rate": 8.19006290472737e-07, "loss": 0.7134, "step": 2520 }, { "epoch": 2.655099894847529, "grad_norm": 0.6887161892823586, "learning_rate": 7.949248994839131e-07, "loss": 0.7107, "step": 2525 }, { "epoch": 2.6603575184016823, "grad_norm": 0.6858305599284509, "learning_rate": 7.711881868390292e-07, "loss": 0.7185, "step": 2530 }, { "epoch": 2.665615141955836, "grad_norm": 0.6919951634850794, "learning_rate": 7.477970413446089e-07, "loss": 0.7038, "step": 2535 }, { "epoch": 2.6708727655099898, "grad_norm": 0.7059421711173827, "learning_rate": 7.247523388676292e-07, "loss": 0.6934, "step": 2540 }, { "epoch": 2.6761303890641432, "grad_norm": 0.697370543891664, "learning_rate": 7.020549423027223e-07, "loss": 0.6874, "step": 2545 }, { "epoch": 2.6813880126182967, "grad_norm": 0.6851210955122395, "learning_rate": 6.797057015398634e-07, "loss": 0.7091, "step": 2550 }, { "epoch": 2.6866456361724502, "grad_norm": 0.6810814971271851, "learning_rate": 6.577054534325511e-07, "loss": 0.6935, "step": 2555 }, { "epoch": 2.6919032597266037, "grad_norm": 0.6676833725760639, "learning_rate": 6.360550217664685e-07, "loss": 0.7088, "step": 2560 }, { "epoch": 2.697160883280757, "grad_norm": 0.7148977742599517, "learning_rate": 6.147552172286375e-07, "loss": 0.6987, "step": 2565 }, { "epoch": 2.7024185068349107, "grad_norm": 0.6475197510665502, "learning_rate": 5.938068373770667e-07, "loss": 0.6864, "step": 2570 }, { "epoch": 2.707676130389064, "grad_norm": 0.685110898697612, "learning_rate": 5.732106666108827e-07, "loss": 0.6937, "step": 2575 }, { "epoch": 2.7129337539432177, "grad_norm": 0.6850644373487722, "learning_rate": 5.529674761409643e-07, "loss": 0.701, "step": 2580 }, { "epoch": 2.718191377497371, "grad_norm": 0.6619622645326332, "learning_rate": 5.330780239610534e-07, "loss": 0.705, "step": 2585 }, { "epoch": 2.7234490010515247, "grad_norm": 0.6779887305496379, "learning_rate": 5.135430548193909e-07, "loss": 0.6912, "step": 2590 }, { "epoch": 2.728706624605678, "grad_norm": 0.6695357873979283, "learning_rate": 4.943633001908111e-07, "loss": 0.7007, "step": 2595 }, { "epoch": 2.7339642481598316, "grad_norm": 0.6851094475471325, "learning_rate": 4.7553947824936496e-07, "loss": 0.7121, "step": 2600 }, { "epoch": 2.7339642481598316, "eval_loss": 1.0742169618606567, "eval_runtime": 543.8651, "eval_samples_per_second": 24.743, "eval_steps_per_second": 0.195, "step": 2600 }, { "epoch": 2.739221871713985, "grad_norm": 0.6798881286754066, "learning_rate": 4.5707229384142184e-07, "loss": 0.7043, "step": 2605 }, { "epoch": 2.7444794952681386, "grad_norm": 0.6627199879579073, "learning_rate": 4.3896243845927943e-07, "loss": 0.7083, "step": 2610 }, { "epoch": 2.749737118822292, "grad_norm": 0.6911107462785068, "learning_rate": 4.21210590215273e-07, "loss": 0.7062, "step": 2615 }, { "epoch": 2.7549947423764456, "grad_norm": 0.6538298159253733, "learning_rate": 4.0381741381638085e-07, "loss": 0.6919, "step": 2620 }, { "epoch": 2.7602523659305995, "grad_norm": 0.6913261772512153, "learning_rate": 3.8678356053933666e-07, "loss": 0.6899, "step": 2625 }, { "epoch": 2.765509989484753, "grad_norm": 0.6731586319154937, "learning_rate": 3.7010966820623996e-07, "loss": 0.7115, "step": 2630 }, { "epoch": 2.7707676130389065, "grad_norm": 0.6739111157184594, "learning_rate": 3.5379636116067764e-07, "loss": 0.6938, "step": 2635 }, { "epoch": 2.77602523659306, "grad_norm": 0.6775894239204638, "learning_rate": 3.378442502443424e-07, "loss": 0.7018, "step": 2640 }, { "epoch": 2.7812828601472135, "grad_norm": 0.6630535974515509, "learning_rate": 3.222539327741592e-07, "loss": 0.7108, "step": 2645 }, { "epoch": 2.786540483701367, "grad_norm": 0.6476313251006354, "learning_rate": 3.070259925199248e-07, "loss": 0.7064, "step": 2650 }, { "epoch": 2.7917981072555205, "grad_norm": 0.6793550821713811, "learning_rate": 2.921609996824437e-07, "loss": 0.686, "step": 2655 }, { "epoch": 2.797055730809674, "grad_norm": 0.6950659181503308, "learning_rate": 2.7765951087218134e-07, "loss": 0.6922, "step": 2660 }, { "epoch": 2.8023133543638274, "grad_norm": 0.6759277309855073, "learning_rate": 2.6352206908841325e-07, "loss": 0.7123, "step": 2665 }, { "epoch": 2.807570977917981, "grad_norm": 0.6871290912583685, "learning_rate": 2.497492036989058e-07, "loss": 0.7071, "step": 2670 }, { "epoch": 2.812828601472135, "grad_norm": 0.6672178424750838, "learning_rate": 2.3634143042008396e-07, "loss": 0.7055, "step": 2675 }, { "epoch": 2.8180862250262884, "grad_norm": 0.6871427641549465, "learning_rate": 2.2329925129772613e-07, "loss": 0.7162, "step": 2680 }, { "epoch": 2.823343848580442, "grad_norm": 0.6996639531083144, "learning_rate": 2.1062315468816318e-07, "loss": 0.7116, "step": 2685 }, { "epoch": 2.8286014721345953, "grad_norm": 0.7057461914462779, "learning_rate": 1.9831361523999227e-07, "loss": 0.6978, "step": 2690 }, { "epoch": 2.833859095688749, "grad_norm": 0.6606180852855636, "learning_rate": 1.8637109387630637e-07, "loss": 0.6872, "step": 2695 }, { "epoch": 2.8391167192429023, "grad_norm": 0.6603518954437334, "learning_rate": 1.7479603777742937e-07, "loss": 0.7049, "step": 2700 }, { "epoch": 2.8391167192429023, "eval_loss": 1.074755311012268, "eval_runtime": 548.3041, "eval_samples_per_second": 24.543, "eval_steps_per_second": 0.193, "step": 2700 }, { "epoch": 2.844374342797056, "grad_norm": 0.7039186631952389, "learning_rate": 1.6358888036418053e-07, "loss": 0.7076, "step": 2705 }, { "epoch": 2.8496319663512093, "grad_norm": 0.6613941861667958, "learning_rate": 1.5275004128163407e-07, "loss": 0.7022, "step": 2710 }, { "epoch": 2.854889589905363, "grad_norm": 0.6784432805911156, "learning_rate": 1.422799263834196e-07, "loss": 0.7018, "step": 2715 }, { "epoch": 2.8601472134595163, "grad_norm": 0.662880920108081, "learning_rate": 1.3217892771651087e-07, "loss": 0.7039, "step": 2720 }, { "epoch": 2.8654048370136698, "grad_norm": 0.674177068306156, "learning_rate": 1.224474235065587e-07, "loss": 0.6948, "step": 2725 }, { "epoch": 2.8706624605678233, "grad_norm": 0.6576941034750949, "learning_rate": 1.1308577814371669e-07, "loss": 0.6959, "step": 2730 }, { "epoch": 2.8759200841219767, "grad_norm": 0.6877738227702634, "learning_rate": 1.040943421690055e-07, "loss": 0.7016, "step": 2735 }, { "epoch": 2.8811777076761302, "grad_norm": 0.6570796449184478, "learning_rate": 9.547345226118666e-08, "loss": 0.7008, "step": 2740 }, { "epoch": 2.8864353312302837, "grad_norm": 0.6556870027002477, "learning_rate": 8.722343122414823e-08, "loss": 0.7114, "step": 2745 }, { "epoch": 2.891692954784437, "grad_norm": 0.6525356309193387, "learning_rate": 7.93445879748267e-08, "loss": 0.705, "step": 2750 }, { "epoch": 2.8969505783385907, "grad_norm": 0.6979809421888648, "learning_rate": 7.183721753163508e-08, "loss": 0.705, "step": 2755 }, { "epoch": 2.9022082018927446, "grad_norm": 0.6680505376816218, "learning_rate": 6.470160100341516e-08, "loss": 0.7028, "step": 2760 }, { "epoch": 2.907465825446898, "grad_norm": 0.6754425700333265, "learning_rate": 5.793800557891471e-08, "loss": 0.6969, "step": 2765 }, { "epoch": 2.9127234490010516, "grad_norm": 0.6770770823855421, "learning_rate": 5.154668451678224e-08, "loss": 0.709, "step": 2770 }, { "epoch": 2.917981072555205, "grad_norm": 0.6880130710385723, "learning_rate": 4.552787713608231e-08, "loss": 0.69, "step": 2775 }, { "epoch": 2.9232386961093586, "grad_norm": 0.6625414510833385, "learning_rate": 3.988180880733161e-08, "loss": 0.6962, "step": 2780 }, { "epoch": 2.928496319663512, "grad_norm": 0.6643252155800653, "learning_rate": 3.460869094407127e-08, "loss": 0.7037, "step": 2785 }, { "epoch": 2.9337539432176656, "grad_norm": 0.6897645676504198, "learning_rate": 2.9708720994934272e-08, "loss": 0.6896, "step": 2790 }, { "epoch": 2.939011566771819, "grad_norm": 0.7113672933129457, "learning_rate": 2.5182082436266963e-08, "loss": 0.7165, "step": 2795 }, { "epoch": 2.9442691903259726, "grad_norm": 0.6781710312687059, "learning_rate": 2.1028944765251193e-08, "loss": 0.7024, "step": 2800 }, { "epoch": 2.9442691903259726, "eval_loss": 1.074735403060913, "eval_runtime": 544.9092, "eval_samples_per_second": 24.696, "eval_steps_per_second": 0.195, "step": 2800 }, { "epoch": 2.949526813880126, "grad_norm": 0.7502190973801118, "learning_rate": 1.724946349355605e-08, "loss": 0.6952, "step": 2805 }, { "epoch": 2.9547844374342795, "grad_norm": 0.6554060805074167, "learning_rate": 1.3843780141521435e-08, "loss": 0.7095, "step": 2810 }, { "epoch": 2.9600420609884335, "grad_norm": 0.6884790361695539, "learning_rate": 1.081202223285449e-08, "loss": 0.7096, "step": 2815 }, { "epoch": 2.965299684542587, "grad_norm": 0.6687316519292371, "learning_rate": 8.154303289854559e-09, "loss": 0.7071, "step": 2820 }, { "epoch": 2.9705573080967405, "grad_norm": 0.6719077380861403, "learning_rate": 5.870722829164344e-09, "loss": 0.6954, "step": 2825 }, { "epoch": 2.975814931650894, "grad_norm": 0.6445219670997994, "learning_rate": 3.9613663580406745e-09, "loss": 0.6844, "step": 2830 }, { "epoch": 2.9810725552050474, "grad_norm": 0.6702818163839258, "learning_rate": 2.426305371155957e-09, "loss": 0.6924, "step": 2835 }, { "epoch": 2.986330178759201, "grad_norm": 0.6546313538456479, "learning_rate": 1.265597347920311e-09, "loss": 0.7013, "step": 2840 }, { "epoch": 2.9915878023133544, "grad_norm": 0.6790610179426215, "learning_rate": 4.792857503266301e-10, "loss": 0.7013, "step": 2845 }, { "epoch": 2.996845425867508, "grad_norm": 0.6610872038208641, "learning_rate": 6.740002132743506e-11, "loss": 0.708, "step": 2850 }, { "epoch": 3.0, "step": 2853, "total_flos": 1194720315310080.0, "train_loss": 0.8973418972260736, "train_runtime": 76133.7056, "train_samples_per_second": 4.793, "train_steps_per_second": 0.037 } ], "logging_steps": 5, "max_steps": 2853, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1194720315310080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }