{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999242787551428, "eval_steps": 2000, "global_step": 5571, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017948739521722182, "grad_norm": 70.875, "learning_rate": 9.999964943868121e-07, "loss": 113.0671, "step": 10 }, { "epoch": 0.0035897479043444365, "grad_norm": 62.15625, "learning_rate": 9.999929887736243e-07, "loss": 109.1635, "step": 20 }, { "epoch": 0.0053846218565166545, "grad_norm": 53.625, "learning_rate": 9.999894831604363e-07, "loss": 109.8683, "step": 30 }, { "epoch": 0.007179495808688873, "grad_norm": 65.6875, "learning_rate": 9.999859775472485e-07, "loss": 108.1756, "step": 40 }, { "epoch": 0.00897436976086109, "grad_norm": 53.0625, "learning_rate": 9.99982471934061e-07, "loss": 106.7566, "step": 50 }, { "epoch": 0.010769243713033309, "grad_norm": 59.84375, "learning_rate": 9.99978966320873e-07, "loss": 107.5178, "step": 60 }, { "epoch": 0.012564117665205527, "grad_norm": 59.46875, "learning_rate": 9.99975460707685e-07, "loss": 106.8143, "step": 70 }, { "epoch": 0.014358991617377746, "grad_norm": 58.96875, "learning_rate": 9.999719550944973e-07, "loss": 106.2242, "step": 80 }, { "epoch": 0.016153865569549963, "grad_norm": 57.46875, "learning_rate": 9.999684494813095e-07, "loss": 105.5488, "step": 90 }, { "epoch": 0.01794873952172218, "grad_norm": 54.65625, "learning_rate": 9.999649438681217e-07, "loss": 105.1425, "step": 100 }, { "epoch": 0.0197436134738944, "grad_norm": 55.78125, "learning_rate": 9.999614382549337e-07, "loss": 105.8918, "step": 110 }, { "epoch": 0.021538487426066618, "grad_norm": 54.53125, "learning_rate": 9.999579326417459e-07, "loss": 105.4892, "step": 120 }, { "epoch": 0.023333361378238836, "grad_norm": 57.6875, "learning_rate": 9.99954427028558e-07, "loss": 105.5813, "step": 130 }, { "epoch": 0.025128235330411055, "grad_norm": 55.3125, "learning_rate": 9.999509214153702e-07, "loss": 105.1202, "step": 140 }, { "epoch": 0.026923109282583273, "grad_norm": 54.21875, "learning_rate": 9.999474158021824e-07, "loss": 105.9377, "step": 150 }, { "epoch": 0.028717983234755492, "grad_norm": 52.5625, "learning_rate": 9.999439101889946e-07, "loss": 104.7342, "step": 160 }, { "epoch": 0.03051285718692771, "grad_norm": 59.9375, "learning_rate": 9.999404045758068e-07, "loss": 104.9382, "step": 170 }, { "epoch": 0.032307731139099925, "grad_norm": 55.65625, "learning_rate": 9.999368989626188e-07, "loss": 106.4299, "step": 180 }, { "epoch": 0.034102605091272144, "grad_norm": 53.6875, "learning_rate": 9.99933393349431e-07, "loss": 104.4395, "step": 190 }, { "epoch": 0.03589747904344436, "grad_norm": 54.8125, "learning_rate": 9.999298877362432e-07, "loss": 104.3946, "step": 200 }, { "epoch": 0.03769235299561658, "grad_norm": 55.9375, "learning_rate": 9.999263821230554e-07, "loss": 105.1693, "step": 210 }, { "epoch": 0.0394872269477888, "grad_norm": 55.03125, "learning_rate": 9.999228765098676e-07, "loss": 104.7132, "step": 220 }, { "epoch": 0.04128210089996102, "grad_norm": 71.0625, "learning_rate": 9.999193708966798e-07, "loss": 104.7171, "step": 230 }, { "epoch": 0.043076974852133236, "grad_norm": 59.40625, "learning_rate": 9.99915865283492e-07, "loss": 104.5917, "step": 240 }, { "epoch": 0.044871848804305454, "grad_norm": 53.84375, "learning_rate": 9.999123596703042e-07, "loss": 104.3239, "step": 250 }, { "epoch": 0.04666672275647767, "grad_norm": 59.375, "learning_rate": 9.999088540571162e-07, "loss": 104.0901, "step": 260 }, { "epoch": 0.04846159670864989, "grad_norm": 56.0, "learning_rate": 9.999053484439284e-07, "loss": 104.5291, "step": 270 }, { "epoch": 0.05025647066082211, "grad_norm": 64.4375, "learning_rate": 9.999018428307405e-07, "loss": 104.5856, "step": 280 }, { "epoch": 0.05205134461299433, "grad_norm": 79.25, "learning_rate": 9.998983372175527e-07, "loss": 104.3207, "step": 290 }, { "epoch": 0.05384621856516655, "grad_norm": 64.375, "learning_rate": 9.99894831604365e-07, "loss": 104.9624, "step": 300 }, { "epoch": 0.055641092517338765, "grad_norm": 58.84375, "learning_rate": 9.99891325991177e-07, "loss": 104.5753, "step": 310 }, { "epoch": 0.057435966469510984, "grad_norm": 61.21875, "learning_rate": 9.998878203779893e-07, "loss": 104.478, "step": 320 }, { "epoch": 0.0592308404216832, "grad_norm": 62.5, "learning_rate": 9.998843147648013e-07, "loss": 103.2508, "step": 330 }, { "epoch": 0.06102571437385542, "grad_norm": 56.8125, "learning_rate": 9.998808091516135e-07, "loss": 104.3786, "step": 340 }, { "epoch": 0.06282058832602763, "grad_norm": 55.21875, "learning_rate": 9.998773035384257e-07, "loss": 103.974, "step": 350 }, { "epoch": 0.06461546227819985, "grad_norm": 61.71875, "learning_rate": 9.998737979252379e-07, "loss": 104.481, "step": 360 }, { "epoch": 0.06641033623037207, "grad_norm": 53.78125, "learning_rate": 9.9987029231205e-07, "loss": 104.1711, "step": 370 }, { "epoch": 0.06820521018254429, "grad_norm": 54.375, "learning_rate": 9.99866786698862e-07, "loss": 103.1215, "step": 380 }, { "epoch": 0.0700000841347165, "grad_norm": 65.6875, "learning_rate": 9.998632810856743e-07, "loss": 105.1524, "step": 390 }, { "epoch": 0.07179495808688872, "grad_norm": 58.46875, "learning_rate": 9.998597754724865e-07, "loss": 104.1496, "step": 400 }, { "epoch": 0.07358983203906094, "grad_norm": 56.28125, "learning_rate": 9.998562698592986e-07, "loss": 103.0084, "step": 410 }, { "epoch": 0.07538470599123316, "grad_norm": 58.9375, "learning_rate": 9.998527642461108e-07, "loss": 102.782, "step": 420 }, { "epoch": 0.07717957994340538, "grad_norm": 53.28125, "learning_rate": 9.99849258632923e-07, "loss": 104.664, "step": 430 }, { "epoch": 0.0789744538955776, "grad_norm": 55.53125, "learning_rate": 9.998457530197352e-07, "loss": 103.6033, "step": 440 }, { "epoch": 0.08076932784774982, "grad_norm": 54.6875, "learning_rate": 9.998422474065474e-07, "loss": 103.2984, "step": 450 }, { "epoch": 0.08256420179992204, "grad_norm": 58.75, "learning_rate": 9.998387417933594e-07, "loss": 103.1222, "step": 460 }, { "epoch": 0.08435907575209425, "grad_norm": 54.65625, "learning_rate": 9.998352361801716e-07, "loss": 104.3975, "step": 470 }, { "epoch": 0.08615394970426647, "grad_norm": 62.0625, "learning_rate": 9.998317305669838e-07, "loss": 103.6788, "step": 480 }, { "epoch": 0.08794882365643869, "grad_norm": 54.75, "learning_rate": 9.99828224953796e-07, "loss": 103.4202, "step": 490 }, { "epoch": 0.08974369760861091, "grad_norm": 58.3125, "learning_rate": 9.998247193406082e-07, "loss": 102.8853, "step": 500 }, { "epoch": 0.09153857156078313, "grad_norm": 56.21875, "learning_rate": 9.998212137274204e-07, "loss": 103.1342, "step": 510 }, { "epoch": 0.09333344551295535, "grad_norm": 57.71875, "learning_rate": 9.998177081142326e-07, "loss": 103.0709, "step": 520 }, { "epoch": 0.09512831946512756, "grad_norm": 60.4375, "learning_rate": 9.998142025010446e-07, "loss": 104.1919, "step": 530 }, { "epoch": 0.09692319341729978, "grad_norm": 58.34375, "learning_rate": 9.998106968878567e-07, "loss": 104.5302, "step": 540 }, { "epoch": 0.098718067369472, "grad_norm": 57.875, "learning_rate": 9.99807191274669e-07, "loss": 103.7714, "step": 550 }, { "epoch": 0.10051294132164422, "grad_norm": 62.46875, "learning_rate": 9.998036856614811e-07, "loss": 104.633, "step": 560 }, { "epoch": 0.10230781527381644, "grad_norm": 53.8125, "learning_rate": 9.998001800482933e-07, "loss": 102.6445, "step": 570 }, { "epoch": 0.10410268922598866, "grad_norm": 63.84375, "learning_rate": 9.997966744351053e-07, "loss": 104.0129, "step": 580 }, { "epoch": 0.10589756317816088, "grad_norm": 57.34375, "learning_rate": 9.997931688219175e-07, "loss": 103.0677, "step": 590 }, { "epoch": 0.1076924371303331, "grad_norm": 57.25, "learning_rate": 9.9978966320873e-07, "loss": 103.8832, "step": 600 }, { "epoch": 0.10948731108250531, "grad_norm": 57.90625, "learning_rate": 9.997861575955419e-07, "loss": 103.4752, "step": 610 }, { "epoch": 0.11128218503467753, "grad_norm": 62.46875, "learning_rate": 9.99782651982354e-07, "loss": 103.0433, "step": 620 }, { "epoch": 0.11307705898684975, "grad_norm": 58.21875, "learning_rate": 9.997791463691663e-07, "loss": 102.7304, "step": 630 }, { "epoch": 0.11487193293902197, "grad_norm": 54.53125, "learning_rate": 9.997756407559785e-07, "loss": 103.3318, "step": 640 }, { "epoch": 0.11666680689119419, "grad_norm": 60.125, "learning_rate": 9.997721351427907e-07, "loss": 103.5509, "step": 650 }, { "epoch": 0.1184616808433664, "grad_norm": 58.59375, "learning_rate": 9.997686295296027e-07, "loss": 102.8469, "step": 660 }, { "epoch": 0.12025655479553862, "grad_norm": 58.125, "learning_rate": 9.997651239164148e-07, "loss": 104.6024, "step": 670 }, { "epoch": 0.12205142874771084, "grad_norm": 57.78125, "learning_rate": 9.99761618303227e-07, "loss": 102.2974, "step": 680 }, { "epoch": 0.12384630269988306, "grad_norm": 53.65625, "learning_rate": 9.997581126900392e-07, "loss": 103.0835, "step": 690 }, { "epoch": 0.12564117665205526, "grad_norm": 53.75, "learning_rate": 9.997546070768514e-07, "loss": 104.0657, "step": 700 }, { "epoch": 0.1274360506042275, "grad_norm": 57.84375, "learning_rate": 9.997511014636636e-07, "loss": 103.865, "step": 710 }, { "epoch": 0.1292309245563997, "grad_norm": 55.1875, "learning_rate": 9.997475958504758e-07, "loss": 102.8469, "step": 720 }, { "epoch": 0.13102579850857193, "grad_norm": 54.375, "learning_rate": 9.997440902372878e-07, "loss": 103.573, "step": 730 }, { "epoch": 0.13282067246074414, "grad_norm": 55.34375, "learning_rate": 9.997405846241e-07, "loss": 102.4788, "step": 740 }, { "epoch": 0.13461554641291637, "grad_norm": 74.1875, "learning_rate": 9.997370790109122e-07, "loss": 104.8757, "step": 750 }, { "epoch": 0.13641042036508857, "grad_norm": 67.25, "learning_rate": 9.997335733977244e-07, "loss": 103.3008, "step": 760 }, { "epoch": 0.1382052943172608, "grad_norm": 50.625, "learning_rate": 9.997300677845366e-07, "loss": 102.5183, "step": 770 }, { "epoch": 0.140000168269433, "grad_norm": 57.34375, "learning_rate": 9.997265621713488e-07, "loss": 103.6885, "step": 780 }, { "epoch": 0.14179504222160524, "grad_norm": 57.15625, "learning_rate": 9.99723056558161e-07, "loss": 103.321, "step": 790 }, { "epoch": 0.14358991617377745, "grad_norm": 64.0625, "learning_rate": 9.997195509449732e-07, "loss": 102.9387, "step": 800 }, { "epoch": 0.14538479012594968, "grad_norm": 61.46875, "learning_rate": 9.997160453317851e-07, "loss": 102.8938, "step": 810 }, { "epoch": 0.14717966407812189, "grad_norm": 54.53125, "learning_rate": 9.997125397185973e-07, "loss": 102.8753, "step": 820 }, { "epoch": 0.14897453803029412, "grad_norm": 59.34375, "learning_rate": 9.997090341054095e-07, "loss": 102.9597, "step": 830 }, { "epoch": 0.15076941198246632, "grad_norm": 55.65625, "learning_rate": 9.997055284922217e-07, "loss": 102.3567, "step": 840 }, { "epoch": 0.15256428593463855, "grad_norm": 67.25, "learning_rate": 9.99702022879034e-07, "loss": 102.8148, "step": 850 }, { "epoch": 0.15435915988681076, "grad_norm": 53.15625, "learning_rate": 9.99698517265846e-07, "loss": 104.3594, "step": 860 }, { "epoch": 0.156154033838983, "grad_norm": 54.8125, "learning_rate": 9.996950116526583e-07, "loss": 102.0147, "step": 870 }, { "epoch": 0.1579489077911552, "grad_norm": 58.53125, "learning_rate": 9.996915060394703e-07, "loss": 103.0337, "step": 880 }, { "epoch": 0.15974378174332743, "grad_norm": 57.9375, "learning_rate": 9.996880004262825e-07, "loss": 103.2189, "step": 890 }, { "epoch": 0.16153865569549963, "grad_norm": 56.3125, "learning_rate": 9.996844948130947e-07, "loss": 102.6131, "step": 900 }, { "epoch": 0.16333352964767187, "grad_norm": 56.0, "learning_rate": 9.996809891999069e-07, "loss": 102.7586, "step": 910 }, { "epoch": 0.16512840359984407, "grad_norm": 53.0, "learning_rate": 9.99677483586719e-07, "loss": 101.981, "step": 920 }, { "epoch": 0.16692327755201627, "grad_norm": 56.4375, "learning_rate": 9.99673977973531e-07, "loss": 103.099, "step": 930 }, { "epoch": 0.1687181515041885, "grad_norm": 57.09375, "learning_rate": 9.996704723603432e-07, "loss": 103.5702, "step": 940 }, { "epoch": 0.1705130254563607, "grad_norm": 54.46875, "learning_rate": 9.996669667471554e-07, "loss": 102.8591, "step": 950 }, { "epoch": 0.17230789940853294, "grad_norm": 56.625, "learning_rate": 9.996634611339676e-07, "loss": 102.1516, "step": 960 }, { "epoch": 0.17410277336070515, "grad_norm": 55.0, "learning_rate": 9.996599555207798e-07, "loss": 103.4924, "step": 970 }, { "epoch": 0.17589764731287738, "grad_norm": 54.71875, "learning_rate": 9.99656449907592e-07, "loss": 102.3699, "step": 980 }, { "epoch": 0.17769252126504959, "grad_norm": 58.625, "learning_rate": 9.996529442944042e-07, "loss": 101.9164, "step": 990 }, { "epoch": 0.17948739521722182, "grad_norm": 53.1875, "learning_rate": 9.996494386812164e-07, "loss": 102.4251, "step": 1000 }, { "epoch": 0.18128226916939402, "grad_norm": 61.34375, "learning_rate": 9.996459330680284e-07, "loss": 101.8229, "step": 1010 }, { "epoch": 0.18307714312156625, "grad_norm": 56.8125, "learning_rate": 9.996424274548406e-07, "loss": 102.8062, "step": 1020 }, { "epoch": 0.18487201707373846, "grad_norm": 57.5625, "learning_rate": 9.996389218416528e-07, "loss": 102.8688, "step": 1030 }, { "epoch": 0.1866668910259107, "grad_norm": 53.75, "learning_rate": 9.99635416228465e-07, "loss": 102.0441, "step": 1040 }, { "epoch": 0.1884617649780829, "grad_norm": 55.3125, "learning_rate": 9.996319106152772e-07, "loss": 101.1412, "step": 1050 }, { "epoch": 0.19025663893025513, "grad_norm": 57.1875, "learning_rate": 9.996284050020894e-07, "loss": 102.8017, "step": 1060 }, { "epoch": 0.19205151288242733, "grad_norm": 55.09375, "learning_rate": 9.996248993889016e-07, "loss": 102.8312, "step": 1070 }, { "epoch": 0.19384638683459957, "grad_norm": 58.8125, "learning_rate": 9.996213937757135e-07, "loss": 100.9258, "step": 1080 }, { "epoch": 0.19564126078677177, "grad_norm": 53.09375, "learning_rate": 9.996178881625257e-07, "loss": 101.9567, "step": 1090 }, { "epoch": 0.197436134738944, "grad_norm": 55.3125, "learning_rate": 9.99614382549338e-07, "loss": 102.2161, "step": 1100 }, { "epoch": 0.1992310086911162, "grad_norm": 60.71875, "learning_rate": 9.996108769361501e-07, "loss": 102.0313, "step": 1110 }, { "epoch": 0.20102588264328844, "grad_norm": 54.28125, "learning_rate": 9.996073713229623e-07, "loss": 101.9621, "step": 1120 }, { "epoch": 0.20282075659546064, "grad_norm": 56.75, "learning_rate": 9.996038657097743e-07, "loss": 102.8201, "step": 1130 }, { "epoch": 0.20461563054763288, "grad_norm": 60.0, "learning_rate": 9.996003600965867e-07, "loss": 101.1011, "step": 1140 }, { "epoch": 0.20641050449980508, "grad_norm": 56.59375, "learning_rate": 9.99596854483399e-07, "loss": 102.2005, "step": 1150 }, { "epoch": 0.2082053784519773, "grad_norm": 60.875, "learning_rate": 9.995933488702109e-07, "loss": 102.4762, "step": 1160 }, { "epoch": 0.21000025240414952, "grad_norm": 61.65625, "learning_rate": 9.99589843257023e-07, "loss": 101.5841, "step": 1170 }, { "epoch": 0.21179512635632175, "grad_norm": 51.375, "learning_rate": 9.995863376438353e-07, "loss": 101.5104, "step": 1180 }, { "epoch": 0.21359000030849395, "grad_norm": 62.5625, "learning_rate": 9.995828320306475e-07, "loss": 102.6215, "step": 1190 }, { "epoch": 0.2153848742606662, "grad_norm": 57.25, "learning_rate": 9.995793264174597e-07, "loss": 100.9618, "step": 1200 }, { "epoch": 0.2171797482128384, "grad_norm": 53.40625, "learning_rate": 9.995758208042716e-07, "loss": 102.1445, "step": 1210 }, { "epoch": 0.21897462216501062, "grad_norm": 61.5, "learning_rate": 9.995723151910838e-07, "loss": 102.3304, "step": 1220 }, { "epoch": 0.22076949611718283, "grad_norm": 58.09375, "learning_rate": 9.99568809577896e-07, "loss": 103.4687, "step": 1230 }, { "epoch": 0.22256437006935506, "grad_norm": 56.5625, "learning_rate": 9.995653039647082e-07, "loss": 102.9005, "step": 1240 }, { "epoch": 0.22435924402152727, "grad_norm": 53.53125, "learning_rate": 9.995617983515204e-07, "loss": 101.6564, "step": 1250 }, { "epoch": 0.2261541179736995, "grad_norm": 55.03125, "learning_rate": 9.995582927383326e-07, "loss": 102.7027, "step": 1260 }, { "epoch": 0.2279489919258717, "grad_norm": 56.09375, "learning_rate": 9.995547871251448e-07, "loss": 102.3057, "step": 1270 }, { "epoch": 0.22974386587804393, "grad_norm": 111.0, "learning_rate": 9.995512815119568e-07, "loss": 102.2795, "step": 1280 }, { "epoch": 0.23153873983021614, "grad_norm": 53.25, "learning_rate": 9.99547775898769e-07, "loss": 102.1083, "step": 1290 }, { "epoch": 0.23333361378238837, "grad_norm": 56.4375, "learning_rate": 9.995442702855812e-07, "loss": 101.5716, "step": 1300 }, { "epoch": 0.23512848773456058, "grad_norm": 56.21875, "learning_rate": 9.995407646723934e-07, "loss": 102.0678, "step": 1310 }, { "epoch": 0.2369233616867328, "grad_norm": 56.4375, "learning_rate": 9.995372590592056e-07, "loss": 101.759, "step": 1320 }, { "epoch": 0.238718235638905, "grad_norm": 57.5625, "learning_rate": 9.995337534460178e-07, "loss": 101.938, "step": 1330 }, { "epoch": 0.24051310959107725, "grad_norm": 52.96875, "learning_rate": 9.9953024783283e-07, "loss": 100.721, "step": 1340 }, { "epoch": 0.24230798354324945, "grad_norm": 55.09375, "learning_rate": 9.995267422196421e-07, "loss": 101.3094, "step": 1350 }, { "epoch": 0.24410285749542168, "grad_norm": 59.125, "learning_rate": 9.995232366064541e-07, "loss": 101.9257, "step": 1360 }, { "epoch": 0.2458977314475939, "grad_norm": 54.5625, "learning_rate": 9.995197309932663e-07, "loss": 102.262, "step": 1370 }, { "epoch": 0.24769260539976612, "grad_norm": 59.0625, "learning_rate": 9.995162253800785e-07, "loss": 101.121, "step": 1380 }, { "epoch": 0.24948747935193832, "grad_norm": 56.90625, "learning_rate": 9.995127197668907e-07, "loss": 103.3174, "step": 1390 }, { "epoch": 0.25128235330411053, "grad_norm": 61.15625, "learning_rate": 9.99509214153703e-07, "loss": 101.7487, "step": 1400 }, { "epoch": 0.25307722725628273, "grad_norm": 56.84375, "learning_rate": 9.99505708540515e-07, "loss": 102.6137, "step": 1410 }, { "epoch": 0.254872101208455, "grad_norm": 56.1875, "learning_rate": 9.995022029273273e-07, "loss": 101.4798, "step": 1420 }, { "epoch": 0.2566669751606272, "grad_norm": 53.3125, "learning_rate": 9.994986973141395e-07, "loss": 102.564, "step": 1430 }, { "epoch": 0.2584618491127994, "grad_norm": 60.71875, "learning_rate": 9.994951917009515e-07, "loss": 101.3267, "step": 1440 }, { "epoch": 0.2602567230649716, "grad_norm": 63.96875, "learning_rate": 9.994916860877637e-07, "loss": 101.9713, "step": 1450 }, { "epoch": 0.26205159701714387, "grad_norm": 56.0625, "learning_rate": 9.994881804745759e-07, "loss": 102.2705, "step": 1460 }, { "epoch": 0.26384647096931607, "grad_norm": 59.4375, "learning_rate": 9.99484674861388e-07, "loss": 101.6555, "step": 1470 }, { "epoch": 0.2656413449214883, "grad_norm": 57.3125, "learning_rate": 9.994811692482e-07, "loss": 100.9821, "step": 1480 }, { "epoch": 0.2674362188736605, "grad_norm": 57.25, "learning_rate": 9.994776636350122e-07, "loss": 100.6823, "step": 1490 }, { "epoch": 0.26923109282583274, "grad_norm": 52.0, "learning_rate": 9.994741580218246e-07, "loss": 101.6088, "step": 1500 }, { "epoch": 0.27102596677800495, "grad_norm": 57.9375, "learning_rate": 9.994706524086366e-07, "loss": 102.9636, "step": 1510 }, { "epoch": 0.27282084073017715, "grad_norm": 52.0625, "learning_rate": 9.994671467954488e-07, "loss": 100.9283, "step": 1520 }, { "epoch": 0.27461571468234935, "grad_norm": 60.125, "learning_rate": 9.99463641182261e-07, "loss": 102.0222, "step": 1530 }, { "epoch": 0.2764105886345216, "grad_norm": 56.1875, "learning_rate": 9.994601355690732e-07, "loss": 100.9025, "step": 1540 }, { "epoch": 0.2782054625866938, "grad_norm": 53.375, "learning_rate": 9.994566299558854e-07, "loss": 100.8715, "step": 1550 }, { "epoch": 0.280000336538866, "grad_norm": 55.59375, "learning_rate": 9.994531243426974e-07, "loss": 102.0768, "step": 1560 }, { "epoch": 0.28179521049103823, "grad_norm": 67.75, "learning_rate": 9.994496187295096e-07, "loss": 102.5791, "step": 1570 }, { "epoch": 0.2835900844432105, "grad_norm": 57.21875, "learning_rate": 9.994461131163218e-07, "loss": 102.531, "step": 1580 }, { "epoch": 0.2853849583953827, "grad_norm": 60.40625, "learning_rate": 9.99442607503134e-07, "loss": 102.0928, "step": 1590 }, { "epoch": 0.2871798323475549, "grad_norm": 53.8125, "learning_rate": 9.994391018899462e-07, "loss": 101.2671, "step": 1600 }, { "epoch": 0.2889747062997271, "grad_norm": 55.4375, "learning_rate": 9.994355962767583e-07, "loss": 102.0377, "step": 1610 }, { "epoch": 0.29076958025189936, "grad_norm": 57.15625, "learning_rate": 9.994320906635705e-07, "loss": 102.7935, "step": 1620 }, { "epoch": 0.29256445420407157, "grad_norm": 59.1875, "learning_rate": 9.994285850503827e-07, "loss": 101.0506, "step": 1630 }, { "epoch": 0.29435932815624377, "grad_norm": 59.75, "learning_rate": 9.994250794371947e-07, "loss": 101.6031, "step": 1640 }, { "epoch": 0.296154202108416, "grad_norm": 58.28125, "learning_rate": 9.99421573824007e-07, "loss": 102.0068, "step": 1650 }, { "epoch": 0.29794907606058824, "grad_norm": 60.875, "learning_rate": 9.994180682108191e-07, "loss": 101.5614, "step": 1660 }, { "epoch": 0.29974395001276044, "grad_norm": 52.03125, "learning_rate": 9.994145625976313e-07, "loss": 101.9509, "step": 1670 }, { "epoch": 0.30153882396493265, "grad_norm": 51.125, "learning_rate": 9.994110569844435e-07, "loss": 100.5649, "step": 1680 }, { "epoch": 0.30333369791710485, "grad_norm": 55.8125, "learning_rate": 9.994075513712557e-07, "loss": 102.3571, "step": 1690 }, { "epoch": 0.3051285718692771, "grad_norm": 55.75, "learning_rate": 9.994040457580679e-07, "loss": 102.2168, "step": 1700 }, { "epoch": 0.3069234458214493, "grad_norm": 58.46875, "learning_rate": 9.994005401448799e-07, "loss": 101.87, "step": 1710 }, { "epoch": 0.3087183197736215, "grad_norm": 52.96875, "learning_rate": 9.99397034531692e-07, "loss": 102.0437, "step": 1720 }, { "epoch": 0.3105131937257937, "grad_norm": 57.09375, "learning_rate": 9.993935289185043e-07, "loss": 100.8123, "step": 1730 }, { "epoch": 0.312308067677966, "grad_norm": 57.78125, "learning_rate": 9.993900233053164e-07, "loss": 101.3681, "step": 1740 }, { "epoch": 0.3141029416301382, "grad_norm": 54.78125, "learning_rate": 9.993865176921286e-07, "loss": 102.2902, "step": 1750 }, { "epoch": 0.3158978155823104, "grad_norm": 62.1875, "learning_rate": 9.993830120789406e-07, "loss": 100.0665, "step": 1760 }, { "epoch": 0.3176926895344826, "grad_norm": 54.84375, "learning_rate": 9.993795064657528e-07, "loss": 101.723, "step": 1770 }, { "epoch": 0.31948756348665486, "grad_norm": 56.65625, "learning_rate": 9.993760008525652e-07, "loss": 100.8694, "step": 1780 }, { "epoch": 0.32128243743882706, "grad_norm": 55.96875, "learning_rate": 9.993724952393772e-07, "loss": 100.4624, "step": 1790 }, { "epoch": 0.32307731139099927, "grad_norm": 53.96875, "learning_rate": 9.993689896261894e-07, "loss": 101.9399, "step": 1800 }, { "epoch": 0.32487218534317147, "grad_norm": 54.375, "learning_rate": 9.993654840130016e-07, "loss": 100.8476, "step": 1810 }, { "epoch": 0.32666705929534373, "grad_norm": 56.34375, "learning_rate": 9.993619783998138e-07, "loss": 100.3165, "step": 1820 }, { "epoch": 0.32846193324751594, "grad_norm": 56.9375, "learning_rate": 9.99358472786626e-07, "loss": 101.3776, "step": 1830 }, { "epoch": 0.33025680719968814, "grad_norm": 55.375, "learning_rate": 9.99354967173438e-07, "loss": 101.8946, "step": 1840 }, { "epoch": 0.33205168115186035, "grad_norm": 57.5, "learning_rate": 9.993514615602502e-07, "loss": 100.4206, "step": 1850 }, { "epoch": 0.33384655510403255, "grad_norm": 54.25, "learning_rate": 9.993479559470624e-07, "loss": 101.984, "step": 1860 }, { "epoch": 0.3356414290562048, "grad_norm": 57.40625, "learning_rate": 9.993444503338745e-07, "loss": 101.568, "step": 1870 }, { "epoch": 0.337436303008377, "grad_norm": 52.40625, "learning_rate": 9.993409447206867e-07, "loss": 101.8589, "step": 1880 }, { "epoch": 0.3392311769605492, "grad_norm": 53.65625, "learning_rate": 9.99337439107499e-07, "loss": 101.0918, "step": 1890 }, { "epoch": 0.3410260509127214, "grad_norm": 57.46875, "learning_rate": 9.993339334943111e-07, "loss": 100.8577, "step": 1900 }, { "epoch": 0.3428209248648937, "grad_norm": 55.03125, "learning_rate": 9.993304278811231e-07, "loss": 101.4831, "step": 1910 }, { "epoch": 0.3446157988170659, "grad_norm": 58.5625, "learning_rate": 9.993269222679353e-07, "loss": 100.9822, "step": 1920 }, { "epoch": 0.3464106727692381, "grad_norm": 53.3125, "learning_rate": 9.993234166547475e-07, "loss": 101.5429, "step": 1930 }, { "epoch": 0.3482055467214103, "grad_norm": 55.4375, "learning_rate": 9.993199110415597e-07, "loss": 100.7439, "step": 1940 }, { "epoch": 0.35000042067358256, "grad_norm": 54.59375, "learning_rate": 9.993164054283719e-07, "loss": 101.6689, "step": 1950 }, { "epoch": 0.35179529462575476, "grad_norm": 52.5625, "learning_rate": 9.99312899815184e-07, "loss": 101.7466, "step": 1960 }, { "epoch": 0.35359016857792697, "grad_norm": 56.625, "learning_rate": 9.993093942019963e-07, "loss": 101.7688, "step": 1970 }, { "epoch": 0.35538504253009917, "grad_norm": 59.78125, "learning_rate": 9.993058885888085e-07, "loss": 103.273, "step": 1980 }, { "epoch": 0.35717991648227143, "grad_norm": 54.53125, "learning_rate": 9.993023829756205e-07, "loss": 100.4256, "step": 1990 }, { "epoch": 0.35897479043444364, "grad_norm": 57.40625, "learning_rate": 9.992988773624327e-07, "loss": 101.7477, "step": 2000 }, { "epoch": 0.35897479043444364, "eval_loss": 1.5870920419692993, "eval_runtime": 199.616, "eval_samples_per_second": 1465.674, "eval_steps_per_second": 45.803, "step": 2000 }, { "epoch": 0.36076966438661584, "grad_norm": 57.59375, "learning_rate": 9.992953717492448e-07, "loss": 102.2176, "step": 2010 }, { "epoch": 0.36256453833878804, "grad_norm": 62.09375, "learning_rate": 9.99291866136057e-07, "loss": 100.658, "step": 2020 }, { "epoch": 0.3643594122909603, "grad_norm": 57.59375, "learning_rate": 9.992883605228692e-07, "loss": 101.9662, "step": 2030 }, { "epoch": 0.3661542862431325, "grad_norm": 55.3125, "learning_rate": 9.992848549096812e-07, "loss": 101.5475, "step": 2040 }, { "epoch": 0.3679491601953047, "grad_norm": 55.15625, "learning_rate": 9.992813492964936e-07, "loss": 101.7496, "step": 2050 }, { "epoch": 0.3697440341474769, "grad_norm": 53.78125, "learning_rate": 9.992778436833056e-07, "loss": 101.7626, "step": 2060 }, { "epoch": 0.3715389080996492, "grad_norm": 55.53125, "learning_rate": 9.992743380701178e-07, "loss": 102.3083, "step": 2070 }, { "epoch": 0.3733337820518214, "grad_norm": 56.65625, "learning_rate": 9.9927083245693e-07, "loss": 101.3427, "step": 2080 }, { "epoch": 0.3751286560039936, "grad_norm": 61.375, "learning_rate": 9.992673268437422e-07, "loss": 100.4862, "step": 2090 }, { "epoch": 0.3769235299561658, "grad_norm": 84.5, "learning_rate": 9.992638212305544e-07, "loss": 101.2625, "step": 2100 }, { "epoch": 0.37871840390833805, "grad_norm": 60.75, "learning_rate": 9.992603156173664e-07, "loss": 101.8961, "step": 2110 }, { "epoch": 0.38051327786051026, "grad_norm": 56.8125, "learning_rate": 9.992568100041786e-07, "loss": 100.7833, "step": 2120 }, { "epoch": 0.38230815181268246, "grad_norm": 56.0, "learning_rate": 9.992533043909908e-07, "loss": 101.3713, "step": 2130 }, { "epoch": 0.38410302576485467, "grad_norm": 54.625, "learning_rate": 9.99249798777803e-07, "loss": 101.6192, "step": 2140 }, { "epoch": 0.3858978997170269, "grad_norm": 61.1875, "learning_rate": 9.992462931646151e-07, "loss": 101.346, "step": 2150 }, { "epoch": 0.38769277366919913, "grad_norm": 58.40625, "learning_rate": 9.992427875514273e-07, "loss": 101.0517, "step": 2160 }, { "epoch": 0.38948764762137134, "grad_norm": 58.78125, "learning_rate": 9.992392819382395e-07, "loss": 101.618, "step": 2170 }, { "epoch": 0.39128252157354354, "grad_norm": 55.03125, "learning_rate": 9.992357763250517e-07, "loss": 101.7176, "step": 2180 }, { "epoch": 0.3930773955257158, "grad_norm": 58.40625, "learning_rate": 9.992322707118637e-07, "loss": 99.9457, "step": 2190 }, { "epoch": 0.394872269477888, "grad_norm": 53.0, "learning_rate": 9.99228765098676e-07, "loss": 101.6802, "step": 2200 }, { "epoch": 0.3966671434300602, "grad_norm": 53.40625, "learning_rate": 9.99225259485488e-07, "loss": 100.4554, "step": 2210 }, { "epoch": 0.3984620173822324, "grad_norm": 57.6875, "learning_rate": 9.992217538723003e-07, "loss": 99.7695, "step": 2220 }, { "epoch": 0.4002568913344047, "grad_norm": 57.1875, "learning_rate": 9.992182482591125e-07, "loss": 100.7215, "step": 2230 }, { "epoch": 0.4020517652865769, "grad_norm": 55.21875, "learning_rate": 9.992147426459247e-07, "loss": 102.5693, "step": 2240 }, { "epoch": 0.4038466392387491, "grad_norm": 52.4375, "learning_rate": 9.992112370327369e-07, "loss": 101.3943, "step": 2250 }, { "epoch": 0.4056415131909213, "grad_norm": 56.9375, "learning_rate": 9.992077314195489e-07, "loss": 102.1974, "step": 2260 }, { "epoch": 0.40743638714309355, "grad_norm": 61.71875, "learning_rate": 9.99204225806361e-07, "loss": 100.8492, "step": 2270 }, { "epoch": 0.40923126109526575, "grad_norm": 56.9375, "learning_rate": 9.992007201931732e-07, "loss": 100.8497, "step": 2280 }, { "epoch": 0.41102613504743796, "grad_norm": 57.71875, "learning_rate": 9.991972145799854e-07, "loss": 101.1414, "step": 2290 }, { "epoch": 0.41282100899961016, "grad_norm": 52.625, "learning_rate": 9.991937089667976e-07, "loss": 101.3806, "step": 2300 }, { "epoch": 0.41461588295178237, "grad_norm": 54.25, "learning_rate": 9.991902033536096e-07, "loss": 101.2679, "step": 2310 }, { "epoch": 0.4164107569039546, "grad_norm": 55.59375, "learning_rate": 9.99186697740422e-07, "loss": 100.5695, "step": 2320 }, { "epoch": 0.41820563085612683, "grad_norm": 64.75, "learning_rate": 9.991831921272342e-07, "loss": 100.9625, "step": 2330 }, { "epoch": 0.42000050480829904, "grad_norm": 59.78125, "learning_rate": 9.991796865140462e-07, "loss": 100.6983, "step": 2340 }, { "epoch": 0.42179537876047124, "grad_norm": 54.46875, "learning_rate": 9.991761809008584e-07, "loss": 100.4711, "step": 2350 }, { "epoch": 0.4235902527126435, "grad_norm": 50.9375, "learning_rate": 9.991726752876706e-07, "loss": 101.1268, "step": 2360 }, { "epoch": 0.4253851266648157, "grad_norm": 54.40625, "learning_rate": 9.991691696744828e-07, "loss": 100.953, "step": 2370 }, { "epoch": 0.4271800006169879, "grad_norm": 60.71875, "learning_rate": 9.99165664061295e-07, "loss": 101.4937, "step": 2380 }, { "epoch": 0.4289748745691601, "grad_norm": 59.15625, "learning_rate": 9.99162158448107e-07, "loss": 98.9683, "step": 2390 }, { "epoch": 0.4307697485213324, "grad_norm": 57.4375, "learning_rate": 9.991586528349191e-07, "loss": 101.705, "step": 2400 }, { "epoch": 0.4325646224735046, "grad_norm": 59.15625, "learning_rate": 9.991551472217313e-07, "loss": 100.5508, "step": 2410 }, { "epoch": 0.4343594964256768, "grad_norm": 54.5, "learning_rate": 9.991516416085435e-07, "loss": 101.0069, "step": 2420 }, { "epoch": 0.436154370377849, "grad_norm": 56.125, "learning_rate": 9.991481359953557e-07, "loss": 101.6237, "step": 2430 }, { "epoch": 0.43794924433002125, "grad_norm": 52.78125, "learning_rate": 9.99144630382168e-07, "loss": 100.7629, "step": 2440 }, { "epoch": 0.43974411828219345, "grad_norm": 57.28125, "learning_rate": 9.991411247689801e-07, "loss": 99.8311, "step": 2450 }, { "epoch": 0.44153899223436566, "grad_norm": 61.125, "learning_rate": 9.99137619155792e-07, "loss": 100.6114, "step": 2460 }, { "epoch": 0.44333386618653786, "grad_norm": 57.1875, "learning_rate": 9.991341135426043e-07, "loss": 101.98, "step": 2470 }, { "epoch": 0.4451287401387101, "grad_norm": 59.90625, "learning_rate": 9.991306079294165e-07, "loss": 101.4968, "step": 2480 }, { "epoch": 0.4469236140908823, "grad_norm": 63.375, "learning_rate": 9.991271023162287e-07, "loss": 101.6943, "step": 2490 }, { "epoch": 0.44871848804305453, "grad_norm": 59.21875, "learning_rate": 9.991235967030409e-07, "loss": 100.9488, "step": 2500 }, { "epoch": 0.45051336199522674, "grad_norm": 58.5625, "learning_rate": 9.99120091089853e-07, "loss": 100.7646, "step": 2510 }, { "epoch": 0.452308235947399, "grad_norm": 62.40625, "learning_rate": 9.991165854766653e-07, "loss": 101.5989, "step": 2520 }, { "epoch": 0.4541031098995712, "grad_norm": 57.3125, "learning_rate": 9.991130798634775e-07, "loss": 101.8999, "step": 2530 }, { "epoch": 0.4558979838517434, "grad_norm": 52.6875, "learning_rate": 9.991095742502894e-07, "loss": 99.8278, "step": 2540 }, { "epoch": 0.4576928578039156, "grad_norm": 53.46875, "learning_rate": 9.991060686371016e-07, "loss": 100.9303, "step": 2550 }, { "epoch": 0.45948773175608787, "grad_norm": 55.21875, "learning_rate": 9.991025630239138e-07, "loss": 99.6612, "step": 2560 }, { "epoch": 0.4612826057082601, "grad_norm": 57.4375, "learning_rate": 9.99099057410726e-07, "loss": 102.1943, "step": 2570 }, { "epoch": 0.4630774796604323, "grad_norm": 55.15625, "learning_rate": 9.990955517975382e-07, "loss": 101.1615, "step": 2580 }, { "epoch": 0.4648723536126045, "grad_norm": 56.90625, "learning_rate": 9.990920461843504e-07, "loss": 100.7708, "step": 2590 }, { "epoch": 0.46666722756477674, "grad_norm": 59.8125, "learning_rate": 9.990885405711626e-07, "loss": 99.9972, "step": 2600 }, { "epoch": 0.46846210151694895, "grad_norm": 58.53125, "learning_rate": 9.990850349579746e-07, "loss": 100.8372, "step": 2610 }, { "epoch": 0.47025697546912115, "grad_norm": 62.8125, "learning_rate": 9.990815293447868e-07, "loss": 99.2942, "step": 2620 }, { "epoch": 0.47205184942129336, "grad_norm": 58.0, "learning_rate": 9.99078023731599e-07, "loss": 100.7167, "step": 2630 }, { "epoch": 0.4738467233734656, "grad_norm": 56.28125, "learning_rate": 9.990745181184112e-07, "loss": 101.028, "step": 2640 }, { "epoch": 0.4756415973256378, "grad_norm": 56.0, "learning_rate": 9.990710125052234e-07, "loss": 100.8192, "step": 2650 }, { "epoch": 0.47743647127781, "grad_norm": 55.625, "learning_rate": 9.990675068920353e-07, "loss": 100.6052, "step": 2660 }, { "epoch": 0.47923134522998223, "grad_norm": 55.03125, "learning_rate": 9.990640012788475e-07, "loss": 100.9563, "step": 2670 }, { "epoch": 0.4810262191821545, "grad_norm": 58.625, "learning_rate": 9.9906049566566e-07, "loss": 100.1024, "step": 2680 }, { "epoch": 0.4828210931343267, "grad_norm": 60.96875, "learning_rate": 9.99056990052472e-07, "loss": 101.6276, "step": 2690 }, { "epoch": 0.4846159670864989, "grad_norm": 53.53125, "learning_rate": 9.990534844392841e-07, "loss": 100.4488, "step": 2700 }, { "epoch": 0.4864108410386711, "grad_norm": 57.78125, "learning_rate": 9.990499788260963e-07, "loss": 100.9927, "step": 2710 }, { "epoch": 0.48820571499084336, "grad_norm": 56.125, "learning_rate": 9.990464732129085e-07, "loss": 101.7882, "step": 2720 }, { "epoch": 0.49000058894301557, "grad_norm": 55.71875, "learning_rate": 9.990429675997207e-07, "loss": 99.6567, "step": 2730 }, { "epoch": 0.4917954628951878, "grad_norm": 57.53125, "learning_rate": 9.990394619865327e-07, "loss": 101.4762, "step": 2740 }, { "epoch": 0.49359033684736, "grad_norm": 53.96875, "learning_rate": 9.990359563733449e-07, "loss": 100.3731, "step": 2750 }, { "epoch": 0.49538521079953224, "grad_norm": 61.0, "learning_rate": 9.99032450760157e-07, "loss": 99.6628, "step": 2760 }, { "epoch": 0.49718008475170444, "grad_norm": 55.875, "learning_rate": 9.990289451469693e-07, "loss": 100.9941, "step": 2770 }, { "epoch": 0.49897495870387665, "grad_norm": 53.40625, "learning_rate": 9.990254395337815e-07, "loss": 101.2516, "step": 2780 }, { "epoch": 0.5007698326560489, "grad_norm": 62.5625, "learning_rate": 9.990219339205937e-07, "loss": 100.9763, "step": 2790 }, { "epoch": 0.5025647066082211, "grad_norm": 54.78125, "learning_rate": 9.990184283074059e-07, "loss": 99.8499, "step": 2800 }, { "epoch": 0.5043595805603933, "grad_norm": 59.125, "learning_rate": 9.990149226942178e-07, "loss": 100.2553, "step": 2810 }, { "epoch": 0.5061544545125655, "grad_norm": 54.625, "learning_rate": 9.9901141708103e-07, "loss": 100.0908, "step": 2820 }, { "epoch": 0.5079493284647377, "grad_norm": 53.875, "learning_rate": 9.990079114678422e-07, "loss": 99.8332, "step": 2830 }, { "epoch": 0.50974420241691, "grad_norm": 67.3125, "learning_rate": 9.990044058546544e-07, "loss": 101.4204, "step": 2840 }, { "epoch": 0.5115390763690821, "grad_norm": 57.8125, "learning_rate": 9.990009002414666e-07, "loss": 100.5515, "step": 2850 }, { "epoch": 0.5133339503212544, "grad_norm": 55.96875, "learning_rate": 9.989973946282786e-07, "loss": 100.3896, "step": 2860 }, { "epoch": 0.5151288242734267, "grad_norm": 57.8125, "learning_rate": 9.98993889015091e-07, "loss": 101.921, "step": 2870 }, { "epoch": 0.5169236982255988, "grad_norm": 51.40625, "learning_rate": 9.989903834019032e-07, "loss": 100.1915, "step": 2880 }, { "epoch": 0.5187185721777711, "grad_norm": 57.25, "learning_rate": 9.989868777887152e-07, "loss": 100.3448, "step": 2890 }, { "epoch": 0.5205134461299432, "grad_norm": 52.40625, "learning_rate": 9.989833721755274e-07, "loss": 100.6079, "step": 2900 }, { "epoch": 0.5223083200821155, "grad_norm": 53.28125, "learning_rate": 9.989798665623396e-07, "loss": 101.4967, "step": 2910 }, { "epoch": 0.5241031940342877, "grad_norm": 54.40625, "learning_rate": 9.989763609491518e-07, "loss": 100.2131, "step": 2920 }, { "epoch": 0.5258980679864599, "grad_norm": 60.59375, "learning_rate": 9.98972855335964e-07, "loss": 100.2743, "step": 2930 }, { "epoch": 0.5276929419386321, "grad_norm": 55.5625, "learning_rate": 9.98969349722776e-07, "loss": 100.4026, "step": 2940 }, { "epoch": 0.5294878158908044, "grad_norm": 60.125, "learning_rate": 9.989658441095881e-07, "loss": 101.1321, "step": 2950 }, { "epoch": 0.5312826898429766, "grad_norm": 56.125, "learning_rate": 9.989623384964005e-07, "loss": 100.6127, "step": 2960 }, { "epoch": 0.5330775637951488, "grad_norm": 56.0, "learning_rate": 9.989588328832125e-07, "loss": 100.822, "step": 2970 }, { "epoch": 0.534872437747321, "grad_norm": 58.65625, "learning_rate": 9.989553272700247e-07, "loss": 100.8803, "step": 2980 }, { "epoch": 0.5366673116994932, "grad_norm": 52.875, "learning_rate": 9.98951821656837e-07, "loss": 100.7196, "step": 2990 }, { "epoch": 0.5384621856516655, "grad_norm": 57.3125, "learning_rate": 9.98948316043649e-07, "loss": 100.049, "step": 3000 }, { "epoch": 0.5402570596038376, "grad_norm": 54.96875, "learning_rate": 9.989448104304613e-07, "loss": 99.7241, "step": 3010 }, { "epoch": 0.5420519335560099, "grad_norm": 57.15625, "learning_rate": 9.989413048172733e-07, "loss": 100.1751, "step": 3020 }, { "epoch": 0.5438468075081822, "grad_norm": 56.375, "learning_rate": 9.989377992040855e-07, "loss": 100.9655, "step": 3030 }, { "epoch": 0.5456416814603543, "grad_norm": 52.09375, "learning_rate": 9.989342935908977e-07, "loss": 101.2898, "step": 3040 }, { "epoch": 0.5474365554125266, "grad_norm": 54.625, "learning_rate": 9.989307879777099e-07, "loss": 100.3283, "step": 3050 }, { "epoch": 0.5492314293646987, "grad_norm": 55.21875, "learning_rate": 9.98927282364522e-07, "loss": 99.0049, "step": 3060 }, { "epoch": 0.551026303316871, "grad_norm": 55.4375, "learning_rate": 9.989237767513343e-07, "loss": 100.6654, "step": 3070 }, { "epoch": 0.5528211772690432, "grad_norm": 55.96875, "learning_rate": 9.989202711381464e-07, "loss": 100.3892, "step": 3080 }, { "epoch": 0.5546160512212154, "grad_norm": 61.875, "learning_rate": 9.989167655249584e-07, "loss": 99.8484, "step": 3090 }, { "epoch": 0.5564109251733876, "grad_norm": 49.375, "learning_rate": 9.989132599117706e-07, "loss": 100.2157, "step": 3100 }, { "epoch": 0.5582057991255599, "grad_norm": 53.5625, "learning_rate": 9.989097542985828e-07, "loss": 99.1748, "step": 3110 }, { "epoch": 0.560000673077732, "grad_norm": 55.25, "learning_rate": 9.98906248685395e-07, "loss": 100.8326, "step": 3120 }, { "epoch": 0.5617955470299043, "grad_norm": 57.71875, "learning_rate": 9.989027430722072e-07, "loss": 100.5236, "step": 3130 }, { "epoch": 0.5635904209820765, "grad_norm": 56.6875, "learning_rate": 9.988992374590194e-07, "loss": 99.4909, "step": 3140 }, { "epoch": 0.5653852949342487, "grad_norm": 56.90625, "learning_rate": 9.988957318458316e-07, "loss": 99.4755, "step": 3150 }, { "epoch": 0.567180168886421, "grad_norm": 56.375, "learning_rate": 9.988922262326438e-07, "loss": 99.6661, "step": 3160 }, { "epoch": 0.5689750428385931, "grad_norm": 56.875, "learning_rate": 9.988887206194558e-07, "loss": 100.5558, "step": 3170 }, { "epoch": 0.5707699167907654, "grad_norm": 64.125, "learning_rate": 9.98885215006268e-07, "loss": 100.403, "step": 3180 }, { "epoch": 0.5725647907429376, "grad_norm": 55.53125, "learning_rate": 9.988817093930802e-07, "loss": 101.0716, "step": 3190 }, { "epoch": 0.5743596646951098, "grad_norm": 54.625, "learning_rate": 9.988782037798924e-07, "loss": 99.6999, "step": 3200 }, { "epoch": 0.576154538647282, "grad_norm": 63.03125, "learning_rate": 9.988746981667045e-07, "loss": 100.4395, "step": 3210 }, { "epoch": 0.5779494125994542, "grad_norm": 59.0, "learning_rate": 9.988711925535165e-07, "loss": 101.8542, "step": 3220 }, { "epoch": 0.5797442865516265, "grad_norm": 58.46875, "learning_rate": 9.98867686940329e-07, "loss": 99.4703, "step": 3230 }, { "epoch": 0.5815391605037987, "grad_norm": 54.3125, "learning_rate": 9.98864181327141e-07, "loss": 101.0656, "step": 3240 }, { "epoch": 0.5833340344559709, "grad_norm": 54.03125, "learning_rate": 9.988606757139531e-07, "loss": 100.0247, "step": 3250 }, { "epoch": 0.5851289084081431, "grad_norm": 57.0625, "learning_rate": 9.988571701007653e-07, "loss": 100.5174, "step": 3260 }, { "epoch": 0.5869237823603153, "grad_norm": 56.1875, "learning_rate": 9.988536644875775e-07, "loss": 100.5157, "step": 3270 }, { "epoch": 0.5887186563124875, "grad_norm": 51.6875, "learning_rate": 9.988501588743897e-07, "loss": 99.3698, "step": 3280 }, { "epoch": 0.5905135302646598, "grad_norm": 60.5625, "learning_rate": 9.988466532612017e-07, "loss": 100.1518, "step": 3290 }, { "epoch": 0.592308404216832, "grad_norm": 59.40625, "learning_rate": 9.988431476480139e-07, "loss": 100.5745, "step": 3300 }, { "epoch": 0.5941032781690042, "grad_norm": 54.5, "learning_rate": 9.98839642034826e-07, "loss": 101.8106, "step": 3310 }, { "epoch": 0.5958981521211765, "grad_norm": 61.03125, "learning_rate": 9.988361364216383e-07, "loss": 100.4201, "step": 3320 }, { "epoch": 0.5976930260733486, "grad_norm": 57.15625, "learning_rate": 9.988326308084505e-07, "loss": 100.6724, "step": 3330 }, { "epoch": 0.5994879000255209, "grad_norm": 51.03125, "learning_rate": 9.988291251952626e-07, "loss": 100.1729, "step": 3340 }, { "epoch": 0.601282773977693, "grad_norm": 57.71875, "learning_rate": 9.988256195820748e-07, "loss": 100.661, "step": 3350 }, { "epoch": 0.6030776479298653, "grad_norm": 57.25, "learning_rate": 9.98822113968887e-07, "loss": 101.6807, "step": 3360 }, { "epoch": 0.6048725218820376, "grad_norm": 56.625, "learning_rate": 9.98818608355699e-07, "loss": 101.2, "step": 3370 }, { "epoch": 0.6066673958342097, "grad_norm": 60.5, "learning_rate": 9.988151027425112e-07, "loss": 99.7801, "step": 3380 }, { "epoch": 0.608462269786382, "grad_norm": 51.375, "learning_rate": 9.988115971293234e-07, "loss": 100.2877, "step": 3390 }, { "epoch": 0.6102571437385542, "grad_norm": 53.40625, "learning_rate": 9.988080915161356e-07, "loss": 99.2708, "step": 3400 }, { "epoch": 0.6120520176907264, "grad_norm": 51.84375, "learning_rate": 9.988045859029478e-07, "loss": 100.1983, "step": 3410 }, { "epoch": 0.6138468916428986, "grad_norm": 57.28125, "learning_rate": 9.9880108028976e-07, "loss": 99.6957, "step": 3420 }, { "epoch": 0.6156417655950708, "grad_norm": 60.71875, "learning_rate": 9.987975746765722e-07, "loss": 100.4678, "step": 3430 }, { "epoch": 0.617436639547243, "grad_norm": 56.84375, "learning_rate": 9.987940690633842e-07, "loss": 101.1054, "step": 3440 }, { "epoch": 0.6192315134994153, "grad_norm": 55.1875, "learning_rate": 9.987905634501964e-07, "loss": 100.2326, "step": 3450 }, { "epoch": 0.6210263874515874, "grad_norm": 59.375, "learning_rate": 9.987870578370086e-07, "loss": 101.0535, "step": 3460 }, { "epoch": 0.6228212614037597, "grad_norm": 58.40625, "learning_rate": 9.987835522238207e-07, "loss": 100.6926, "step": 3470 }, { "epoch": 0.624616135355932, "grad_norm": 60.25, "learning_rate": 9.98780046610633e-07, "loss": 101.5863, "step": 3480 }, { "epoch": 0.6264110093081041, "grad_norm": 61.15625, "learning_rate": 9.98776540997445e-07, "loss": 100.7781, "step": 3490 }, { "epoch": 0.6282058832602764, "grad_norm": 55.96875, "learning_rate": 9.987730353842573e-07, "loss": 101.0874, "step": 3500 }, { "epoch": 0.6300007572124485, "grad_norm": 59.59375, "learning_rate": 9.987695297710695e-07, "loss": 100.5733, "step": 3510 }, { "epoch": 0.6317956311646208, "grad_norm": 55.71875, "learning_rate": 9.987660241578815e-07, "loss": 100.3865, "step": 3520 }, { "epoch": 0.633590505116793, "grad_norm": 60.1875, "learning_rate": 9.987625185446937e-07, "loss": 100.3602, "step": 3530 }, { "epoch": 0.6353853790689652, "grad_norm": 56.625, "learning_rate": 9.98759012931506e-07, "loss": 99.8301, "step": 3540 }, { "epoch": 0.6371802530211375, "grad_norm": 59.84375, "learning_rate": 9.98755507318318e-07, "loss": 101.2679, "step": 3550 }, { "epoch": 0.6389751269733097, "grad_norm": 53.71875, "learning_rate": 9.987520017051303e-07, "loss": 100.4626, "step": 3560 }, { "epoch": 0.6407700009254819, "grad_norm": 65.5625, "learning_rate": 9.987484960919423e-07, "loss": 98.8636, "step": 3570 }, { "epoch": 0.6425648748776541, "grad_norm": 59.15625, "learning_rate": 9.987449904787545e-07, "loss": 99.7406, "step": 3580 }, { "epoch": 0.6443597488298263, "grad_norm": 54.8125, "learning_rate": 9.987414848655667e-07, "loss": 99.1887, "step": 3590 }, { "epoch": 0.6461546227819985, "grad_norm": 56.875, "learning_rate": 9.987379792523788e-07, "loss": 101.342, "step": 3600 }, { "epoch": 0.6479494967341708, "grad_norm": 55.6875, "learning_rate": 9.98734473639191e-07, "loss": 99.9434, "step": 3610 }, { "epoch": 0.6497443706863429, "grad_norm": 60.59375, "learning_rate": 9.987309680260032e-07, "loss": 99.4815, "step": 3620 }, { "epoch": 0.6515392446385152, "grad_norm": 51.53125, "learning_rate": 9.987274624128154e-07, "loss": 99.7587, "step": 3630 }, { "epoch": 0.6533341185906875, "grad_norm": 60.0, "learning_rate": 9.987239567996274e-07, "loss": 101.2723, "step": 3640 }, { "epoch": 0.6551289925428596, "grad_norm": 63.8125, "learning_rate": 9.987204511864396e-07, "loss": 100.9252, "step": 3650 }, { "epoch": 0.6569238664950319, "grad_norm": 56.75, "learning_rate": 9.987169455732518e-07, "loss": 100.5552, "step": 3660 }, { "epoch": 0.658718740447204, "grad_norm": 59.71875, "learning_rate": 9.98713439960064e-07, "loss": 100.8731, "step": 3670 }, { "epoch": 0.6605136143993763, "grad_norm": 51.9375, "learning_rate": 9.987099343468762e-07, "loss": 100.687, "step": 3680 }, { "epoch": 0.6623084883515485, "grad_norm": 56.96875, "learning_rate": 9.987064287336884e-07, "loss": 99.6145, "step": 3690 }, { "epoch": 0.6641033623037207, "grad_norm": 57.1875, "learning_rate": 9.987029231205006e-07, "loss": 99.7202, "step": 3700 }, { "epoch": 0.665898236255893, "grad_norm": 56.03125, "learning_rate": 9.986994175073128e-07, "loss": 100.4039, "step": 3710 }, { "epoch": 0.6676931102080651, "grad_norm": 57.875, "learning_rate": 9.986959118941248e-07, "loss": 100.2563, "step": 3720 }, { "epoch": 0.6694879841602374, "grad_norm": 56.15625, "learning_rate": 9.98692406280937e-07, "loss": 100.1503, "step": 3730 }, { "epoch": 0.6712828581124096, "grad_norm": 58.0625, "learning_rate": 9.986889006677491e-07, "loss": 100.3172, "step": 3740 }, { "epoch": 0.6730777320645818, "grad_norm": 54.53125, "learning_rate": 9.986853950545613e-07, "loss": 99.0521, "step": 3750 }, { "epoch": 0.674872606016754, "grad_norm": 67.8125, "learning_rate": 9.986818894413735e-07, "loss": 101.304, "step": 3760 }, { "epoch": 0.6766674799689263, "grad_norm": 53.90625, "learning_rate": 9.986783838281857e-07, "loss": 99.0437, "step": 3770 }, { "epoch": 0.6784623539210984, "grad_norm": 53.40625, "learning_rate": 9.98674878214998e-07, "loss": 100.0527, "step": 3780 }, { "epoch": 0.6802572278732707, "grad_norm": 55.90625, "learning_rate": 9.9867137260181e-07, "loss": 100.6432, "step": 3790 }, { "epoch": 0.6820521018254428, "grad_norm": 53.5625, "learning_rate": 9.98667866988622e-07, "loss": 100.5706, "step": 3800 }, { "epoch": 0.6838469757776151, "grad_norm": 61.46875, "learning_rate": 9.986643613754343e-07, "loss": 99.6143, "step": 3810 }, { "epoch": 0.6856418497297874, "grad_norm": 57.9375, "learning_rate": 9.986608557622465e-07, "loss": 100.0116, "step": 3820 }, { "epoch": 0.6874367236819595, "grad_norm": 54.40625, "learning_rate": 9.986573501490587e-07, "loss": 100.6161, "step": 3830 }, { "epoch": 0.6892315976341318, "grad_norm": 54.6875, "learning_rate": 9.986538445358707e-07, "loss": 100.3408, "step": 3840 }, { "epoch": 0.691026471586304, "grad_norm": 59.40625, "learning_rate": 9.986503389226829e-07, "loss": 99.8375, "step": 3850 }, { "epoch": 0.6928213455384762, "grad_norm": 55.28125, "learning_rate": 9.986468333094953e-07, "loss": 100.2615, "step": 3860 }, { "epoch": 0.6946162194906484, "grad_norm": 62.71875, "learning_rate": 9.986433276963072e-07, "loss": 101.2431, "step": 3870 }, { "epoch": 0.6964110934428206, "grad_norm": 56.25, "learning_rate": 9.986398220831194e-07, "loss": 100.8722, "step": 3880 }, { "epoch": 0.6982059673949929, "grad_norm": 56.28125, "learning_rate": 9.986363164699316e-07, "loss": 100.0757, "step": 3890 }, { "epoch": 0.7000008413471651, "grad_norm": 57.1875, "learning_rate": 9.986328108567438e-07, "loss": 99.6734, "step": 3900 }, { "epoch": 0.7017957152993373, "grad_norm": 55.15625, "learning_rate": 9.98629305243556e-07, "loss": 100.0005, "step": 3910 }, { "epoch": 0.7035905892515095, "grad_norm": 58.0, "learning_rate": 9.98625799630368e-07, "loss": 99.7119, "step": 3920 }, { "epoch": 0.7053854632036818, "grad_norm": 57.15625, "learning_rate": 9.986222940171802e-07, "loss": 99.9743, "step": 3930 }, { "epoch": 0.7071803371558539, "grad_norm": 53.21875, "learning_rate": 9.986187884039924e-07, "loss": 99.9378, "step": 3940 }, { "epoch": 0.7089752111080262, "grad_norm": 55.46875, "learning_rate": 9.986152827908046e-07, "loss": 101.0054, "step": 3950 }, { "epoch": 0.7107700850601983, "grad_norm": 53.09375, "learning_rate": 9.986117771776168e-07, "loss": 100.5901, "step": 3960 }, { "epoch": 0.7125649590123706, "grad_norm": 58.6875, "learning_rate": 9.98608271564429e-07, "loss": 100.1371, "step": 3970 }, { "epoch": 0.7143598329645429, "grad_norm": 55.59375, "learning_rate": 9.986047659512412e-07, "loss": 100.4305, "step": 3980 }, { "epoch": 0.716154706916715, "grad_norm": 55.875, "learning_rate": 9.986012603380532e-07, "loss": 99.389, "step": 3990 }, { "epoch": 0.7179495808688873, "grad_norm": 60.21875, "learning_rate": 9.985977547248653e-07, "loss": 100.2918, "step": 4000 }, { "epoch": 0.7179495808688873, "eval_loss": 1.561510443687439, "eval_runtime": 193.4272, "eval_samples_per_second": 1512.569, "eval_steps_per_second": 47.268, "step": 4000 }, { "epoch": 0.7197444548210595, "grad_norm": 60.78125, "learning_rate": 9.985942491116775e-07, "loss": 99.0898, "step": 4010 }, { "epoch": 0.7215393287732317, "grad_norm": 61.78125, "learning_rate": 9.985907434984897e-07, "loss": 99.2981, "step": 4020 }, { "epoch": 0.7233342027254039, "grad_norm": 57.75, "learning_rate": 9.98587237885302e-07, "loss": 99.7238, "step": 4030 }, { "epoch": 0.7251290766775761, "grad_norm": 53.78125, "learning_rate": 9.98583732272114e-07, "loss": 99.6891, "step": 4040 }, { "epoch": 0.7269239506297483, "grad_norm": 59.375, "learning_rate": 9.985802266589263e-07, "loss": 100.155, "step": 4050 }, { "epoch": 0.7287188245819206, "grad_norm": 57.375, "learning_rate": 9.985767210457385e-07, "loss": 98.5343, "step": 4060 }, { "epoch": 0.7305136985340928, "grad_norm": 57.9375, "learning_rate": 9.985732154325505e-07, "loss": 101.0483, "step": 4070 }, { "epoch": 0.732308572486265, "grad_norm": 61.5, "learning_rate": 9.985697098193627e-07, "loss": 99.708, "step": 4080 }, { "epoch": 0.7341034464384373, "grad_norm": 52.6875, "learning_rate": 9.985662042061749e-07, "loss": 100.2094, "step": 4090 }, { "epoch": 0.7358983203906094, "grad_norm": 54.34375, "learning_rate": 9.98562698592987e-07, "loss": 99.3119, "step": 4100 }, { "epoch": 0.7376931943427817, "grad_norm": 52.625, "learning_rate": 9.985591929797993e-07, "loss": 98.8817, "step": 4110 }, { "epoch": 0.7394880682949538, "grad_norm": 58.71875, "learning_rate": 9.985556873666113e-07, "loss": 100.0632, "step": 4120 }, { "epoch": 0.7412829422471261, "grad_norm": 54.28125, "learning_rate": 9.985521817534234e-07, "loss": 100.4009, "step": 4130 }, { "epoch": 0.7430778161992984, "grad_norm": 62.46875, "learning_rate": 9.985486761402356e-07, "loss": 99.6302, "step": 4140 }, { "epoch": 0.7448726901514705, "grad_norm": 54.90625, "learning_rate": 9.985451705270478e-07, "loss": 98.9977, "step": 4150 }, { "epoch": 0.7466675641036428, "grad_norm": 57.3125, "learning_rate": 9.9854166491386e-07, "loss": 99.6041, "step": 4160 }, { "epoch": 0.7484624380558149, "grad_norm": 55.90625, "learning_rate": 9.985381593006722e-07, "loss": 99.9616, "step": 4170 }, { "epoch": 0.7502573120079872, "grad_norm": 55.5625, "learning_rate": 9.985346536874844e-07, "loss": 99.9101, "step": 4180 }, { "epoch": 0.7520521859601594, "grad_norm": 58.375, "learning_rate": 9.985311480742964e-07, "loss": 99.3676, "step": 4190 }, { "epoch": 0.7538470599123316, "grad_norm": 62.0, "learning_rate": 9.985276424611086e-07, "loss": 100.3315, "step": 4200 }, { "epoch": 0.7556419338645038, "grad_norm": 55.375, "learning_rate": 9.985241368479208e-07, "loss": 100.3022, "step": 4210 }, { "epoch": 0.7574368078166761, "grad_norm": 56.1875, "learning_rate": 9.98520631234733e-07, "loss": 99.8273, "step": 4220 }, { "epoch": 0.7592316817688483, "grad_norm": 55.15625, "learning_rate": 9.985171256215452e-07, "loss": 99.8944, "step": 4230 }, { "epoch": 0.7610265557210205, "grad_norm": 57.25, "learning_rate": 9.985136200083574e-07, "loss": 99.9793, "step": 4240 }, { "epoch": 0.7628214296731927, "grad_norm": 57.65625, "learning_rate": 9.985101143951696e-07, "loss": 99.6139, "step": 4250 }, { "epoch": 0.7646163036253649, "grad_norm": 58.09375, "learning_rate": 9.985066087819818e-07, "loss": 99.2432, "step": 4260 }, { "epoch": 0.7664111775775372, "grad_norm": 57.84375, "learning_rate": 9.985031031687937e-07, "loss": 100.9006, "step": 4270 }, { "epoch": 0.7682060515297093, "grad_norm": 55.875, "learning_rate": 9.98499597555606e-07, "loss": 100.518, "step": 4280 }, { "epoch": 0.7700009254818816, "grad_norm": 60.15625, "learning_rate": 9.984960919424181e-07, "loss": 99.9677, "step": 4290 }, { "epoch": 0.7717957994340539, "grad_norm": 60.21875, "learning_rate": 9.984925863292303e-07, "loss": 100.5463, "step": 4300 }, { "epoch": 0.773590673386226, "grad_norm": 59.5, "learning_rate": 9.984890807160425e-07, "loss": 99.2726, "step": 4310 }, { "epoch": 0.7753855473383983, "grad_norm": 63.03125, "learning_rate": 9.984855751028547e-07, "loss": 99.198, "step": 4320 }, { "epoch": 0.7771804212905704, "grad_norm": 55.84375, "learning_rate": 9.98482069489667e-07, "loss": 99.174, "step": 4330 }, { "epoch": 0.7789752952427427, "grad_norm": 62.21875, "learning_rate": 9.98478563876479e-07, "loss": 99.7285, "step": 4340 }, { "epoch": 0.7807701691949149, "grad_norm": 54.125, "learning_rate": 9.98475058263291e-07, "loss": 97.6992, "step": 4350 }, { "epoch": 0.7825650431470871, "grad_norm": 64.3125, "learning_rate": 9.984715526501033e-07, "loss": 99.3388, "step": 4360 }, { "epoch": 0.7843599170992593, "grad_norm": 56.1875, "learning_rate": 9.984680470369155e-07, "loss": 98.7792, "step": 4370 }, { "epoch": 0.7861547910514316, "grad_norm": 57.90625, "learning_rate": 9.984645414237277e-07, "loss": 100.1827, "step": 4380 }, { "epoch": 0.7879496650036037, "grad_norm": 56.09375, "learning_rate": 9.984610358105396e-07, "loss": 99.409, "step": 4390 }, { "epoch": 0.789744538955776, "grad_norm": 54.78125, "learning_rate": 9.984575301973518e-07, "loss": 99.461, "step": 4400 }, { "epoch": 0.7915394129079482, "grad_norm": 56.3125, "learning_rate": 9.984540245841642e-07, "loss": 99.5, "step": 4410 }, { "epoch": 0.7933342868601204, "grad_norm": 58.5625, "learning_rate": 9.984505189709762e-07, "loss": 99.3468, "step": 4420 }, { "epoch": 0.7951291608122927, "grad_norm": 65.0625, "learning_rate": 9.984470133577884e-07, "loss": 99.6309, "step": 4430 }, { "epoch": 0.7969240347644648, "grad_norm": 59.8125, "learning_rate": 9.984435077446006e-07, "loss": 99.3963, "step": 4440 }, { "epoch": 0.7987189087166371, "grad_norm": 60.28125, "learning_rate": 9.984400021314128e-07, "loss": 98.8352, "step": 4450 }, { "epoch": 0.8005137826688093, "grad_norm": 57.1875, "learning_rate": 9.98436496518225e-07, "loss": 99.6084, "step": 4460 }, { "epoch": 0.8023086566209815, "grad_norm": 58.3125, "learning_rate": 9.98432990905037e-07, "loss": 99.5552, "step": 4470 }, { "epoch": 0.8041035305731538, "grad_norm": 54.28125, "learning_rate": 9.984294852918492e-07, "loss": 99.7207, "step": 4480 }, { "epoch": 0.8058984045253259, "grad_norm": 55.875, "learning_rate": 9.984259796786614e-07, "loss": 100.2163, "step": 4490 }, { "epoch": 0.8076932784774982, "grad_norm": 58.46875, "learning_rate": 9.984224740654736e-07, "loss": 100.1001, "step": 4500 }, { "epoch": 0.8094881524296704, "grad_norm": 52.09375, "learning_rate": 9.984189684522858e-07, "loss": 100.1762, "step": 4510 }, { "epoch": 0.8112830263818426, "grad_norm": 58.0, "learning_rate": 9.98415462839098e-07, "loss": 99.485, "step": 4520 }, { "epoch": 0.8130779003340148, "grad_norm": 53.34375, "learning_rate": 9.984119572259102e-07, "loss": 99.2043, "step": 4530 }, { "epoch": 0.8148727742861871, "grad_norm": 55.84375, "learning_rate": 9.984084516127223e-07, "loss": 100.9975, "step": 4540 }, { "epoch": 0.8166676482383592, "grad_norm": 53.1875, "learning_rate": 9.984049459995343e-07, "loss": 100.2588, "step": 4550 }, { "epoch": 0.8184625221905315, "grad_norm": 59.15625, "learning_rate": 9.984014403863465e-07, "loss": 101.2972, "step": 4560 }, { "epoch": 0.8202573961427037, "grad_norm": 56.09375, "learning_rate": 9.983979347731587e-07, "loss": 99.6443, "step": 4570 }, { "epoch": 0.8220522700948759, "grad_norm": 57.03125, "learning_rate": 9.98394429159971e-07, "loss": 99.9172, "step": 4580 }, { "epoch": 0.8238471440470482, "grad_norm": 57.8125, "learning_rate": 9.983909235467831e-07, "loss": 99.9183, "step": 4590 }, { "epoch": 0.8256420179992203, "grad_norm": 58.59375, "learning_rate": 9.983874179335953e-07, "loss": 101.1427, "step": 4600 }, { "epoch": 0.8274368919513926, "grad_norm": 56.375, "learning_rate": 9.983839123204075e-07, "loss": 99.5511, "step": 4610 }, { "epoch": 0.8292317659035647, "grad_norm": 54.84375, "learning_rate": 9.983804067072195e-07, "loss": 98.27, "step": 4620 }, { "epoch": 0.831026639855737, "grad_norm": 55.1875, "learning_rate": 9.983769010940317e-07, "loss": 99.897, "step": 4630 }, { "epoch": 0.8328215138079093, "grad_norm": 55.59375, "learning_rate": 9.983733954808439e-07, "loss": 100.4762, "step": 4640 }, { "epoch": 0.8346163877600814, "grad_norm": 53.53125, "learning_rate": 9.98369889867656e-07, "loss": 99.019, "step": 4650 }, { "epoch": 0.8364112617122537, "grad_norm": 55.125, "learning_rate": 9.983663842544683e-07, "loss": 99.7545, "step": 4660 }, { "epoch": 0.8382061356644259, "grad_norm": 54.75, "learning_rate": 9.983628786412802e-07, "loss": 98.6123, "step": 4670 }, { "epoch": 0.8400010096165981, "grad_norm": 52.1875, "learning_rate": 9.983593730280926e-07, "loss": 99.1614, "step": 4680 }, { "epoch": 0.8417958835687703, "grad_norm": 57.0625, "learning_rate": 9.983558674149048e-07, "loss": 101.187, "step": 4690 }, { "epoch": 0.8435907575209425, "grad_norm": 54.0, "learning_rate": 9.983523618017168e-07, "loss": 99.3741, "step": 4700 }, { "epoch": 0.8453856314731147, "grad_norm": 54.65625, "learning_rate": 9.98348856188529e-07, "loss": 98.8455, "step": 4710 }, { "epoch": 0.847180505425287, "grad_norm": 56.03125, "learning_rate": 9.983453505753412e-07, "loss": 100.3109, "step": 4720 }, { "epoch": 0.8489753793774591, "grad_norm": 56.90625, "learning_rate": 9.983418449621534e-07, "loss": 99.0733, "step": 4730 }, { "epoch": 0.8507702533296314, "grad_norm": 60.03125, "learning_rate": 9.983383393489656e-07, "loss": 99.2901, "step": 4740 }, { "epoch": 0.8525651272818037, "grad_norm": 60.375, "learning_rate": 9.983348337357776e-07, "loss": 99.2978, "step": 4750 }, { "epoch": 0.8543600012339758, "grad_norm": 52.1875, "learning_rate": 9.983313281225898e-07, "loss": 100.2887, "step": 4760 }, { "epoch": 0.8561548751861481, "grad_norm": 58.6875, "learning_rate": 9.98327822509402e-07, "loss": 99.9115, "step": 4770 }, { "epoch": 0.8579497491383202, "grad_norm": 56.03125, "learning_rate": 9.983243168962142e-07, "loss": 98.9636, "step": 4780 }, { "epoch": 0.8597446230904925, "grad_norm": 64.5625, "learning_rate": 9.983208112830264e-07, "loss": 98.904, "step": 4790 }, { "epoch": 0.8615394970426647, "grad_norm": 60.09375, "learning_rate": 9.983173056698386e-07, "loss": 99.8028, "step": 4800 }, { "epoch": 0.8633343709948369, "grad_norm": 59.9375, "learning_rate": 9.983138000566507e-07, "loss": 99.5853, "step": 4810 }, { "epoch": 0.8651292449470092, "grad_norm": 54.34375, "learning_rate": 9.983102944434627e-07, "loss": 98.7931, "step": 4820 }, { "epoch": 0.8669241188991814, "grad_norm": 53.03125, "learning_rate": 9.98306788830275e-07, "loss": 99.695, "step": 4830 }, { "epoch": 0.8687189928513536, "grad_norm": 54.34375, "learning_rate": 9.983032832170871e-07, "loss": 99.7535, "step": 4840 }, { "epoch": 0.8705138668035258, "grad_norm": 58.84375, "learning_rate": 9.982997776038993e-07, "loss": 99.5457, "step": 4850 }, { "epoch": 0.872308740755698, "grad_norm": 54.0, "learning_rate": 9.982962719907115e-07, "loss": 99.1976, "step": 4860 }, { "epoch": 0.8741036147078702, "grad_norm": 52.1875, "learning_rate": 9.982927663775237e-07, "loss": 98.987, "step": 4870 }, { "epoch": 0.8758984886600425, "grad_norm": 55.0625, "learning_rate": 9.982892607643359e-07, "loss": 99.2621, "step": 4880 }, { "epoch": 0.8776933626122146, "grad_norm": 53.4375, "learning_rate": 9.98285755151148e-07, "loss": 99.8875, "step": 4890 }, { "epoch": 0.8794882365643869, "grad_norm": 55.875, "learning_rate": 9.9828224953796e-07, "loss": 98.8774, "step": 4900 }, { "epoch": 0.8812831105165592, "grad_norm": 59.78125, "learning_rate": 9.982787439247723e-07, "loss": 100.7272, "step": 4910 }, { "epoch": 0.8830779844687313, "grad_norm": 55.21875, "learning_rate": 9.982752383115845e-07, "loss": 99.0058, "step": 4920 }, { "epoch": 0.8848728584209036, "grad_norm": 54.8125, "learning_rate": 9.982717326983967e-07, "loss": 99.3883, "step": 4930 }, { "epoch": 0.8866677323730757, "grad_norm": 55.8125, "learning_rate": 9.982682270852088e-07, "loss": 98.9662, "step": 4940 }, { "epoch": 0.888462606325248, "grad_norm": 52.90625, "learning_rate": 9.98264721472021e-07, "loss": 100.7881, "step": 4950 }, { "epoch": 0.8902574802774202, "grad_norm": 54.71875, "learning_rate": 9.982612158588332e-07, "loss": 98.7615, "step": 4960 }, { "epoch": 0.8920523542295924, "grad_norm": 53.65625, "learning_rate": 9.982577102456452e-07, "loss": 99.4831, "step": 4970 }, { "epoch": 0.8938472281817647, "grad_norm": 58.78125, "learning_rate": 9.982542046324574e-07, "loss": 98.5362, "step": 4980 }, { "epoch": 0.8956421021339369, "grad_norm": 54.0625, "learning_rate": 9.982506990192696e-07, "loss": 98.9379, "step": 4990 }, { "epoch": 0.8974369760861091, "grad_norm": 52.78125, "learning_rate": 9.982471934060818e-07, "loss": 98.3786, "step": 5000 }, { "epoch": 0.8992318500382813, "grad_norm": 57.09375, "learning_rate": 9.98243687792894e-07, "loss": 99.2205, "step": 5010 }, { "epoch": 0.9010267239904535, "grad_norm": 53.71875, "learning_rate": 9.98240182179706e-07, "loss": 99.4672, "step": 5020 }, { "epoch": 0.9028215979426257, "grad_norm": 57.40625, "learning_rate": 9.982366765665182e-07, "loss": 99.5186, "step": 5030 }, { "epoch": 0.904616471894798, "grad_norm": 58.71875, "learning_rate": 9.982331709533304e-07, "loss": 99.584, "step": 5040 }, { "epoch": 0.9064113458469701, "grad_norm": 55.0625, "learning_rate": 9.982296653401426e-07, "loss": 100.6763, "step": 5050 }, { "epoch": 0.9082062197991424, "grad_norm": 54.28125, "learning_rate": 9.982261597269548e-07, "loss": 98.7888, "step": 5060 }, { "epoch": 0.9100010937513147, "grad_norm": 54.8125, "learning_rate": 9.98222654113767e-07, "loss": 98.7003, "step": 5070 }, { "epoch": 0.9117959677034868, "grad_norm": 58.28125, "learning_rate": 9.982191485005791e-07, "loss": 99.602, "step": 5080 }, { "epoch": 0.9135908416556591, "grad_norm": 56.28125, "learning_rate": 9.982156428873913e-07, "loss": 99.339, "step": 5090 }, { "epoch": 0.9153857156078312, "grad_norm": 55.15625, "learning_rate": 9.982121372742033e-07, "loss": 99.4155, "step": 5100 }, { "epoch": 0.9171805895600035, "grad_norm": 57.03125, "learning_rate": 9.982086316610155e-07, "loss": 99.6862, "step": 5110 }, { "epoch": 0.9189754635121757, "grad_norm": 56.96875, "learning_rate": 9.982051260478277e-07, "loss": 99.5966, "step": 5120 }, { "epoch": 0.9207703374643479, "grad_norm": 55.6875, "learning_rate": 9.9820162043464e-07, "loss": 100.2231, "step": 5130 }, { "epoch": 0.9225652114165201, "grad_norm": 54.78125, "learning_rate": 9.98198114821452e-07, "loss": 99.5804, "step": 5140 }, { "epoch": 0.9243600853686923, "grad_norm": 58.1875, "learning_rate": 9.981946092082643e-07, "loss": 98.8408, "step": 5150 }, { "epoch": 0.9261549593208646, "grad_norm": 56.75, "learning_rate": 9.981911035950765e-07, "loss": 99.1592, "step": 5160 }, { "epoch": 0.9279498332730368, "grad_norm": 51.3125, "learning_rate": 9.981875979818885e-07, "loss": 99.347, "step": 5170 }, { "epoch": 0.929744707225209, "grad_norm": 56.6875, "learning_rate": 9.981840923687007e-07, "loss": 99.7062, "step": 5180 }, { "epoch": 0.9315395811773812, "grad_norm": 55.28125, "learning_rate": 9.981805867555129e-07, "loss": 100.1522, "step": 5190 }, { "epoch": 0.9333344551295535, "grad_norm": 55.25, "learning_rate": 9.98177081142325e-07, "loss": 99.1621, "step": 5200 }, { "epoch": 0.9351293290817256, "grad_norm": 58.40625, "learning_rate": 9.981735755291372e-07, "loss": 99.3177, "step": 5210 }, { "epoch": 0.9369242030338979, "grad_norm": 82.875, "learning_rate": 9.981700699159492e-07, "loss": 99.838, "step": 5220 }, { "epoch": 0.93871907698607, "grad_norm": 56.125, "learning_rate": 9.981665643027616e-07, "loss": 98.6675, "step": 5230 }, { "epoch": 0.9405139509382423, "grad_norm": 56.15625, "learning_rate": 9.981630586895738e-07, "loss": 100.7525, "step": 5240 }, { "epoch": 0.9423088248904146, "grad_norm": 54.5625, "learning_rate": 9.981595530763858e-07, "loss": 98.633, "step": 5250 }, { "epoch": 0.9441036988425867, "grad_norm": 55.875, "learning_rate": 9.98156047463198e-07, "loss": 99.6261, "step": 5260 }, { "epoch": 0.945898572794759, "grad_norm": 58.15625, "learning_rate": 9.981525418500102e-07, "loss": 99.3767, "step": 5270 }, { "epoch": 0.9476934467469312, "grad_norm": 55.96875, "learning_rate": 9.981490362368224e-07, "loss": 99.0867, "step": 5280 }, { "epoch": 0.9494883206991034, "grad_norm": 56.90625, "learning_rate": 9.981455306236346e-07, "loss": 98.7356, "step": 5290 }, { "epoch": 0.9512831946512756, "grad_norm": 56.78125, "learning_rate": 9.981420250104466e-07, "loss": 99.5022, "step": 5300 }, { "epoch": 0.9530780686034478, "grad_norm": 55.59375, "learning_rate": 9.981385193972588e-07, "loss": 99.9538, "step": 5310 }, { "epoch": 0.95487294255562, "grad_norm": 56.8125, "learning_rate": 9.98135013784071e-07, "loss": 98.6498, "step": 5320 }, { "epoch": 0.9566678165077923, "grad_norm": 57.9375, "learning_rate": 9.981315081708831e-07, "loss": 99.2055, "step": 5330 }, { "epoch": 0.9584626904599645, "grad_norm": 53.25, "learning_rate": 9.981280025576953e-07, "loss": 99.4488, "step": 5340 }, { "epoch": 0.9602575644121367, "grad_norm": 58.21875, "learning_rate": 9.981244969445075e-07, "loss": 100.3882, "step": 5350 }, { "epoch": 0.962052438364309, "grad_norm": 53.875, "learning_rate": 9.981209913313197e-07, "loss": 98.866, "step": 5360 }, { "epoch": 0.9638473123164811, "grad_norm": 56.28125, "learning_rate": 9.981174857181317e-07, "loss": 99.651, "step": 5370 }, { "epoch": 0.9656421862686534, "grad_norm": 57.4375, "learning_rate": 9.98113980104944e-07, "loss": 98.6736, "step": 5380 }, { "epoch": 0.9674370602208255, "grad_norm": 59.3125, "learning_rate": 9.98110474491756e-07, "loss": 99.5751, "step": 5390 }, { "epoch": 0.9692319341729978, "grad_norm": 57.71875, "learning_rate": 9.981069688785683e-07, "loss": 98.4359, "step": 5400 }, { "epoch": 0.9710268081251701, "grad_norm": 52.65625, "learning_rate": 9.981034632653805e-07, "loss": 99.2994, "step": 5410 }, { "epoch": 0.9728216820773422, "grad_norm": 55.90625, "learning_rate": 9.980999576521927e-07, "loss": 99.8557, "step": 5420 }, { "epoch": 0.9746165560295145, "grad_norm": 54.875, "learning_rate": 9.980964520390049e-07, "loss": 98.9348, "step": 5430 }, { "epoch": 0.9764114299816867, "grad_norm": 57.21875, "learning_rate": 9.98092946425817e-07, "loss": 99.2456, "step": 5440 }, { "epoch": 0.9782063039338589, "grad_norm": 55.84375, "learning_rate": 9.98089440812629e-07, "loss": 98.4899, "step": 5450 }, { "epoch": 0.9800011778860311, "grad_norm": 57.15625, "learning_rate": 9.980859351994412e-07, "loss": 99.0099, "step": 5460 }, { "epoch": 0.9817960518382033, "grad_norm": 52.875, "learning_rate": 9.980824295862534e-07, "loss": 98.0469, "step": 5470 }, { "epoch": 0.9835909257903755, "grad_norm": 52.84375, "learning_rate": 9.980789239730656e-07, "loss": 99.7561, "step": 5480 }, { "epoch": 0.9853857997425478, "grad_norm": 55.4375, "learning_rate": 9.980754183598778e-07, "loss": 99.6168, "step": 5490 }, { "epoch": 0.98718067369472, "grad_norm": 55.0, "learning_rate": 9.9807191274669e-07, "loss": 99.3561, "step": 5500 }, { "epoch": 0.9889755476468922, "grad_norm": 54.875, "learning_rate": 9.980684071335022e-07, "loss": 98.6927, "step": 5510 }, { "epoch": 0.9907704215990645, "grad_norm": 55.03125, "learning_rate": 9.980649015203142e-07, "loss": 98.8652, "step": 5520 }, { "epoch": 0.9925652955512366, "grad_norm": 55.375, "learning_rate": 9.980613959071264e-07, "loss": 99.0075, "step": 5530 }, { "epoch": 0.9943601695034089, "grad_norm": 56.03125, "learning_rate": 9.980578902939386e-07, "loss": 99.1172, "step": 5540 }, { "epoch": 0.996155043455581, "grad_norm": 52.59375, "learning_rate": 9.980543846807508e-07, "loss": 99.8071, "step": 5550 }, { "epoch": 0.9979499174077533, "grad_norm": 57.0625, "learning_rate": 9.98050879067563e-07, "loss": 99.4139, "step": 5560 }, { "epoch": 0.9997447913599256, "grad_norm": 60.0, "learning_rate": 9.98047373454375e-07, "loss": 98.7373, "step": 5570 } ], "logging_steps": 10, "max_steps": 5571, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5382180454408913e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }