{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.35897479043444364, "eval_steps": 2000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017948739521722182, "grad_norm": 70.875, "learning_rate": 9.999964943868121e-07, "loss": 113.0671, "step": 10 }, { "epoch": 0.0035897479043444365, "grad_norm": 62.15625, "learning_rate": 9.999929887736243e-07, "loss": 109.1635, "step": 20 }, { "epoch": 0.0053846218565166545, "grad_norm": 53.625, "learning_rate": 9.999894831604363e-07, "loss": 109.8683, "step": 30 }, { "epoch": 0.007179495808688873, "grad_norm": 65.6875, "learning_rate": 9.999859775472485e-07, "loss": 108.1756, "step": 40 }, { "epoch": 0.00897436976086109, "grad_norm": 53.0625, "learning_rate": 9.99982471934061e-07, "loss": 106.7566, "step": 50 }, { "epoch": 0.010769243713033309, "grad_norm": 59.84375, "learning_rate": 9.99978966320873e-07, "loss": 107.5178, "step": 60 }, { "epoch": 0.012564117665205527, "grad_norm": 59.46875, "learning_rate": 9.99975460707685e-07, "loss": 106.8143, "step": 70 }, { "epoch": 0.014358991617377746, "grad_norm": 58.96875, "learning_rate": 9.999719550944973e-07, "loss": 106.2242, "step": 80 }, { "epoch": 0.016153865569549963, "grad_norm": 57.46875, "learning_rate": 9.999684494813095e-07, "loss": 105.5488, "step": 90 }, { "epoch": 0.01794873952172218, "grad_norm": 54.65625, "learning_rate": 9.999649438681217e-07, "loss": 105.1425, "step": 100 }, { "epoch": 0.0197436134738944, "grad_norm": 55.78125, "learning_rate": 9.999614382549337e-07, "loss": 105.8918, "step": 110 }, { "epoch": 0.021538487426066618, "grad_norm": 54.53125, "learning_rate": 9.999579326417459e-07, "loss": 105.4892, "step": 120 }, { "epoch": 0.023333361378238836, "grad_norm": 57.6875, "learning_rate": 9.99954427028558e-07, "loss": 105.5813, "step": 130 }, { "epoch": 0.025128235330411055, "grad_norm": 55.3125, "learning_rate": 9.999509214153702e-07, "loss": 105.1202, "step": 140 }, { "epoch": 0.026923109282583273, "grad_norm": 54.21875, "learning_rate": 9.999474158021824e-07, "loss": 105.9377, "step": 150 }, { "epoch": 0.028717983234755492, "grad_norm": 52.5625, "learning_rate": 9.999439101889946e-07, "loss": 104.7342, "step": 160 }, { "epoch": 0.03051285718692771, "grad_norm": 59.9375, "learning_rate": 9.999404045758068e-07, "loss": 104.9382, "step": 170 }, { "epoch": 0.032307731139099925, "grad_norm": 55.65625, "learning_rate": 9.999368989626188e-07, "loss": 106.4299, "step": 180 }, { "epoch": 0.034102605091272144, "grad_norm": 53.6875, "learning_rate": 9.99933393349431e-07, "loss": 104.4395, "step": 190 }, { "epoch": 0.03589747904344436, "grad_norm": 54.8125, "learning_rate": 9.999298877362432e-07, "loss": 104.3946, "step": 200 }, { "epoch": 0.03769235299561658, "grad_norm": 55.9375, "learning_rate": 9.999263821230554e-07, "loss": 105.1693, "step": 210 }, { "epoch": 0.0394872269477888, "grad_norm": 55.03125, "learning_rate": 9.999228765098676e-07, "loss": 104.7132, "step": 220 }, { "epoch": 0.04128210089996102, "grad_norm": 71.0625, "learning_rate": 9.999193708966798e-07, "loss": 104.7171, "step": 230 }, { "epoch": 0.043076974852133236, "grad_norm": 59.40625, "learning_rate": 9.99915865283492e-07, "loss": 104.5917, "step": 240 }, { "epoch": 0.044871848804305454, "grad_norm": 53.84375, "learning_rate": 9.999123596703042e-07, "loss": 104.3239, "step": 250 }, { "epoch": 0.04666672275647767, "grad_norm": 59.375, "learning_rate": 9.999088540571162e-07, "loss": 104.0901, "step": 260 }, { "epoch": 0.04846159670864989, "grad_norm": 56.0, "learning_rate": 9.999053484439284e-07, "loss": 104.5291, "step": 270 }, { "epoch": 0.05025647066082211, "grad_norm": 64.4375, "learning_rate": 9.999018428307405e-07, "loss": 104.5856, "step": 280 }, { "epoch": 0.05205134461299433, "grad_norm": 79.25, "learning_rate": 9.998983372175527e-07, "loss": 104.3207, "step": 290 }, { "epoch": 0.05384621856516655, "grad_norm": 64.375, "learning_rate": 9.99894831604365e-07, "loss": 104.9624, "step": 300 }, { "epoch": 0.055641092517338765, "grad_norm": 58.84375, "learning_rate": 9.99891325991177e-07, "loss": 104.5753, "step": 310 }, { "epoch": 0.057435966469510984, "grad_norm": 61.21875, "learning_rate": 9.998878203779893e-07, "loss": 104.478, "step": 320 }, { "epoch": 0.0592308404216832, "grad_norm": 62.5, "learning_rate": 9.998843147648013e-07, "loss": 103.2508, "step": 330 }, { "epoch": 0.06102571437385542, "grad_norm": 56.8125, "learning_rate": 9.998808091516135e-07, "loss": 104.3786, "step": 340 }, { "epoch": 0.06282058832602763, "grad_norm": 55.21875, "learning_rate": 9.998773035384257e-07, "loss": 103.974, "step": 350 }, { "epoch": 0.06461546227819985, "grad_norm": 61.71875, "learning_rate": 9.998737979252379e-07, "loss": 104.481, "step": 360 }, { "epoch": 0.06641033623037207, "grad_norm": 53.78125, "learning_rate": 9.9987029231205e-07, "loss": 104.1711, "step": 370 }, { "epoch": 0.06820521018254429, "grad_norm": 54.375, "learning_rate": 9.99866786698862e-07, "loss": 103.1215, "step": 380 }, { "epoch": 0.0700000841347165, "grad_norm": 65.6875, "learning_rate": 9.998632810856743e-07, "loss": 105.1524, "step": 390 }, { "epoch": 0.07179495808688872, "grad_norm": 58.46875, "learning_rate": 9.998597754724865e-07, "loss": 104.1496, "step": 400 }, { "epoch": 0.07358983203906094, "grad_norm": 56.28125, "learning_rate": 9.998562698592986e-07, "loss": 103.0084, "step": 410 }, { "epoch": 0.07538470599123316, "grad_norm": 58.9375, "learning_rate": 9.998527642461108e-07, "loss": 102.782, "step": 420 }, { "epoch": 0.07717957994340538, "grad_norm": 53.28125, "learning_rate": 9.99849258632923e-07, "loss": 104.664, "step": 430 }, { "epoch": 0.0789744538955776, "grad_norm": 55.53125, "learning_rate": 9.998457530197352e-07, "loss": 103.6033, "step": 440 }, { "epoch": 0.08076932784774982, "grad_norm": 54.6875, "learning_rate": 9.998422474065474e-07, "loss": 103.2984, "step": 450 }, { "epoch": 0.08256420179992204, "grad_norm": 58.75, "learning_rate": 9.998387417933594e-07, "loss": 103.1222, "step": 460 }, { "epoch": 0.08435907575209425, "grad_norm": 54.65625, "learning_rate": 9.998352361801716e-07, "loss": 104.3975, "step": 470 }, { "epoch": 0.08615394970426647, "grad_norm": 62.0625, "learning_rate": 9.998317305669838e-07, "loss": 103.6788, "step": 480 }, { "epoch": 0.08794882365643869, "grad_norm": 54.75, "learning_rate": 9.99828224953796e-07, "loss": 103.4202, "step": 490 }, { "epoch": 0.08974369760861091, "grad_norm": 58.3125, "learning_rate": 9.998247193406082e-07, "loss": 102.8853, "step": 500 }, { "epoch": 0.09153857156078313, "grad_norm": 56.21875, "learning_rate": 9.998212137274204e-07, "loss": 103.1342, "step": 510 }, { "epoch": 0.09333344551295535, "grad_norm": 57.71875, "learning_rate": 9.998177081142326e-07, "loss": 103.0709, "step": 520 }, { "epoch": 0.09512831946512756, "grad_norm": 60.4375, "learning_rate": 9.998142025010446e-07, "loss": 104.1919, "step": 530 }, { "epoch": 0.09692319341729978, "grad_norm": 58.34375, "learning_rate": 9.998106968878567e-07, "loss": 104.5302, "step": 540 }, { "epoch": 0.098718067369472, "grad_norm": 57.875, "learning_rate": 9.99807191274669e-07, "loss": 103.7714, "step": 550 }, { "epoch": 0.10051294132164422, "grad_norm": 62.46875, "learning_rate": 9.998036856614811e-07, "loss": 104.633, "step": 560 }, { "epoch": 0.10230781527381644, "grad_norm": 53.8125, "learning_rate": 9.998001800482933e-07, "loss": 102.6445, "step": 570 }, { "epoch": 0.10410268922598866, "grad_norm": 63.84375, "learning_rate": 9.997966744351053e-07, "loss": 104.0129, "step": 580 }, { "epoch": 0.10589756317816088, "grad_norm": 57.34375, "learning_rate": 9.997931688219175e-07, "loss": 103.0677, "step": 590 }, { "epoch": 0.1076924371303331, "grad_norm": 57.25, "learning_rate": 9.9978966320873e-07, "loss": 103.8832, "step": 600 }, { "epoch": 0.10948731108250531, "grad_norm": 57.90625, "learning_rate": 9.997861575955419e-07, "loss": 103.4752, "step": 610 }, { "epoch": 0.11128218503467753, "grad_norm": 62.46875, "learning_rate": 9.99782651982354e-07, "loss": 103.0433, "step": 620 }, { "epoch": 0.11307705898684975, "grad_norm": 58.21875, "learning_rate": 9.997791463691663e-07, "loss": 102.7304, "step": 630 }, { "epoch": 0.11487193293902197, "grad_norm": 54.53125, "learning_rate": 9.997756407559785e-07, "loss": 103.3318, "step": 640 }, { "epoch": 0.11666680689119419, "grad_norm": 60.125, "learning_rate": 9.997721351427907e-07, "loss": 103.5509, "step": 650 }, { "epoch": 0.1184616808433664, "grad_norm": 58.59375, "learning_rate": 9.997686295296027e-07, "loss": 102.8469, "step": 660 }, { "epoch": 0.12025655479553862, "grad_norm": 58.125, "learning_rate": 9.997651239164148e-07, "loss": 104.6024, "step": 670 }, { "epoch": 0.12205142874771084, "grad_norm": 57.78125, "learning_rate": 9.99761618303227e-07, "loss": 102.2974, "step": 680 }, { "epoch": 0.12384630269988306, "grad_norm": 53.65625, "learning_rate": 9.997581126900392e-07, "loss": 103.0835, "step": 690 }, { "epoch": 0.12564117665205526, "grad_norm": 53.75, "learning_rate": 9.997546070768514e-07, "loss": 104.0657, "step": 700 }, { "epoch": 0.1274360506042275, "grad_norm": 57.84375, "learning_rate": 9.997511014636636e-07, "loss": 103.865, "step": 710 }, { "epoch": 0.1292309245563997, "grad_norm": 55.1875, "learning_rate": 9.997475958504758e-07, "loss": 102.8469, "step": 720 }, { "epoch": 0.13102579850857193, "grad_norm": 54.375, "learning_rate": 9.997440902372878e-07, "loss": 103.573, "step": 730 }, { "epoch": 0.13282067246074414, "grad_norm": 55.34375, "learning_rate": 9.997405846241e-07, "loss": 102.4788, "step": 740 }, { "epoch": 0.13461554641291637, "grad_norm": 74.1875, "learning_rate": 9.997370790109122e-07, "loss": 104.8757, "step": 750 }, { "epoch": 0.13641042036508857, "grad_norm": 67.25, "learning_rate": 9.997335733977244e-07, "loss": 103.3008, "step": 760 }, { "epoch": 0.1382052943172608, "grad_norm": 50.625, "learning_rate": 9.997300677845366e-07, "loss": 102.5183, "step": 770 }, { "epoch": 0.140000168269433, "grad_norm": 57.34375, "learning_rate": 9.997265621713488e-07, "loss": 103.6885, "step": 780 }, { "epoch": 0.14179504222160524, "grad_norm": 57.15625, "learning_rate": 9.99723056558161e-07, "loss": 103.321, "step": 790 }, { "epoch": 0.14358991617377745, "grad_norm": 64.0625, "learning_rate": 9.997195509449732e-07, "loss": 102.9387, "step": 800 }, { "epoch": 0.14538479012594968, "grad_norm": 61.46875, "learning_rate": 9.997160453317851e-07, "loss": 102.8938, "step": 810 }, { "epoch": 0.14717966407812189, "grad_norm": 54.53125, "learning_rate": 9.997125397185973e-07, "loss": 102.8753, "step": 820 }, { "epoch": 0.14897453803029412, "grad_norm": 59.34375, "learning_rate": 9.997090341054095e-07, "loss": 102.9597, "step": 830 }, { "epoch": 0.15076941198246632, "grad_norm": 55.65625, "learning_rate": 9.997055284922217e-07, "loss": 102.3567, "step": 840 }, { "epoch": 0.15256428593463855, "grad_norm": 67.25, "learning_rate": 9.99702022879034e-07, "loss": 102.8148, "step": 850 }, { "epoch": 0.15435915988681076, "grad_norm": 53.15625, "learning_rate": 9.99698517265846e-07, "loss": 104.3594, "step": 860 }, { "epoch": 0.156154033838983, "grad_norm": 54.8125, "learning_rate": 9.996950116526583e-07, "loss": 102.0147, "step": 870 }, { "epoch": 0.1579489077911552, "grad_norm": 58.53125, "learning_rate": 9.996915060394703e-07, "loss": 103.0337, "step": 880 }, { "epoch": 0.15974378174332743, "grad_norm": 57.9375, "learning_rate": 9.996880004262825e-07, "loss": 103.2189, "step": 890 }, { "epoch": 0.16153865569549963, "grad_norm": 56.3125, "learning_rate": 9.996844948130947e-07, "loss": 102.6131, "step": 900 }, { "epoch": 0.16333352964767187, "grad_norm": 56.0, "learning_rate": 9.996809891999069e-07, "loss": 102.7586, "step": 910 }, { "epoch": 0.16512840359984407, "grad_norm": 53.0, "learning_rate": 9.99677483586719e-07, "loss": 101.981, "step": 920 }, { "epoch": 0.16692327755201627, "grad_norm": 56.4375, "learning_rate": 9.99673977973531e-07, "loss": 103.099, "step": 930 }, { "epoch": 0.1687181515041885, "grad_norm": 57.09375, "learning_rate": 9.996704723603432e-07, "loss": 103.5702, "step": 940 }, { "epoch": 0.1705130254563607, "grad_norm": 54.46875, "learning_rate": 9.996669667471554e-07, "loss": 102.8591, "step": 950 }, { "epoch": 0.17230789940853294, "grad_norm": 56.625, "learning_rate": 9.996634611339676e-07, "loss": 102.1516, "step": 960 }, { "epoch": 0.17410277336070515, "grad_norm": 55.0, "learning_rate": 9.996599555207798e-07, "loss": 103.4924, "step": 970 }, { "epoch": 0.17589764731287738, "grad_norm": 54.71875, "learning_rate": 9.99656449907592e-07, "loss": 102.3699, "step": 980 }, { "epoch": 0.17769252126504959, "grad_norm": 58.625, "learning_rate": 9.996529442944042e-07, "loss": 101.9164, "step": 990 }, { "epoch": 0.17948739521722182, "grad_norm": 53.1875, "learning_rate": 9.996494386812164e-07, "loss": 102.4251, "step": 1000 }, { "epoch": 0.18128226916939402, "grad_norm": 61.34375, "learning_rate": 9.996459330680284e-07, "loss": 101.8229, "step": 1010 }, { "epoch": 0.18307714312156625, "grad_norm": 56.8125, "learning_rate": 9.996424274548406e-07, "loss": 102.8062, "step": 1020 }, { "epoch": 0.18487201707373846, "grad_norm": 57.5625, "learning_rate": 9.996389218416528e-07, "loss": 102.8688, "step": 1030 }, { "epoch": 0.1866668910259107, "grad_norm": 53.75, "learning_rate": 9.99635416228465e-07, "loss": 102.0441, "step": 1040 }, { "epoch": 0.1884617649780829, "grad_norm": 55.3125, "learning_rate": 9.996319106152772e-07, "loss": 101.1412, "step": 1050 }, { "epoch": 0.19025663893025513, "grad_norm": 57.1875, "learning_rate": 9.996284050020894e-07, "loss": 102.8017, "step": 1060 }, { "epoch": 0.19205151288242733, "grad_norm": 55.09375, "learning_rate": 9.996248993889016e-07, "loss": 102.8312, "step": 1070 }, { "epoch": 0.19384638683459957, "grad_norm": 58.8125, "learning_rate": 9.996213937757135e-07, "loss": 100.9258, "step": 1080 }, { "epoch": 0.19564126078677177, "grad_norm": 53.09375, "learning_rate": 9.996178881625257e-07, "loss": 101.9567, "step": 1090 }, { "epoch": 0.197436134738944, "grad_norm": 55.3125, "learning_rate": 9.99614382549338e-07, "loss": 102.2161, "step": 1100 }, { "epoch": 0.1992310086911162, "grad_norm": 60.71875, "learning_rate": 9.996108769361501e-07, "loss": 102.0313, "step": 1110 }, { "epoch": 0.20102588264328844, "grad_norm": 54.28125, "learning_rate": 9.996073713229623e-07, "loss": 101.9621, "step": 1120 }, { "epoch": 0.20282075659546064, "grad_norm": 56.75, "learning_rate": 9.996038657097743e-07, "loss": 102.8201, "step": 1130 }, { "epoch": 0.20461563054763288, "grad_norm": 60.0, "learning_rate": 9.996003600965867e-07, "loss": 101.1011, "step": 1140 }, { "epoch": 0.20641050449980508, "grad_norm": 56.59375, "learning_rate": 9.99596854483399e-07, "loss": 102.2005, "step": 1150 }, { "epoch": 0.2082053784519773, "grad_norm": 60.875, "learning_rate": 9.995933488702109e-07, "loss": 102.4762, "step": 1160 }, { "epoch": 0.21000025240414952, "grad_norm": 61.65625, "learning_rate": 9.99589843257023e-07, "loss": 101.5841, "step": 1170 }, { "epoch": 0.21179512635632175, "grad_norm": 51.375, "learning_rate": 9.995863376438353e-07, "loss": 101.5104, "step": 1180 }, { "epoch": 0.21359000030849395, "grad_norm": 62.5625, "learning_rate": 9.995828320306475e-07, "loss": 102.6215, "step": 1190 }, { "epoch": 0.2153848742606662, "grad_norm": 57.25, "learning_rate": 9.995793264174597e-07, "loss": 100.9618, "step": 1200 }, { "epoch": 0.2171797482128384, "grad_norm": 53.40625, "learning_rate": 9.995758208042716e-07, "loss": 102.1445, "step": 1210 }, { "epoch": 0.21897462216501062, "grad_norm": 61.5, "learning_rate": 9.995723151910838e-07, "loss": 102.3304, "step": 1220 }, { "epoch": 0.22076949611718283, "grad_norm": 58.09375, "learning_rate": 9.99568809577896e-07, "loss": 103.4687, "step": 1230 }, { "epoch": 0.22256437006935506, "grad_norm": 56.5625, "learning_rate": 9.995653039647082e-07, "loss": 102.9005, "step": 1240 }, { "epoch": 0.22435924402152727, "grad_norm": 53.53125, "learning_rate": 9.995617983515204e-07, "loss": 101.6564, "step": 1250 }, { "epoch": 0.2261541179736995, "grad_norm": 55.03125, "learning_rate": 9.995582927383326e-07, "loss": 102.7027, "step": 1260 }, { "epoch": 0.2279489919258717, "grad_norm": 56.09375, "learning_rate": 9.995547871251448e-07, "loss": 102.3057, "step": 1270 }, { "epoch": 0.22974386587804393, "grad_norm": 111.0, "learning_rate": 9.995512815119568e-07, "loss": 102.2795, "step": 1280 }, { "epoch": 0.23153873983021614, "grad_norm": 53.25, "learning_rate": 9.99547775898769e-07, "loss": 102.1083, "step": 1290 }, { "epoch": 0.23333361378238837, "grad_norm": 56.4375, "learning_rate": 9.995442702855812e-07, "loss": 101.5716, "step": 1300 }, { "epoch": 0.23512848773456058, "grad_norm": 56.21875, "learning_rate": 9.995407646723934e-07, "loss": 102.0678, "step": 1310 }, { "epoch": 0.2369233616867328, "grad_norm": 56.4375, "learning_rate": 9.995372590592056e-07, "loss": 101.759, "step": 1320 }, { "epoch": 0.238718235638905, "grad_norm": 57.5625, "learning_rate": 9.995337534460178e-07, "loss": 101.938, "step": 1330 }, { "epoch": 0.24051310959107725, "grad_norm": 52.96875, "learning_rate": 9.9953024783283e-07, "loss": 100.721, "step": 1340 }, { "epoch": 0.24230798354324945, "grad_norm": 55.09375, "learning_rate": 9.995267422196421e-07, "loss": 101.3094, "step": 1350 }, { "epoch": 0.24410285749542168, "grad_norm": 59.125, "learning_rate": 9.995232366064541e-07, "loss": 101.9257, "step": 1360 }, { "epoch": 0.2458977314475939, "grad_norm": 54.5625, "learning_rate": 9.995197309932663e-07, "loss": 102.262, "step": 1370 }, { "epoch": 0.24769260539976612, "grad_norm": 59.0625, "learning_rate": 9.995162253800785e-07, "loss": 101.121, "step": 1380 }, { "epoch": 0.24948747935193832, "grad_norm": 56.90625, "learning_rate": 9.995127197668907e-07, "loss": 103.3174, "step": 1390 }, { "epoch": 0.25128235330411053, "grad_norm": 61.15625, "learning_rate": 9.99509214153703e-07, "loss": 101.7487, "step": 1400 }, { "epoch": 0.25307722725628273, "grad_norm": 56.84375, "learning_rate": 9.99505708540515e-07, "loss": 102.6137, "step": 1410 }, { "epoch": 0.254872101208455, "grad_norm": 56.1875, "learning_rate": 9.995022029273273e-07, "loss": 101.4798, "step": 1420 }, { "epoch": 0.2566669751606272, "grad_norm": 53.3125, "learning_rate": 9.994986973141395e-07, "loss": 102.564, "step": 1430 }, { "epoch": 0.2584618491127994, "grad_norm": 60.71875, "learning_rate": 9.994951917009515e-07, "loss": 101.3267, "step": 1440 }, { "epoch": 0.2602567230649716, "grad_norm": 63.96875, "learning_rate": 9.994916860877637e-07, "loss": 101.9713, "step": 1450 }, { "epoch": 0.26205159701714387, "grad_norm": 56.0625, "learning_rate": 9.994881804745759e-07, "loss": 102.2705, "step": 1460 }, { "epoch": 0.26384647096931607, "grad_norm": 59.4375, "learning_rate": 9.99484674861388e-07, "loss": 101.6555, "step": 1470 }, { "epoch": 0.2656413449214883, "grad_norm": 57.3125, "learning_rate": 9.994811692482e-07, "loss": 100.9821, "step": 1480 }, { "epoch": 0.2674362188736605, "grad_norm": 57.25, "learning_rate": 9.994776636350122e-07, "loss": 100.6823, "step": 1490 }, { "epoch": 0.26923109282583274, "grad_norm": 52.0, "learning_rate": 9.994741580218246e-07, "loss": 101.6088, "step": 1500 }, { "epoch": 0.27102596677800495, "grad_norm": 57.9375, "learning_rate": 9.994706524086366e-07, "loss": 102.9636, "step": 1510 }, { "epoch": 0.27282084073017715, "grad_norm": 52.0625, "learning_rate": 9.994671467954488e-07, "loss": 100.9283, "step": 1520 }, { "epoch": 0.27461571468234935, "grad_norm": 60.125, "learning_rate": 9.99463641182261e-07, "loss": 102.0222, "step": 1530 }, { "epoch": 0.2764105886345216, "grad_norm": 56.1875, "learning_rate": 9.994601355690732e-07, "loss": 100.9025, "step": 1540 }, { "epoch": 0.2782054625866938, "grad_norm": 53.375, "learning_rate": 9.994566299558854e-07, "loss": 100.8715, "step": 1550 }, { "epoch": 0.280000336538866, "grad_norm": 55.59375, "learning_rate": 9.994531243426974e-07, "loss": 102.0768, "step": 1560 }, { "epoch": 0.28179521049103823, "grad_norm": 67.75, "learning_rate": 9.994496187295096e-07, "loss": 102.5791, "step": 1570 }, { "epoch": 0.2835900844432105, "grad_norm": 57.21875, "learning_rate": 9.994461131163218e-07, "loss": 102.531, "step": 1580 }, { "epoch": 0.2853849583953827, "grad_norm": 60.40625, "learning_rate": 9.99442607503134e-07, "loss": 102.0928, "step": 1590 }, { "epoch": 0.2871798323475549, "grad_norm": 53.8125, "learning_rate": 9.994391018899462e-07, "loss": 101.2671, "step": 1600 }, { "epoch": 0.2889747062997271, "grad_norm": 55.4375, "learning_rate": 9.994355962767583e-07, "loss": 102.0377, "step": 1610 }, { "epoch": 0.29076958025189936, "grad_norm": 57.15625, "learning_rate": 9.994320906635705e-07, "loss": 102.7935, "step": 1620 }, { "epoch": 0.29256445420407157, "grad_norm": 59.1875, "learning_rate": 9.994285850503827e-07, "loss": 101.0506, "step": 1630 }, { "epoch": 0.29435932815624377, "grad_norm": 59.75, "learning_rate": 9.994250794371947e-07, "loss": 101.6031, "step": 1640 }, { "epoch": 0.296154202108416, "grad_norm": 58.28125, "learning_rate": 9.99421573824007e-07, "loss": 102.0068, "step": 1650 }, { "epoch": 0.29794907606058824, "grad_norm": 60.875, "learning_rate": 9.994180682108191e-07, "loss": 101.5614, "step": 1660 }, { "epoch": 0.29974395001276044, "grad_norm": 52.03125, "learning_rate": 9.994145625976313e-07, "loss": 101.9509, "step": 1670 }, { "epoch": 0.30153882396493265, "grad_norm": 51.125, "learning_rate": 9.994110569844435e-07, "loss": 100.5649, "step": 1680 }, { "epoch": 0.30333369791710485, "grad_norm": 55.8125, "learning_rate": 9.994075513712557e-07, "loss": 102.3571, "step": 1690 }, { "epoch": 0.3051285718692771, "grad_norm": 55.75, "learning_rate": 9.994040457580679e-07, "loss": 102.2168, "step": 1700 }, { "epoch": 0.3069234458214493, "grad_norm": 58.46875, "learning_rate": 9.994005401448799e-07, "loss": 101.87, "step": 1710 }, { "epoch": 0.3087183197736215, "grad_norm": 52.96875, "learning_rate": 9.99397034531692e-07, "loss": 102.0437, "step": 1720 }, { "epoch": 0.3105131937257937, "grad_norm": 57.09375, "learning_rate": 9.993935289185043e-07, "loss": 100.8123, "step": 1730 }, { "epoch": 0.312308067677966, "grad_norm": 57.78125, "learning_rate": 9.993900233053164e-07, "loss": 101.3681, "step": 1740 }, { "epoch": 0.3141029416301382, "grad_norm": 54.78125, "learning_rate": 9.993865176921286e-07, "loss": 102.2902, "step": 1750 }, { "epoch": 0.3158978155823104, "grad_norm": 62.1875, "learning_rate": 9.993830120789406e-07, "loss": 100.0665, "step": 1760 }, { "epoch": 0.3176926895344826, "grad_norm": 54.84375, "learning_rate": 9.993795064657528e-07, "loss": 101.723, "step": 1770 }, { "epoch": 0.31948756348665486, "grad_norm": 56.65625, "learning_rate": 9.993760008525652e-07, "loss": 100.8694, "step": 1780 }, { "epoch": 0.32128243743882706, "grad_norm": 55.96875, "learning_rate": 9.993724952393772e-07, "loss": 100.4624, "step": 1790 }, { "epoch": 0.32307731139099927, "grad_norm": 53.96875, "learning_rate": 9.993689896261894e-07, "loss": 101.9399, "step": 1800 }, { "epoch": 0.32487218534317147, "grad_norm": 54.375, "learning_rate": 9.993654840130016e-07, "loss": 100.8476, "step": 1810 }, { "epoch": 0.32666705929534373, "grad_norm": 56.34375, "learning_rate": 9.993619783998138e-07, "loss": 100.3165, "step": 1820 }, { "epoch": 0.32846193324751594, "grad_norm": 56.9375, "learning_rate": 9.99358472786626e-07, "loss": 101.3776, "step": 1830 }, { "epoch": 0.33025680719968814, "grad_norm": 55.375, "learning_rate": 9.99354967173438e-07, "loss": 101.8946, "step": 1840 }, { "epoch": 0.33205168115186035, "grad_norm": 57.5, "learning_rate": 9.993514615602502e-07, "loss": 100.4206, "step": 1850 }, { "epoch": 0.33384655510403255, "grad_norm": 54.25, "learning_rate": 9.993479559470624e-07, "loss": 101.984, "step": 1860 }, { "epoch": 0.3356414290562048, "grad_norm": 57.40625, "learning_rate": 9.993444503338745e-07, "loss": 101.568, "step": 1870 }, { "epoch": 0.337436303008377, "grad_norm": 52.40625, "learning_rate": 9.993409447206867e-07, "loss": 101.8589, "step": 1880 }, { "epoch": 0.3392311769605492, "grad_norm": 53.65625, "learning_rate": 9.99337439107499e-07, "loss": 101.0918, "step": 1890 }, { "epoch": 0.3410260509127214, "grad_norm": 57.46875, "learning_rate": 9.993339334943111e-07, "loss": 100.8577, "step": 1900 }, { "epoch": 0.3428209248648937, "grad_norm": 55.03125, "learning_rate": 9.993304278811231e-07, "loss": 101.4831, "step": 1910 }, { "epoch": 0.3446157988170659, "grad_norm": 58.5625, "learning_rate": 9.993269222679353e-07, "loss": 100.9822, "step": 1920 }, { "epoch": 0.3464106727692381, "grad_norm": 53.3125, "learning_rate": 9.993234166547475e-07, "loss": 101.5429, "step": 1930 }, { "epoch": 0.3482055467214103, "grad_norm": 55.4375, "learning_rate": 9.993199110415597e-07, "loss": 100.7439, "step": 1940 }, { "epoch": 0.35000042067358256, "grad_norm": 54.59375, "learning_rate": 9.993164054283719e-07, "loss": 101.6689, "step": 1950 }, { "epoch": 0.35179529462575476, "grad_norm": 52.5625, "learning_rate": 9.99312899815184e-07, "loss": 101.7466, "step": 1960 }, { "epoch": 0.35359016857792697, "grad_norm": 56.625, "learning_rate": 9.993093942019963e-07, "loss": 101.7688, "step": 1970 }, { "epoch": 0.35538504253009917, "grad_norm": 59.78125, "learning_rate": 9.993058885888085e-07, "loss": 103.273, "step": 1980 }, { "epoch": 0.35717991648227143, "grad_norm": 54.53125, "learning_rate": 9.993023829756205e-07, "loss": 100.4256, "step": 1990 }, { "epoch": 0.35897479043444364, "grad_norm": 57.40625, "learning_rate": 9.992988773624327e-07, "loss": 101.7477, "step": 2000 }, { "epoch": 0.35897479043444364, "eval_loss": 1.5870920419692993, "eval_runtime": 199.616, "eval_samples_per_second": 1465.674, "eval_steps_per_second": 45.803, "step": 2000 } ], "logging_steps": 10, "max_steps": 5571, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.52223315549225e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }