CocoRoF's picture
Training in progress, step 5571, checkpoint
e599eb3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999242787551428,
"eval_steps": 2000,
"global_step": 5571,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017948739521722182,
"grad_norm": 70.875,
"learning_rate": 9.999964943868121e-07,
"loss": 113.0671,
"step": 10
},
{
"epoch": 0.0035897479043444365,
"grad_norm": 62.15625,
"learning_rate": 9.999929887736243e-07,
"loss": 109.1635,
"step": 20
},
{
"epoch": 0.0053846218565166545,
"grad_norm": 53.625,
"learning_rate": 9.999894831604363e-07,
"loss": 109.8683,
"step": 30
},
{
"epoch": 0.007179495808688873,
"grad_norm": 65.6875,
"learning_rate": 9.999859775472485e-07,
"loss": 108.1756,
"step": 40
},
{
"epoch": 0.00897436976086109,
"grad_norm": 53.0625,
"learning_rate": 9.99982471934061e-07,
"loss": 106.7566,
"step": 50
},
{
"epoch": 0.010769243713033309,
"grad_norm": 59.84375,
"learning_rate": 9.99978966320873e-07,
"loss": 107.5178,
"step": 60
},
{
"epoch": 0.012564117665205527,
"grad_norm": 59.46875,
"learning_rate": 9.99975460707685e-07,
"loss": 106.8143,
"step": 70
},
{
"epoch": 0.014358991617377746,
"grad_norm": 58.96875,
"learning_rate": 9.999719550944973e-07,
"loss": 106.2242,
"step": 80
},
{
"epoch": 0.016153865569549963,
"grad_norm": 57.46875,
"learning_rate": 9.999684494813095e-07,
"loss": 105.5488,
"step": 90
},
{
"epoch": 0.01794873952172218,
"grad_norm": 54.65625,
"learning_rate": 9.999649438681217e-07,
"loss": 105.1425,
"step": 100
},
{
"epoch": 0.0197436134738944,
"grad_norm": 55.78125,
"learning_rate": 9.999614382549337e-07,
"loss": 105.8918,
"step": 110
},
{
"epoch": 0.021538487426066618,
"grad_norm": 54.53125,
"learning_rate": 9.999579326417459e-07,
"loss": 105.4892,
"step": 120
},
{
"epoch": 0.023333361378238836,
"grad_norm": 57.6875,
"learning_rate": 9.99954427028558e-07,
"loss": 105.5813,
"step": 130
},
{
"epoch": 0.025128235330411055,
"grad_norm": 55.3125,
"learning_rate": 9.999509214153702e-07,
"loss": 105.1202,
"step": 140
},
{
"epoch": 0.026923109282583273,
"grad_norm": 54.21875,
"learning_rate": 9.999474158021824e-07,
"loss": 105.9377,
"step": 150
},
{
"epoch": 0.028717983234755492,
"grad_norm": 52.5625,
"learning_rate": 9.999439101889946e-07,
"loss": 104.7342,
"step": 160
},
{
"epoch": 0.03051285718692771,
"grad_norm": 59.9375,
"learning_rate": 9.999404045758068e-07,
"loss": 104.9382,
"step": 170
},
{
"epoch": 0.032307731139099925,
"grad_norm": 55.65625,
"learning_rate": 9.999368989626188e-07,
"loss": 106.4299,
"step": 180
},
{
"epoch": 0.034102605091272144,
"grad_norm": 53.6875,
"learning_rate": 9.99933393349431e-07,
"loss": 104.4395,
"step": 190
},
{
"epoch": 0.03589747904344436,
"grad_norm": 54.8125,
"learning_rate": 9.999298877362432e-07,
"loss": 104.3946,
"step": 200
},
{
"epoch": 0.03769235299561658,
"grad_norm": 55.9375,
"learning_rate": 9.999263821230554e-07,
"loss": 105.1693,
"step": 210
},
{
"epoch": 0.0394872269477888,
"grad_norm": 55.03125,
"learning_rate": 9.999228765098676e-07,
"loss": 104.7132,
"step": 220
},
{
"epoch": 0.04128210089996102,
"grad_norm": 71.0625,
"learning_rate": 9.999193708966798e-07,
"loss": 104.7171,
"step": 230
},
{
"epoch": 0.043076974852133236,
"grad_norm": 59.40625,
"learning_rate": 9.99915865283492e-07,
"loss": 104.5917,
"step": 240
},
{
"epoch": 0.044871848804305454,
"grad_norm": 53.84375,
"learning_rate": 9.999123596703042e-07,
"loss": 104.3239,
"step": 250
},
{
"epoch": 0.04666672275647767,
"grad_norm": 59.375,
"learning_rate": 9.999088540571162e-07,
"loss": 104.0901,
"step": 260
},
{
"epoch": 0.04846159670864989,
"grad_norm": 56.0,
"learning_rate": 9.999053484439284e-07,
"loss": 104.5291,
"step": 270
},
{
"epoch": 0.05025647066082211,
"grad_norm": 64.4375,
"learning_rate": 9.999018428307405e-07,
"loss": 104.5856,
"step": 280
},
{
"epoch": 0.05205134461299433,
"grad_norm": 79.25,
"learning_rate": 9.998983372175527e-07,
"loss": 104.3207,
"step": 290
},
{
"epoch": 0.05384621856516655,
"grad_norm": 64.375,
"learning_rate": 9.99894831604365e-07,
"loss": 104.9624,
"step": 300
},
{
"epoch": 0.055641092517338765,
"grad_norm": 58.84375,
"learning_rate": 9.99891325991177e-07,
"loss": 104.5753,
"step": 310
},
{
"epoch": 0.057435966469510984,
"grad_norm": 61.21875,
"learning_rate": 9.998878203779893e-07,
"loss": 104.478,
"step": 320
},
{
"epoch": 0.0592308404216832,
"grad_norm": 62.5,
"learning_rate": 9.998843147648013e-07,
"loss": 103.2508,
"step": 330
},
{
"epoch": 0.06102571437385542,
"grad_norm": 56.8125,
"learning_rate": 9.998808091516135e-07,
"loss": 104.3786,
"step": 340
},
{
"epoch": 0.06282058832602763,
"grad_norm": 55.21875,
"learning_rate": 9.998773035384257e-07,
"loss": 103.974,
"step": 350
},
{
"epoch": 0.06461546227819985,
"grad_norm": 61.71875,
"learning_rate": 9.998737979252379e-07,
"loss": 104.481,
"step": 360
},
{
"epoch": 0.06641033623037207,
"grad_norm": 53.78125,
"learning_rate": 9.9987029231205e-07,
"loss": 104.1711,
"step": 370
},
{
"epoch": 0.06820521018254429,
"grad_norm": 54.375,
"learning_rate": 9.99866786698862e-07,
"loss": 103.1215,
"step": 380
},
{
"epoch": 0.0700000841347165,
"grad_norm": 65.6875,
"learning_rate": 9.998632810856743e-07,
"loss": 105.1524,
"step": 390
},
{
"epoch": 0.07179495808688872,
"grad_norm": 58.46875,
"learning_rate": 9.998597754724865e-07,
"loss": 104.1496,
"step": 400
},
{
"epoch": 0.07358983203906094,
"grad_norm": 56.28125,
"learning_rate": 9.998562698592986e-07,
"loss": 103.0084,
"step": 410
},
{
"epoch": 0.07538470599123316,
"grad_norm": 58.9375,
"learning_rate": 9.998527642461108e-07,
"loss": 102.782,
"step": 420
},
{
"epoch": 0.07717957994340538,
"grad_norm": 53.28125,
"learning_rate": 9.99849258632923e-07,
"loss": 104.664,
"step": 430
},
{
"epoch": 0.0789744538955776,
"grad_norm": 55.53125,
"learning_rate": 9.998457530197352e-07,
"loss": 103.6033,
"step": 440
},
{
"epoch": 0.08076932784774982,
"grad_norm": 54.6875,
"learning_rate": 9.998422474065474e-07,
"loss": 103.2984,
"step": 450
},
{
"epoch": 0.08256420179992204,
"grad_norm": 58.75,
"learning_rate": 9.998387417933594e-07,
"loss": 103.1222,
"step": 460
},
{
"epoch": 0.08435907575209425,
"grad_norm": 54.65625,
"learning_rate": 9.998352361801716e-07,
"loss": 104.3975,
"step": 470
},
{
"epoch": 0.08615394970426647,
"grad_norm": 62.0625,
"learning_rate": 9.998317305669838e-07,
"loss": 103.6788,
"step": 480
},
{
"epoch": 0.08794882365643869,
"grad_norm": 54.75,
"learning_rate": 9.99828224953796e-07,
"loss": 103.4202,
"step": 490
},
{
"epoch": 0.08974369760861091,
"grad_norm": 58.3125,
"learning_rate": 9.998247193406082e-07,
"loss": 102.8853,
"step": 500
},
{
"epoch": 0.09153857156078313,
"grad_norm": 56.21875,
"learning_rate": 9.998212137274204e-07,
"loss": 103.1342,
"step": 510
},
{
"epoch": 0.09333344551295535,
"grad_norm": 57.71875,
"learning_rate": 9.998177081142326e-07,
"loss": 103.0709,
"step": 520
},
{
"epoch": 0.09512831946512756,
"grad_norm": 60.4375,
"learning_rate": 9.998142025010446e-07,
"loss": 104.1919,
"step": 530
},
{
"epoch": 0.09692319341729978,
"grad_norm": 58.34375,
"learning_rate": 9.998106968878567e-07,
"loss": 104.5302,
"step": 540
},
{
"epoch": 0.098718067369472,
"grad_norm": 57.875,
"learning_rate": 9.99807191274669e-07,
"loss": 103.7714,
"step": 550
},
{
"epoch": 0.10051294132164422,
"grad_norm": 62.46875,
"learning_rate": 9.998036856614811e-07,
"loss": 104.633,
"step": 560
},
{
"epoch": 0.10230781527381644,
"grad_norm": 53.8125,
"learning_rate": 9.998001800482933e-07,
"loss": 102.6445,
"step": 570
},
{
"epoch": 0.10410268922598866,
"grad_norm": 63.84375,
"learning_rate": 9.997966744351053e-07,
"loss": 104.0129,
"step": 580
},
{
"epoch": 0.10589756317816088,
"grad_norm": 57.34375,
"learning_rate": 9.997931688219175e-07,
"loss": 103.0677,
"step": 590
},
{
"epoch": 0.1076924371303331,
"grad_norm": 57.25,
"learning_rate": 9.9978966320873e-07,
"loss": 103.8832,
"step": 600
},
{
"epoch": 0.10948731108250531,
"grad_norm": 57.90625,
"learning_rate": 9.997861575955419e-07,
"loss": 103.4752,
"step": 610
},
{
"epoch": 0.11128218503467753,
"grad_norm": 62.46875,
"learning_rate": 9.99782651982354e-07,
"loss": 103.0433,
"step": 620
},
{
"epoch": 0.11307705898684975,
"grad_norm": 58.21875,
"learning_rate": 9.997791463691663e-07,
"loss": 102.7304,
"step": 630
},
{
"epoch": 0.11487193293902197,
"grad_norm": 54.53125,
"learning_rate": 9.997756407559785e-07,
"loss": 103.3318,
"step": 640
},
{
"epoch": 0.11666680689119419,
"grad_norm": 60.125,
"learning_rate": 9.997721351427907e-07,
"loss": 103.5509,
"step": 650
},
{
"epoch": 0.1184616808433664,
"grad_norm": 58.59375,
"learning_rate": 9.997686295296027e-07,
"loss": 102.8469,
"step": 660
},
{
"epoch": 0.12025655479553862,
"grad_norm": 58.125,
"learning_rate": 9.997651239164148e-07,
"loss": 104.6024,
"step": 670
},
{
"epoch": 0.12205142874771084,
"grad_norm": 57.78125,
"learning_rate": 9.99761618303227e-07,
"loss": 102.2974,
"step": 680
},
{
"epoch": 0.12384630269988306,
"grad_norm": 53.65625,
"learning_rate": 9.997581126900392e-07,
"loss": 103.0835,
"step": 690
},
{
"epoch": 0.12564117665205526,
"grad_norm": 53.75,
"learning_rate": 9.997546070768514e-07,
"loss": 104.0657,
"step": 700
},
{
"epoch": 0.1274360506042275,
"grad_norm": 57.84375,
"learning_rate": 9.997511014636636e-07,
"loss": 103.865,
"step": 710
},
{
"epoch": 0.1292309245563997,
"grad_norm": 55.1875,
"learning_rate": 9.997475958504758e-07,
"loss": 102.8469,
"step": 720
},
{
"epoch": 0.13102579850857193,
"grad_norm": 54.375,
"learning_rate": 9.997440902372878e-07,
"loss": 103.573,
"step": 730
},
{
"epoch": 0.13282067246074414,
"grad_norm": 55.34375,
"learning_rate": 9.997405846241e-07,
"loss": 102.4788,
"step": 740
},
{
"epoch": 0.13461554641291637,
"grad_norm": 74.1875,
"learning_rate": 9.997370790109122e-07,
"loss": 104.8757,
"step": 750
},
{
"epoch": 0.13641042036508857,
"grad_norm": 67.25,
"learning_rate": 9.997335733977244e-07,
"loss": 103.3008,
"step": 760
},
{
"epoch": 0.1382052943172608,
"grad_norm": 50.625,
"learning_rate": 9.997300677845366e-07,
"loss": 102.5183,
"step": 770
},
{
"epoch": 0.140000168269433,
"grad_norm": 57.34375,
"learning_rate": 9.997265621713488e-07,
"loss": 103.6885,
"step": 780
},
{
"epoch": 0.14179504222160524,
"grad_norm": 57.15625,
"learning_rate": 9.99723056558161e-07,
"loss": 103.321,
"step": 790
},
{
"epoch": 0.14358991617377745,
"grad_norm": 64.0625,
"learning_rate": 9.997195509449732e-07,
"loss": 102.9387,
"step": 800
},
{
"epoch": 0.14538479012594968,
"grad_norm": 61.46875,
"learning_rate": 9.997160453317851e-07,
"loss": 102.8938,
"step": 810
},
{
"epoch": 0.14717966407812189,
"grad_norm": 54.53125,
"learning_rate": 9.997125397185973e-07,
"loss": 102.8753,
"step": 820
},
{
"epoch": 0.14897453803029412,
"grad_norm": 59.34375,
"learning_rate": 9.997090341054095e-07,
"loss": 102.9597,
"step": 830
},
{
"epoch": 0.15076941198246632,
"grad_norm": 55.65625,
"learning_rate": 9.997055284922217e-07,
"loss": 102.3567,
"step": 840
},
{
"epoch": 0.15256428593463855,
"grad_norm": 67.25,
"learning_rate": 9.99702022879034e-07,
"loss": 102.8148,
"step": 850
},
{
"epoch": 0.15435915988681076,
"grad_norm": 53.15625,
"learning_rate": 9.99698517265846e-07,
"loss": 104.3594,
"step": 860
},
{
"epoch": 0.156154033838983,
"grad_norm": 54.8125,
"learning_rate": 9.996950116526583e-07,
"loss": 102.0147,
"step": 870
},
{
"epoch": 0.1579489077911552,
"grad_norm": 58.53125,
"learning_rate": 9.996915060394703e-07,
"loss": 103.0337,
"step": 880
},
{
"epoch": 0.15974378174332743,
"grad_norm": 57.9375,
"learning_rate": 9.996880004262825e-07,
"loss": 103.2189,
"step": 890
},
{
"epoch": 0.16153865569549963,
"grad_norm": 56.3125,
"learning_rate": 9.996844948130947e-07,
"loss": 102.6131,
"step": 900
},
{
"epoch": 0.16333352964767187,
"grad_norm": 56.0,
"learning_rate": 9.996809891999069e-07,
"loss": 102.7586,
"step": 910
},
{
"epoch": 0.16512840359984407,
"grad_norm": 53.0,
"learning_rate": 9.99677483586719e-07,
"loss": 101.981,
"step": 920
},
{
"epoch": 0.16692327755201627,
"grad_norm": 56.4375,
"learning_rate": 9.99673977973531e-07,
"loss": 103.099,
"step": 930
},
{
"epoch": 0.1687181515041885,
"grad_norm": 57.09375,
"learning_rate": 9.996704723603432e-07,
"loss": 103.5702,
"step": 940
},
{
"epoch": 0.1705130254563607,
"grad_norm": 54.46875,
"learning_rate": 9.996669667471554e-07,
"loss": 102.8591,
"step": 950
},
{
"epoch": 0.17230789940853294,
"grad_norm": 56.625,
"learning_rate": 9.996634611339676e-07,
"loss": 102.1516,
"step": 960
},
{
"epoch": 0.17410277336070515,
"grad_norm": 55.0,
"learning_rate": 9.996599555207798e-07,
"loss": 103.4924,
"step": 970
},
{
"epoch": 0.17589764731287738,
"grad_norm": 54.71875,
"learning_rate": 9.99656449907592e-07,
"loss": 102.3699,
"step": 980
},
{
"epoch": 0.17769252126504959,
"grad_norm": 58.625,
"learning_rate": 9.996529442944042e-07,
"loss": 101.9164,
"step": 990
},
{
"epoch": 0.17948739521722182,
"grad_norm": 53.1875,
"learning_rate": 9.996494386812164e-07,
"loss": 102.4251,
"step": 1000
},
{
"epoch": 0.18128226916939402,
"grad_norm": 61.34375,
"learning_rate": 9.996459330680284e-07,
"loss": 101.8229,
"step": 1010
},
{
"epoch": 0.18307714312156625,
"grad_norm": 56.8125,
"learning_rate": 9.996424274548406e-07,
"loss": 102.8062,
"step": 1020
},
{
"epoch": 0.18487201707373846,
"grad_norm": 57.5625,
"learning_rate": 9.996389218416528e-07,
"loss": 102.8688,
"step": 1030
},
{
"epoch": 0.1866668910259107,
"grad_norm": 53.75,
"learning_rate": 9.99635416228465e-07,
"loss": 102.0441,
"step": 1040
},
{
"epoch": 0.1884617649780829,
"grad_norm": 55.3125,
"learning_rate": 9.996319106152772e-07,
"loss": 101.1412,
"step": 1050
},
{
"epoch": 0.19025663893025513,
"grad_norm": 57.1875,
"learning_rate": 9.996284050020894e-07,
"loss": 102.8017,
"step": 1060
},
{
"epoch": 0.19205151288242733,
"grad_norm": 55.09375,
"learning_rate": 9.996248993889016e-07,
"loss": 102.8312,
"step": 1070
},
{
"epoch": 0.19384638683459957,
"grad_norm": 58.8125,
"learning_rate": 9.996213937757135e-07,
"loss": 100.9258,
"step": 1080
},
{
"epoch": 0.19564126078677177,
"grad_norm": 53.09375,
"learning_rate": 9.996178881625257e-07,
"loss": 101.9567,
"step": 1090
},
{
"epoch": 0.197436134738944,
"grad_norm": 55.3125,
"learning_rate": 9.99614382549338e-07,
"loss": 102.2161,
"step": 1100
},
{
"epoch": 0.1992310086911162,
"grad_norm": 60.71875,
"learning_rate": 9.996108769361501e-07,
"loss": 102.0313,
"step": 1110
},
{
"epoch": 0.20102588264328844,
"grad_norm": 54.28125,
"learning_rate": 9.996073713229623e-07,
"loss": 101.9621,
"step": 1120
},
{
"epoch": 0.20282075659546064,
"grad_norm": 56.75,
"learning_rate": 9.996038657097743e-07,
"loss": 102.8201,
"step": 1130
},
{
"epoch": 0.20461563054763288,
"grad_norm": 60.0,
"learning_rate": 9.996003600965867e-07,
"loss": 101.1011,
"step": 1140
},
{
"epoch": 0.20641050449980508,
"grad_norm": 56.59375,
"learning_rate": 9.99596854483399e-07,
"loss": 102.2005,
"step": 1150
},
{
"epoch": 0.2082053784519773,
"grad_norm": 60.875,
"learning_rate": 9.995933488702109e-07,
"loss": 102.4762,
"step": 1160
},
{
"epoch": 0.21000025240414952,
"grad_norm": 61.65625,
"learning_rate": 9.99589843257023e-07,
"loss": 101.5841,
"step": 1170
},
{
"epoch": 0.21179512635632175,
"grad_norm": 51.375,
"learning_rate": 9.995863376438353e-07,
"loss": 101.5104,
"step": 1180
},
{
"epoch": 0.21359000030849395,
"grad_norm": 62.5625,
"learning_rate": 9.995828320306475e-07,
"loss": 102.6215,
"step": 1190
},
{
"epoch": 0.2153848742606662,
"grad_norm": 57.25,
"learning_rate": 9.995793264174597e-07,
"loss": 100.9618,
"step": 1200
},
{
"epoch": 0.2171797482128384,
"grad_norm": 53.40625,
"learning_rate": 9.995758208042716e-07,
"loss": 102.1445,
"step": 1210
},
{
"epoch": 0.21897462216501062,
"grad_norm": 61.5,
"learning_rate": 9.995723151910838e-07,
"loss": 102.3304,
"step": 1220
},
{
"epoch": 0.22076949611718283,
"grad_norm": 58.09375,
"learning_rate": 9.99568809577896e-07,
"loss": 103.4687,
"step": 1230
},
{
"epoch": 0.22256437006935506,
"grad_norm": 56.5625,
"learning_rate": 9.995653039647082e-07,
"loss": 102.9005,
"step": 1240
},
{
"epoch": 0.22435924402152727,
"grad_norm": 53.53125,
"learning_rate": 9.995617983515204e-07,
"loss": 101.6564,
"step": 1250
},
{
"epoch": 0.2261541179736995,
"grad_norm": 55.03125,
"learning_rate": 9.995582927383326e-07,
"loss": 102.7027,
"step": 1260
},
{
"epoch": 0.2279489919258717,
"grad_norm": 56.09375,
"learning_rate": 9.995547871251448e-07,
"loss": 102.3057,
"step": 1270
},
{
"epoch": 0.22974386587804393,
"grad_norm": 111.0,
"learning_rate": 9.995512815119568e-07,
"loss": 102.2795,
"step": 1280
},
{
"epoch": 0.23153873983021614,
"grad_norm": 53.25,
"learning_rate": 9.99547775898769e-07,
"loss": 102.1083,
"step": 1290
},
{
"epoch": 0.23333361378238837,
"grad_norm": 56.4375,
"learning_rate": 9.995442702855812e-07,
"loss": 101.5716,
"step": 1300
},
{
"epoch": 0.23512848773456058,
"grad_norm": 56.21875,
"learning_rate": 9.995407646723934e-07,
"loss": 102.0678,
"step": 1310
},
{
"epoch": 0.2369233616867328,
"grad_norm": 56.4375,
"learning_rate": 9.995372590592056e-07,
"loss": 101.759,
"step": 1320
},
{
"epoch": 0.238718235638905,
"grad_norm": 57.5625,
"learning_rate": 9.995337534460178e-07,
"loss": 101.938,
"step": 1330
},
{
"epoch": 0.24051310959107725,
"grad_norm": 52.96875,
"learning_rate": 9.9953024783283e-07,
"loss": 100.721,
"step": 1340
},
{
"epoch": 0.24230798354324945,
"grad_norm": 55.09375,
"learning_rate": 9.995267422196421e-07,
"loss": 101.3094,
"step": 1350
},
{
"epoch": 0.24410285749542168,
"grad_norm": 59.125,
"learning_rate": 9.995232366064541e-07,
"loss": 101.9257,
"step": 1360
},
{
"epoch": 0.2458977314475939,
"grad_norm": 54.5625,
"learning_rate": 9.995197309932663e-07,
"loss": 102.262,
"step": 1370
},
{
"epoch": 0.24769260539976612,
"grad_norm": 59.0625,
"learning_rate": 9.995162253800785e-07,
"loss": 101.121,
"step": 1380
},
{
"epoch": 0.24948747935193832,
"grad_norm": 56.90625,
"learning_rate": 9.995127197668907e-07,
"loss": 103.3174,
"step": 1390
},
{
"epoch": 0.25128235330411053,
"grad_norm": 61.15625,
"learning_rate": 9.99509214153703e-07,
"loss": 101.7487,
"step": 1400
},
{
"epoch": 0.25307722725628273,
"grad_norm": 56.84375,
"learning_rate": 9.99505708540515e-07,
"loss": 102.6137,
"step": 1410
},
{
"epoch": 0.254872101208455,
"grad_norm": 56.1875,
"learning_rate": 9.995022029273273e-07,
"loss": 101.4798,
"step": 1420
},
{
"epoch": 0.2566669751606272,
"grad_norm": 53.3125,
"learning_rate": 9.994986973141395e-07,
"loss": 102.564,
"step": 1430
},
{
"epoch": 0.2584618491127994,
"grad_norm": 60.71875,
"learning_rate": 9.994951917009515e-07,
"loss": 101.3267,
"step": 1440
},
{
"epoch": 0.2602567230649716,
"grad_norm": 63.96875,
"learning_rate": 9.994916860877637e-07,
"loss": 101.9713,
"step": 1450
},
{
"epoch": 0.26205159701714387,
"grad_norm": 56.0625,
"learning_rate": 9.994881804745759e-07,
"loss": 102.2705,
"step": 1460
},
{
"epoch": 0.26384647096931607,
"grad_norm": 59.4375,
"learning_rate": 9.99484674861388e-07,
"loss": 101.6555,
"step": 1470
},
{
"epoch": 0.2656413449214883,
"grad_norm": 57.3125,
"learning_rate": 9.994811692482e-07,
"loss": 100.9821,
"step": 1480
},
{
"epoch": 0.2674362188736605,
"grad_norm": 57.25,
"learning_rate": 9.994776636350122e-07,
"loss": 100.6823,
"step": 1490
},
{
"epoch": 0.26923109282583274,
"grad_norm": 52.0,
"learning_rate": 9.994741580218246e-07,
"loss": 101.6088,
"step": 1500
},
{
"epoch": 0.27102596677800495,
"grad_norm": 57.9375,
"learning_rate": 9.994706524086366e-07,
"loss": 102.9636,
"step": 1510
},
{
"epoch": 0.27282084073017715,
"grad_norm": 52.0625,
"learning_rate": 9.994671467954488e-07,
"loss": 100.9283,
"step": 1520
},
{
"epoch": 0.27461571468234935,
"grad_norm": 60.125,
"learning_rate": 9.99463641182261e-07,
"loss": 102.0222,
"step": 1530
},
{
"epoch": 0.2764105886345216,
"grad_norm": 56.1875,
"learning_rate": 9.994601355690732e-07,
"loss": 100.9025,
"step": 1540
},
{
"epoch": 0.2782054625866938,
"grad_norm": 53.375,
"learning_rate": 9.994566299558854e-07,
"loss": 100.8715,
"step": 1550
},
{
"epoch": 0.280000336538866,
"grad_norm": 55.59375,
"learning_rate": 9.994531243426974e-07,
"loss": 102.0768,
"step": 1560
},
{
"epoch": 0.28179521049103823,
"grad_norm": 67.75,
"learning_rate": 9.994496187295096e-07,
"loss": 102.5791,
"step": 1570
},
{
"epoch": 0.2835900844432105,
"grad_norm": 57.21875,
"learning_rate": 9.994461131163218e-07,
"loss": 102.531,
"step": 1580
},
{
"epoch": 0.2853849583953827,
"grad_norm": 60.40625,
"learning_rate": 9.99442607503134e-07,
"loss": 102.0928,
"step": 1590
},
{
"epoch": 0.2871798323475549,
"grad_norm": 53.8125,
"learning_rate": 9.994391018899462e-07,
"loss": 101.2671,
"step": 1600
},
{
"epoch": 0.2889747062997271,
"grad_norm": 55.4375,
"learning_rate": 9.994355962767583e-07,
"loss": 102.0377,
"step": 1610
},
{
"epoch": 0.29076958025189936,
"grad_norm": 57.15625,
"learning_rate": 9.994320906635705e-07,
"loss": 102.7935,
"step": 1620
},
{
"epoch": 0.29256445420407157,
"grad_norm": 59.1875,
"learning_rate": 9.994285850503827e-07,
"loss": 101.0506,
"step": 1630
},
{
"epoch": 0.29435932815624377,
"grad_norm": 59.75,
"learning_rate": 9.994250794371947e-07,
"loss": 101.6031,
"step": 1640
},
{
"epoch": 0.296154202108416,
"grad_norm": 58.28125,
"learning_rate": 9.99421573824007e-07,
"loss": 102.0068,
"step": 1650
},
{
"epoch": 0.29794907606058824,
"grad_norm": 60.875,
"learning_rate": 9.994180682108191e-07,
"loss": 101.5614,
"step": 1660
},
{
"epoch": 0.29974395001276044,
"grad_norm": 52.03125,
"learning_rate": 9.994145625976313e-07,
"loss": 101.9509,
"step": 1670
},
{
"epoch": 0.30153882396493265,
"grad_norm": 51.125,
"learning_rate": 9.994110569844435e-07,
"loss": 100.5649,
"step": 1680
},
{
"epoch": 0.30333369791710485,
"grad_norm": 55.8125,
"learning_rate": 9.994075513712557e-07,
"loss": 102.3571,
"step": 1690
},
{
"epoch": 0.3051285718692771,
"grad_norm": 55.75,
"learning_rate": 9.994040457580679e-07,
"loss": 102.2168,
"step": 1700
},
{
"epoch": 0.3069234458214493,
"grad_norm": 58.46875,
"learning_rate": 9.994005401448799e-07,
"loss": 101.87,
"step": 1710
},
{
"epoch": 0.3087183197736215,
"grad_norm": 52.96875,
"learning_rate": 9.99397034531692e-07,
"loss": 102.0437,
"step": 1720
},
{
"epoch": 0.3105131937257937,
"grad_norm": 57.09375,
"learning_rate": 9.993935289185043e-07,
"loss": 100.8123,
"step": 1730
},
{
"epoch": 0.312308067677966,
"grad_norm": 57.78125,
"learning_rate": 9.993900233053164e-07,
"loss": 101.3681,
"step": 1740
},
{
"epoch": 0.3141029416301382,
"grad_norm": 54.78125,
"learning_rate": 9.993865176921286e-07,
"loss": 102.2902,
"step": 1750
},
{
"epoch": 0.3158978155823104,
"grad_norm": 62.1875,
"learning_rate": 9.993830120789406e-07,
"loss": 100.0665,
"step": 1760
},
{
"epoch": 0.3176926895344826,
"grad_norm": 54.84375,
"learning_rate": 9.993795064657528e-07,
"loss": 101.723,
"step": 1770
},
{
"epoch": 0.31948756348665486,
"grad_norm": 56.65625,
"learning_rate": 9.993760008525652e-07,
"loss": 100.8694,
"step": 1780
},
{
"epoch": 0.32128243743882706,
"grad_norm": 55.96875,
"learning_rate": 9.993724952393772e-07,
"loss": 100.4624,
"step": 1790
},
{
"epoch": 0.32307731139099927,
"grad_norm": 53.96875,
"learning_rate": 9.993689896261894e-07,
"loss": 101.9399,
"step": 1800
},
{
"epoch": 0.32487218534317147,
"grad_norm": 54.375,
"learning_rate": 9.993654840130016e-07,
"loss": 100.8476,
"step": 1810
},
{
"epoch": 0.32666705929534373,
"grad_norm": 56.34375,
"learning_rate": 9.993619783998138e-07,
"loss": 100.3165,
"step": 1820
},
{
"epoch": 0.32846193324751594,
"grad_norm": 56.9375,
"learning_rate": 9.99358472786626e-07,
"loss": 101.3776,
"step": 1830
},
{
"epoch": 0.33025680719968814,
"grad_norm": 55.375,
"learning_rate": 9.99354967173438e-07,
"loss": 101.8946,
"step": 1840
},
{
"epoch": 0.33205168115186035,
"grad_norm": 57.5,
"learning_rate": 9.993514615602502e-07,
"loss": 100.4206,
"step": 1850
},
{
"epoch": 0.33384655510403255,
"grad_norm": 54.25,
"learning_rate": 9.993479559470624e-07,
"loss": 101.984,
"step": 1860
},
{
"epoch": 0.3356414290562048,
"grad_norm": 57.40625,
"learning_rate": 9.993444503338745e-07,
"loss": 101.568,
"step": 1870
},
{
"epoch": 0.337436303008377,
"grad_norm": 52.40625,
"learning_rate": 9.993409447206867e-07,
"loss": 101.8589,
"step": 1880
},
{
"epoch": 0.3392311769605492,
"grad_norm": 53.65625,
"learning_rate": 9.99337439107499e-07,
"loss": 101.0918,
"step": 1890
},
{
"epoch": 0.3410260509127214,
"grad_norm": 57.46875,
"learning_rate": 9.993339334943111e-07,
"loss": 100.8577,
"step": 1900
},
{
"epoch": 0.3428209248648937,
"grad_norm": 55.03125,
"learning_rate": 9.993304278811231e-07,
"loss": 101.4831,
"step": 1910
},
{
"epoch": 0.3446157988170659,
"grad_norm": 58.5625,
"learning_rate": 9.993269222679353e-07,
"loss": 100.9822,
"step": 1920
},
{
"epoch": 0.3464106727692381,
"grad_norm": 53.3125,
"learning_rate": 9.993234166547475e-07,
"loss": 101.5429,
"step": 1930
},
{
"epoch": 0.3482055467214103,
"grad_norm": 55.4375,
"learning_rate": 9.993199110415597e-07,
"loss": 100.7439,
"step": 1940
},
{
"epoch": 0.35000042067358256,
"grad_norm": 54.59375,
"learning_rate": 9.993164054283719e-07,
"loss": 101.6689,
"step": 1950
},
{
"epoch": 0.35179529462575476,
"grad_norm": 52.5625,
"learning_rate": 9.99312899815184e-07,
"loss": 101.7466,
"step": 1960
},
{
"epoch": 0.35359016857792697,
"grad_norm": 56.625,
"learning_rate": 9.993093942019963e-07,
"loss": 101.7688,
"step": 1970
},
{
"epoch": 0.35538504253009917,
"grad_norm": 59.78125,
"learning_rate": 9.993058885888085e-07,
"loss": 103.273,
"step": 1980
},
{
"epoch": 0.35717991648227143,
"grad_norm": 54.53125,
"learning_rate": 9.993023829756205e-07,
"loss": 100.4256,
"step": 1990
},
{
"epoch": 0.35897479043444364,
"grad_norm": 57.40625,
"learning_rate": 9.992988773624327e-07,
"loss": 101.7477,
"step": 2000
},
{
"epoch": 0.35897479043444364,
"eval_loss": 1.5870920419692993,
"eval_runtime": 199.616,
"eval_samples_per_second": 1465.674,
"eval_steps_per_second": 45.803,
"step": 2000
},
{
"epoch": 0.36076966438661584,
"grad_norm": 57.59375,
"learning_rate": 9.992953717492448e-07,
"loss": 102.2176,
"step": 2010
},
{
"epoch": 0.36256453833878804,
"grad_norm": 62.09375,
"learning_rate": 9.99291866136057e-07,
"loss": 100.658,
"step": 2020
},
{
"epoch": 0.3643594122909603,
"grad_norm": 57.59375,
"learning_rate": 9.992883605228692e-07,
"loss": 101.9662,
"step": 2030
},
{
"epoch": 0.3661542862431325,
"grad_norm": 55.3125,
"learning_rate": 9.992848549096812e-07,
"loss": 101.5475,
"step": 2040
},
{
"epoch": 0.3679491601953047,
"grad_norm": 55.15625,
"learning_rate": 9.992813492964936e-07,
"loss": 101.7496,
"step": 2050
},
{
"epoch": 0.3697440341474769,
"grad_norm": 53.78125,
"learning_rate": 9.992778436833056e-07,
"loss": 101.7626,
"step": 2060
},
{
"epoch": 0.3715389080996492,
"grad_norm": 55.53125,
"learning_rate": 9.992743380701178e-07,
"loss": 102.3083,
"step": 2070
},
{
"epoch": 0.3733337820518214,
"grad_norm": 56.65625,
"learning_rate": 9.9927083245693e-07,
"loss": 101.3427,
"step": 2080
},
{
"epoch": 0.3751286560039936,
"grad_norm": 61.375,
"learning_rate": 9.992673268437422e-07,
"loss": 100.4862,
"step": 2090
},
{
"epoch": 0.3769235299561658,
"grad_norm": 84.5,
"learning_rate": 9.992638212305544e-07,
"loss": 101.2625,
"step": 2100
},
{
"epoch": 0.37871840390833805,
"grad_norm": 60.75,
"learning_rate": 9.992603156173664e-07,
"loss": 101.8961,
"step": 2110
},
{
"epoch": 0.38051327786051026,
"grad_norm": 56.8125,
"learning_rate": 9.992568100041786e-07,
"loss": 100.7833,
"step": 2120
},
{
"epoch": 0.38230815181268246,
"grad_norm": 56.0,
"learning_rate": 9.992533043909908e-07,
"loss": 101.3713,
"step": 2130
},
{
"epoch": 0.38410302576485467,
"grad_norm": 54.625,
"learning_rate": 9.99249798777803e-07,
"loss": 101.6192,
"step": 2140
},
{
"epoch": 0.3858978997170269,
"grad_norm": 61.1875,
"learning_rate": 9.992462931646151e-07,
"loss": 101.346,
"step": 2150
},
{
"epoch": 0.38769277366919913,
"grad_norm": 58.40625,
"learning_rate": 9.992427875514273e-07,
"loss": 101.0517,
"step": 2160
},
{
"epoch": 0.38948764762137134,
"grad_norm": 58.78125,
"learning_rate": 9.992392819382395e-07,
"loss": 101.618,
"step": 2170
},
{
"epoch": 0.39128252157354354,
"grad_norm": 55.03125,
"learning_rate": 9.992357763250517e-07,
"loss": 101.7176,
"step": 2180
},
{
"epoch": 0.3930773955257158,
"grad_norm": 58.40625,
"learning_rate": 9.992322707118637e-07,
"loss": 99.9457,
"step": 2190
},
{
"epoch": 0.394872269477888,
"grad_norm": 53.0,
"learning_rate": 9.99228765098676e-07,
"loss": 101.6802,
"step": 2200
},
{
"epoch": 0.3966671434300602,
"grad_norm": 53.40625,
"learning_rate": 9.99225259485488e-07,
"loss": 100.4554,
"step": 2210
},
{
"epoch": 0.3984620173822324,
"grad_norm": 57.6875,
"learning_rate": 9.992217538723003e-07,
"loss": 99.7695,
"step": 2220
},
{
"epoch": 0.4002568913344047,
"grad_norm": 57.1875,
"learning_rate": 9.992182482591125e-07,
"loss": 100.7215,
"step": 2230
},
{
"epoch": 0.4020517652865769,
"grad_norm": 55.21875,
"learning_rate": 9.992147426459247e-07,
"loss": 102.5693,
"step": 2240
},
{
"epoch": 0.4038466392387491,
"grad_norm": 52.4375,
"learning_rate": 9.992112370327369e-07,
"loss": 101.3943,
"step": 2250
},
{
"epoch": 0.4056415131909213,
"grad_norm": 56.9375,
"learning_rate": 9.992077314195489e-07,
"loss": 102.1974,
"step": 2260
},
{
"epoch": 0.40743638714309355,
"grad_norm": 61.71875,
"learning_rate": 9.99204225806361e-07,
"loss": 100.8492,
"step": 2270
},
{
"epoch": 0.40923126109526575,
"grad_norm": 56.9375,
"learning_rate": 9.992007201931732e-07,
"loss": 100.8497,
"step": 2280
},
{
"epoch": 0.41102613504743796,
"grad_norm": 57.71875,
"learning_rate": 9.991972145799854e-07,
"loss": 101.1414,
"step": 2290
},
{
"epoch": 0.41282100899961016,
"grad_norm": 52.625,
"learning_rate": 9.991937089667976e-07,
"loss": 101.3806,
"step": 2300
},
{
"epoch": 0.41461588295178237,
"grad_norm": 54.25,
"learning_rate": 9.991902033536096e-07,
"loss": 101.2679,
"step": 2310
},
{
"epoch": 0.4164107569039546,
"grad_norm": 55.59375,
"learning_rate": 9.99186697740422e-07,
"loss": 100.5695,
"step": 2320
},
{
"epoch": 0.41820563085612683,
"grad_norm": 64.75,
"learning_rate": 9.991831921272342e-07,
"loss": 100.9625,
"step": 2330
},
{
"epoch": 0.42000050480829904,
"grad_norm": 59.78125,
"learning_rate": 9.991796865140462e-07,
"loss": 100.6983,
"step": 2340
},
{
"epoch": 0.42179537876047124,
"grad_norm": 54.46875,
"learning_rate": 9.991761809008584e-07,
"loss": 100.4711,
"step": 2350
},
{
"epoch": 0.4235902527126435,
"grad_norm": 50.9375,
"learning_rate": 9.991726752876706e-07,
"loss": 101.1268,
"step": 2360
},
{
"epoch": 0.4253851266648157,
"grad_norm": 54.40625,
"learning_rate": 9.991691696744828e-07,
"loss": 100.953,
"step": 2370
},
{
"epoch": 0.4271800006169879,
"grad_norm": 60.71875,
"learning_rate": 9.99165664061295e-07,
"loss": 101.4937,
"step": 2380
},
{
"epoch": 0.4289748745691601,
"grad_norm": 59.15625,
"learning_rate": 9.99162158448107e-07,
"loss": 98.9683,
"step": 2390
},
{
"epoch": 0.4307697485213324,
"grad_norm": 57.4375,
"learning_rate": 9.991586528349191e-07,
"loss": 101.705,
"step": 2400
},
{
"epoch": 0.4325646224735046,
"grad_norm": 59.15625,
"learning_rate": 9.991551472217313e-07,
"loss": 100.5508,
"step": 2410
},
{
"epoch": 0.4343594964256768,
"grad_norm": 54.5,
"learning_rate": 9.991516416085435e-07,
"loss": 101.0069,
"step": 2420
},
{
"epoch": 0.436154370377849,
"grad_norm": 56.125,
"learning_rate": 9.991481359953557e-07,
"loss": 101.6237,
"step": 2430
},
{
"epoch": 0.43794924433002125,
"grad_norm": 52.78125,
"learning_rate": 9.99144630382168e-07,
"loss": 100.7629,
"step": 2440
},
{
"epoch": 0.43974411828219345,
"grad_norm": 57.28125,
"learning_rate": 9.991411247689801e-07,
"loss": 99.8311,
"step": 2450
},
{
"epoch": 0.44153899223436566,
"grad_norm": 61.125,
"learning_rate": 9.99137619155792e-07,
"loss": 100.6114,
"step": 2460
},
{
"epoch": 0.44333386618653786,
"grad_norm": 57.1875,
"learning_rate": 9.991341135426043e-07,
"loss": 101.98,
"step": 2470
},
{
"epoch": 0.4451287401387101,
"grad_norm": 59.90625,
"learning_rate": 9.991306079294165e-07,
"loss": 101.4968,
"step": 2480
},
{
"epoch": 0.4469236140908823,
"grad_norm": 63.375,
"learning_rate": 9.991271023162287e-07,
"loss": 101.6943,
"step": 2490
},
{
"epoch": 0.44871848804305453,
"grad_norm": 59.21875,
"learning_rate": 9.991235967030409e-07,
"loss": 100.9488,
"step": 2500
},
{
"epoch": 0.45051336199522674,
"grad_norm": 58.5625,
"learning_rate": 9.99120091089853e-07,
"loss": 100.7646,
"step": 2510
},
{
"epoch": 0.452308235947399,
"grad_norm": 62.40625,
"learning_rate": 9.991165854766653e-07,
"loss": 101.5989,
"step": 2520
},
{
"epoch": 0.4541031098995712,
"grad_norm": 57.3125,
"learning_rate": 9.991130798634775e-07,
"loss": 101.8999,
"step": 2530
},
{
"epoch": 0.4558979838517434,
"grad_norm": 52.6875,
"learning_rate": 9.991095742502894e-07,
"loss": 99.8278,
"step": 2540
},
{
"epoch": 0.4576928578039156,
"grad_norm": 53.46875,
"learning_rate": 9.991060686371016e-07,
"loss": 100.9303,
"step": 2550
},
{
"epoch": 0.45948773175608787,
"grad_norm": 55.21875,
"learning_rate": 9.991025630239138e-07,
"loss": 99.6612,
"step": 2560
},
{
"epoch": 0.4612826057082601,
"grad_norm": 57.4375,
"learning_rate": 9.99099057410726e-07,
"loss": 102.1943,
"step": 2570
},
{
"epoch": 0.4630774796604323,
"grad_norm": 55.15625,
"learning_rate": 9.990955517975382e-07,
"loss": 101.1615,
"step": 2580
},
{
"epoch": 0.4648723536126045,
"grad_norm": 56.90625,
"learning_rate": 9.990920461843504e-07,
"loss": 100.7708,
"step": 2590
},
{
"epoch": 0.46666722756477674,
"grad_norm": 59.8125,
"learning_rate": 9.990885405711626e-07,
"loss": 99.9972,
"step": 2600
},
{
"epoch": 0.46846210151694895,
"grad_norm": 58.53125,
"learning_rate": 9.990850349579746e-07,
"loss": 100.8372,
"step": 2610
},
{
"epoch": 0.47025697546912115,
"grad_norm": 62.8125,
"learning_rate": 9.990815293447868e-07,
"loss": 99.2942,
"step": 2620
},
{
"epoch": 0.47205184942129336,
"grad_norm": 58.0,
"learning_rate": 9.99078023731599e-07,
"loss": 100.7167,
"step": 2630
},
{
"epoch": 0.4738467233734656,
"grad_norm": 56.28125,
"learning_rate": 9.990745181184112e-07,
"loss": 101.028,
"step": 2640
},
{
"epoch": 0.4756415973256378,
"grad_norm": 56.0,
"learning_rate": 9.990710125052234e-07,
"loss": 100.8192,
"step": 2650
},
{
"epoch": 0.47743647127781,
"grad_norm": 55.625,
"learning_rate": 9.990675068920353e-07,
"loss": 100.6052,
"step": 2660
},
{
"epoch": 0.47923134522998223,
"grad_norm": 55.03125,
"learning_rate": 9.990640012788475e-07,
"loss": 100.9563,
"step": 2670
},
{
"epoch": 0.4810262191821545,
"grad_norm": 58.625,
"learning_rate": 9.9906049566566e-07,
"loss": 100.1024,
"step": 2680
},
{
"epoch": 0.4828210931343267,
"grad_norm": 60.96875,
"learning_rate": 9.99056990052472e-07,
"loss": 101.6276,
"step": 2690
},
{
"epoch": 0.4846159670864989,
"grad_norm": 53.53125,
"learning_rate": 9.990534844392841e-07,
"loss": 100.4488,
"step": 2700
},
{
"epoch": 0.4864108410386711,
"grad_norm": 57.78125,
"learning_rate": 9.990499788260963e-07,
"loss": 100.9927,
"step": 2710
},
{
"epoch": 0.48820571499084336,
"grad_norm": 56.125,
"learning_rate": 9.990464732129085e-07,
"loss": 101.7882,
"step": 2720
},
{
"epoch": 0.49000058894301557,
"grad_norm": 55.71875,
"learning_rate": 9.990429675997207e-07,
"loss": 99.6567,
"step": 2730
},
{
"epoch": 0.4917954628951878,
"grad_norm": 57.53125,
"learning_rate": 9.990394619865327e-07,
"loss": 101.4762,
"step": 2740
},
{
"epoch": 0.49359033684736,
"grad_norm": 53.96875,
"learning_rate": 9.990359563733449e-07,
"loss": 100.3731,
"step": 2750
},
{
"epoch": 0.49538521079953224,
"grad_norm": 61.0,
"learning_rate": 9.99032450760157e-07,
"loss": 99.6628,
"step": 2760
},
{
"epoch": 0.49718008475170444,
"grad_norm": 55.875,
"learning_rate": 9.990289451469693e-07,
"loss": 100.9941,
"step": 2770
},
{
"epoch": 0.49897495870387665,
"grad_norm": 53.40625,
"learning_rate": 9.990254395337815e-07,
"loss": 101.2516,
"step": 2780
},
{
"epoch": 0.5007698326560489,
"grad_norm": 62.5625,
"learning_rate": 9.990219339205937e-07,
"loss": 100.9763,
"step": 2790
},
{
"epoch": 0.5025647066082211,
"grad_norm": 54.78125,
"learning_rate": 9.990184283074059e-07,
"loss": 99.8499,
"step": 2800
},
{
"epoch": 0.5043595805603933,
"grad_norm": 59.125,
"learning_rate": 9.990149226942178e-07,
"loss": 100.2553,
"step": 2810
},
{
"epoch": 0.5061544545125655,
"grad_norm": 54.625,
"learning_rate": 9.9901141708103e-07,
"loss": 100.0908,
"step": 2820
},
{
"epoch": 0.5079493284647377,
"grad_norm": 53.875,
"learning_rate": 9.990079114678422e-07,
"loss": 99.8332,
"step": 2830
},
{
"epoch": 0.50974420241691,
"grad_norm": 67.3125,
"learning_rate": 9.990044058546544e-07,
"loss": 101.4204,
"step": 2840
},
{
"epoch": 0.5115390763690821,
"grad_norm": 57.8125,
"learning_rate": 9.990009002414666e-07,
"loss": 100.5515,
"step": 2850
},
{
"epoch": 0.5133339503212544,
"grad_norm": 55.96875,
"learning_rate": 9.989973946282786e-07,
"loss": 100.3896,
"step": 2860
},
{
"epoch": 0.5151288242734267,
"grad_norm": 57.8125,
"learning_rate": 9.98993889015091e-07,
"loss": 101.921,
"step": 2870
},
{
"epoch": 0.5169236982255988,
"grad_norm": 51.40625,
"learning_rate": 9.989903834019032e-07,
"loss": 100.1915,
"step": 2880
},
{
"epoch": 0.5187185721777711,
"grad_norm": 57.25,
"learning_rate": 9.989868777887152e-07,
"loss": 100.3448,
"step": 2890
},
{
"epoch": 0.5205134461299432,
"grad_norm": 52.40625,
"learning_rate": 9.989833721755274e-07,
"loss": 100.6079,
"step": 2900
},
{
"epoch": 0.5223083200821155,
"grad_norm": 53.28125,
"learning_rate": 9.989798665623396e-07,
"loss": 101.4967,
"step": 2910
},
{
"epoch": 0.5241031940342877,
"grad_norm": 54.40625,
"learning_rate": 9.989763609491518e-07,
"loss": 100.2131,
"step": 2920
},
{
"epoch": 0.5258980679864599,
"grad_norm": 60.59375,
"learning_rate": 9.98972855335964e-07,
"loss": 100.2743,
"step": 2930
},
{
"epoch": 0.5276929419386321,
"grad_norm": 55.5625,
"learning_rate": 9.98969349722776e-07,
"loss": 100.4026,
"step": 2940
},
{
"epoch": 0.5294878158908044,
"grad_norm": 60.125,
"learning_rate": 9.989658441095881e-07,
"loss": 101.1321,
"step": 2950
},
{
"epoch": 0.5312826898429766,
"grad_norm": 56.125,
"learning_rate": 9.989623384964005e-07,
"loss": 100.6127,
"step": 2960
},
{
"epoch": 0.5330775637951488,
"grad_norm": 56.0,
"learning_rate": 9.989588328832125e-07,
"loss": 100.822,
"step": 2970
},
{
"epoch": 0.534872437747321,
"grad_norm": 58.65625,
"learning_rate": 9.989553272700247e-07,
"loss": 100.8803,
"step": 2980
},
{
"epoch": 0.5366673116994932,
"grad_norm": 52.875,
"learning_rate": 9.98951821656837e-07,
"loss": 100.7196,
"step": 2990
},
{
"epoch": 0.5384621856516655,
"grad_norm": 57.3125,
"learning_rate": 9.98948316043649e-07,
"loss": 100.049,
"step": 3000
},
{
"epoch": 0.5402570596038376,
"grad_norm": 54.96875,
"learning_rate": 9.989448104304613e-07,
"loss": 99.7241,
"step": 3010
},
{
"epoch": 0.5420519335560099,
"grad_norm": 57.15625,
"learning_rate": 9.989413048172733e-07,
"loss": 100.1751,
"step": 3020
},
{
"epoch": 0.5438468075081822,
"grad_norm": 56.375,
"learning_rate": 9.989377992040855e-07,
"loss": 100.9655,
"step": 3030
},
{
"epoch": 0.5456416814603543,
"grad_norm": 52.09375,
"learning_rate": 9.989342935908977e-07,
"loss": 101.2898,
"step": 3040
},
{
"epoch": 0.5474365554125266,
"grad_norm": 54.625,
"learning_rate": 9.989307879777099e-07,
"loss": 100.3283,
"step": 3050
},
{
"epoch": 0.5492314293646987,
"grad_norm": 55.21875,
"learning_rate": 9.98927282364522e-07,
"loss": 99.0049,
"step": 3060
},
{
"epoch": 0.551026303316871,
"grad_norm": 55.4375,
"learning_rate": 9.989237767513343e-07,
"loss": 100.6654,
"step": 3070
},
{
"epoch": 0.5528211772690432,
"grad_norm": 55.96875,
"learning_rate": 9.989202711381464e-07,
"loss": 100.3892,
"step": 3080
},
{
"epoch": 0.5546160512212154,
"grad_norm": 61.875,
"learning_rate": 9.989167655249584e-07,
"loss": 99.8484,
"step": 3090
},
{
"epoch": 0.5564109251733876,
"grad_norm": 49.375,
"learning_rate": 9.989132599117706e-07,
"loss": 100.2157,
"step": 3100
},
{
"epoch": 0.5582057991255599,
"grad_norm": 53.5625,
"learning_rate": 9.989097542985828e-07,
"loss": 99.1748,
"step": 3110
},
{
"epoch": 0.560000673077732,
"grad_norm": 55.25,
"learning_rate": 9.98906248685395e-07,
"loss": 100.8326,
"step": 3120
},
{
"epoch": 0.5617955470299043,
"grad_norm": 57.71875,
"learning_rate": 9.989027430722072e-07,
"loss": 100.5236,
"step": 3130
},
{
"epoch": 0.5635904209820765,
"grad_norm": 56.6875,
"learning_rate": 9.988992374590194e-07,
"loss": 99.4909,
"step": 3140
},
{
"epoch": 0.5653852949342487,
"grad_norm": 56.90625,
"learning_rate": 9.988957318458316e-07,
"loss": 99.4755,
"step": 3150
},
{
"epoch": 0.567180168886421,
"grad_norm": 56.375,
"learning_rate": 9.988922262326438e-07,
"loss": 99.6661,
"step": 3160
},
{
"epoch": 0.5689750428385931,
"grad_norm": 56.875,
"learning_rate": 9.988887206194558e-07,
"loss": 100.5558,
"step": 3170
},
{
"epoch": 0.5707699167907654,
"grad_norm": 64.125,
"learning_rate": 9.98885215006268e-07,
"loss": 100.403,
"step": 3180
},
{
"epoch": 0.5725647907429376,
"grad_norm": 55.53125,
"learning_rate": 9.988817093930802e-07,
"loss": 101.0716,
"step": 3190
},
{
"epoch": 0.5743596646951098,
"grad_norm": 54.625,
"learning_rate": 9.988782037798924e-07,
"loss": 99.6999,
"step": 3200
},
{
"epoch": 0.576154538647282,
"grad_norm": 63.03125,
"learning_rate": 9.988746981667045e-07,
"loss": 100.4395,
"step": 3210
},
{
"epoch": 0.5779494125994542,
"grad_norm": 59.0,
"learning_rate": 9.988711925535165e-07,
"loss": 101.8542,
"step": 3220
},
{
"epoch": 0.5797442865516265,
"grad_norm": 58.46875,
"learning_rate": 9.98867686940329e-07,
"loss": 99.4703,
"step": 3230
},
{
"epoch": 0.5815391605037987,
"grad_norm": 54.3125,
"learning_rate": 9.98864181327141e-07,
"loss": 101.0656,
"step": 3240
},
{
"epoch": 0.5833340344559709,
"grad_norm": 54.03125,
"learning_rate": 9.988606757139531e-07,
"loss": 100.0247,
"step": 3250
},
{
"epoch": 0.5851289084081431,
"grad_norm": 57.0625,
"learning_rate": 9.988571701007653e-07,
"loss": 100.5174,
"step": 3260
},
{
"epoch": 0.5869237823603153,
"grad_norm": 56.1875,
"learning_rate": 9.988536644875775e-07,
"loss": 100.5157,
"step": 3270
},
{
"epoch": 0.5887186563124875,
"grad_norm": 51.6875,
"learning_rate": 9.988501588743897e-07,
"loss": 99.3698,
"step": 3280
},
{
"epoch": 0.5905135302646598,
"grad_norm": 60.5625,
"learning_rate": 9.988466532612017e-07,
"loss": 100.1518,
"step": 3290
},
{
"epoch": 0.592308404216832,
"grad_norm": 59.40625,
"learning_rate": 9.988431476480139e-07,
"loss": 100.5745,
"step": 3300
},
{
"epoch": 0.5941032781690042,
"grad_norm": 54.5,
"learning_rate": 9.98839642034826e-07,
"loss": 101.8106,
"step": 3310
},
{
"epoch": 0.5958981521211765,
"grad_norm": 61.03125,
"learning_rate": 9.988361364216383e-07,
"loss": 100.4201,
"step": 3320
},
{
"epoch": 0.5976930260733486,
"grad_norm": 57.15625,
"learning_rate": 9.988326308084505e-07,
"loss": 100.6724,
"step": 3330
},
{
"epoch": 0.5994879000255209,
"grad_norm": 51.03125,
"learning_rate": 9.988291251952626e-07,
"loss": 100.1729,
"step": 3340
},
{
"epoch": 0.601282773977693,
"grad_norm": 57.71875,
"learning_rate": 9.988256195820748e-07,
"loss": 100.661,
"step": 3350
},
{
"epoch": 0.6030776479298653,
"grad_norm": 57.25,
"learning_rate": 9.98822113968887e-07,
"loss": 101.6807,
"step": 3360
},
{
"epoch": 0.6048725218820376,
"grad_norm": 56.625,
"learning_rate": 9.98818608355699e-07,
"loss": 101.2,
"step": 3370
},
{
"epoch": 0.6066673958342097,
"grad_norm": 60.5,
"learning_rate": 9.988151027425112e-07,
"loss": 99.7801,
"step": 3380
},
{
"epoch": 0.608462269786382,
"grad_norm": 51.375,
"learning_rate": 9.988115971293234e-07,
"loss": 100.2877,
"step": 3390
},
{
"epoch": 0.6102571437385542,
"grad_norm": 53.40625,
"learning_rate": 9.988080915161356e-07,
"loss": 99.2708,
"step": 3400
},
{
"epoch": 0.6120520176907264,
"grad_norm": 51.84375,
"learning_rate": 9.988045859029478e-07,
"loss": 100.1983,
"step": 3410
},
{
"epoch": 0.6138468916428986,
"grad_norm": 57.28125,
"learning_rate": 9.9880108028976e-07,
"loss": 99.6957,
"step": 3420
},
{
"epoch": 0.6156417655950708,
"grad_norm": 60.71875,
"learning_rate": 9.987975746765722e-07,
"loss": 100.4678,
"step": 3430
},
{
"epoch": 0.617436639547243,
"grad_norm": 56.84375,
"learning_rate": 9.987940690633842e-07,
"loss": 101.1054,
"step": 3440
},
{
"epoch": 0.6192315134994153,
"grad_norm": 55.1875,
"learning_rate": 9.987905634501964e-07,
"loss": 100.2326,
"step": 3450
},
{
"epoch": 0.6210263874515874,
"grad_norm": 59.375,
"learning_rate": 9.987870578370086e-07,
"loss": 101.0535,
"step": 3460
},
{
"epoch": 0.6228212614037597,
"grad_norm": 58.40625,
"learning_rate": 9.987835522238207e-07,
"loss": 100.6926,
"step": 3470
},
{
"epoch": 0.624616135355932,
"grad_norm": 60.25,
"learning_rate": 9.98780046610633e-07,
"loss": 101.5863,
"step": 3480
},
{
"epoch": 0.6264110093081041,
"grad_norm": 61.15625,
"learning_rate": 9.98776540997445e-07,
"loss": 100.7781,
"step": 3490
},
{
"epoch": 0.6282058832602764,
"grad_norm": 55.96875,
"learning_rate": 9.987730353842573e-07,
"loss": 101.0874,
"step": 3500
},
{
"epoch": 0.6300007572124485,
"grad_norm": 59.59375,
"learning_rate": 9.987695297710695e-07,
"loss": 100.5733,
"step": 3510
},
{
"epoch": 0.6317956311646208,
"grad_norm": 55.71875,
"learning_rate": 9.987660241578815e-07,
"loss": 100.3865,
"step": 3520
},
{
"epoch": 0.633590505116793,
"grad_norm": 60.1875,
"learning_rate": 9.987625185446937e-07,
"loss": 100.3602,
"step": 3530
},
{
"epoch": 0.6353853790689652,
"grad_norm": 56.625,
"learning_rate": 9.98759012931506e-07,
"loss": 99.8301,
"step": 3540
},
{
"epoch": 0.6371802530211375,
"grad_norm": 59.84375,
"learning_rate": 9.98755507318318e-07,
"loss": 101.2679,
"step": 3550
},
{
"epoch": 0.6389751269733097,
"grad_norm": 53.71875,
"learning_rate": 9.987520017051303e-07,
"loss": 100.4626,
"step": 3560
},
{
"epoch": 0.6407700009254819,
"grad_norm": 65.5625,
"learning_rate": 9.987484960919423e-07,
"loss": 98.8636,
"step": 3570
},
{
"epoch": 0.6425648748776541,
"grad_norm": 59.15625,
"learning_rate": 9.987449904787545e-07,
"loss": 99.7406,
"step": 3580
},
{
"epoch": 0.6443597488298263,
"grad_norm": 54.8125,
"learning_rate": 9.987414848655667e-07,
"loss": 99.1887,
"step": 3590
},
{
"epoch": 0.6461546227819985,
"grad_norm": 56.875,
"learning_rate": 9.987379792523788e-07,
"loss": 101.342,
"step": 3600
},
{
"epoch": 0.6479494967341708,
"grad_norm": 55.6875,
"learning_rate": 9.98734473639191e-07,
"loss": 99.9434,
"step": 3610
},
{
"epoch": 0.6497443706863429,
"grad_norm": 60.59375,
"learning_rate": 9.987309680260032e-07,
"loss": 99.4815,
"step": 3620
},
{
"epoch": 0.6515392446385152,
"grad_norm": 51.53125,
"learning_rate": 9.987274624128154e-07,
"loss": 99.7587,
"step": 3630
},
{
"epoch": 0.6533341185906875,
"grad_norm": 60.0,
"learning_rate": 9.987239567996274e-07,
"loss": 101.2723,
"step": 3640
},
{
"epoch": 0.6551289925428596,
"grad_norm": 63.8125,
"learning_rate": 9.987204511864396e-07,
"loss": 100.9252,
"step": 3650
},
{
"epoch": 0.6569238664950319,
"grad_norm": 56.75,
"learning_rate": 9.987169455732518e-07,
"loss": 100.5552,
"step": 3660
},
{
"epoch": 0.658718740447204,
"grad_norm": 59.71875,
"learning_rate": 9.98713439960064e-07,
"loss": 100.8731,
"step": 3670
},
{
"epoch": 0.6605136143993763,
"grad_norm": 51.9375,
"learning_rate": 9.987099343468762e-07,
"loss": 100.687,
"step": 3680
},
{
"epoch": 0.6623084883515485,
"grad_norm": 56.96875,
"learning_rate": 9.987064287336884e-07,
"loss": 99.6145,
"step": 3690
},
{
"epoch": 0.6641033623037207,
"grad_norm": 57.1875,
"learning_rate": 9.987029231205006e-07,
"loss": 99.7202,
"step": 3700
},
{
"epoch": 0.665898236255893,
"grad_norm": 56.03125,
"learning_rate": 9.986994175073128e-07,
"loss": 100.4039,
"step": 3710
},
{
"epoch": 0.6676931102080651,
"grad_norm": 57.875,
"learning_rate": 9.986959118941248e-07,
"loss": 100.2563,
"step": 3720
},
{
"epoch": 0.6694879841602374,
"grad_norm": 56.15625,
"learning_rate": 9.98692406280937e-07,
"loss": 100.1503,
"step": 3730
},
{
"epoch": 0.6712828581124096,
"grad_norm": 58.0625,
"learning_rate": 9.986889006677491e-07,
"loss": 100.3172,
"step": 3740
},
{
"epoch": 0.6730777320645818,
"grad_norm": 54.53125,
"learning_rate": 9.986853950545613e-07,
"loss": 99.0521,
"step": 3750
},
{
"epoch": 0.674872606016754,
"grad_norm": 67.8125,
"learning_rate": 9.986818894413735e-07,
"loss": 101.304,
"step": 3760
},
{
"epoch": 0.6766674799689263,
"grad_norm": 53.90625,
"learning_rate": 9.986783838281857e-07,
"loss": 99.0437,
"step": 3770
},
{
"epoch": 0.6784623539210984,
"grad_norm": 53.40625,
"learning_rate": 9.98674878214998e-07,
"loss": 100.0527,
"step": 3780
},
{
"epoch": 0.6802572278732707,
"grad_norm": 55.90625,
"learning_rate": 9.9867137260181e-07,
"loss": 100.6432,
"step": 3790
},
{
"epoch": 0.6820521018254428,
"grad_norm": 53.5625,
"learning_rate": 9.98667866988622e-07,
"loss": 100.5706,
"step": 3800
},
{
"epoch": 0.6838469757776151,
"grad_norm": 61.46875,
"learning_rate": 9.986643613754343e-07,
"loss": 99.6143,
"step": 3810
},
{
"epoch": 0.6856418497297874,
"grad_norm": 57.9375,
"learning_rate": 9.986608557622465e-07,
"loss": 100.0116,
"step": 3820
},
{
"epoch": 0.6874367236819595,
"grad_norm": 54.40625,
"learning_rate": 9.986573501490587e-07,
"loss": 100.6161,
"step": 3830
},
{
"epoch": 0.6892315976341318,
"grad_norm": 54.6875,
"learning_rate": 9.986538445358707e-07,
"loss": 100.3408,
"step": 3840
},
{
"epoch": 0.691026471586304,
"grad_norm": 59.40625,
"learning_rate": 9.986503389226829e-07,
"loss": 99.8375,
"step": 3850
},
{
"epoch": 0.6928213455384762,
"grad_norm": 55.28125,
"learning_rate": 9.986468333094953e-07,
"loss": 100.2615,
"step": 3860
},
{
"epoch": 0.6946162194906484,
"grad_norm": 62.71875,
"learning_rate": 9.986433276963072e-07,
"loss": 101.2431,
"step": 3870
},
{
"epoch": 0.6964110934428206,
"grad_norm": 56.25,
"learning_rate": 9.986398220831194e-07,
"loss": 100.8722,
"step": 3880
},
{
"epoch": 0.6982059673949929,
"grad_norm": 56.28125,
"learning_rate": 9.986363164699316e-07,
"loss": 100.0757,
"step": 3890
},
{
"epoch": 0.7000008413471651,
"grad_norm": 57.1875,
"learning_rate": 9.986328108567438e-07,
"loss": 99.6734,
"step": 3900
},
{
"epoch": 0.7017957152993373,
"grad_norm": 55.15625,
"learning_rate": 9.98629305243556e-07,
"loss": 100.0005,
"step": 3910
},
{
"epoch": 0.7035905892515095,
"grad_norm": 58.0,
"learning_rate": 9.98625799630368e-07,
"loss": 99.7119,
"step": 3920
},
{
"epoch": 0.7053854632036818,
"grad_norm": 57.15625,
"learning_rate": 9.986222940171802e-07,
"loss": 99.9743,
"step": 3930
},
{
"epoch": 0.7071803371558539,
"grad_norm": 53.21875,
"learning_rate": 9.986187884039924e-07,
"loss": 99.9378,
"step": 3940
},
{
"epoch": 0.7089752111080262,
"grad_norm": 55.46875,
"learning_rate": 9.986152827908046e-07,
"loss": 101.0054,
"step": 3950
},
{
"epoch": 0.7107700850601983,
"grad_norm": 53.09375,
"learning_rate": 9.986117771776168e-07,
"loss": 100.5901,
"step": 3960
},
{
"epoch": 0.7125649590123706,
"grad_norm": 58.6875,
"learning_rate": 9.98608271564429e-07,
"loss": 100.1371,
"step": 3970
},
{
"epoch": 0.7143598329645429,
"grad_norm": 55.59375,
"learning_rate": 9.986047659512412e-07,
"loss": 100.4305,
"step": 3980
},
{
"epoch": 0.716154706916715,
"grad_norm": 55.875,
"learning_rate": 9.986012603380532e-07,
"loss": 99.389,
"step": 3990
},
{
"epoch": 0.7179495808688873,
"grad_norm": 60.21875,
"learning_rate": 9.985977547248653e-07,
"loss": 100.2918,
"step": 4000
},
{
"epoch": 0.7179495808688873,
"eval_loss": 1.561510443687439,
"eval_runtime": 193.4272,
"eval_samples_per_second": 1512.569,
"eval_steps_per_second": 47.268,
"step": 4000
},
{
"epoch": 0.7197444548210595,
"grad_norm": 60.78125,
"learning_rate": 9.985942491116775e-07,
"loss": 99.0898,
"step": 4010
},
{
"epoch": 0.7215393287732317,
"grad_norm": 61.78125,
"learning_rate": 9.985907434984897e-07,
"loss": 99.2981,
"step": 4020
},
{
"epoch": 0.7233342027254039,
"grad_norm": 57.75,
"learning_rate": 9.98587237885302e-07,
"loss": 99.7238,
"step": 4030
},
{
"epoch": 0.7251290766775761,
"grad_norm": 53.78125,
"learning_rate": 9.98583732272114e-07,
"loss": 99.6891,
"step": 4040
},
{
"epoch": 0.7269239506297483,
"grad_norm": 59.375,
"learning_rate": 9.985802266589263e-07,
"loss": 100.155,
"step": 4050
},
{
"epoch": 0.7287188245819206,
"grad_norm": 57.375,
"learning_rate": 9.985767210457385e-07,
"loss": 98.5343,
"step": 4060
},
{
"epoch": 0.7305136985340928,
"grad_norm": 57.9375,
"learning_rate": 9.985732154325505e-07,
"loss": 101.0483,
"step": 4070
},
{
"epoch": 0.732308572486265,
"grad_norm": 61.5,
"learning_rate": 9.985697098193627e-07,
"loss": 99.708,
"step": 4080
},
{
"epoch": 0.7341034464384373,
"grad_norm": 52.6875,
"learning_rate": 9.985662042061749e-07,
"loss": 100.2094,
"step": 4090
},
{
"epoch": 0.7358983203906094,
"grad_norm": 54.34375,
"learning_rate": 9.98562698592987e-07,
"loss": 99.3119,
"step": 4100
},
{
"epoch": 0.7376931943427817,
"grad_norm": 52.625,
"learning_rate": 9.985591929797993e-07,
"loss": 98.8817,
"step": 4110
},
{
"epoch": 0.7394880682949538,
"grad_norm": 58.71875,
"learning_rate": 9.985556873666113e-07,
"loss": 100.0632,
"step": 4120
},
{
"epoch": 0.7412829422471261,
"grad_norm": 54.28125,
"learning_rate": 9.985521817534234e-07,
"loss": 100.4009,
"step": 4130
},
{
"epoch": 0.7430778161992984,
"grad_norm": 62.46875,
"learning_rate": 9.985486761402356e-07,
"loss": 99.6302,
"step": 4140
},
{
"epoch": 0.7448726901514705,
"grad_norm": 54.90625,
"learning_rate": 9.985451705270478e-07,
"loss": 98.9977,
"step": 4150
},
{
"epoch": 0.7466675641036428,
"grad_norm": 57.3125,
"learning_rate": 9.9854166491386e-07,
"loss": 99.6041,
"step": 4160
},
{
"epoch": 0.7484624380558149,
"grad_norm": 55.90625,
"learning_rate": 9.985381593006722e-07,
"loss": 99.9616,
"step": 4170
},
{
"epoch": 0.7502573120079872,
"grad_norm": 55.5625,
"learning_rate": 9.985346536874844e-07,
"loss": 99.9101,
"step": 4180
},
{
"epoch": 0.7520521859601594,
"grad_norm": 58.375,
"learning_rate": 9.985311480742964e-07,
"loss": 99.3676,
"step": 4190
},
{
"epoch": 0.7538470599123316,
"grad_norm": 62.0,
"learning_rate": 9.985276424611086e-07,
"loss": 100.3315,
"step": 4200
},
{
"epoch": 0.7556419338645038,
"grad_norm": 55.375,
"learning_rate": 9.985241368479208e-07,
"loss": 100.3022,
"step": 4210
},
{
"epoch": 0.7574368078166761,
"grad_norm": 56.1875,
"learning_rate": 9.98520631234733e-07,
"loss": 99.8273,
"step": 4220
},
{
"epoch": 0.7592316817688483,
"grad_norm": 55.15625,
"learning_rate": 9.985171256215452e-07,
"loss": 99.8944,
"step": 4230
},
{
"epoch": 0.7610265557210205,
"grad_norm": 57.25,
"learning_rate": 9.985136200083574e-07,
"loss": 99.9793,
"step": 4240
},
{
"epoch": 0.7628214296731927,
"grad_norm": 57.65625,
"learning_rate": 9.985101143951696e-07,
"loss": 99.6139,
"step": 4250
},
{
"epoch": 0.7646163036253649,
"grad_norm": 58.09375,
"learning_rate": 9.985066087819818e-07,
"loss": 99.2432,
"step": 4260
},
{
"epoch": 0.7664111775775372,
"grad_norm": 57.84375,
"learning_rate": 9.985031031687937e-07,
"loss": 100.9006,
"step": 4270
},
{
"epoch": 0.7682060515297093,
"grad_norm": 55.875,
"learning_rate": 9.98499597555606e-07,
"loss": 100.518,
"step": 4280
},
{
"epoch": 0.7700009254818816,
"grad_norm": 60.15625,
"learning_rate": 9.984960919424181e-07,
"loss": 99.9677,
"step": 4290
},
{
"epoch": 0.7717957994340539,
"grad_norm": 60.21875,
"learning_rate": 9.984925863292303e-07,
"loss": 100.5463,
"step": 4300
},
{
"epoch": 0.773590673386226,
"grad_norm": 59.5,
"learning_rate": 9.984890807160425e-07,
"loss": 99.2726,
"step": 4310
},
{
"epoch": 0.7753855473383983,
"grad_norm": 63.03125,
"learning_rate": 9.984855751028547e-07,
"loss": 99.198,
"step": 4320
},
{
"epoch": 0.7771804212905704,
"grad_norm": 55.84375,
"learning_rate": 9.98482069489667e-07,
"loss": 99.174,
"step": 4330
},
{
"epoch": 0.7789752952427427,
"grad_norm": 62.21875,
"learning_rate": 9.98478563876479e-07,
"loss": 99.7285,
"step": 4340
},
{
"epoch": 0.7807701691949149,
"grad_norm": 54.125,
"learning_rate": 9.98475058263291e-07,
"loss": 97.6992,
"step": 4350
},
{
"epoch": 0.7825650431470871,
"grad_norm": 64.3125,
"learning_rate": 9.984715526501033e-07,
"loss": 99.3388,
"step": 4360
},
{
"epoch": 0.7843599170992593,
"grad_norm": 56.1875,
"learning_rate": 9.984680470369155e-07,
"loss": 98.7792,
"step": 4370
},
{
"epoch": 0.7861547910514316,
"grad_norm": 57.90625,
"learning_rate": 9.984645414237277e-07,
"loss": 100.1827,
"step": 4380
},
{
"epoch": 0.7879496650036037,
"grad_norm": 56.09375,
"learning_rate": 9.984610358105396e-07,
"loss": 99.409,
"step": 4390
},
{
"epoch": 0.789744538955776,
"grad_norm": 54.78125,
"learning_rate": 9.984575301973518e-07,
"loss": 99.461,
"step": 4400
},
{
"epoch": 0.7915394129079482,
"grad_norm": 56.3125,
"learning_rate": 9.984540245841642e-07,
"loss": 99.5,
"step": 4410
},
{
"epoch": 0.7933342868601204,
"grad_norm": 58.5625,
"learning_rate": 9.984505189709762e-07,
"loss": 99.3468,
"step": 4420
},
{
"epoch": 0.7951291608122927,
"grad_norm": 65.0625,
"learning_rate": 9.984470133577884e-07,
"loss": 99.6309,
"step": 4430
},
{
"epoch": 0.7969240347644648,
"grad_norm": 59.8125,
"learning_rate": 9.984435077446006e-07,
"loss": 99.3963,
"step": 4440
},
{
"epoch": 0.7987189087166371,
"grad_norm": 60.28125,
"learning_rate": 9.984400021314128e-07,
"loss": 98.8352,
"step": 4450
},
{
"epoch": 0.8005137826688093,
"grad_norm": 57.1875,
"learning_rate": 9.98436496518225e-07,
"loss": 99.6084,
"step": 4460
},
{
"epoch": 0.8023086566209815,
"grad_norm": 58.3125,
"learning_rate": 9.98432990905037e-07,
"loss": 99.5552,
"step": 4470
},
{
"epoch": 0.8041035305731538,
"grad_norm": 54.28125,
"learning_rate": 9.984294852918492e-07,
"loss": 99.7207,
"step": 4480
},
{
"epoch": 0.8058984045253259,
"grad_norm": 55.875,
"learning_rate": 9.984259796786614e-07,
"loss": 100.2163,
"step": 4490
},
{
"epoch": 0.8076932784774982,
"grad_norm": 58.46875,
"learning_rate": 9.984224740654736e-07,
"loss": 100.1001,
"step": 4500
},
{
"epoch": 0.8094881524296704,
"grad_norm": 52.09375,
"learning_rate": 9.984189684522858e-07,
"loss": 100.1762,
"step": 4510
},
{
"epoch": 0.8112830263818426,
"grad_norm": 58.0,
"learning_rate": 9.98415462839098e-07,
"loss": 99.485,
"step": 4520
},
{
"epoch": 0.8130779003340148,
"grad_norm": 53.34375,
"learning_rate": 9.984119572259102e-07,
"loss": 99.2043,
"step": 4530
},
{
"epoch": 0.8148727742861871,
"grad_norm": 55.84375,
"learning_rate": 9.984084516127223e-07,
"loss": 100.9975,
"step": 4540
},
{
"epoch": 0.8166676482383592,
"grad_norm": 53.1875,
"learning_rate": 9.984049459995343e-07,
"loss": 100.2588,
"step": 4550
},
{
"epoch": 0.8184625221905315,
"grad_norm": 59.15625,
"learning_rate": 9.984014403863465e-07,
"loss": 101.2972,
"step": 4560
},
{
"epoch": 0.8202573961427037,
"grad_norm": 56.09375,
"learning_rate": 9.983979347731587e-07,
"loss": 99.6443,
"step": 4570
},
{
"epoch": 0.8220522700948759,
"grad_norm": 57.03125,
"learning_rate": 9.98394429159971e-07,
"loss": 99.9172,
"step": 4580
},
{
"epoch": 0.8238471440470482,
"grad_norm": 57.8125,
"learning_rate": 9.983909235467831e-07,
"loss": 99.9183,
"step": 4590
},
{
"epoch": 0.8256420179992203,
"grad_norm": 58.59375,
"learning_rate": 9.983874179335953e-07,
"loss": 101.1427,
"step": 4600
},
{
"epoch": 0.8274368919513926,
"grad_norm": 56.375,
"learning_rate": 9.983839123204075e-07,
"loss": 99.5511,
"step": 4610
},
{
"epoch": 0.8292317659035647,
"grad_norm": 54.84375,
"learning_rate": 9.983804067072195e-07,
"loss": 98.27,
"step": 4620
},
{
"epoch": 0.831026639855737,
"grad_norm": 55.1875,
"learning_rate": 9.983769010940317e-07,
"loss": 99.897,
"step": 4630
},
{
"epoch": 0.8328215138079093,
"grad_norm": 55.59375,
"learning_rate": 9.983733954808439e-07,
"loss": 100.4762,
"step": 4640
},
{
"epoch": 0.8346163877600814,
"grad_norm": 53.53125,
"learning_rate": 9.98369889867656e-07,
"loss": 99.019,
"step": 4650
},
{
"epoch": 0.8364112617122537,
"grad_norm": 55.125,
"learning_rate": 9.983663842544683e-07,
"loss": 99.7545,
"step": 4660
},
{
"epoch": 0.8382061356644259,
"grad_norm": 54.75,
"learning_rate": 9.983628786412802e-07,
"loss": 98.6123,
"step": 4670
},
{
"epoch": 0.8400010096165981,
"grad_norm": 52.1875,
"learning_rate": 9.983593730280926e-07,
"loss": 99.1614,
"step": 4680
},
{
"epoch": 0.8417958835687703,
"grad_norm": 57.0625,
"learning_rate": 9.983558674149048e-07,
"loss": 101.187,
"step": 4690
},
{
"epoch": 0.8435907575209425,
"grad_norm": 54.0,
"learning_rate": 9.983523618017168e-07,
"loss": 99.3741,
"step": 4700
},
{
"epoch": 0.8453856314731147,
"grad_norm": 54.65625,
"learning_rate": 9.98348856188529e-07,
"loss": 98.8455,
"step": 4710
},
{
"epoch": 0.847180505425287,
"grad_norm": 56.03125,
"learning_rate": 9.983453505753412e-07,
"loss": 100.3109,
"step": 4720
},
{
"epoch": 0.8489753793774591,
"grad_norm": 56.90625,
"learning_rate": 9.983418449621534e-07,
"loss": 99.0733,
"step": 4730
},
{
"epoch": 0.8507702533296314,
"grad_norm": 60.03125,
"learning_rate": 9.983383393489656e-07,
"loss": 99.2901,
"step": 4740
},
{
"epoch": 0.8525651272818037,
"grad_norm": 60.375,
"learning_rate": 9.983348337357776e-07,
"loss": 99.2978,
"step": 4750
},
{
"epoch": 0.8543600012339758,
"grad_norm": 52.1875,
"learning_rate": 9.983313281225898e-07,
"loss": 100.2887,
"step": 4760
},
{
"epoch": 0.8561548751861481,
"grad_norm": 58.6875,
"learning_rate": 9.98327822509402e-07,
"loss": 99.9115,
"step": 4770
},
{
"epoch": 0.8579497491383202,
"grad_norm": 56.03125,
"learning_rate": 9.983243168962142e-07,
"loss": 98.9636,
"step": 4780
},
{
"epoch": 0.8597446230904925,
"grad_norm": 64.5625,
"learning_rate": 9.983208112830264e-07,
"loss": 98.904,
"step": 4790
},
{
"epoch": 0.8615394970426647,
"grad_norm": 60.09375,
"learning_rate": 9.983173056698386e-07,
"loss": 99.8028,
"step": 4800
},
{
"epoch": 0.8633343709948369,
"grad_norm": 59.9375,
"learning_rate": 9.983138000566507e-07,
"loss": 99.5853,
"step": 4810
},
{
"epoch": 0.8651292449470092,
"grad_norm": 54.34375,
"learning_rate": 9.983102944434627e-07,
"loss": 98.7931,
"step": 4820
},
{
"epoch": 0.8669241188991814,
"grad_norm": 53.03125,
"learning_rate": 9.98306788830275e-07,
"loss": 99.695,
"step": 4830
},
{
"epoch": 0.8687189928513536,
"grad_norm": 54.34375,
"learning_rate": 9.983032832170871e-07,
"loss": 99.7535,
"step": 4840
},
{
"epoch": 0.8705138668035258,
"grad_norm": 58.84375,
"learning_rate": 9.982997776038993e-07,
"loss": 99.5457,
"step": 4850
},
{
"epoch": 0.872308740755698,
"grad_norm": 54.0,
"learning_rate": 9.982962719907115e-07,
"loss": 99.1976,
"step": 4860
},
{
"epoch": 0.8741036147078702,
"grad_norm": 52.1875,
"learning_rate": 9.982927663775237e-07,
"loss": 98.987,
"step": 4870
},
{
"epoch": 0.8758984886600425,
"grad_norm": 55.0625,
"learning_rate": 9.982892607643359e-07,
"loss": 99.2621,
"step": 4880
},
{
"epoch": 0.8776933626122146,
"grad_norm": 53.4375,
"learning_rate": 9.98285755151148e-07,
"loss": 99.8875,
"step": 4890
},
{
"epoch": 0.8794882365643869,
"grad_norm": 55.875,
"learning_rate": 9.9828224953796e-07,
"loss": 98.8774,
"step": 4900
},
{
"epoch": 0.8812831105165592,
"grad_norm": 59.78125,
"learning_rate": 9.982787439247723e-07,
"loss": 100.7272,
"step": 4910
},
{
"epoch": 0.8830779844687313,
"grad_norm": 55.21875,
"learning_rate": 9.982752383115845e-07,
"loss": 99.0058,
"step": 4920
},
{
"epoch": 0.8848728584209036,
"grad_norm": 54.8125,
"learning_rate": 9.982717326983967e-07,
"loss": 99.3883,
"step": 4930
},
{
"epoch": 0.8866677323730757,
"grad_norm": 55.8125,
"learning_rate": 9.982682270852088e-07,
"loss": 98.9662,
"step": 4940
},
{
"epoch": 0.888462606325248,
"grad_norm": 52.90625,
"learning_rate": 9.98264721472021e-07,
"loss": 100.7881,
"step": 4950
},
{
"epoch": 0.8902574802774202,
"grad_norm": 54.71875,
"learning_rate": 9.982612158588332e-07,
"loss": 98.7615,
"step": 4960
},
{
"epoch": 0.8920523542295924,
"grad_norm": 53.65625,
"learning_rate": 9.982577102456452e-07,
"loss": 99.4831,
"step": 4970
},
{
"epoch": 0.8938472281817647,
"grad_norm": 58.78125,
"learning_rate": 9.982542046324574e-07,
"loss": 98.5362,
"step": 4980
},
{
"epoch": 0.8956421021339369,
"grad_norm": 54.0625,
"learning_rate": 9.982506990192696e-07,
"loss": 98.9379,
"step": 4990
},
{
"epoch": 0.8974369760861091,
"grad_norm": 52.78125,
"learning_rate": 9.982471934060818e-07,
"loss": 98.3786,
"step": 5000
},
{
"epoch": 0.8992318500382813,
"grad_norm": 57.09375,
"learning_rate": 9.98243687792894e-07,
"loss": 99.2205,
"step": 5010
},
{
"epoch": 0.9010267239904535,
"grad_norm": 53.71875,
"learning_rate": 9.98240182179706e-07,
"loss": 99.4672,
"step": 5020
},
{
"epoch": 0.9028215979426257,
"grad_norm": 57.40625,
"learning_rate": 9.982366765665182e-07,
"loss": 99.5186,
"step": 5030
},
{
"epoch": 0.904616471894798,
"grad_norm": 58.71875,
"learning_rate": 9.982331709533304e-07,
"loss": 99.584,
"step": 5040
},
{
"epoch": 0.9064113458469701,
"grad_norm": 55.0625,
"learning_rate": 9.982296653401426e-07,
"loss": 100.6763,
"step": 5050
},
{
"epoch": 0.9082062197991424,
"grad_norm": 54.28125,
"learning_rate": 9.982261597269548e-07,
"loss": 98.7888,
"step": 5060
},
{
"epoch": 0.9100010937513147,
"grad_norm": 54.8125,
"learning_rate": 9.98222654113767e-07,
"loss": 98.7003,
"step": 5070
},
{
"epoch": 0.9117959677034868,
"grad_norm": 58.28125,
"learning_rate": 9.982191485005791e-07,
"loss": 99.602,
"step": 5080
},
{
"epoch": 0.9135908416556591,
"grad_norm": 56.28125,
"learning_rate": 9.982156428873913e-07,
"loss": 99.339,
"step": 5090
},
{
"epoch": 0.9153857156078312,
"grad_norm": 55.15625,
"learning_rate": 9.982121372742033e-07,
"loss": 99.4155,
"step": 5100
},
{
"epoch": 0.9171805895600035,
"grad_norm": 57.03125,
"learning_rate": 9.982086316610155e-07,
"loss": 99.6862,
"step": 5110
},
{
"epoch": 0.9189754635121757,
"grad_norm": 56.96875,
"learning_rate": 9.982051260478277e-07,
"loss": 99.5966,
"step": 5120
},
{
"epoch": 0.9207703374643479,
"grad_norm": 55.6875,
"learning_rate": 9.9820162043464e-07,
"loss": 100.2231,
"step": 5130
},
{
"epoch": 0.9225652114165201,
"grad_norm": 54.78125,
"learning_rate": 9.98198114821452e-07,
"loss": 99.5804,
"step": 5140
},
{
"epoch": 0.9243600853686923,
"grad_norm": 58.1875,
"learning_rate": 9.981946092082643e-07,
"loss": 98.8408,
"step": 5150
},
{
"epoch": 0.9261549593208646,
"grad_norm": 56.75,
"learning_rate": 9.981911035950765e-07,
"loss": 99.1592,
"step": 5160
},
{
"epoch": 0.9279498332730368,
"grad_norm": 51.3125,
"learning_rate": 9.981875979818885e-07,
"loss": 99.347,
"step": 5170
},
{
"epoch": 0.929744707225209,
"grad_norm": 56.6875,
"learning_rate": 9.981840923687007e-07,
"loss": 99.7062,
"step": 5180
},
{
"epoch": 0.9315395811773812,
"grad_norm": 55.28125,
"learning_rate": 9.981805867555129e-07,
"loss": 100.1522,
"step": 5190
},
{
"epoch": 0.9333344551295535,
"grad_norm": 55.25,
"learning_rate": 9.98177081142325e-07,
"loss": 99.1621,
"step": 5200
},
{
"epoch": 0.9351293290817256,
"grad_norm": 58.40625,
"learning_rate": 9.981735755291372e-07,
"loss": 99.3177,
"step": 5210
},
{
"epoch": 0.9369242030338979,
"grad_norm": 82.875,
"learning_rate": 9.981700699159492e-07,
"loss": 99.838,
"step": 5220
},
{
"epoch": 0.93871907698607,
"grad_norm": 56.125,
"learning_rate": 9.981665643027616e-07,
"loss": 98.6675,
"step": 5230
},
{
"epoch": 0.9405139509382423,
"grad_norm": 56.15625,
"learning_rate": 9.981630586895738e-07,
"loss": 100.7525,
"step": 5240
},
{
"epoch": 0.9423088248904146,
"grad_norm": 54.5625,
"learning_rate": 9.981595530763858e-07,
"loss": 98.633,
"step": 5250
},
{
"epoch": 0.9441036988425867,
"grad_norm": 55.875,
"learning_rate": 9.98156047463198e-07,
"loss": 99.6261,
"step": 5260
},
{
"epoch": 0.945898572794759,
"grad_norm": 58.15625,
"learning_rate": 9.981525418500102e-07,
"loss": 99.3767,
"step": 5270
},
{
"epoch": 0.9476934467469312,
"grad_norm": 55.96875,
"learning_rate": 9.981490362368224e-07,
"loss": 99.0867,
"step": 5280
},
{
"epoch": 0.9494883206991034,
"grad_norm": 56.90625,
"learning_rate": 9.981455306236346e-07,
"loss": 98.7356,
"step": 5290
},
{
"epoch": 0.9512831946512756,
"grad_norm": 56.78125,
"learning_rate": 9.981420250104466e-07,
"loss": 99.5022,
"step": 5300
},
{
"epoch": 0.9530780686034478,
"grad_norm": 55.59375,
"learning_rate": 9.981385193972588e-07,
"loss": 99.9538,
"step": 5310
},
{
"epoch": 0.95487294255562,
"grad_norm": 56.8125,
"learning_rate": 9.98135013784071e-07,
"loss": 98.6498,
"step": 5320
},
{
"epoch": 0.9566678165077923,
"grad_norm": 57.9375,
"learning_rate": 9.981315081708831e-07,
"loss": 99.2055,
"step": 5330
},
{
"epoch": 0.9584626904599645,
"grad_norm": 53.25,
"learning_rate": 9.981280025576953e-07,
"loss": 99.4488,
"step": 5340
},
{
"epoch": 0.9602575644121367,
"grad_norm": 58.21875,
"learning_rate": 9.981244969445075e-07,
"loss": 100.3882,
"step": 5350
},
{
"epoch": 0.962052438364309,
"grad_norm": 53.875,
"learning_rate": 9.981209913313197e-07,
"loss": 98.866,
"step": 5360
},
{
"epoch": 0.9638473123164811,
"grad_norm": 56.28125,
"learning_rate": 9.981174857181317e-07,
"loss": 99.651,
"step": 5370
},
{
"epoch": 0.9656421862686534,
"grad_norm": 57.4375,
"learning_rate": 9.98113980104944e-07,
"loss": 98.6736,
"step": 5380
},
{
"epoch": 0.9674370602208255,
"grad_norm": 59.3125,
"learning_rate": 9.98110474491756e-07,
"loss": 99.5751,
"step": 5390
},
{
"epoch": 0.9692319341729978,
"grad_norm": 57.71875,
"learning_rate": 9.981069688785683e-07,
"loss": 98.4359,
"step": 5400
},
{
"epoch": 0.9710268081251701,
"grad_norm": 52.65625,
"learning_rate": 9.981034632653805e-07,
"loss": 99.2994,
"step": 5410
},
{
"epoch": 0.9728216820773422,
"grad_norm": 55.90625,
"learning_rate": 9.980999576521927e-07,
"loss": 99.8557,
"step": 5420
},
{
"epoch": 0.9746165560295145,
"grad_norm": 54.875,
"learning_rate": 9.980964520390049e-07,
"loss": 98.9348,
"step": 5430
},
{
"epoch": 0.9764114299816867,
"grad_norm": 57.21875,
"learning_rate": 9.98092946425817e-07,
"loss": 99.2456,
"step": 5440
},
{
"epoch": 0.9782063039338589,
"grad_norm": 55.84375,
"learning_rate": 9.98089440812629e-07,
"loss": 98.4899,
"step": 5450
},
{
"epoch": 0.9800011778860311,
"grad_norm": 57.15625,
"learning_rate": 9.980859351994412e-07,
"loss": 99.0099,
"step": 5460
},
{
"epoch": 0.9817960518382033,
"grad_norm": 52.875,
"learning_rate": 9.980824295862534e-07,
"loss": 98.0469,
"step": 5470
},
{
"epoch": 0.9835909257903755,
"grad_norm": 52.84375,
"learning_rate": 9.980789239730656e-07,
"loss": 99.7561,
"step": 5480
},
{
"epoch": 0.9853857997425478,
"grad_norm": 55.4375,
"learning_rate": 9.980754183598778e-07,
"loss": 99.6168,
"step": 5490
},
{
"epoch": 0.98718067369472,
"grad_norm": 55.0,
"learning_rate": 9.9807191274669e-07,
"loss": 99.3561,
"step": 5500
},
{
"epoch": 0.9889755476468922,
"grad_norm": 54.875,
"learning_rate": 9.980684071335022e-07,
"loss": 98.6927,
"step": 5510
},
{
"epoch": 0.9907704215990645,
"grad_norm": 55.03125,
"learning_rate": 9.980649015203142e-07,
"loss": 98.8652,
"step": 5520
},
{
"epoch": 0.9925652955512366,
"grad_norm": 55.375,
"learning_rate": 9.980613959071264e-07,
"loss": 99.0075,
"step": 5530
},
{
"epoch": 0.9943601695034089,
"grad_norm": 56.03125,
"learning_rate": 9.980578902939386e-07,
"loss": 99.1172,
"step": 5540
},
{
"epoch": 0.996155043455581,
"grad_norm": 52.59375,
"learning_rate": 9.980543846807508e-07,
"loss": 99.8071,
"step": 5550
},
{
"epoch": 0.9979499174077533,
"grad_norm": 57.0625,
"learning_rate": 9.98050879067563e-07,
"loss": 99.4139,
"step": 5560
},
{
"epoch": 0.9997447913599256,
"grad_norm": 60.0,
"learning_rate": 9.98047373454375e-07,
"loss": 98.7373,
"step": 5570
}
],
"logging_steps": 10,
"max_steps": 5571,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5382180454408913e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}