checkpoint-ai / checkpoint-69627 /trainer_state.json
ardavey's picture
Upload 26 files
d10fc6f verified
{
"best_metric": {
"accuracy": 0.9902156155185724
},
"best_model_checkpoint": "./results/checkpoint-69627",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 69627,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007181122265787698,
"grad_norm": 4.7540483474731445,
"learning_rate": 1.5e-06,
"loss": 0.7343,
"step": 50
},
{
"epoch": 0.0014362244531575395,
"grad_norm": 3.270306348800659,
"learning_rate": 2.9700000000000004e-06,
"loss": 0.6002,
"step": 100
},
{
"epoch": 0.002154336679736309,
"grad_norm": 8.934218406677246,
"learning_rate": 4.4699999999999996e-06,
"loss": 0.4248,
"step": 150
},
{
"epoch": 0.002872448906315079,
"grad_norm": 2.0825181007385254,
"learning_rate": 5.940000000000001e-06,
"loss": 0.3267,
"step": 200
},
{
"epoch": 0.0035905611328938486,
"grad_norm": 22.761402130126953,
"learning_rate": 7.44e-06,
"loss": 0.2673,
"step": 250
},
{
"epoch": 0.004308673359472618,
"grad_norm": 14.120667457580566,
"learning_rate": 8.91e-06,
"loss": 0.1968,
"step": 300
},
{
"epoch": 0.005026785586051388,
"grad_norm": 30.47674560546875,
"learning_rate": 1.0379999999999999e-05,
"loss": 0.2866,
"step": 350
},
{
"epoch": 0.005744897812630158,
"grad_norm": 4.9884772300720215,
"learning_rate": 1.1880000000000001e-05,
"loss": 0.1806,
"step": 400
},
{
"epoch": 0.006463010039208928,
"grad_norm": 7.104131698608398,
"learning_rate": 1.338e-05,
"loss": 0.1843,
"step": 450
},
{
"epoch": 0.007181122265787697,
"grad_norm": 9.267885208129883,
"learning_rate": 1.488e-05,
"loss": 0.2175,
"step": 500
},
{
"epoch": 0.007899234492366467,
"grad_norm": 0.08351179212331772,
"learning_rate": 1.6380000000000002e-05,
"loss": 0.1864,
"step": 550
},
{
"epoch": 0.008617346718945236,
"grad_norm": 4.860571384429932,
"learning_rate": 1.7879999999999998e-05,
"loss": 0.1599,
"step": 600
},
{
"epoch": 0.009335458945524006,
"grad_norm": 19.020278930664062,
"learning_rate": 1.938e-05,
"loss": 0.2149,
"step": 650
},
{
"epoch": 0.010053571172102775,
"grad_norm": 21.864152908325195,
"learning_rate": 2.088e-05,
"loss": 0.1844,
"step": 700
},
{
"epoch": 0.010771683398681547,
"grad_norm": 13.073183059692383,
"learning_rate": 2.238e-05,
"loss": 0.1342,
"step": 750
},
{
"epoch": 0.011489795625260316,
"grad_norm": 4.226386547088623,
"learning_rate": 2.3880000000000002e-05,
"loss": 0.1602,
"step": 800
},
{
"epoch": 0.012207907851839086,
"grad_norm": 6.574745178222656,
"learning_rate": 2.538e-05,
"loss": 0.1298,
"step": 850
},
{
"epoch": 0.012926020078417855,
"grad_norm": 1.3294140100479126,
"learning_rate": 2.688e-05,
"loss": 0.1627,
"step": 900
},
{
"epoch": 0.013644132304996625,
"grad_norm": 15.410859107971191,
"learning_rate": 2.838e-05,
"loss": 0.1802,
"step": 950
},
{
"epoch": 0.014362244531575395,
"grad_norm": 7.947683811187744,
"learning_rate": 2.9880000000000002e-05,
"loss": 0.1519,
"step": 1000
},
{
"epoch": 0.015080356758154164,
"grad_norm": 9.371002197265625,
"learning_rate": 2.9993361586676995e-05,
"loss": 0.1402,
"step": 1050
},
{
"epoch": 0.015798468984732934,
"grad_norm": 0.8048446774482727,
"learning_rate": 2.998614592002155e-05,
"loss": 0.1335,
"step": 1100
},
{
"epoch": 0.016516581211311703,
"grad_norm": 1.139202356338501,
"learning_rate": 2.997893025336611e-05,
"loss": 0.1234,
"step": 1150
},
{
"epoch": 0.017234693437890473,
"grad_norm": 20.09372901916504,
"learning_rate": 2.9971714586710667e-05,
"loss": 0.2036,
"step": 1200
},
{
"epoch": 0.017952805664469242,
"grad_norm": 50.06760787963867,
"learning_rate": 2.9964498920055226e-05,
"loss": 0.1759,
"step": 1250
},
{
"epoch": 0.018670917891048012,
"grad_norm": 9.23760986328125,
"learning_rate": 2.995728325339978e-05,
"loss": 0.0863,
"step": 1300
},
{
"epoch": 0.01938903011762678,
"grad_norm": 49.052547454833984,
"learning_rate": 2.995006758674434e-05,
"loss": 0.0894,
"step": 1350
},
{
"epoch": 0.02010714234420555,
"grad_norm": 29.987974166870117,
"learning_rate": 2.99428519200889e-05,
"loss": 0.0983,
"step": 1400
},
{
"epoch": 0.02082525457078432,
"grad_norm": 23.74324607849121,
"learning_rate": 2.9935636253433458e-05,
"loss": 0.1495,
"step": 1450
},
{
"epoch": 0.021543366797363094,
"grad_norm": 11.304362297058105,
"learning_rate": 2.9928420586778013e-05,
"loss": 0.1215,
"step": 1500
},
{
"epoch": 0.022261479023941863,
"grad_norm": 0.33057159185409546,
"learning_rate": 2.992120492012257e-05,
"loss": 0.1711,
"step": 1550
},
{
"epoch": 0.022979591250520633,
"grad_norm": 25.56655502319336,
"learning_rate": 2.991398925346713e-05,
"loss": 0.1592,
"step": 1600
},
{
"epoch": 0.023697703477099402,
"grad_norm": 16.568450927734375,
"learning_rate": 2.9906773586811686e-05,
"loss": 0.1175,
"step": 1650
},
{
"epoch": 0.02441581570367817,
"grad_norm": 70.81444549560547,
"learning_rate": 2.9899557920156244e-05,
"loss": 0.0847,
"step": 1700
},
{
"epoch": 0.02513392793025694,
"grad_norm": 7.089319705963135,
"learning_rate": 2.9892342253500803e-05,
"loss": 0.1262,
"step": 1750
},
{
"epoch": 0.02585204015683571,
"grad_norm": 0.050011906772851944,
"learning_rate": 2.988512658684536e-05,
"loss": 0.0937,
"step": 1800
},
{
"epoch": 0.02657015238341448,
"grad_norm": 25.49470329284668,
"learning_rate": 2.9877910920189917e-05,
"loss": 0.1614,
"step": 1850
},
{
"epoch": 0.02728826460999325,
"grad_norm": 0.3818345367908478,
"learning_rate": 2.9870695253534476e-05,
"loss": 0.1194,
"step": 1900
},
{
"epoch": 0.02800637683657202,
"grad_norm": 9.010673522949219,
"learning_rate": 2.986347958687903e-05,
"loss": 0.0894,
"step": 1950
},
{
"epoch": 0.02872448906315079,
"grad_norm": 18.90770721435547,
"learning_rate": 2.985626392022359e-05,
"loss": 0.1395,
"step": 2000
},
{
"epoch": 0.02944260128972956,
"grad_norm": 0.030203022062778473,
"learning_rate": 2.984904825356815e-05,
"loss": 0.115,
"step": 2050
},
{
"epoch": 0.030160713516308328,
"grad_norm": 0.0564361996948719,
"learning_rate": 2.9841832586912704e-05,
"loss": 0.0639,
"step": 2100
},
{
"epoch": 0.030878825742887098,
"grad_norm": 0.019318081438541412,
"learning_rate": 2.9834616920257266e-05,
"loss": 0.0859,
"step": 2150
},
{
"epoch": 0.03159693796946587,
"grad_norm": 0.49546706676483154,
"learning_rate": 2.982754556693493e-05,
"loss": 0.1413,
"step": 2200
},
{
"epoch": 0.03231505019604464,
"grad_norm": 0.03695172816514969,
"learning_rate": 2.9820329900279487e-05,
"loss": 0.0901,
"step": 2250
},
{
"epoch": 0.033033162422623406,
"grad_norm": 0.016125192865729332,
"learning_rate": 2.9813114233624046e-05,
"loss": 0.0813,
"step": 2300
},
{
"epoch": 0.033751274649202176,
"grad_norm": 57.18072509765625,
"learning_rate": 2.9805898566968605e-05,
"loss": 0.095,
"step": 2350
},
{
"epoch": 0.034469386875780945,
"grad_norm": 4.513989448547363,
"learning_rate": 2.979868290031316e-05,
"loss": 0.1273,
"step": 2400
},
{
"epoch": 0.035187499102359715,
"grad_norm": 10.933919906616211,
"learning_rate": 2.979146723365772e-05,
"loss": 0.0767,
"step": 2450
},
{
"epoch": 0.035905611328938485,
"grad_norm": 83.93428802490234,
"learning_rate": 2.9784251567002277e-05,
"loss": 0.1061,
"step": 2500
},
{
"epoch": 0.036623723555517254,
"grad_norm": 16.953140258789062,
"learning_rate": 2.9777035900346833e-05,
"loss": 0.1024,
"step": 2550
},
{
"epoch": 0.037341835782096024,
"grad_norm": 17.177425384521484,
"learning_rate": 2.976982023369139e-05,
"loss": 0.1049,
"step": 2600
},
{
"epoch": 0.03805994800867479,
"grad_norm": 0.028152592480182648,
"learning_rate": 2.976260456703595e-05,
"loss": 0.0597,
"step": 2650
},
{
"epoch": 0.03877806023525356,
"grad_norm": 22.621421813964844,
"learning_rate": 2.9755388900380506e-05,
"loss": 0.1103,
"step": 2700
},
{
"epoch": 0.03949617246183233,
"grad_norm": 0.2816521227359772,
"learning_rate": 2.9748173233725064e-05,
"loss": 0.0794,
"step": 2750
},
{
"epoch": 0.0402142846884111,
"grad_norm": 0.068883016705513,
"learning_rate": 2.9740957567069623e-05,
"loss": 0.0844,
"step": 2800
},
{
"epoch": 0.04093239691498987,
"grad_norm": 0.06445488333702087,
"learning_rate": 2.973374190041418e-05,
"loss": 0.0703,
"step": 2850
},
{
"epoch": 0.04165050914156864,
"grad_norm": 0.15849824249744415,
"learning_rate": 2.9726526233758737e-05,
"loss": 0.0717,
"step": 2900
},
{
"epoch": 0.04236862136814742,
"grad_norm": 0.08682260662317276,
"learning_rate": 2.9719310567103296e-05,
"loss": 0.0947,
"step": 2950
},
{
"epoch": 0.04308673359472619,
"grad_norm": 20.855480194091797,
"learning_rate": 2.971209490044785e-05,
"loss": 0.095,
"step": 3000
},
{
"epoch": 0.04380484582130496,
"grad_norm": 0.009179933927953243,
"learning_rate": 2.9704879233792413e-05,
"loss": 0.0681,
"step": 3050
},
{
"epoch": 0.044522958047883726,
"grad_norm": 1.1233855485916138,
"learning_rate": 2.969766356713697e-05,
"loss": 0.1191,
"step": 3100
},
{
"epoch": 0.045241070274462496,
"grad_norm": 7.719324588775635,
"learning_rate": 2.9690447900481524e-05,
"loss": 0.114,
"step": 3150
},
{
"epoch": 0.045959182501041265,
"grad_norm": 0.08133476972579956,
"learning_rate": 2.9683232233826086e-05,
"loss": 0.0883,
"step": 3200
},
{
"epoch": 0.046677294727620035,
"grad_norm": 0.15997205674648285,
"learning_rate": 2.967601656717064e-05,
"loss": 0.0624,
"step": 3250
},
{
"epoch": 0.047395406954198804,
"grad_norm": 30.50885581970215,
"learning_rate": 2.9668800900515197e-05,
"loss": 0.0792,
"step": 3300
},
{
"epoch": 0.048113519180777574,
"grad_norm": 0.03833446651697159,
"learning_rate": 2.966158523385976e-05,
"loss": 0.0655,
"step": 3350
},
{
"epoch": 0.04883163140735634,
"grad_norm": 0.020543202757835388,
"learning_rate": 2.9654369567204314e-05,
"loss": 0.0648,
"step": 3400
},
{
"epoch": 0.04954974363393511,
"grad_norm": 0.18718767166137695,
"learning_rate": 2.9647153900548873e-05,
"loss": 0.0722,
"step": 3450
},
{
"epoch": 0.05026785586051388,
"grad_norm": 0.038752757012844086,
"learning_rate": 2.963993823389343e-05,
"loss": 0.088,
"step": 3500
},
{
"epoch": 0.05098596808709265,
"grad_norm": 0.5428361296653748,
"learning_rate": 2.9632722567237987e-05,
"loss": 0.1104,
"step": 3550
},
{
"epoch": 0.05170408031367142,
"grad_norm": 29.898271560668945,
"learning_rate": 2.9625506900582546e-05,
"loss": 0.0972,
"step": 3600
},
{
"epoch": 0.05242219254025019,
"grad_norm": 8.866761207580566,
"learning_rate": 2.9618291233927104e-05,
"loss": 0.0844,
"step": 3650
},
{
"epoch": 0.05314030476682896,
"grad_norm": 0.06850434094667435,
"learning_rate": 2.961107556727166e-05,
"loss": 0.0914,
"step": 3700
},
{
"epoch": 0.05385841699340773,
"grad_norm": 33.26413345336914,
"learning_rate": 2.9603859900616222e-05,
"loss": 0.0927,
"step": 3750
},
{
"epoch": 0.0545765292199865,
"grad_norm": 0.015440431423485279,
"learning_rate": 2.9596644233960777e-05,
"loss": 0.0575,
"step": 3800
},
{
"epoch": 0.05529464144656527,
"grad_norm": 0.16492103040218353,
"learning_rate": 2.9589428567305332e-05,
"loss": 0.0696,
"step": 3850
},
{
"epoch": 0.05601275367314404,
"grad_norm": 0.020445317029953003,
"learning_rate": 2.9582212900649895e-05,
"loss": 0.052,
"step": 3900
},
{
"epoch": 0.05673086589972281,
"grad_norm": 0.12008040398359299,
"learning_rate": 2.957499723399445e-05,
"loss": 0.0864,
"step": 3950
},
{
"epoch": 0.05744897812630158,
"grad_norm": 10.394414901733398,
"learning_rate": 2.9567781567339005e-05,
"loss": 0.0877,
"step": 4000
},
{
"epoch": 0.05816709035288035,
"grad_norm": 33.755489349365234,
"learning_rate": 2.9560565900683567e-05,
"loss": 0.0833,
"step": 4050
},
{
"epoch": 0.05888520257945912,
"grad_norm": 33.599884033203125,
"learning_rate": 2.9553350234028123e-05,
"loss": 0.0758,
"step": 4100
},
{
"epoch": 0.05960331480603789,
"grad_norm": 0.05187524855136871,
"learning_rate": 2.9546134567372678e-05,
"loss": 0.052,
"step": 4150
},
{
"epoch": 0.060321427032616656,
"grad_norm": 17.136632919311523,
"learning_rate": 2.953891890071724e-05,
"loss": 0.0543,
"step": 4200
},
{
"epoch": 0.061039539259195426,
"grad_norm": 0.02133137546479702,
"learning_rate": 2.9531703234061795e-05,
"loss": 0.066,
"step": 4250
},
{
"epoch": 0.061757651485774195,
"grad_norm": 0.014079502783715725,
"learning_rate": 2.952448756740635e-05,
"loss": 0.055,
"step": 4300
},
{
"epoch": 0.062475763712352965,
"grad_norm": 0.07812543213367462,
"learning_rate": 2.9517271900750913e-05,
"loss": 0.0507,
"step": 4350
},
{
"epoch": 0.06319387593893173,
"grad_norm": 13.165999412536621,
"learning_rate": 2.9510056234095468e-05,
"loss": 0.0739,
"step": 4400
},
{
"epoch": 0.06391198816551051,
"grad_norm": 14.774604797363281,
"learning_rate": 2.9502840567440027e-05,
"loss": 0.0572,
"step": 4450
},
{
"epoch": 0.06463010039208927,
"grad_norm": 0.04285166785120964,
"learning_rate": 2.9495624900784586e-05,
"loss": 0.0737,
"step": 4500
},
{
"epoch": 0.06534821261866805,
"grad_norm": 0.0385599248111248,
"learning_rate": 2.948840923412914e-05,
"loss": 0.0651,
"step": 4550
},
{
"epoch": 0.06606632484524681,
"grad_norm": 8.97938060760498,
"learning_rate": 2.94811935674737e-05,
"loss": 0.1029,
"step": 4600
},
{
"epoch": 0.06678443707182559,
"grad_norm": 14.360940933227539,
"learning_rate": 2.947397790081826e-05,
"loss": 0.0861,
"step": 4650
},
{
"epoch": 0.06750254929840435,
"grad_norm": 0.021648872643709183,
"learning_rate": 2.9466762234162814e-05,
"loss": 0.0503,
"step": 4700
},
{
"epoch": 0.06822066152498313,
"grad_norm": 12.358551025390625,
"learning_rate": 2.9459546567507373e-05,
"loss": 0.0476,
"step": 4750
},
{
"epoch": 0.06893877375156189,
"grad_norm": 0.5937502384185791,
"learning_rate": 2.945233090085193e-05,
"loss": 0.0833,
"step": 4800
},
{
"epoch": 0.06965688597814067,
"grad_norm": 0.009575131349265575,
"learning_rate": 2.9445115234196487e-05,
"loss": 0.0488,
"step": 4850
},
{
"epoch": 0.07037499820471943,
"grad_norm": 0.027700720354914665,
"learning_rate": 2.9437899567541045e-05,
"loss": 0.0726,
"step": 4900
},
{
"epoch": 0.0710931104312982,
"grad_norm": 0.06248120963573456,
"learning_rate": 2.9430683900885604e-05,
"loss": 0.0346,
"step": 4950
},
{
"epoch": 0.07181122265787697,
"grad_norm": 0.047881875187158585,
"learning_rate": 2.942361254756327e-05,
"loss": 0.0844,
"step": 5000
},
{
"epoch": 0.07252933488445575,
"grad_norm": 0.21929340064525604,
"learning_rate": 2.9416396880907825e-05,
"loss": 0.0622,
"step": 5050
},
{
"epoch": 0.07324744711103451,
"grad_norm": 0.07342693209648132,
"learning_rate": 2.9409181214252387e-05,
"loss": 0.1072,
"step": 5100
},
{
"epoch": 0.07396555933761328,
"grad_norm": 0.028538711369037628,
"learning_rate": 2.9401965547596943e-05,
"loss": 0.0655,
"step": 5150
},
{
"epoch": 0.07468367156419205,
"grad_norm": 0.03597363457083702,
"learning_rate": 2.9394749880941498e-05,
"loss": 0.027,
"step": 5200
},
{
"epoch": 0.07540178379077082,
"grad_norm": 0.011987659148871899,
"learning_rate": 2.938753421428606e-05,
"loss": 0.1305,
"step": 5250
},
{
"epoch": 0.07611989601734959,
"grad_norm": 1.634634017944336,
"learning_rate": 2.9380318547630615e-05,
"loss": 0.0666,
"step": 5300
},
{
"epoch": 0.07683800824392836,
"grad_norm": 5.277307987213135,
"learning_rate": 2.9373102880975174e-05,
"loss": 0.0612,
"step": 5350
},
{
"epoch": 0.07755612047050713,
"grad_norm": 9.484989166259766,
"learning_rate": 2.9365887214319733e-05,
"loss": 0.111,
"step": 5400
},
{
"epoch": 0.0782742326970859,
"grad_norm": 0.1096024438738823,
"learning_rate": 2.9358671547664288e-05,
"loss": 0.0833,
"step": 5450
},
{
"epoch": 0.07899234492366466,
"grad_norm": 13.125753402709961,
"learning_rate": 2.9351455881008847e-05,
"loss": 0.1178,
"step": 5500
},
{
"epoch": 0.07971045715024344,
"grad_norm": 0.16206516325473785,
"learning_rate": 2.9344240214353406e-05,
"loss": 0.0921,
"step": 5550
},
{
"epoch": 0.0804285693768222,
"grad_norm": 0.3482252061367035,
"learning_rate": 2.933702454769796e-05,
"loss": 0.0682,
"step": 5600
},
{
"epoch": 0.08114668160340098,
"grad_norm": 5.2325944900512695,
"learning_rate": 2.932980888104252e-05,
"loss": 0.0622,
"step": 5650
},
{
"epoch": 0.08186479382997974,
"grad_norm": 0.03085625357925892,
"learning_rate": 2.932259321438708e-05,
"loss": 0.0501,
"step": 5700
},
{
"epoch": 0.08258290605655852,
"grad_norm": 10.35268783569336,
"learning_rate": 2.9315377547731634e-05,
"loss": 0.0657,
"step": 5750
},
{
"epoch": 0.08330101828313728,
"grad_norm": 0.03741024062037468,
"learning_rate": 2.9308161881076196e-05,
"loss": 0.0579,
"step": 5800
},
{
"epoch": 0.08401913050971606,
"grad_norm": 22.895612716674805,
"learning_rate": 2.930094621442075e-05,
"loss": 0.0745,
"step": 5850
},
{
"epoch": 0.08473724273629483,
"grad_norm": 0.12802597880363464,
"learning_rate": 2.9293730547765307e-05,
"loss": 0.0595,
"step": 5900
},
{
"epoch": 0.0854553549628736,
"grad_norm": 0.1966264247894287,
"learning_rate": 2.928651488110987e-05,
"loss": 0.0807,
"step": 5950
},
{
"epoch": 0.08617346718945237,
"grad_norm": 0.4157123267650604,
"learning_rate": 2.9279299214454424e-05,
"loss": 0.118,
"step": 6000
},
{
"epoch": 0.08689157941603114,
"grad_norm": 0.08162178844213486,
"learning_rate": 2.9272083547798983e-05,
"loss": 0.0726,
"step": 6050
},
{
"epoch": 0.08760969164260991,
"grad_norm": 0.13167402148246765,
"learning_rate": 2.926486788114354e-05,
"loss": 0.0753,
"step": 6100
},
{
"epoch": 0.08832780386918868,
"grad_norm": 0.12660837173461914,
"learning_rate": 2.9257652214488097e-05,
"loss": 0.0742,
"step": 6150
},
{
"epoch": 0.08904591609576745,
"grad_norm": 30.069957733154297,
"learning_rate": 2.9250436547832655e-05,
"loss": 0.0372,
"step": 6200
},
{
"epoch": 0.08976402832234621,
"grad_norm": 0.09002042561769485,
"learning_rate": 2.9243220881177214e-05,
"loss": 0.0892,
"step": 6250
},
{
"epoch": 0.09048214054892499,
"grad_norm": 2.681645631790161,
"learning_rate": 2.923600521452177e-05,
"loss": 0.044,
"step": 6300
},
{
"epoch": 0.09120025277550375,
"grad_norm": 0.08975830674171448,
"learning_rate": 2.9228789547866328e-05,
"loss": 0.0683,
"step": 6350
},
{
"epoch": 0.09191836500208253,
"grad_norm": 1.0190995931625366,
"learning_rate": 2.9221573881210887e-05,
"loss": 0.0758,
"step": 6400
},
{
"epoch": 0.0926364772286613,
"grad_norm": 13.023914337158203,
"learning_rate": 2.9214358214555442e-05,
"loss": 0.0743,
"step": 6450
},
{
"epoch": 0.09335458945524007,
"grad_norm": 0.027975155040621758,
"learning_rate": 2.92071425479e-05,
"loss": 0.0469,
"step": 6500
},
{
"epoch": 0.09407270168181883,
"grad_norm": 0.05313028395175934,
"learning_rate": 2.919992688124456e-05,
"loss": 0.029,
"step": 6550
},
{
"epoch": 0.09479081390839761,
"grad_norm": 0.10678339004516602,
"learning_rate": 2.9192711214589115e-05,
"loss": 0.0258,
"step": 6600
},
{
"epoch": 0.09550892613497637,
"grad_norm": 9.206793785095215,
"learning_rate": 2.9185495547933674e-05,
"loss": 0.1138,
"step": 6650
},
{
"epoch": 0.09622703836155515,
"grad_norm": 0.13147731125354767,
"learning_rate": 2.9178279881278233e-05,
"loss": 0.0678,
"step": 6700
},
{
"epoch": 0.09694515058813391,
"grad_norm": 0.058656755834817886,
"learning_rate": 2.917106421462279e-05,
"loss": 0.0628,
"step": 6750
},
{
"epoch": 0.09766326281471269,
"grad_norm": 0.06019889935851097,
"learning_rate": 2.9163848547967347e-05,
"loss": 0.0713,
"step": 6800
},
{
"epoch": 0.09838137504129145,
"grad_norm": 1.6211074590682983,
"learning_rate": 2.9156777194645016e-05,
"loss": 0.0518,
"step": 6850
},
{
"epoch": 0.09909948726787023,
"grad_norm": 48.71439743041992,
"learning_rate": 2.914956152798957e-05,
"loss": 0.0585,
"step": 6900
},
{
"epoch": 0.09981759949444899,
"grad_norm": 7.728841304779053,
"learning_rate": 2.914234586133413e-05,
"loss": 0.0953,
"step": 6950
},
{
"epoch": 0.10053571172102777,
"grad_norm": 0.023901065811514854,
"learning_rate": 2.913513019467869e-05,
"loss": 0.0722,
"step": 7000
},
{
"epoch": 0.10125382394760653,
"grad_norm": 0.10294115543365479,
"learning_rate": 2.9127914528023244e-05,
"loss": 0.036,
"step": 7050
},
{
"epoch": 0.1019719361741853,
"grad_norm": 0.011413712054491043,
"learning_rate": 2.9120698861367803e-05,
"loss": 0.0754,
"step": 7100
},
{
"epoch": 0.10269004840076407,
"grad_norm": 0.11528253555297852,
"learning_rate": 2.911348319471236e-05,
"loss": 0.0543,
"step": 7150
},
{
"epoch": 0.10340816062734284,
"grad_norm": 0.10068734735250473,
"learning_rate": 2.9106267528056917e-05,
"loss": 0.0873,
"step": 7200
},
{
"epoch": 0.1041262728539216,
"grad_norm": 0.0316169336438179,
"learning_rate": 2.9099051861401475e-05,
"loss": 0.0541,
"step": 7250
},
{
"epoch": 0.10484438508050038,
"grad_norm": 9.961137771606445,
"learning_rate": 2.9091836194746034e-05,
"loss": 0.0497,
"step": 7300
},
{
"epoch": 0.10556249730707915,
"grad_norm": 0.08653315156698227,
"learning_rate": 2.908462052809059e-05,
"loss": 0.065,
"step": 7350
},
{
"epoch": 0.10628060953365792,
"grad_norm": 0.13973358273506165,
"learning_rate": 2.9077404861435148e-05,
"loss": 0.094,
"step": 7400
},
{
"epoch": 0.10699872176023668,
"grad_norm": 2.6138124465942383,
"learning_rate": 2.9070189194779707e-05,
"loss": 0.0518,
"step": 7450
},
{
"epoch": 0.10771683398681546,
"grad_norm": 0.01820351369678974,
"learning_rate": 2.9062973528124262e-05,
"loss": 0.0491,
"step": 7500
},
{
"epoch": 0.10843494621339422,
"grad_norm": 0.34972327947616577,
"learning_rate": 2.905575786146882e-05,
"loss": 0.0902,
"step": 7550
},
{
"epoch": 0.109153058439973,
"grad_norm": 0.09330655634403229,
"learning_rate": 2.904854219481338e-05,
"loss": 0.0775,
"step": 7600
},
{
"epoch": 0.10987117066655176,
"grad_norm": 20.732421875,
"learning_rate": 2.904132652815794e-05,
"loss": 0.0485,
"step": 7650
},
{
"epoch": 0.11058928289313054,
"grad_norm": 0.10912938416004181,
"learning_rate": 2.9034110861502494e-05,
"loss": 0.0548,
"step": 7700
},
{
"epoch": 0.1113073951197093,
"grad_norm": 0.05953862518072128,
"learning_rate": 2.9026895194847052e-05,
"loss": 0.051,
"step": 7750
},
{
"epoch": 0.11202550734628808,
"grad_norm": 0.03613553196191788,
"learning_rate": 2.901967952819161e-05,
"loss": 0.0576,
"step": 7800
},
{
"epoch": 0.11274361957286685,
"grad_norm": 0.013742661103606224,
"learning_rate": 2.9012463861536166e-05,
"loss": 0.0435,
"step": 7850
},
{
"epoch": 0.11346173179944562,
"grad_norm": 14.854754447937012,
"learning_rate": 2.9005248194880725e-05,
"loss": 0.0588,
"step": 7900
},
{
"epoch": 0.1141798440260244,
"grad_norm": 0.025177430361509323,
"learning_rate": 2.8998032528225284e-05,
"loss": 0.049,
"step": 7950
},
{
"epoch": 0.11489795625260316,
"grad_norm": 0.10366187989711761,
"learning_rate": 2.8990816861569843e-05,
"loss": 0.0527,
"step": 8000
},
{
"epoch": 0.11561606847918193,
"grad_norm": 17.234394073486328,
"learning_rate": 2.8983601194914398e-05,
"loss": 0.0496,
"step": 8050
},
{
"epoch": 0.1163341807057607,
"grad_norm": 0.20396387577056885,
"learning_rate": 2.8976385528258957e-05,
"loss": 0.0641,
"step": 8100
},
{
"epoch": 0.11705229293233947,
"grad_norm": 0.016371482983231544,
"learning_rate": 2.8969169861603515e-05,
"loss": 0.0379,
"step": 8150
},
{
"epoch": 0.11777040515891823,
"grad_norm": 0.06216276437044144,
"learning_rate": 2.896195419494807e-05,
"loss": 0.0565,
"step": 8200
},
{
"epoch": 0.11848851738549701,
"grad_norm": 0.18494826555252075,
"learning_rate": 2.895473852829263e-05,
"loss": 0.0599,
"step": 8250
},
{
"epoch": 0.11920662961207577,
"grad_norm": 0.012996107339859009,
"learning_rate": 2.8947522861637188e-05,
"loss": 0.0361,
"step": 8300
},
{
"epoch": 0.11992474183865455,
"grad_norm": 26.88838768005371,
"learning_rate": 2.8940307194981747e-05,
"loss": 0.0699,
"step": 8350
},
{
"epoch": 0.12064285406523331,
"grad_norm": 0.03782937675714493,
"learning_rate": 2.8933091528326302e-05,
"loss": 0.0327,
"step": 8400
},
{
"epoch": 0.12136096629181209,
"grad_norm": 0.055766601115465164,
"learning_rate": 2.892587586167086e-05,
"loss": 0.0747,
"step": 8450
},
{
"epoch": 0.12207907851839085,
"grad_norm": 0.009904072619974613,
"learning_rate": 2.891866019501542e-05,
"loss": 0.0611,
"step": 8500
},
{
"epoch": 0.12279719074496963,
"grad_norm": 0.06229407340288162,
"learning_rate": 2.8911444528359975e-05,
"loss": 0.1479,
"step": 8550
},
{
"epoch": 0.12351530297154839,
"grad_norm": 0.033349085599184036,
"learning_rate": 2.8904228861704534e-05,
"loss": 0.0442,
"step": 8600
},
{
"epoch": 0.12423341519812717,
"grad_norm": 0.04250495880842209,
"learning_rate": 2.8897013195049093e-05,
"loss": 0.0646,
"step": 8650
},
{
"epoch": 0.12495152742470593,
"grad_norm": 0.07844238728284836,
"learning_rate": 2.8889797528393648e-05,
"loss": 0.0732,
"step": 8700
},
{
"epoch": 0.1256696396512847,
"grad_norm": 3.7660670280456543,
"learning_rate": 2.8882581861738207e-05,
"loss": 0.0612,
"step": 8750
},
{
"epoch": 0.12638775187786347,
"grad_norm": 0.031977858394384384,
"learning_rate": 2.8875366195082765e-05,
"loss": 0.0511,
"step": 8800
},
{
"epoch": 0.12710586410444225,
"grad_norm": 0.3466741442680359,
"learning_rate": 2.886815052842732e-05,
"loss": 0.0423,
"step": 8850
},
{
"epoch": 0.12782397633102102,
"grad_norm": 0.314433753490448,
"learning_rate": 2.886093486177188e-05,
"loss": 0.0374,
"step": 8900
},
{
"epoch": 0.12854208855759977,
"grad_norm": 0.007800741121172905,
"learning_rate": 2.8853719195116438e-05,
"loss": 0.0683,
"step": 8950
},
{
"epoch": 0.12926020078417855,
"grad_norm": 0.929775595664978,
"learning_rate": 2.8846503528460993e-05,
"loss": 0.0772,
"step": 9000
},
{
"epoch": 0.12997831301075732,
"grad_norm": 0.35541367530822754,
"learning_rate": 2.8839287861805556e-05,
"loss": 0.0577,
"step": 9050
},
{
"epoch": 0.1306964252373361,
"grad_norm": 1.530603051185608,
"learning_rate": 2.883207219515011e-05,
"loss": 0.0563,
"step": 9100
},
{
"epoch": 0.13141453746391485,
"grad_norm": 53.78153991699219,
"learning_rate": 2.8824856528494666e-05,
"loss": 0.0611,
"step": 9150
},
{
"epoch": 0.13213264969049363,
"grad_norm": 2.021066427230835,
"learning_rate": 2.8817640861839228e-05,
"loss": 0.067,
"step": 9200
},
{
"epoch": 0.1328507619170724,
"grad_norm": 0.058540359139442444,
"learning_rate": 2.8810425195183784e-05,
"loss": 0.0431,
"step": 9250
},
{
"epoch": 0.13356887414365118,
"grad_norm": 3.8294928073883057,
"learning_rate": 2.880320952852834e-05,
"loss": 0.0588,
"step": 9300
},
{
"epoch": 0.13428698637022993,
"grad_norm": 0.44677820801734924,
"learning_rate": 2.87959938618729e-05,
"loss": 0.0694,
"step": 9350
},
{
"epoch": 0.1350050985968087,
"grad_norm": 0.11883492767810822,
"learning_rate": 2.8788778195217456e-05,
"loss": 0.0562,
"step": 9400
},
{
"epoch": 0.13572321082338748,
"grad_norm": 26.162769317626953,
"learning_rate": 2.8781562528562012e-05,
"loss": 0.1393,
"step": 9450
},
{
"epoch": 0.13644132304996626,
"grad_norm": 0.02398838847875595,
"learning_rate": 2.8774346861906574e-05,
"loss": 0.0499,
"step": 9500
},
{
"epoch": 0.137159435276545,
"grad_norm": 6.311274528503418,
"learning_rate": 2.876713119525113e-05,
"loss": 0.0582,
"step": 9550
},
{
"epoch": 0.13787754750312378,
"grad_norm": 0.06905557215213776,
"learning_rate": 2.8759915528595685e-05,
"loss": 0.0475,
"step": 9600
},
{
"epoch": 0.13859565972970256,
"grad_norm": 0.012767783366143703,
"learning_rate": 2.8752699861940247e-05,
"loss": 0.0268,
"step": 9650
},
{
"epoch": 0.13931377195628133,
"grad_norm": 0.07180549949407578,
"learning_rate": 2.8745484195284802e-05,
"loss": 0.0566,
"step": 9700
},
{
"epoch": 0.14003188418286008,
"grad_norm": 0.13380548357963562,
"learning_rate": 2.873826852862936e-05,
"loss": 0.0714,
"step": 9750
},
{
"epoch": 0.14074999640943886,
"grad_norm": 0.1076226532459259,
"learning_rate": 2.873105286197392e-05,
"loss": 0.0925,
"step": 9800
},
{
"epoch": 0.14146810863601764,
"grad_norm": 0.09091708809137344,
"learning_rate": 2.8723837195318475e-05,
"loss": 0.0468,
"step": 9850
},
{
"epoch": 0.1421862208625964,
"grad_norm": 0.049373432993888855,
"learning_rate": 2.8716621528663033e-05,
"loss": 0.0672,
"step": 9900
},
{
"epoch": 0.1429043330891752,
"grad_norm": 0.0634181797504425,
"learning_rate": 2.8709405862007592e-05,
"loss": 0.0724,
"step": 9950
},
{
"epoch": 0.14362244531575394,
"grad_norm": 0.3946473002433777,
"learning_rate": 2.8702190195352148e-05,
"loss": 0.0675,
"step": 10000
},
{
"epoch": 0.14434055754233271,
"grad_norm": 0.1286439746618271,
"learning_rate": 2.869497452869671e-05,
"loss": 0.0755,
"step": 10050
},
{
"epoch": 0.1450586697689115,
"grad_norm": 0.04242817312479019,
"learning_rate": 2.8687758862041265e-05,
"loss": 0.0368,
"step": 10100
},
{
"epoch": 0.14577678199549027,
"grad_norm": 0.1294609159231186,
"learning_rate": 2.868054319538582e-05,
"loss": 0.0679,
"step": 10150
},
{
"epoch": 0.14649489422206902,
"grad_norm": 0.0815710574388504,
"learning_rate": 2.8673327528730382e-05,
"loss": 0.0737,
"step": 10200
},
{
"epoch": 0.1472130064486478,
"grad_norm": 0.04006512463092804,
"learning_rate": 2.8666111862074938e-05,
"loss": 0.05,
"step": 10250
},
{
"epoch": 0.14793111867522657,
"grad_norm": 0.015738453716039658,
"learning_rate": 2.8658896195419493e-05,
"loss": 0.0629,
"step": 10300
},
{
"epoch": 0.14864923090180535,
"grad_norm": 0.045333586633205414,
"learning_rate": 2.8651680528764055e-05,
"loss": 0.0534,
"step": 10350
},
{
"epoch": 0.1493673431283841,
"grad_norm": 0.06222152337431908,
"learning_rate": 2.864446486210861e-05,
"loss": 0.0703,
"step": 10400
},
{
"epoch": 0.15008545535496287,
"grad_norm": 0.0591827891767025,
"learning_rate": 2.863724919545317e-05,
"loss": 0.0499,
"step": 10450
},
{
"epoch": 0.15080356758154165,
"grad_norm": 1.1121820211410522,
"learning_rate": 2.8630033528797728e-05,
"loss": 0.3427,
"step": 10500
},
{
"epoch": 0.15152167980812042,
"grad_norm": 0.1692652404308319,
"learning_rate": 2.8622962175475394e-05,
"loss": 0.1411,
"step": 10550
},
{
"epoch": 0.15223979203469917,
"grad_norm": 0.5019534826278687,
"learning_rate": 2.861574650881995e-05,
"loss": 0.085,
"step": 10600
},
{
"epoch": 0.15295790426127795,
"grad_norm": 1.0281404256820679,
"learning_rate": 2.860853084216451e-05,
"loss": 0.1386,
"step": 10650
},
{
"epoch": 0.15367601648785673,
"grad_norm": 0.048874229192733765,
"learning_rate": 2.8601315175509067e-05,
"loss": 0.0843,
"step": 10700
},
{
"epoch": 0.1543941287144355,
"grad_norm": 2.0028481483459473,
"learning_rate": 2.8594099508853622e-05,
"loss": 0.0657,
"step": 10750
},
{
"epoch": 0.15511224094101425,
"grad_norm": 0.1379271149635315,
"learning_rate": 2.8586883842198184e-05,
"loss": 0.0684,
"step": 10800
},
{
"epoch": 0.15583035316759303,
"grad_norm": 0.08114618808031082,
"learning_rate": 2.857966817554274e-05,
"loss": 0.0667,
"step": 10850
},
{
"epoch": 0.1565484653941718,
"grad_norm": 1.392622709274292,
"learning_rate": 2.8572452508887295e-05,
"loss": 0.0688,
"step": 10900
},
{
"epoch": 0.15726657762075058,
"grad_norm": 11.604351997375488,
"learning_rate": 2.8565236842231857e-05,
"loss": 0.0882,
"step": 10950
},
{
"epoch": 0.15798468984732933,
"grad_norm": 0.31855666637420654,
"learning_rate": 2.8558021175576412e-05,
"loss": 0.1015,
"step": 11000
},
{
"epoch": 0.1587028020739081,
"grad_norm": 1.6662945747375488,
"learning_rate": 2.8550805508920967e-05,
"loss": 0.1386,
"step": 11050
},
{
"epoch": 0.15942091430048688,
"grad_norm": 13.3506498336792,
"learning_rate": 2.854358984226553e-05,
"loss": 0.0856,
"step": 11100
},
{
"epoch": 0.16013902652706566,
"grad_norm": 1.4275833368301392,
"learning_rate": 2.8536374175610085e-05,
"loss": 0.0537,
"step": 11150
},
{
"epoch": 0.1608571387536444,
"grad_norm": 0.6070023775100708,
"learning_rate": 2.852915850895464e-05,
"loss": 0.0396,
"step": 11200
},
{
"epoch": 0.16157525098022318,
"grad_norm": 0.4954681694507599,
"learning_rate": 2.8521942842299202e-05,
"loss": 0.0684,
"step": 11250
},
{
"epoch": 0.16229336320680196,
"grad_norm": 0.03993278741836548,
"learning_rate": 2.8514727175643758e-05,
"loss": 0.0556,
"step": 11300
},
{
"epoch": 0.16301147543338074,
"grad_norm": 0.039236683398485184,
"learning_rate": 2.8507511508988316e-05,
"loss": 0.066,
"step": 11350
},
{
"epoch": 0.16372958765995949,
"grad_norm": 0.8015234470367432,
"learning_rate": 2.8500295842332875e-05,
"loss": 0.0636,
"step": 11400
},
{
"epoch": 0.16444769988653826,
"grad_norm": 1.8648189306259155,
"learning_rate": 2.849308017567743e-05,
"loss": 0.0991,
"step": 11450
},
{
"epoch": 0.16516581211311704,
"grad_norm": 0.09908437728881836,
"learning_rate": 2.848586450902199e-05,
"loss": 0.0537,
"step": 11500
},
{
"epoch": 0.16588392433969582,
"grad_norm": 2.209059000015259,
"learning_rate": 2.8478648842366548e-05,
"loss": 0.0793,
"step": 11550
},
{
"epoch": 0.16660203656627456,
"grad_norm": 0.09478212147951126,
"learning_rate": 2.8471433175711103e-05,
"loss": 0.0658,
"step": 11600
},
{
"epoch": 0.16732014879285334,
"grad_norm": 2.0855391025543213,
"learning_rate": 2.8464217509055662e-05,
"loss": 0.0647,
"step": 11650
},
{
"epoch": 0.16803826101943212,
"grad_norm": 0.08406183868646622,
"learning_rate": 2.845700184240022e-05,
"loss": 0.0601,
"step": 11700
},
{
"epoch": 0.1687563732460109,
"grad_norm": 0.054508406668901443,
"learning_rate": 2.8449786175744776e-05,
"loss": 0.0527,
"step": 11750
},
{
"epoch": 0.16947448547258967,
"grad_norm": 0.22775687277317047,
"learning_rate": 2.8442570509089335e-05,
"loss": 0.0712,
"step": 11800
},
{
"epoch": 0.17019259769916842,
"grad_norm": 0.04549324885010719,
"learning_rate": 2.8435354842433893e-05,
"loss": 0.058,
"step": 11850
},
{
"epoch": 0.1709107099257472,
"grad_norm": 0.05884317681193352,
"learning_rate": 2.842813917577845e-05,
"loss": 0.0817,
"step": 11900
},
{
"epoch": 0.17162882215232597,
"grad_norm": 0.07801628112792969,
"learning_rate": 2.8420923509123008e-05,
"loss": 0.0365,
"step": 11950
},
{
"epoch": 0.17234693437890475,
"grad_norm": 1.7730413675308228,
"learning_rate": 2.8413707842467566e-05,
"loss": 0.0913,
"step": 12000
},
{
"epoch": 0.1730650466054835,
"grad_norm": 0.054094549268484116,
"learning_rate": 2.8406492175812125e-05,
"loss": 0.0477,
"step": 12050
},
{
"epoch": 0.17378315883206227,
"grad_norm": 0.9634159207344055,
"learning_rate": 2.839927650915668e-05,
"loss": 0.0455,
"step": 12100
},
{
"epoch": 0.17450127105864105,
"grad_norm": 0.09357574582099915,
"learning_rate": 2.839206084250124e-05,
"loss": 0.0636,
"step": 12150
},
{
"epoch": 0.17521938328521983,
"grad_norm": 0.09306718409061432,
"learning_rate": 2.8384845175845798e-05,
"loss": 0.0541,
"step": 12200
},
{
"epoch": 0.17593749551179858,
"grad_norm": 0.09160356223583221,
"learning_rate": 2.8377629509190356e-05,
"loss": 0.0678,
"step": 12250
},
{
"epoch": 0.17665560773837735,
"grad_norm": 0.04214440658688545,
"learning_rate": 2.8370413842534912e-05,
"loss": 0.0371,
"step": 12300
},
{
"epoch": 0.17737371996495613,
"grad_norm": 11.02153491973877,
"learning_rate": 2.836319817587947e-05,
"loss": 0.0747,
"step": 12350
},
{
"epoch": 0.1780918321915349,
"grad_norm": 0.14870058000087738,
"learning_rate": 2.835598250922403e-05,
"loss": 0.1061,
"step": 12400
},
{
"epoch": 0.17880994441811365,
"grad_norm": 0.07340437173843384,
"learning_rate": 2.8348766842568585e-05,
"loss": 0.0835,
"step": 12450
},
{
"epoch": 0.17952805664469243,
"grad_norm": 0.13345997035503387,
"learning_rate": 2.8341551175913143e-05,
"loss": 0.0566,
"step": 12500
},
{
"epoch": 0.1802461688712712,
"grad_norm": 0.1167718842625618,
"learning_rate": 2.8334335509257702e-05,
"loss": 0.0517,
"step": 12550
},
{
"epoch": 0.18096428109784998,
"grad_norm": 1.0248634815216064,
"learning_rate": 2.832711984260226e-05,
"loss": 0.0857,
"step": 12600
},
{
"epoch": 0.18168239332442873,
"grad_norm": 0.7217876315116882,
"learning_rate": 2.8319904175946816e-05,
"loss": 0.0621,
"step": 12650
},
{
"epoch": 0.1824005055510075,
"grad_norm": 0.13123983144760132,
"learning_rate": 2.8312688509291375e-05,
"loss": 0.047,
"step": 12700
},
{
"epoch": 0.18311861777758628,
"grad_norm": 0.12581509351730347,
"learning_rate": 2.8305472842635934e-05,
"loss": 0.0487,
"step": 12750
},
{
"epoch": 0.18383673000416506,
"grad_norm": 0.11207035183906555,
"learning_rate": 2.829825717598049e-05,
"loss": 0.0846,
"step": 12800
},
{
"epoch": 0.1845548422307438,
"grad_norm": 0.146351158618927,
"learning_rate": 2.8291041509325048e-05,
"loss": 0.0541,
"step": 12850
},
{
"epoch": 0.1852729544573226,
"grad_norm": 0.10366562008857727,
"learning_rate": 2.8283970156002713e-05,
"loss": 0.0882,
"step": 12900
},
{
"epoch": 0.18599106668390136,
"grad_norm": 0.032089706510305405,
"learning_rate": 2.8276754489347272e-05,
"loss": 0.0591,
"step": 12950
},
{
"epoch": 0.18670917891048014,
"grad_norm": 0.4299641251564026,
"learning_rate": 2.826953882269183e-05,
"loss": 0.0877,
"step": 13000
},
{
"epoch": 0.1874272911370589,
"grad_norm": 0.31984642148017883,
"learning_rate": 2.8262323156036386e-05,
"loss": 0.1061,
"step": 13050
},
{
"epoch": 0.18814540336363766,
"grad_norm": 0.21333181858062744,
"learning_rate": 2.8255107489380945e-05,
"loss": 0.0754,
"step": 13100
},
{
"epoch": 0.18886351559021644,
"grad_norm": 44.345706939697266,
"learning_rate": 2.8247891822725504e-05,
"loss": 0.0724,
"step": 13150
},
{
"epoch": 0.18958162781679522,
"grad_norm": 18.202777862548828,
"learning_rate": 2.824067615607006e-05,
"loss": 0.0707,
"step": 13200
},
{
"epoch": 0.19029974004337397,
"grad_norm": 0.09870504587888718,
"learning_rate": 2.8233460489414618e-05,
"loss": 0.0385,
"step": 13250
},
{
"epoch": 0.19101785226995274,
"grad_norm": 0.06396225094795227,
"learning_rate": 2.8226244822759176e-05,
"loss": 0.0695,
"step": 13300
},
{
"epoch": 0.19173596449653152,
"grad_norm": 0.07328931987285614,
"learning_rate": 2.8219029156103732e-05,
"loss": 0.052,
"step": 13350
},
{
"epoch": 0.1924540767231103,
"grad_norm": 0.0672977939248085,
"learning_rate": 2.821181348944829e-05,
"loss": 0.0757,
"step": 13400
},
{
"epoch": 0.19317218894968904,
"grad_norm": 0.09179489314556122,
"learning_rate": 2.820459782279285e-05,
"loss": 0.0716,
"step": 13450
},
{
"epoch": 0.19389030117626782,
"grad_norm": 0.11543264240026474,
"learning_rate": 2.8197382156137408e-05,
"loss": 0.1018,
"step": 13500
},
{
"epoch": 0.1946084134028466,
"grad_norm": 1.2652478218078613,
"learning_rate": 2.8190166489481963e-05,
"loss": 0.0646,
"step": 13550
},
{
"epoch": 0.19532652562942537,
"grad_norm": 0.10677991062402725,
"learning_rate": 2.8182950822826522e-05,
"loss": 0.0634,
"step": 13600
},
{
"epoch": 0.19604463785600412,
"grad_norm": 0.09842602908611298,
"learning_rate": 2.817573515617108e-05,
"loss": 0.0389,
"step": 13650
},
{
"epoch": 0.1967627500825829,
"grad_norm": 0.07868483662605286,
"learning_rate": 2.8168519489515636e-05,
"loss": 0.056,
"step": 13700
},
{
"epoch": 0.19748086230916168,
"grad_norm": 0.10997475683689117,
"learning_rate": 2.8161303822860195e-05,
"loss": 0.0769,
"step": 13750
},
{
"epoch": 0.19819897453574045,
"grad_norm": 5.006385326385498,
"learning_rate": 2.8154088156204753e-05,
"loss": 0.0778,
"step": 13800
},
{
"epoch": 0.19891708676231923,
"grad_norm": 5.939600944519043,
"learning_rate": 2.814687248954931e-05,
"loss": 0.0406,
"step": 13850
},
{
"epoch": 0.19963519898889798,
"grad_norm": 1.339119553565979,
"learning_rate": 2.8139656822893868e-05,
"loss": 0.0715,
"step": 13900
},
{
"epoch": 0.20035331121547675,
"grad_norm": 1.5083401203155518,
"learning_rate": 2.8132441156238426e-05,
"loss": 0.0735,
"step": 13950
},
{
"epoch": 0.20107142344205553,
"grad_norm": 1.6835660934448242,
"learning_rate": 2.812522548958298e-05,
"loss": 0.0568,
"step": 14000
},
{
"epoch": 0.2017895356686343,
"grad_norm": 0.1471836119890213,
"learning_rate": 2.811800982292754e-05,
"loss": 0.0496,
"step": 14050
},
{
"epoch": 0.20250764789521306,
"grad_norm": 0.06911035627126694,
"learning_rate": 2.81107941562721e-05,
"loss": 0.0588,
"step": 14100
},
{
"epoch": 0.20322576012179183,
"grad_norm": 0.05670926347374916,
"learning_rate": 2.8103578489616654e-05,
"loss": 0.0435,
"step": 14150
},
{
"epoch": 0.2039438723483706,
"grad_norm": 20.66596031188965,
"learning_rate": 2.8096362822961216e-05,
"loss": 0.056,
"step": 14200
},
{
"epoch": 0.20466198457494938,
"grad_norm": 2.9100301265716553,
"learning_rate": 2.8089147156305772e-05,
"loss": 0.0737,
"step": 14250
},
{
"epoch": 0.20538009680152813,
"grad_norm": 0.05886685848236084,
"learning_rate": 2.8081931489650327e-05,
"loss": 0.06,
"step": 14300
},
{
"epoch": 0.2060982090281069,
"grad_norm": 2.182786464691162,
"learning_rate": 2.807471582299489e-05,
"loss": 0.0594,
"step": 14350
},
{
"epoch": 0.2068163212546857,
"grad_norm": 0.10666311532258987,
"learning_rate": 2.8067500156339445e-05,
"loss": 0.0562,
"step": 14400
},
{
"epoch": 0.20753443348126446,
"grad_norm": 1.5818663835525513,
"learning_rate": 2.8060284489684003e-05,
"loss": 0.0383,
"step": 14450
},
{
"epoch": 0.2082525457078432,
"grad_norm": 0.05999474972486496,
"learning_rate": 2.8053068823028562e-05,
"loss": 0.05,
"step": 14500
},
{
"epoch": 0.208970657934422,
"grad_norm": 0.09610898047685623,
"learning_rate": 2.8045853156373117e-05,
"loss": 0.053,
"step": 14550
},
{
"epoch": 0.20968877016100076,
"grad_norm": 0.11906774342060089,
"learning_rate": 2.8038637489717676e-05,
"loss": 0.0535,
"step": 14600
},
{
"epoch": 0.21040688238757954,
"grad_norm": 0.033821843564510345,
"learning_rate": 2.8031421823062235e-05,
"loss": 0.027,
"step": 14650
},
{
"epoch": 0.2111249946141583,
"grad_norm": 0.06107163429260254,
"learning_rate": 2.802420615640679e-05,
"loss": 0.0901,
"step": 14700
},
{
"epoch": 0.21184310684073707,
"grad_norm": 0.04277655854821205,
"learning_rate": 2.801699048975135e-05,
"loss": 0.0664,
"step": 14750
},
{
"epoch": 0.21256121906731584,
"grad_norm": 0.07864125818014145,
"learning_rate": 2.8009774823095908e-05,
"loss": 0.0374,
"step": 14800
},
{
"epoch": 0.21327933129389462,
"grad_norm": 19.828433990478516,
"learning_rate": 2.8002559156440463e-05,
"loss": 0.1275,
"step": 14850
},
{
"epoch": 0.21399744352047337,
"grad_norm": 0.11536970734596252,
"learning_rate": 2.7995343489785025e-05,
"loss": 0.0673,
"step": 14900
},
{
"epoch": 0.21471555574705214,
"grad_norm": 0.06297395378351212,
"learning_rate": 2.798812782312958e-05,
"loss": 0.0551,
"step": 14950
},
{
"epoch": 0.21543366797363092,
"grad_norm": 0.1182003766298294,
"learning_rate": 2.7980912156474136e-05,
"loss": 0.0692,
"step": 15000
},
{
"epoch": 0.2161517802002097,
"grad_norm": 0.43064287304878235,
"learning_rate": 2.7973696489818698e-05,
"loss": 0.08,
"step": 15050
},
{
"epoch": 0.21686989242678845,
"grad_norm": 0.07606443017721176,
"learning_rate": 2.7966480823163253e-05,
"loss": 0.0388,
"step": 15100
},
{
"epoch": 0.21758800465336722,
"grad_norm": 12.470568656921387,
"learning_rate": 2.795926515650781e-05,
"loss": 0.0426,
"step": 15150
},
{
"epoch": 0.218306116879946,
"grad_norm": 1.1158839464187622,
"learning_rate": 2.795204948985237e-05,
"loss": 0.0722,
"step": 15200
},
{
"epoch": 0.21902422910652478,
"grad_norm": 0.1124221459031105,
"learning_rate": 2.7944833823196926e-05,
"loss": 0.0454,
"step": 15250
},
{
"epoch": 0.21974234133310352,
"grad_norm": 0.050924718379974365,
"learning_rate": 2.793761815654148e-05,
"loss": 0.0437,
"step": 15300
},
{
"epoch": 0.2204604535596823,
"grad_norm": 0.09370733797550201,
"learning_rate": 2.7930402489886043e-05,
"loss": 0.0514,
"step": 15350
},
{
"epoch": 0.22117856578626108,
"grad_norm": 0.11523021012544632,
"learning_rate": 2.79231868232306e-05,
"loss": 0.0588,
"step": 15400
},
{
"epoch": 0.22189667801283985,
"grad_norm": 0.0939398780465126,
"learning_rate": 2.7915971156575154e-05,
"loss": 0.0585,
"step": 15450
},
{
"epoch": 0.2226147902394186,
"grad_norm": 0.049065787345170975,
"learning_rate": 2.7908755489919716e-05,
"loss": 0.0404,
"step": 15500
},
{
"epoch": 0.22333290246599738,
"grad_norm": 0.03415745124220848,
"learning_rate": 2.790153982326427e-05,
"loss": 0.0536,
"step": 15550
},
{
"epoch": 0.22405101469257616,
"grad_norm": 0.16649800539016724,
"learning_rate": 2.789432415660883e-05,
"loss": 0.0802,
"step": 15600
},
{
"epoch": 0.22476912691915493,
"grad_norm": 0.0649518072605133,
"learning_rate": 2.788710848995339e-05,
"loss": 0.0387,
"step": 15650
},
{
"epoch": 0.2254872391457337,
"grad_norm": 0.03974470496177673,
"learning_rate": 2.7879892823297944e-05,
"loss": 0.0407,
"step": 15700
},
{
"epoch": 0.22620535137231246,
"grad_norm": 0.08658800274133682,
"learning_rate": 2.7872677156642503e-05,
"loss": 0.0486,
"step": 15750
},
{
"epoch": 0.22692346359889123,
"grad_norm": 0.03997417911887169,
"learning_rate": 2.7865461489987062e-05,
"loss": 0.039,
"step": 15800
},
{
"epoch": 0.22764157582547,
"grad_norm": 0.44339972734451294,
"learning_rate": 2.7858245823331617e-05,
"loss": 0.082,
"step": 15850
},
{
"epoch": 0.2283596880520488,
"grad_norm": 0.06587305665016174,
"learning_rate": 2.7851030156676176e-05,
"loss": 0.0627,
"step": 15900
},
{
"epoch": 0.22907780027862754,
"grad_norm": 45.02585983276367,
"learning_rate": 2.7843814490020734e-05,
"loss": 0.0865,
"step": 15950
},
{
"epoch": 0.2297959125052063,
"grad_norm": 0.046719033271074295,
"learning_rate": 2.783659882336529e-05,
"loss": 0.0566,
"step": 16000
},
{
"epoch": 0.2305140247317851,
"grad_norm": 0.06561392545700073,
"learning_rate": 2.7829527470042956e-05,
"loss": 0.0639,
"step": 16050
},
{
"epoch": 0.23123213695836387,
"grad_norm": 0.04073488339781761,
"learning_rate": 2.7822311803387518e-05,
"loss": 0.0421,
"step": 16100
},
{
"epoch": 0.23195024918494261,
"grad_norm": 0.03125346079468727,
"learning_rate": 2.7815096136732073e-05,
"loss": 0.0702,
"step": 16150
},
{
"epoch": 0.2326683614115214,
"grad_norm": 0.1020875945687294,
"learning_rate": 2.780788047007663e-05,
"loss": 0.0601,
"step": 16200
},
{
"epoch": 0.23338647363810017,
"grad_norm": 0.05583386868238449,
"learning_rate": 2.780066480342119e-05,
"loss": 0.0384,
"step": 16250
},
{
"epoch": 0.23410458586467894,
"grad_norm": 0.5046035647392273,
"learning_rate": 2.7793449136765746e-05,
"loss": 0.0562,
"step": 16300
},
{
"epoch": 0.2348226980912577,
"grad_norm": 0.03745768964290619,
"learning_rate": 2.77862334701103e-05,
"loss": 0.0518,
"step": 16350
},
{
"epoch": 0.23554081031783647,
"grad_norm": 4.189239501953125,
"learning_rate": 2.7779017803454863e-05,
"loss": 0.0638,
"step": 16400
},
{
"epoch": 0.23625892254441525,
"grad_norm": 5.748262405395508,
"learning_rate": 2.777180213679942e-05,
"loss": 0.08,
"step": 16450
},
{
"epoch": 0.23697703477099402,
"grad_norm": 0.13202162086963654,
"learning_rate": 2.7764586470143977e-05,
"loss": 0.0986,
"step": 16500
},
{
"epoch": 0.23769514699757277,
"grad_norm": 0.22944556176662445,
"learning_rate": 2.7757370803488536e-05,
"loss": 0.0593,
"step": 16550
},
{
"epoch": 0.23841325922415155,
"grad_norm": 0.041575804352760315,
"learning_rate": 2.775015513683309e-05,
"loss": 0.0316,
"step": 16600
},
{
"epoch": 0.23913137145073032,
"grad_norm": 0.5464422702789307,
"learning_rate": 2.774293947017765e-05,
"loss": 0.049,
"step": 16650
},
{
"epoch": 0.2398494836773091,
"grad_norm": 0.057239070534706116,
"learning_rate": 2.773572380352221e-05,
"loss": 0.0785,
"step": 16700
},
{
"epoch": 0.24056759590388785,
"grad_norm": 0.04813413694500923,
"learning_rate": 2.7728508136866764e-05,
"loss": 0.0426,
"step": 16750
},
{
"epoch": 0.24128570813046663,
"grad_norm": 0.21200266480445862,
"learning_rate": 2.7721292470211326e-05,
"loss": 0.0563,
"step": 16800
},
{
"epoch": 0.2420038203570454,
"grad_norm": 0.054479002952575684,
"learning_rate": 2.771407680355588e-05,
"loss": 0.0372,
"step": 16850
},
{
"epoch": 0.24272193258362418,
"grad_norm": 0.07375594973564148,
"learning_rate": 2.7706861136900437e-05,
"loss": 0.0832,
"step": 16900
},
{
"epoch": 0.24344004481020293,
"grad_norm": 0.0346427857875824,
"learning_rate": 2.7699645470245e-05,
"loss": 0.0324,
"step": 16950
},
{
"epoch": 0.2441581570367817,
"grad_norm": 0.027668170630931854,
"learning_rate": 2.7692429803589554e-05,
"loss": 0.0286,
"step": 17000
},
{
"epoch": 0.24487626926336048,
"grad_norm": 0.16506126523017883,
"learning_rate": 2.768521413693411e-05,
"loss": 0.0454,
"step": 17050
},
{
"epoch": 0.24559438148993926,
"grad_norm": 0.14680777490139008,
"learning_rate": 2.7677998470278672e-05,
"loss": 0.0487,
"step": 17100
},
{
"epoch": 0.246312493716518,
"grad_norm": 1.6522406339645386,
"learning_rate": 2.7670782803623227e-05,
"loss": 0.0592,
"step": 17150
},
{
"epoch": 0.24703060594309678,
"grad_norm": 0.020861970260739326,
"learning_rate": 2.7663567136967786e-05,
"loss": 0.0501,
"step": 17200
},
{
"epoch": 0.24774871816967556,
"grad_norm": 0.17236000299453735,
"learning_rate": 2.7656351470312345e-05,
"loss": 0.0811,
"step": 17250
},
{
"epoch": 0.24846683039625433,
"grad_norm": 0.1144992858171463,
"learning_rate": 2.76491358036569e-05,
"loss": 0.0692,
"step": 17300
},
{
"epoch": 0.24918494262283308,
"grad_norm": 0.1321888267993927,
"learning_rate": 2.764192013700146e-05,
"loss": 0.0439,
"step": 17350
},
{
"epoch": 0.24990305484941186,
"grad_norm": 0.07972278445959091,
"learning_rate": 2.7634704470346017e-05,
"loss": 0.0479,
"step": 17400
},
{
"epoch": 0.25062116707599064,
"grad_norm": 0.029969004914164543,
"learning_rate": 2.7627488803690573e-05,
"loss": 0.0422,
"step": 17450
},
{
"epoch": 0.2513392793025694,
"grad_norm": 0.08719193935394287,
"learning_rate": 2.762027313703513e-05,
"loss": 0.0688,
"step": 17500
},
{
"epoch": 0.2520573915291482,
"grad_norm": 1.5696977376937866,
"learning_rate": 2.761305747037969e-05,
"loss": 0.0543,
"step": 17550
},
{
"epoch": 0.25277550375572694,
"grad_norm": 0.03598424419760704,
"learning_rate": 2.7605841803724246e-05,
"loss": 0.0468,
"step": 17600
},
{
"epoch": 0.25349361598230574,
"grad_norm": 0.0753965675830841,
"learning_rate": 2.7598626137068804e-05,
"loss": 0.0401,
"step": 17650
},
{
"epoch": 0.2542117282088845,
"grad_norm": 0.06296613067388535,
"learning_rate": 2.7591410470413363e-05,
"loss": 0.0523,
"step": 17700
},
{
"epoch": 0.25492984043546324,
"grad_norm": 0.10784654319286346,
"learning_rate": 2.7584194803757918e-05,
"loss": 0.0495,
"step": 17750
},
{
"epoch": 0.25564795266204204,
"grad_norm": 14.075483322143555,
"learning_rate": 2.7576979137102477e-05,
"loss": 0.0783,
"step": 17800
},
{
"epoch": 0.2563660648886208,
"grad_norm": 0.05900178104639053,
"learning_rate": 2.7569763470447036e-05,
"loss": 0.0306,
"step": 17850
},
{
"epoch": 0.25708417711519954,
"grad_norm": 0.07077598571777344,
"learning_rate": 2.7562547803791594e-05,
"loss": 0.0474,
"step": 17900
},
{
"epoch": 0.25780228934177835,
"grad_norm": 0.035241879522800446,
"learning_rate": 2.755533213713615e-05,
"loss": 0.0348,
"step": 17950
},
{
"epoch": 0.2585204015683571,
"grad_norm": 0.031246617436408997,
"learning_rate": 2.754811647048071e-05,
"loss": 0.0335,
"step": 18000
},
{
"epoch": 0.2592385137949359,
"grad_norm": 0.01713900826871395,
"learning_rate": 2.7540900803825267e-05,
"loss": 0.0395,
"step": 18050
},
{
"epoch": 0.25995662602151465,
"grad_norm": 27.746681213378906,
"learning_rate": 2.7533685137169823e-05,
"loss": 0.0682,
"step": 18100
},
{
"epoch": 0.2606747382480934,
"grad_norm": 0.17817170917987823,
"learning_rate": 2.752646947051438e-05,
"loss": 0.0677,
"step": 18150
},
{
"epoch": 0.2613928504746722,
"grad_norm": 0.026913467794656754,
"learning_rate": 2.751925380385894e-05,
"loss": 0.0458,
"step": 18200
},
{
"epoch": 0.26211096270125095,
"grad_norm": 1.5317715406417847,
"learning_rate": 2.7512038137203495e-05,
"loss": 0.062,
"step": 18250
},
{
"epoch": 0.2628290749278297,
"grad_norm": 0.04229836165904999,
"learning_rate": 2.7504822470548054e-05,
"loss": 0.0408,
"step": 18300
},
{
"epoch": 0.2635471871544085,
"grad_norm": 0.021231146529316902,
"learning_rate": 2.7497606803892613e-05,
"loss": 0.0425,
"step": 18350
},
{
"epoch": 0.26426529938098725,
"grad_norm": 1.7055615186691284,
"learning_rate": 2.7490391137237168e-05,
"loss": 0.055,
"step": 18400
},
{
"epoch": 0.26498341160756606,
"grad_norm": 0.1902083307504654,
"learning_rate": 2.7483175470581727e-05,
"loss": 0.0416,
"step": 18450
},
{
"epoch": 0.2657015238341448,
"grad_norm": 0.016266852617263794,
"learning_rate": 2.7475959803926286e-05,
"loss": 0.0543,
"step": 18500
},
{
"epoch": 0.26641963606072355,
"grad_norm": 19.51447868347168,
"learning_rate": 2.746874413727084e-05,
"loss": 0.0517,
"step": 18550
},
{
"epoch": 0.26713774828730236,
"grad_norm": 0.012048796750605106,
"learning_rate": 2.7461528470615403e-05,
"loss": 0.021,
"step": 18600
},
{
"epoch": 0.2678558605138811,
"grad_norm": 1.3456578254699707,
"learning_rate": 2.745431280395996e-05,
"loss": 0.0766,
"step": 18650
},
{
"epoch": 0.26857397274045985,
"grad_norm": 5.304058074951172,
"learning_rate": 2.7447097137304517e-05,
"loss": 0.1094,
"step": 18700
},
{
"epoch": 0.26929208496703866,
"grad_norm": 0.19321753084659576,
"learning_rate": 2.7439881470649076e-05,
"loss": 0.0706,
"step": 18750
},
{
"epoch": 0.2700101971936174,
"grad_norm": 0.23943820595741272,
"learning_rate": 2.743266580399363e-05,
"loss": 0.1026,
"step": 18800
},
{
"epoch": 0.2707283094201962,
"grad_norm": 0.08400757610797882,
"learning_rate": 2.742545013733819e-05,
"loss": 0.0598,
"step": 18850
},
{
"epoch": 0.27144642164677496,
"grad_norm": 0.2430121749639511,
"learning_rate": 2.741823447068275e-05,
"loss": 0.0693,
"step": 18900
},
{
"epoch": 0.2721645338733537,
"grad_norm": 0.06597993522882462,
"learning_rate": 2.7411163117360414e-05,
"loss": 0.0384,
"step": 18950
},
{
"epoch": 0.2728826460999325,
"grad_norm": 7.39203405380249,
"learning_rate": 2.7403947450704973e-05,
"loss": 0.0541,
"step": 19000
},
{
"epoch": 0.27360075832651126,
"grad_norm": 0.1847894936800003,
"learning_rate": 2.739673178404953e-05,
"loss": 0.0606,
"step": 19050
},
{
"epoch": 0.27431887055309,
"grad_norm": 0.03135521709918976,
"learning_rate": 2.7389516117394087e-05,
"loss": 0.042,
"step": 19100
},
{
"epoch": 0.2750369827796688,
"grad_norm": 1.6055335998535156,
"learning_rate": 2.7382300450738646e-05,
"loss": 0.0428,
"step": 19150
},
{
"epoch": 0.27575509500624756,
"grad_norm": 0.15340352058410645,
"learning_rate": 2.73750847840832e-05,
"loss": 0.0712,
"step": 19200
},
{
"epoch": 0.27647320723282637,
"grad_norm": 0.027920212596654892,
"learning_rate": 2.736786911742776e-05,
"loss": 0.0361,
"step": 19250
},
{
"epoch": 0.2771913194594051,
"grad_norm": 0.21404047310352325,
"learning_rate": 2.736065345077232e-05,
"loss": 0.0574,
"step": 19300
},
{
"epoch": 0.27790943168598387,
"grad_norm": 0.02065029740333557,
"learning_rate": 2.7353437784116874e-05,
"loss": 0.0331,
"step": 19350
},
{
"epoch": 0.27862754391256267,
"grad_norm": 6.572165012359619,
"learning_rate": 2.7346222117461433e-05,
"loss": 0.049,
"step": 19400
},
{
"epoch": 0.2793456561391414,
"grad_norm": 0.03197706490755081,
"learning_rate": 2.733900645080599e-05,
"loss": 0.0336,
"step": 19450
},
{
"epoch": 0.28006376836572017,
"grad_norm": 1.9417047500610352,
"learning_rate": 2.733179078415055e-05,
"loss": 0.0781,
"step": 19500
},
{
"epoch": 0.28078188059229897,
"grad_norm": 0.07425859570503235,
"learning_rate": 2.7324575117495106e-05,
"loss": 0.0499,
"step": 19550
},
{
"epoch": 0.2814999928188777,
"grad_norm": 0.09280502051115036,
"learning_rate": 2.7317359450839664e-05,
"loss": 0.0583,
"step": 19600
},
{
"epoch": 0.2822181050454565,
"grad_norm": 0.1167830228805542,
"learning_rate": 2.7310143784184223e-05,
"loss": 0.1167,
"step": 19650
},
{
"epoch": 0.2829362172720353,
"grad_norm": 7.878513813018799,
"learning_rate": 2.7302928117528778e-05,
"loss": 0.0615,
"step": 19700
},
{
"epoch": 0.283654329498614,
"grad_norm": 0.05659603700041771,
"learning_rate": 2.7295712450873337e-05,
"loss": 0.0561,
"step": 19750
},
{
"epoch": 0.2843724417251928,
"grad_norm": 0.061973392963409424,
"learning_rate": 2.7288496784217896e-05,
"loss": 0.0736,
"step": 19800
},
{
"epoch": 0.2850905539517716,
"grad_norm": 0.05980992689728737,
"learning_rate": 2.728128111756245e-05,
"loss": 0.0461,
"step": 19850
},
{
"epoch": 0.2858086661783504,
"grad_norm": 0.025946056470274925,
"learning_rate": 2.727406545090701e-05,
"loss": 0.0202,
"step": 19900
},
{
"epoch": 0.2865267784049291,
"grad_norm": 0.024165382608771324,
"learning_rate": 2.726684978425157e-05,
"loss": 0.0674,
"step": 19950
},
{
"epoch": 0.2872448906315079,
"grad_norm": 0.09668976813554764,
"learning_rate": 2.7259634117596124e-05,
"loss": 0.0576,
"step": 20000
},
{
"epoch": 0.2879630028580867,
"grad_norm": 0.11194832623004913,
"learning_rate": 2.7252418450940683e-05,
"loss": 0.035,
"step": 20050
},
{
"epoch": 0.28868111508466543,
"grad_norm": 0.024094650521874428,
"learning_rate": 2.724520278428524e-05,
"loss": 0.0435,
"step": 20100
},
{
"epoch": 0.2893992273112442,
"grad_norm": 0.05443573743104935,
"learning_rate": 2.7237987117629797e-05,
"loss": 0.0706,
"step": 20150
},
{
"epoch": 0.290117339537823,
"grad_norm": 8.078572273254395,
"learning_rate": 2.723077145097436e-05,
"loss": 0.0559,
"step": 20200
},
{
"epoch": 0.29083545176440173,
"grad_norm": 0.09734756499528885,
"learning_rate": 2.7223555784318914e-05,
"loss": 0.0895,
"step": 20250
},
{
"epoch": 0.29155356399098054,
"grad_norm": 0.12933404743671417,
"learning_rate": 2.721634011766347e-05,
"loss": 0.0383,
"step": 20300
},
{
"epoch": 0.2922716762175593,
"grad_norm": 0.5858670473098755,
"learning_rate": 2.720912445100803e-05,
"loss": 0.0853,
"step": 20350
},
{
"epoch": 0.29298978844413803,
"grad_norm": 40.67034912109375,
"learning_rate": 2.7201908784352587e-05,
"loss": 0.0467,
"step": 20400
},
{
"epoch": 0.29370790067071684,
"grad_norm": 9.080503463745117,
"learning_rate": 2.7194693117697142e-05,
"loss": 0.0588,
"step": 20450
},
{
"epoch": 0.2944260128972956,
"grad_norm": 13.065214157104492,
"learning_rate": 2.7187477451041704e-05,
"loss": 0.0269,
"step": 20500
},
{
"epoch": 0.29514412512387433,
"grad_norm": 0.03507602587342262,
"learning_rate": 2.718026178438626e-05,
"loss": 0.0486,
"step": 20550
},
{
"epoch": 0.29586223735045314,
"grad_norm": 0.053962577134370804,
"learning_rate": 2.7173046117730815e-05,
"loss": 0.0704,
"step": 20600
},
{
"epoch": 0.2965803495770319,
"grad_norm": 0.04702967032790184,
"learning_rate": 2.7165830451075377e-05,
"loss": 0.0503,
"step": 20650
},
{
"epoch": 0.2972984618036107,
"grad_norm": 0.04396852105855942,
"learning_rate": 2.7158614784419932e-05,
"loss": 0.0601,
"step": 20700
},
{
"epoch": 0.29801657403018944,
"grad_norm": 0.053248144686222076,
"learning_rate": 2.7151399117764488e-05,
"loss": 0.048,
"step": 20750
},
{
"epoch": 0.2987346862567682,
"grad_norm": 0.03361482545733452,
"learning_rate": 2.714418345110905e-05,
"loss": 0.0641,
"step": 20800
},
{
"epoch": 0.299452798483347,
"grad_norm": 0.05684931203722954,
"learning_rate": 2.7137112097786716e-05,
"loss": 0.0337,
"step": 20850
},
{
"epoch": 0.30017091070992574,
"grad_norm": 0.06214507669210434,
"learning_rate": 2.712989643113127e-05,
"loss": 0.0386,
"step": 20900
},
{
"epoch": 0.3008890229365045,
"grad_norm": 0.04733401909470558,
"learning_rate": 2.712268076447583e-05,
"loss": 0.071,
"step": 20950
},
{
"epoch": 0.3016071351630833,
"grad_norm": 3.680715560913086,
"learning_rate": 2.711546509782039e-05,
"loss": 0.0499,
"step": 21000
},
{
"epoch": 0.30232524738966204,
"grad_norm": 0.4213371276855469,
"learning_rate": 2.7108249431164944e-05,
"loss": 0.071,
"step": 21050
},
{
"epoch": 0.30304335961624085,
"grad_norm": 0.06743492186069489,
"learning_rate": 2.7101033764509506e-05,
"loss": 0.036,
"step": 21100
},
{
"epoch": 0.3037614718428196,
"grad_norm": 3.1243784427642822,
"learning_rate": 2.709381809785406e-05,
"loss": 0.0787,
"step": 21150
},
{
"epoch": 0.30447958406939835,
"grad_norm": 13.193207740783691,
"learning_rate": 2.7086602431198617e-05,
"loss": 0.0572,
"step": 21200
},
{
"epoch": 0.30519769629597715,
"grad_norm": 0.3255308270454407,
"learning_rate": 2.707938676454318e-05,
"loss": 0.0527,
"step": 21250
},
{
"epoch": 0.3059158085225559,
"grad_norm": 0.05115434527397156,
"learning_rate": 2.7072171097887734e-05,
"loss": 0.0287,
"step": 21300
},
{
"epoch": 0.30663392074913465,
"grad_norm": 1.7095527648925781,
"learning_rate": 2.7064955431232293e-05,
"loss": 0.0632,
"step": 21350
},
{
"epoch": 0.30735203297571345,
"grad_norm": 0.06429759413003922,
"learning_rate": 2.705773976457685e-05,
"loss": 0.0706,
"step": 21400
},
{
"epoch": 0.3080701452022922,
"grad_norm": 58.65373229980469,
"learning_rate": 2.7050524097921407e-05,
"loss": 0.0485,
"step": 21450
},
{
"epoch": 0.308788257428871,
"grad_norm": 0.047401025891304016,
"learning_rate": 2.7043308431265965e-05,
"loss": 0.0839,
"step": 21500
},
{
"epoch": 0.30950636965544975,
"grad_norm": 0.03714323043823242,
"learning_rate": 2.7036092764610524e-05,
"loss": 0.0399,
"step": 21550
},
{
"epoch": 0.3102244818820285,
"grad_norm": 0.06805309653282166,
"learning_rate": 2.702887709795508e-05,
"loss": 0.0665,
"step": 21600
},
{
"epoch": 0.3109425941086073,
"grad_norm": 0.5862714052200317,
"learning_rate": 2.7021661431299638e-05,
"loss": 0.0525,
"step": 21650
},
{
"epoch": 0.31166070633518606,
"grad_norm": 0.38594987988471985,
"learning_rate": 2.7014445764644197e-05,
"loss": 0.0772,
"step": 21700
},
{
"epoch": 0.31237881856176486,
"grad_norm": 0.0842975378036499,
"learning_rate": 2.7007230097988752e-05,
"loss": 0.063,
"step": 21750
},
{
"epoch": 0.3130969307883436,
"grad_norm": 0.1474410593509674,
"learning_rate": 2.7000014431333314e-05,
"loss": 0.0842,
"step": 21800
},
{
"epoch": 0.31381504301492236,
"grad_norm": 0.0783839225769043,
"learning_rate": 2.699279876467787e-05,
"loss": 0.0551,
"step": 21850
},
{
"epoch": 0.31453315524150116,
"grad_norm": 3.7803609371185303,
"learning_rate": 2.6985583098022425e-05,
"loss": 0.0933,
"step": 21900
},
{
"epoch": 0.3152512674680799,
"grad_norm": 0.12234686315059662,
"learning_rate": 2.6978367431366987e-05,
"loss": 0.0787,
"step": 21950
},
{
"epoch": 0.31596937969465866,
"grad_norm": 8.717498779296875,
"learning_rate": 2.6971151764711543e-05,
"loss": 0.0468,
"step": 22000
},
{
"epoch": 0.31668749192123746,
"grad_norm": 87.91850280761719,
"learning_rate": 2.6963936098056098e-05,
"loss": 0.0749,
"step": 22050
},
{
"epoch": 0.3174056041478162,
"grad_norm": 0.047568611800670624,
"learning_rate": 2.695672043140066e-05,
"loss": 0.033,
"step": 22100
},
{
"epoch": 0.318123716374395,
"grad_norm": 0.07533901929855347,
"learning_rate": 2.6949504764745215e-05,
"loss": 0.044,
"step": 22150
},
{
"epoch": 0.31884182860097376,
"grad_norm": 0.05188257247209549,
"learning_rate": 2.694228909808977e-05,
"loss": 0.0358,
"step": 22200
},
{
"epoch": 0.3195599408275525,
"grad_norm": 21.622936248779297,
"learning_rate": 2.6935073431434333e-05,
"loss": 0.06,
"step": 22250
},
{
"epoch": 0.3202780530541313,
"grad_norm": 0.05965098738670349,
"learning_rate": 2.6927857764778888e-05,
"loss": 0.0772,
"step": 22300
},
{
"epoch": 0.32099616528071007,
"grad_norm": 0.09030428528785706,
"learning_rate": 2.6920642098123443e-05,
"loss": 0.044,
"step": 22350
},
{
"epoch": 0.3217142775072888,
"grad_norm": 6.469705104827881,
"learning_rate": 2.6913426431468006e-05,
"loss": 0.0627,
"step": 22400
},
{
"epoch": 0.3224323897338676,
"grad_norm": 0.048977576196193695,
"learning_rate": 2.690621076481256e-05,
"loss": 0.0402,
"step": 22450
},
{
"epoch": 0.32315050196044637,
"grad_norm": 7.34023904800415,
"learning_rate": 2.689899509815712e-05,
"loss": 0.0661,
"step": 22500
},
{
"epoch": 0.32386861418702517,
"grad_norm": 3.076848268508911,
"learning_rate": 2.689177943150168e-05,
"loss": 0.0794,
"step": 22550
},
{
"epoch": 0.3245867264136039,
"grad_norm": 0.1343400776386261,
"learning_rate": 2.6884563764846234e-05,
"loss": 0.0601,
"step": 22600
},
{
"epoch": 0.32530483864018267,
"grad_norm": 0.022573702037334442,
"learning_rate": 2.6877348098190792e-05,
"loss": 0.0309,
"step": 22650
},
{
"epoch": 0.3260229508667615,
"grad_norm": 0.022028079256415367,
"learning_rate": 2.687013243153535e-05,
"loss": 0.0586,
"step": 22700
},
{
"epoch": 0.3267410630933402,
"grad_norm": 0.03373101353645325,
"learning_rate": 2.6862916764879906e-05,
"loss": 0.048,
"step": 22750
},
{
"epoch": 0.32745917531991897,
"grad_norm": 7.364265441894531,
"learning_rate": 2.6855701098224465e-05,
"loss": 0.0303,
"step": 22800
},
{
"epoch": 0.3281772875464978,
"grad_norm": 7.823999881744385,
"learning_rate": 2.6848485431569024e-05,
"loss": 0.0647,
"step": 22850
},
{
"epoch": 0.3288953997730765,
"grad_norm": 0.02031027339398861,
"learning_rate": 2.684126976491358e-05,
"loss": 0.0472,
"step": 22900
},
{
"epoch": 0.32961351199965533,
"grad_norm": 27.554948806762695,
"learning_rate": 2.6834054098258138e-05,
"loss": 0.0451,
"step": 22950
},
{
"epoch": 0.3303316242262341,
"grad_norm": 1.5407648086547852,
"learning_rate": 2.6826838431602697e-05,
"loss": 0.0827,
"step": 23000
},
{
"epoch": 0.3310497364528128,
"grad_norm": 0.15170975029468536,
"learning_rate": 2.6819622764947252e-05,
"loss": 0.0629,
"step": 23050
},
{
"epoch": 0.33176784867939163,
"grad_norm": 9.083374977111816,
"learning_rate": 2.681240709829181e-05,
"loss": 0.0818,
"step": 23100
},
{
"epoch": 0.3324859609059704,
"grad_norm": 0.021673114970326424,
"learning_rate": 2.680519143163637e-05,
"loss": 0.0471,
"step": 23150
},
{
"epoch": 0.33320407313254913,
"grad_norm": 0.01671200804412365,
"learning_rate": 2.6797975764980928e-05,
"loss": 0.0222,
"step": 23200
},
{
"epoch": 0.33392218535912793,
"grad_norm": 0.01688038930296898,
"learning_rate": 2.6790760098325487e-05,
"loss": 0.0343,
"step": 23250
},
{
"epoch": 0.3346402975857067,
"grad_norm": 0.025257810950279236,
"learning_rate": 2.6783544431670042e-05,
"loss": 0.0242,
"step": 23300
},
{
"epoch": 0.3353584098122855,
"grad_norm": 0.016638273373246193,
"learning_rate": 2.67763287650146e-05,
"loss": 0.0408,
"step": 23350
},
{
"epoch": 0.33607652203886423,
"grad_norm": 1.6154401302337646,
"learning_rate": 2.676911309835916e-05,
"loss": 0.0208,
"step": 23400
},
{
"epoch": 0.336794634265443,
"grad_norm": 1.148065209388733,
"learning_rate": 2.6761897431703715e-05,
"loss": 0.0848,
"step": 23450
},
{
"epoch": 0.3375127464920218,
"grad_norm": 3.4515271186828613,
"learning_rate": 2.6754681765048274e-05,
"loss": 0.0402,
"step": 23500
},
{
"epoch": 0.33823085871860054,
"grad_norm": 0.04399702697992325,
"learning_rate": 2.6747466098392832e-05,
"loss": 0.0293,
"step": 23550
},
{
"epoch": 0.33894897094517934,
"grad_norm": 0.023840222507715225,
"learning_rate": 2.6740250431737388e-05,
"loss": 0.0567,
"step": 23600
},
{
"epoch": 0.3396670831717581,
"grad_norm": 0.18096154928207397,
"learning_rate": 2.6733034765081947e-05,
"loss": 0.0792,
"step": 23650
},
{
"epoch": 0.34038519539833684,
"grad_norm": 2.950413227081299,
"learning_rate": 2.6725819098426505e-05,
"loss": 0.0568,
"step": 23700
},
{
"epoch": 0.34110330762491564,
"grad_norm": 0.03382248431444168,
"learning_rate": 2.671860343177106e-05,
"loss": 0.0389,
"step": 23750
},
{
"epoch": 0.3418214198514944,
"grad_norm": 0.06225144863128662,
"learning_rate": 2.671138776511562e-05,
"loss": 0.027,
"step": 23800
},
{
"epoch": 0.34253953207807314,
"grad_norm": 0.11203402280807495,
"learning_rate": 2.6704172098460178e-05,
"loss": 0.0481,
"step": 23850
},
{
"epoch": 0.34325764430465194,
"grad_norm": 0.028412073850631714,
"learning_rate": 2.6696956431804737e-05,
"loss": 0.0472,
"step": 23900
},
{
"epoch": 0.3439757565312307,
"grad_norm": 0.043254122138023376,
"learning_rate": 2.6689740765149292e-05,
"loss": 0.0524,
"step": 23950
},
{
"epoch": 0.3446938687578095,
"grad_norm": 0.1154152899980545,
"learning_rate": 2.668252509849385e-05,
"loss": 0.0625,
"step": 24000
},
{
"epoch": 0.34541198098438824,
"grad_norm": 0.0551941879093647,
"learning_rate": 2.667530943183841e-05,
"loss": 0.065,
"step": 24050
},
{
"epoch": 0.346130093210967,
"grad_norm": 5.94352388381958,
"learning_rate": 2.6668093765182965e-05,
"loss": 0.0645,
"step": 24100
},
{
"epoch": 0.3468482054375458,
"grad_norm": 0.05754236876964569,
"learning_rate": 2.6660878098527524e-05,
"loss": 0.0396,
"step": 24150
},
{
"epoch": 0.34756631766412455,
"grad_norm": 0.032528094947338104,
"learning_rate": 2.6653662431872082e-05,
"loss": 0.0379,
"step": 24200
},
{
"epoch": 0.3482844298907033,
"grad_norm": 1.8535875082015991,
"learning_rate": 2.6646446765216638e-05,
"loss": 0.0479,
"step": 24250
},
{
"epoch": 0.3490025421172821,
"grad_norm": 0.05355213209986687,
"learning_rate": 2.6639231098561196e-05,
"loss": 0.0337,
"step": 24300
},
{
"epoch": 0.34972065434386085,
"grad_norm": 0.12058199197053909,
"learning_rate": 2.6632015431905755e-05,
"loss": 0.0764,
"step": 24350
},
{
"epoch": 0.35043876657043965,
"grad_norm": 20.0172176361084,
"learning_rate": 2.662479976525031e-05,
"loss": 0.0692,
"step": 24400
},
{
"epoch": 0.3511568787970184,
"grad_norm": 1.6164151430130005,
"learning_rate": 2.6617584098594873e-05,
"loss": 0.0617,
"step": 24450
},
{
"epoch": 0.35187499102359715,
"grad_norm": 0.1497562974691391,
"learning_rate": 2.6610368431939428e-05,
"loss": 0.0615,
"step": 24500
},
{
"epoch": 0.35259310325017595,
"grad_norm": 0.18678772449493408,
"learning_rate": 2.6603152765283983e-05,
"loss": 0.051,
"step": 24550
},
{
"epoch": 0.3533112154767547,
"grad_norm": 0.29123035073280334,
"learning_rate": 2.6595937098628545e-05,
"loss": 0.0606,
"step": 24600
},
{
"epoch": 0.35402932770333345,
"grad_norm": 3.2221381664276123,
"learning_rate": 2.65887214319731e-05,
"loss": 0.0341,
"step": 24650
},
{
"epoch": 0.35474743992991226,
"grad_norm": 0.10685055702924728,
"learning_rate": 2.6581505765317656e-05,
"loss": 0.0843,
"step": 24700
},
{
"epoch": 0.355465552156491,
"grad_norm": 0.10422638803720474,
"learning_rate": 2.6574290098662218e-05,
"loss": 0.0547,
"step": 24750
},
{
"epoch": 0.3561836643830698,
"grad_norm": 0.08146257698535919,
"learning_rate": 2.6567074432006773e-05,
"loss": 0.0439,
"step": 24800
},
{
"epoch": 0.35690177660964856,
"grad_norm": 0.15033039450645447,
"learning_rate": 2.655985876535133e-05,
"loss": 0.0713,
"step": 24850
},
{
"epoch": 0.3576198888362273,
"grad_norm": 0.3770735561847687,
"learning_rate": 2.655264309869589e-05,
"loss": 0.0199,
"step": 24900
},
{
"epoch": 0.3583380010628061,
"grad_norm": 0.5093479156494141,
"learning_rate": 2.6545427432040446e-05,
"loss": 0.0632,
"step": 24950
},
{
"epoch": 0.35905611328938486,
"grad_norm": 0.03541666641831398,
"learning_rate": 2.6538211765385e-05,
"loss": 0.0494,
"step": 25000
},
{
"epoch": 0.3597742255159636,
"grad_norm": 0.10205601900815964,
"learning_rate": 2.6530996098729564e-05,
"loss": 0.1011,
"step": 25050
},
{
"epoch": 0.3604923377425424,
"grad_norm": 11.667498588562012,
"learning_rate": 2.652378043207412e-05,
"loss": 0.0367,
"step": 25100
},
{
"epoch": 0.36121044996912116,
"grad_norm": 0.04607768356800079,
"learning_rate": 2.6516564765418678e-05,
"loss": 0.0456,
"step": 25150
},
{
"epoch": 0.36192856219569997,
"grad_norm": 0.3675365746021271,
"learning_rate": 2.6509493412096344e-05,
"loss": 0.0653,
"step": 25200
},
{
"epoch": 0.3626466744222787,
"grad_norm": 0.06698648631572723,
"learning_rate": 2.6502277745440902e-05,
"loss": 0.053,
"step": 25250
},
{
"epoch": 0.36336478664885746,
"grad_norm": 2.359265089035034,
"learning_rate": 2.6495062078785458e-05,
"loss": 0.0399,
"step": 25300
},
{
"epoch": 0.36408289887543627,
"grad_norm": 1.7382354736328125,
"learning_rate": 2.648784641213002e-05,
"loss": 0.0438,
"step": 25350
},
{
"epoch": 0.364801011102015,
"grad_norm": 10.03585433959961,
"learning_rate": 2.6480630745474575e-05,
"loss": 0.0469,
"step": 25400
},
{
"epoch": 0.3655191233285938,
"grad_norm": 0.02849421463906765,
"learning_rate": 2.647341507881913e-05,
"loss": 0.039,
"step": 25450
},
{
"epoch": 0.36623723555517257,
"grad_norm": 0.04304594174027443,
"learning_rate": 2.6466199412163692e-05,
"loss": 0.0481,
"step": 25500
},
{
"epoch": 0.3669553477817513,
"grad_norm": 0.12656258046627045,
"learning_rate": 2.6458983745508248e-05,
"loss": 0.0584,
"step": 25550
},
{
"epoch": 0.3676734600083301,
"grad_norm": 0.01808289811015129,
"learning_rate": 2.6451768078852807e-05,
"loss": 0.0405,
"step": 25600
},
{
"epoch": 0.36839157223490887,
"grad_norm": 3.8107683658599854,
"learning_rate": 2.6444552412197365e-05,
"loss": 0.0542,
"step": 25650
},
{
"epoch": 0.3691096844614876,
"grad_norm": 0.02769179455935955,
"learning_rate": 2.643733674554192e-05,
"loss": 0.0452,
"step": 25700
},
{
"epoch": 0.3698277966880664,
"grad_norm": 0.10629933327436447,
"learning_rate": 2.643012107888648e-05,
"loss": 0.0545,
"step": 25750
},
{
"epoch": 0.3705459089146452,
"grad_norm": 0.025269586592912674,
"learning_rate": 2.6422905412231038e-05,
"loss": 0.0353,
"step": 25800
},
{
"epoch": 0.371264021141224,
"grad_norm": 0.04765285179018974,
"learning_rate": 2.6415689745575593e-05,
"loss": 0.042,
"step": 25850
},
{
"epoch": 0.3719821333678027,
"grad_norm": 0.020955292508006096,
"learning_rate": 2.6408474078920152e-05,
"loss": 0.0403,
"step": 25900
},
{
"epoch": 0.3727002455943815,
"grad_norm": 0.03894009068608284,
"learning_rate": 2.640125841226471e-05,
"loss": 0.0184,
"step": 25950
},
{
"epoch": 0.3734183578209603,
"grad_norm": 0.01537378504872322,
"learning_rate": 2.6394042745609266e-05,
"loss": 0.0491,
"step": 26000
},
{
"epoch": 0.374136470047539,
"grad_norm": 0.17179059982299805,
"learning_rate": 2.6386827078953828e-05,
"loss": 0.032,
"step": 26050
},
{
"epoch": 0.3748545822741178,
"grad_norm": 0.03283374384045601,
"learning_rate": 2.6379611412298384e-05,
"loss": 0.0686,
"step": 26100
},
{
"epoch": 0.3755726945006966,
"grad_norm": 0.015107177197933197,
"learning_rate": 2.637239574564294e-05,
"loss": 0.0412,
"step": 26150
},
{
"epoch": 0.37629080672727533,
"grad_norm": 0.9494103193283081,
"learning_rate": 2.63651800789875e-05,
"loss": 0.1147,
"step": 26200
},
{
"epoch": 0.37700891895385413,
"grad_norm": 0.035837531089782715,
"learning_rate": 2.6357964412332056e-05,
"loss": 0.0311,
"step": 26250
},
{
"epoch": 0.3777270311804329,
"grad_norm": 0.05149250105023384,
"learning_rate": 2.6350893059009722e-05,
"loss": 0.0504,
"step": 26300
},
{
"epoch": 0.37844514340701163,
"grad_norm": 0.05706680193543434,
"learning_rate": 2.634367739235428e-05,
"loss": 0.0697,
"step": 26350
},
{
"epoch": 0.37916325563359043,
"grad_norm": 0.5677555203437805,
"learning_rate": 2.633646172569884e-05,
"loss": 0.0699,
"step": 26400
},
{
"epoch": 0.3798813678601692,
"grad_norm": 2.2311105728149414,
"learning_rate": 2.6329246059043395e-05,
"loss": 0.0491,
"step": 26450
},
{
"epoch": 0.38059948008674793,
"grad_norm": 0.05452633649110794,
"learning_rate": 2.6322030392387954e-05,
"loss": 0.052,
"step": 26500
},
{
"epoch": 0.38131759231332674,
"grad_norm": 0.10342645645141602,
"learning_rate": 2.6314814725732512e-05,
"loss": 0.0431,
"step": 26550
},
{
"epoch": 0.3820357045399055,
"grad_norm": 7.60914421081543,
"learning_rate": 2.6307599059077068e-05,
"loss": 0.0637,
"step": 26600
},
{
"epoch": 0.3827538167664843,
"grad_norm": 5.700191497802734,
"learning_rate": 2.6300383392421626e-05,
"loss": 0.0637,
"step": 26650
},
{
"epoch": 0.38347192899306304,
"grad_norm": 2.0660006999969482,
"learning_rate": 2.6293167725766185e-05,
"loss": 0.0728,
"step": 26700
},
{
"epoch": 0.3841900412196418,
"grad_norm": 0.04174574464559555,
"learning_rate": 2.628595205911074e-05,
"loss": 0.0446,
"step": 26750
},
{
"epoch": 0.3849081534462206,
"grad_norm": 3.7447893619537354,
"learning_rate": 2.62787363924553e-05,
"loss": 0.0301,
"step": 26800
},
{
"epoch": 0.38562626567279934,
"grad_norm": 2.0020968914031982,
"learning_rate": 2.6271520725799858e-05,
"loss": 0.0679,
"step": 26850
},
{
"epoch": 0.3863443778993781,
"grad_norm": 0.04318321496248245,
"learning_rate": 2.6264305059144413e-05,
"loss": 0.0533,
"step": 26900
},
{
"epoch": 0.3870624901259569,
"grad_norm": 0.05878012999892235,
"learning_rate": 2.6257089392488975e-05,
"loss": 0.0537,
"step": 26950
},
{
"epoch": 0.38778060235253564,
"grad_norm": 28.61198616027832,
"learning_rate": 2.624987372583353e-05,
"loss": 0.0554,
"step": 27000
},
{
"epoch": 0.38849871457911445,
"grad_norm": 0.12207271158695221,
"learning_rate": 2.6242658059178086e-05,
"loss": 0.0566,
"step": 27050
},
{
"epoch": 0.3892168268056932,
"grad_norm": 2.0619852542877197,
"learning_rate": 2.6235442392522648e-05,
"loss": 0.0577,
"step": 27100
},
{
"epoch": 0.38993493903227194,
"grad_norm": 0.03043234907090664,
"learning_rate": 2.6228226725867203e-05,
"loss": 0.041,
"step": 27150
},
{
"epoch": 0.39065305125885075,
"grad_norm": 0.07814677804708481,
"learning_rate": 2.622101105921176e-05,
"loss": 0.0335,
"step": 27200
},
{
"epoch": 0.3913711634854295,
"grad_norm": 0.3286628723144531,
"learning_rate": 2.621379539255632e-05,
"loss": 0.0667,
"step": 27250
},
{
"epoch": 0.39208927571200825,
"grad_norm": 0.0443241186439991,
"learning_rate": 2.6206579725900876e-05,
"loss": 0.0337,
"step": 27300
},
{
"epoch": 0.39280738793858705,
"grad_norm": 0.0626874640583992,
"learning_rate": 2.619936405924543e-05,
"loss": 0.0565,
"step": 27350
},
{
"epoch": 0.3935255001651658,
"grad_norm": 4.9472808837890625,
"learning_rate": 2.6192148392589994e-05,
"loss": 0.0407,
"step": 27400
},
{
"epoch": 0.3942436123917446,
"grad_norm": 0.1261199414730072,
"learning_rate": 2.618493272593455e-05,
"loss": 0.0544,
"step": 27450
},
{
"epoch": 0.39496172461832335,
"grad_norm": 0.02840198390185833,
"learning_rate": 2.6177717059279104e-05,
"loss": 0.0342,
"step": 27500
},
{
"epoch": 0.3956798368449021,
"grad_norm": 0.04255662113428116,
"learning_rate": 2.6170501392623667e-05,
"loss": 0.0542,
"step": 27550
},
{
"epoch": 0.3963979490714809,
"grad_norm": 0.08006764948368073,
"learning_rate": 2.6163285725968222e-05,
"loss": 0.0821,
"step": 27600
},
{
"epoch": 0.39711606129805965,
"grad_norm": 2.240722179412842,
"learning_rate": 2.615607005931278e-05,
"loss": 0.0599,
"step": 27650
},
{
"epoch": 0.39783417352463846,
"grad_norm": 0.050311390310525894,
"learning_rate": 2.614885439265734e-05,
"loss": 0.0281,
"step": 27700
},
{
"epoch": 0.3985522857512172,
"grad_norm": 0.03226127102971077,
"learning_rate": 2.6141638726001895e-05,
"loss": 0.0493,
"step": 27750
},
{
"epoch": 0.39927039797779595,
"grad_norm": 0.2031366527080536,
"learning_rate": 2.6134423059346457e-05,
"loss": 0.0696,
"step": 27800
},
{
"epoch": 0.39998851020437476,
"grad_norm": 0.5344455242156982,
"learning_rate": 2.6127207392691012e-05,
"loss": 0.0543,
"step": 27850
},
{
"epoch": 0.4007066224309535,
"grad_norm": 0.02825642190873623,
"learning_rate": 2.6119991726035567e-05,
"loss": 0.0506,
"step": 27900
},
{
"epoch": 0.40142473465753226,
"grad_norm": 0.05845210328698158,
"learning_rate": 2.611277605938013e-05,
"loss": 0.0466,
"step": 27950
},
{
"epoch": 0.40214284688411106,
"grad_norm": 0.07064907997846603,
"learning_rate": 2.6105560392724685e-05,
"loss": 0.0246,
"step": 28000
},
{
"epoch": 0.4028609591106898,
"grad_norm": 71.78559875488281,
"learning_rate": 2.609834472606924e-05,
"loss": 0.0838,
"step": 28050
},
{
"epoch": 0.4035790713372686,
"grad_norm": 0.0453072227537632,
"learning_rate": 2.6091129059413802e-05,
"loss": 0.0378,
"step": 28100
},
{
"epoch": 0.40429718356384736,
"grad_norm": 0.054179996252059937,
"learning_rate": 2.6083913392758358e-05,
"loss": 0.05,
"step": 28150
},
{
"epoch": 0.4050152957904261,
"grad_norm": 0.021717887371778488,
"learning_rate": 2.6076697726102913e-05,
"loss": 0.0223,
"step": 28200
},
{
"epoch": 0.4057334080170049,
"grad_norm": 0.022488858550786972,
"learning_rate": 2.6069482059447475e-05,
"loss": 0.0487,
"step": 28250
},
{
"epoch": 0.40645152024358366,
"grad_norm": 0.056543637067079544,
"learning_rate": 2.606226639279203e-05,
"loss": 0.06,
"step": 28300
},
{
"epoch": 0.4071696324701624,
"grad_norm": 14.041226387023926,
"learning_rate": 2.605505072613659e-05,
"loss": 0.038,
"step": 28350
},
{
"epoch": 0.4078877446967412,
"grad_norm": 0.8094896078109741,
"learning_rate": 2.6047835059481148e-05,
"loss": 0.0489,
"step": 28400
},
{
"epoch": 0.40860585692331997,
"grad_norm": 0.044881440699100494,
"learning_rate": 2.6040619392825703e-05,
"loss": 0.057,
"step": 28450
},
{
"epoch": 0.40932396914989877,
"grad_norm": 0.0832076370716095,
"learning_rate": 2.6033403726170262e-05,
"loss": 0.0505,
"step": 28500
},
{
"epoch": 0.4100420813764775,
"grad_norm": 15.860810279846191,
"learning_rate": 2.602618805951482e-05,
"loss": 0.0532,
"step": 28550
},
{
"epoch": 0.41076019360305627,
"grad_norm": 0.11455155164003372,
"learning_rate": 2.6018972392859376e-05,
"loss": 0.0585,
"step": 28600
},
{
"epoch": 0.41147830582963507,
"grad_norm": 0.07686082273721695,
"learning_rate": 2.6011756726203935e-05,
"loss": 0.0818,
"step": 28650
},
{
"epoch": 0.4121964180562138,
"grad_norm": 63.3516731262207,
"learning_rate": 2.6004541059548493e-05,
"loss": 0.0771,
"step": 28700
},
{
"epoch": 0.41291453028279257,
"grad_norm": 0.06112220883369446,
"learning_rate": 2.599732539289305e-05,
"loss": 0.0917,
"step": 28750
},
{
"epoch": 0.4136326425093714,
"grad_norm": 20.554012298583984,
"learning_rate": 2.5990109726237607e-05,
"loss": 0.0626,
"step": 28800
},
{
"epoch": 0.4143507547359501,
"grad_norm": 0.05520211532711983,
"learning_rate": 2.5982894059582166e-05,
"loss": 0.046,
"step": 28850
},
{
"epoch": 0.4150688669625289,
"grad_norm": 0.0589030385017395,
"learning_rate": 2.597567839292672e-05,
"loss": 0.034,
"step": 28900
},
{
"epoch": 0.4157869791891077,
"grad_norm": 0.03793846070766449,
"learning_rate": 2.596846272627128e-05,
"loss": 0.0324,
"step": 28950
},
{
"epoch": 0.4165050914156864,
"grad_norm": 0.02868301048874855,
"learning_rate": 2.596124705961584e-05,
"loss": 0.0415,
"step": 29000
},
{
"epoch": 0.41722320364226523,
"grad_norm": 14.847681045532227,
"learning_rate": 2.5954031392960398e-05,
"loss": 0.0395,
"step": 29050
},
{
"epoch": 0.417941315868844,
"grad_norm": 0.06720839440822601,
"learning_rate": 2.5946815726304953e-05,
"loss": 0.0361,
"step": 29100
},
{
"epoch": 0.4186594280954227,
"grad_norm": 0.03984254598617554,
"learning_rate": 2.5939600059649512e-05,
"loss": 0.0498,
"step": 29150
},
{
"epoch": 0.41937754032200153,
"grad_norm": 1.7163082361221313,
"learning_rate": 2.593238439299407e-05,
"loss": 0.0174,
"step": 29200
},
{
"epoch": 0.4200956525485803,
"grad_norm": 0.014802789315581322,
"learning_rate": 2.5925168726338626e-05,
"loss": 0.0374,
"step": 29250
},
{
"epoch": 0.4208137647751591,
"grad_norm": 0.017352402210235596,
"learning_rate": 2.5917953059683185e-05,
"loss": 0.0211,
"step": 29300
},
{
"epoch": 0.42153187700173783,
"grad_norm": 0.07344254106283188,
"learning_rate": 2.5910737393027743e-05,
"loss": 0.0502,
"step": 29350
},
{
"epoch": 0.4222499892283166,
"grad_norm": 0.03359295800328255,
"learning_rate": 2.59035217263723e-05,
"loss": 0.0258,
"step": 29400
},
{
"epoch": 0.4229681014548954,
"grad_norm": 0.45311981439590454,
"learning_rate": 2.5896306059716857e-05,
"loss": 0.0318,
"step": 29450
},
{
"epoch": 0.42368621368147413,
"grad_norm": 0.3234398663043976,
"learning_rate": 2.5889090393061416e-05,
"loss": 0.0546,
"step": 29500
},
{
"epoch": 0.42440432590805294,
"grad_norm": 0.024956252425909042,
"learning_rate": 2.588187472640597e-05,
"loss": 0.0266,
"step": 29550
},
{
"epoch": 0.4251224381346317,
"grad_norm": 0.10940051078796387,
"learning_rate": 2.587465905975053e-05,
"loss": 0.0897,
"step": 29600
},
{
"epoch": 0.42584055036121043,
"grad_norm": 0.07653014361858368,
"learning_rate": 2.586744339309509e-05,
"loss": 0.0327,
"step": 29650
},
{
"epoch": 0.42655866258778924,
"grad_norm": 0.1112053170800209,
"learning_rate": 2.5860227726439644e-05,
"loss": 0.0514,
"step": 29700
},
{
"epoch": 0.427276774814368,
"grad_norm": 2.1015751361846924,
"learning_rate": 2.5853012059784206e-05,
"loss": 0.119,
"step": 29750
},
{
"epoch": 0.42799488704094674,
"grad_norm": 0.05717989802360535,
"learning_rate": 2.584579639312876e-05,
"loss": 0.0536,
"step": 29800
},
{
"epoch": 0.42871299926752554,
"grad_norm": 32.660850524902344,
"learning_rate": 2.583858072647332e-05,
"loss": 0.0992,
"step": 29850
},
{
"epoch": 0.4294311114941043,
"grad_norm": 0.06266611069440842,
"learning_rate": 2.583136505981788e-05,
"loss": 0.0568,
"step": 29900
},
{
"epoch": 0.4301492237206831,
"grad_norm": 0.0731596052646637,
"learning_rate": 2.5824149393162434e-05,
"loss": 0.0534,
"step": 29950
},
{
"epoch": 0.43086733594726184,
"grad_norm": 0.0834427997469902,
"learning_rate": 2.5816933726506993e-05,
"loss": 0.0688,
"step": 30000
},
{
"epoch": 0.4315854481738406,
"grad_norm": 0.917650580406189,
"learning_rate": 2.5809718059851552e-05,
"loss": 0.0559,
"step": 30050
},
{
"epoch": 0.4323035604004194,
"grad_norm": 0.11482948809862137,
"learning_rate": 2.5802646706529218e-05,
"loss": 0.0669,
"step": 30100
},
{
"epoch": 0.43302167262699814,
"grad_norm": 0.0454438179731369,
"learning_rate": 2.5795431039873776e-05,
"loss": 0.0308,
"step": 30150
},
{
"epoch": 0.4337397848535769,
"grad_norm": 0.20670810341835022,
"learning_rate": 2.578821537321833e-05,
"loss": 0.0416,
"step": 30200
},
{
"epoch": 0.4344578970801557,
"grad_norm": 0.03354871645569801,
"learning_rate": 2.578099970656289e-05,
"loss": 0.0222,
"step": 30250
},
{
"epoch": 0.43517600930673445,
"grad_norm": 5.863499164581299,
"learning_rate": 2.577378403990745e-05,
"loss": 0.0687,
"step": 30300
},
{
"epoch": 0.43589412153331325,
"grad_norm": 0.5545240044593811,
"learning_rate": 2.5766568373252004e-05,
"loss": 0.0578,
"step": 30350
},
{
"epoch": 0.436612233759892,
"grad_norm": 0.10319915413856506,
"learning_rate": 2.5759352706596563e-05,
"loss": 0.0471,
"step": 30400
},
{
"epoch": 0.43733034598647075,
"grad_norm": 0.030765533447265625,
"learning_rate": 2.5752137039941122e-05,
"loss": 0.0519,
"step": 30450
},
{
"epoch": 0.43804845821304955,
"grad_norm": 477.0625305175781,
"learning_rate": 2.5745209999951898e-05,
"loss": 0.0669,
"step": 30500
},
{
"epoch": 0.4387665704396283,
"grad_norm": 228.48228454589844,
"learning_rate": 2.5737994333296453e-05,
"loss": 0.0614,
"step": 30550
},
{
"epoch": 0.43948468266620705,
"grad_norm": 1.823301076889038,
"learning_rate": 2.5730778666641012e-05,
"loss": 0.052,
"step": 30600
},
{
"epoch": 0.44020279489278585,
"grad_norm": 0.8825114369392395,
"learning_rate": 2.572356299998557e-05,
"loss": 0.0514,
"step": 30650
},
{
"epoch": 0.4409209071193646,
"grad_norm": 0.08648502081632614,
"learning_rate": 2.5716347333330126e-05,
"loss": 0.0344,
"step": 30700
},
{
"epoch": 0.4416390193459434,
"grad_norm": 0.038441527634859085,
"learning_rate": 2.5709275980007792e-05,
"loss": 0.055,
"step": 30750
},
{
"epoch": 0.44235713157252216,
"grad_norm": 0.09304679930210114,
"learning_rate": 2.570206031335235e-05,
"loss": 0.0497,
"step": 30800
},
{
"epoch": 0.4430752437991009,
"grad_norm": 1.5768539905548096,
"learning_rate": 2.569484464669691e-05,
"loss": 0.0578,
"step": 30850
},
{
"epoch": 0.4437933560256797,
"grad_norm": 0.02988545596599579,
"learning_rate": 2.5687628980041468e-05,
"loss": 0.0325,
"step": 30900
},
{
"epoch": 0.44451146825225846,
"grad_norm": 0.10128447413444519,
"learning_rate": 2.5680413313386024e-05,
"loss": 0.03,
"step": 30950
},
{
"epoch": 0.4452295804788372,
"grad_norm": 0.06121072173118591,
"learning_rate": 2.5673197646730582e-05,
"loss": 0.0644,
"step": 31000
},
{
"epoch": 0.445947692705416,
"grad_norm": 0.20161040127277374,
"learning_rate": 2.566598198007514e-05,
"loss": 0.0763,
"step": 31050
},
{
"epoch": 0.44666580493199476,
"grad_norm": 0.619103729724884,
"learning_rate": 2.5658766313419696e-05,
"loss": 0.0457,
"step": 31100
},
{
"epoch": 0.44738391715857356,
"grad_norm": 1.6349852085113525,
"learning_rate": 2.5651550646764255e-05,
"loss": 0.052,
"step": 31150
},
{
"epoch": 0.4481020293851523,
"grad_norm": 0.12041996419429779,
"learning_rate": 2.5644334980108814e-05,
"loss": 0.0495,
"step": 31200
},
{
"epoch": 0.44882014161173106,
"grad_norm": 3.013185977935791,
"learning_rate": 2.5637119313453372e-05,
"loss": 0.0597,
"step": 31250
},
{
"epoch": 0.44953825383830986,
"grad_norm": 0.07766876369714737,
"learning_rate": 2.5629903646797928e-05,
"loss": 0.0674,
"step": 31300
},
{
"epoch": 0.4502563660648886,
"grad_norm": 0.4663377106189728,
"learning_rate": 2.5622687980142487e-05,
"loss": 0.0701,
"step": 31350
},
{
"epoch": 0.4509744782914674,
"grad_norm": 0.045421402901411057,
"learning_rate": 2.5615472313487045e-05,
"loss": 0.0302,
"step": 31400
},
{
"epoch": 0.45169259051804617,
"grad_norm": 0.19363941252231598,
"learning_rate": 2.56082566468316e-05,
"loss": 0.074,
"step": 31450
},
{
"epoch": 0.4524107027446249,
"grad_norm": 3.0277023315429688,
"learning_rate": 2.560104098017616e-05,
"loss": 0.0802,
"step": 31500
},
{
"epoch": 0.4531288149712037,
"grad_norm": 87.38606262207031,
"learning_rate": 2.5593825313520718e-05,
"loss": 0.0384,
"step": 31550
},
{
"epoch": 0.45384692719778247,
"grad_norm": 0.049042146652936935,
"learning_rate": 2.5586609646865273e-05,
"loss": 0.0169,
"step": 31600
},
{
"epoch": 0.4545650394243612,
"grad_norm": 0.051833972334861755,
"learning_rate": 2.5579393980209832e-05,
"loss": 0.0518,
"step": 31650
},
{
"epoch": 0.45528315165094,
"grad_norm": 0.03294980525970459,
"learning_rate": 2.557217831355439e-05,
"loss": 0.0493,
"step": 31700
},
{
"epoch": 0.45600126387751877,
"grad_norm": 19.67607307434082,
"learning_rate": 2.5564962646898946e-05,
"loss": 0.0251,
"step": 31750
},
{
"epoch": 0.4567193761040976,
"grad_norm": 0.062403663992881775,
"learning_rate": 2.5557746980243505e-05,
"loss": 0.0705,
"step": 31800
},
{
"epoch": 0.4574374883306763,
"grad_norm": 2.5241332054138184,
"learning_rate": 2.5550531313588064e-05,
"loss": 0.0559,
"step": 31850
},
{
"epoch": 0.45815560055725507,
"grad_norm": 0.09829045087099075,
"learning_rate": 2.554331564693262e-05,
"loss": 0.0732,
"step": 31900
},
{
"epoch": 0.4588737127838339,
"grad_norm": 0.07366979122161865,
"learning_rate": 2.553609998027718e-05,
"loss": 0.0412,
"step": 31950
},
{
"epoch": 0.4595918250104126,
"grad_norm": 0.4203979969024658,
"learning_rate": 2.5528884313621736e-05,
"loss": 0.0524,
"step": 32000
},
{
"epoch": 0.4603099372369914,
"grad_norm": 0.07752757519483566,
"learning_rate": 2.5521668646966292e-05,
"loss": 0.0611,
"step": 32050
},
{
"epoch": 0.4610280494635702,
"grad_norm": 0.09692207723855972,
"learning_rate": 2.5514452980310854e-05,
"loss": 0.0708,
"step": 32100
},
{
"epoch": 0.4617461616901489,
"grad_norm": 0.07400350272655487,
"learning_rate": 2.550723731365541e-05,
"loss": 0.0525,
"step": 32150
},
{
"epoch": 0.46246427391672773,
"grad_norm": 0.05159100517630577,
"learning_rate": 2.5500021646999965e-05,
"loss": 0.0549,
"step": 32200
},
{
"epoch": 0.4631823861433065,
"grad_norm": 35.71484375,
"learning_rate": 2.5492805980344527e-05,
"loss": 0.0529,
"step": 32250
},
{
"epoch": 0.46390049836988523,
"grad_norm": 0.07319821417331696,
"learning_rate": 2.5485590313689082e-05,
"loss": 0.0655,
"step": 32300
},
{
"epoch": 0.46461861059646403,
"grad_norm": 2.4547245502471924,
"learning_rate": 2.5478374647033637e-05,
"loss": 0.0454,
"step": 32350
},
{
"epoch": 0.4653367228230428,
"grad_norm": 0.10266736894845963,
"learning_rate": 2.54711589803782e-05,
"loss": 0.0313,
"step": 32400
},
{
"epoch": 0.46605483504962153,
"grad_norm": 5.870818138122559,
"learning_rate": 2.5463943313722755e-05,
"loss": 0.0643,
"step": 32450
},
{
"epoch": 0.46677294727620033,
"grad_norm": 0.31677067279815674,
"learning_rate": 2.545672764706731e-05,
"loss": 0.0628,
"step": 32500
},
{
"epoch": 0.4674910595027791,
"grad_norm": 0.07288742810487747,
"learning_rate": 2.5449511980411872e-05,
"loss": 0.0863,
"step": 32550
},
{
"epoch": 0.4682091717293579,
"grad_norm": 0.07232692092657089,
"learning_rate": 2.5442296313756428e-05,
"loss": 0.0583,
"step": 32600
},
{
"epoch": 0.46892728395593664,
"grad_norm": 1.5541493892669678,
"learning_rate": 2.5435080647100986e-05,
"loss": 0.0652,
"step": 32650
},
{
"epoch": 0.4696453961825154,
"grad_norm": 0.14363867044448853,
"learning_rate": 2.5427864980445545e-05,
"loss": 0.0452,
"step": 32700
},
{
"epoch": 0.4703635084090942,
"grad_norm": 0.07765737920999527,
"learning_rate": 2.54206493137901e-05,
"loss": 0.0915,
"step": 32750
},
{
"epoch": 0.47108162063567294,
"grad_norm": 0.6616801023483276,
"learning_rate": 2.541343364713466e-05,
"loss": 0.0343,
"step": 32800
},
{
"epoch": 0.4717997328622517,
"grad_norm": 0.11457184702157974,
"learning_rate": 2.5406217980479218e-05,
"loss": 0.0401,
"step": 32850
},
{
"epoch": 0.4725178450888305,
"grad_norm": 0.03043946996331215,
"learning_rate": 2.5399002313823773e-05,
"loss": 0.0659,
"step": 32900
},
{
"epoch": 0.47323595731540924,
"grad_norm": 0.04275374487042427,
"learning_rate": 2.539193096050144e-05,
"loss": 0.0479,
"step": 32950
},
{
"epoch": 0.47395406954198804,
"grad_norm": 0.036188703030347824,
"learning_rate": 2.5384715293846e-05,
"loss": 0.0397,
"step": 33000
},
{
"epoch": 0.4746721817685668,
"grad_norm": 3.709174156188965,
"learning_rate": 2.5377499627190556e-05,
"loss": 0.0405,
"step": 33050
},
{
"epoch": 0.47539029399514554,
"grad_norm": 0.039049405604600906,
"learning_rate": 2.5370283960535115e-05,
"loss": 0.0399,
"step": 33100
},
{
"epoch": 0.47610840622172435,
"grad_norm": 0.03738459199666977,
"learning_rate": 2.5363068293879674e-05,
"loss": 0.0456,
"step": 33150
},
{
"epoch": 0.4768265184483031,
"grad_norm": 0.07459737360477448,
"learning_rate": 2.535585262722423e-05,
"loss": 0.0718,
"step": 33200
},
{
"epoch": 0.4775446306748819,
"grad_norm": 0.03416149318218231,
"learning_rate": 2.5348636960568788e-05,
"loss": 0.0249,
"step": 33250
},
{
"epoch": 0.47826274290146065,
"grad_norm": 0.05686436593532562,
"learning_rate": 2.5341421293913347e-05,
"loss": 0.0714,
"step": 33300
},
{
"epoch": 0.4789808551280394,
"grad_norm": 0.1602715402841568,
"learning_rate": 2.5334205627257902e-05,
"loss": 0.0463,
"step": 33350
},
{
"epoch": 0.4796989673546182,
"grad_norm": 0.05453362315893173,
"learning_rate": 2.532698996060246e-05,
"loss": 0.0676,
"step": 33400
},
{
"epoch": 0.48041707958119695,
"grad_norm": 4.763365268707275,
"learning_rate": 2.531977429394702e-05,
"loss": 0.0461,
"step": 33450
},
{
"epoch": 0.4811351918077757,
"grad_norm": 0.06556567549705505,
"learning_rate": 2.5312558627291575e-05,
"loss": 0.0432,
"step": 33500
},
{
"epoch": 0.4818533040343545,
"grad_norm": 0.05367936193943024,
"learning_rate": 2.5305342960636137e-05,
"loss": 0.0536,
"step": 33550
},
{
"epoch": 0.48257141626093325,
"grad_norm": 0.7342302203178406,
"learning_rate": 2.5298127293980692e-05,
"loss": 0.1082,
"step": 33600
},
{
"epoch": 0.48328952848751205,
"grad_norm": 1.5394067764282227,
"learning_rate": 2.5290911627325247e-05,
"loss": 0.0691,
"step": 33650
},
{
"epoch": 0.4840076407140908,
"grad_norm": 0.10191819071769714,
"learning_rate": 2.528369596066981e-05,
"loss": 0.0616,
"step": 33700
},
{
"epoch": 0.48472575294066955,
"grad_norm": 0.125881627202034,
"learning_rate": 2.5276480294014365e-05,
"loss": 0.0565,
"step": 33750
},
{
"epoch": 0.48544386516724836,
"grad_norm": 0.12268608063459396,
"learning_rate": 2.526926462735892e-05,
"loss": 0.0535,
"step": 33800
},
{
"epoch": 0.4861619773938271,
"grad_norm": 0.5251419544219971,
"learning_rate": 2.5262048960703482e-05,
"loss": 0.0327,
"step": 33850
},
{
"epoch": 0.48688008962040585,
"grad_norm": 0.049022432416677475,
"learning_rate": 2.5254833294048038e-05,
"loss": 0.0477,
"step": 33900
},
{
"epoch": 0.48759820184698466,
"grad_norm": 17.507314682006836,
"learning_rate": 2.5247617627392593e-05,
"loss": 0.0725,
"step": 33950
},
{
"epoch": 0.4883163140735634,
"grad_norm": 0.5622566938400269,
"learning_rate": 2.5240401960737155e-05,
"loss": 0.0539,
"step": 34000
},
{
"epoch": 0.4890344263001422,
"grad_norm": 0.04837973043322563,
"learning_rate": 2.523318629408171e-05,
"loss": 0.0582,
"step": 34050
},
{
"epoch": 0.48975253852672096,
"grad_norm": 0.5979802012443542,
"learning_rate": 2.5225970627426266e-05,
"loss": 0.0311,
"step": 34100
},
{
"epoch": 0.4904706507532997,
"grad_norm": 0.05467730388045311,
"learning_rate": 2.5218754960770828e-05,
"loss": 0.0395,
"step": 34150
},
{
"epoch": 0.4911887629798785,
"grad_norm": 0.0720251053571701,
"learning_rate": 2.5211539294115383e-05,
"loss": 0.0321,
"step": 34200
},
{
"epoch": 0.49190687520645726,
"grad_norm": 0.05227138102054596,
"learning_rate": 2.5204323627459942e-05,
"loss": 0.0493,
"step": 34250
},
{
"epoch": 0.492624987433036,
"grad_norm": 0.04734648019075394,
"learning_rate": 2.51971079608045e-05,
"loss": 0.0359,
"step": 34300
},
{
"epoch": 0.4933430996596148,
"grad_norm": 0.2445848435163498,
"learning_rate": 2.5189892294149056e-05,
"loss": 0.0218,
"step": 34350
},
{
"epoch": 0.49406121188619356,
"grad_norm": 1.6729121208190918,
"learning_rate": 2.5182676627493615e-05,
"loss": 0.0672,
"step": 34400
},
{
"epoch": 0.49477932411277237,
"grad_norm": 0.0916346088051796,
"learning_rate": 2.5175460960838173e-05,
"loss": 0.0729,
"step": 34450
},
{
"epoch": 0.4954974363393511,
"grad_norm": 0.07457554340362549,
"learning_rate": 2.516824529418273e-05,
"loss": 0.0527,
"step": 34500
},
{
"epoch": 0.49621554856592986,
"grad_norm": 0.9593037962913513,
"learning_rate": 2.5161029627527287e-05,
"loss": 0.0551,
"step": 34550
},
{
"epoch": 0.49693366079250867,
"grad_norm": 0.07710795849561691,
"learning_rate": 2.5153813960871846e-05,
"loss": 0.0708,
"step": 34600
},
{
"epoch": 0.4976517730190874,
"grad_norm": 1.506896734237671,
"learning_rate": 2.51465982942164e-05,
"loss": 0.074,
"step": 34650
},
{
"epoch": 0.49836988524566617,
"grad_norm": 0.15761396288871765,
"learning_rate": 2.513938262756096e-05,
"loss": 0.0563,
"step": 34700
},
{
"epoch": 0.49908799747224497,
"grad_norm": 0.3991515636444092,
"learning_rate": 2.513216696090552e-05,
"loss": 0.0796,
"step": 34750
},
{
"epoch": 0.4998061096988237,
"grad_norm": 0.10450137406587601,
"learning_rate": 2.5124951294250074e-05,
"loss": 0.0555,
"step": 34800
},
{
"epoch": 0.5005242219254025,
"grad_norm": 0.0935065820813179,
"learning_rate": 2.5117735627594633e-05,
"loss": 0.0681,
"step": 34850
},
{
"epoch": 0.5012423341519813,
"grad_norm": 0.2559933066368103,
"learning_rate": 2.5110519960939192e-05,
"loss": 0.0315,
"step": 34900
},
{
"epoch": 0.5019604463785601,
"grad_norm": 0.06852543354034424,
"learning_rate": 2.510330429428375e-05,
"loss": 0.0636,
"step": 34950
},
{
"epoch": 0.5026785586051388,
"grad_norm": 71.8512191772461,
"learning_rate": 2.5096088627628306e-05,
"loss": 0.0565,
"step": 35000
},
{
"epoch": 0.5033966708317176,
"grad_norm": 3.9660584926605225,
"learning_rate": 2.5088872960972865e-05,
"loss": 0.078,
"step": 35050
},
{
"epoch": 0.5041147830582964,
"grad_norm": 0.08855576068162918,
"learning_rate": 2.5081657294317423e-05,
"loss": 0.0274,
"step": 35100
},
{
"epoch": 0.5048328952848751,
"grad_norm": 0.206807479262352,
"learning_rate": 2.5074441627661982e-05,
"loss": 0.0572,
"step": 35150
},
{
"epoch": 0.5055510075114539,
"grad_norm": 0.07930755615234375,
"learning_rate": 2.5067225961006537e-05,
"loss": 0.0958,
"step": 35200
},
{
"epoch": 0.5062691197380327,
"grad_norm": 0.08441215008497238,
"learning_rate": 2.5060010294351096e-05,
"loss": 0.0497,
"step": 35250
},
{
"epoch": 0.5069872319646115,
"grad_norm": 0.11579915881156921,
"learning_rate": 2.5052794627695655e-05,
"loss": 0.0633,
"step": 35300
},
{
"epoch": 0.5077053441911902,
"grad_norm": 0.07134761661291122,
"learning_rate": 2.504557896104021e-05,
"loss": 0.0241,
"step": 35350
},
{
"epoch": 0.508423456417769,
"grad_norm": 0.15993690490722656,
"learning_rate": 2.503836329438477e-05,
"loss": 0.0801,
"step": 35400
},
{
"epoch": 0.5091415686443478,
"grad_norm": 0.24976429343223572,
"learning_rate": 2.5031147627729328e-05,
"loss": 0.0803,
"step": 35450
},
{
"epoch": 0.5098596808709265,
"grad_norm": 4.378284454345703,
"learning_rate": 2.5023931961073883e-05,
"loss": 0.0364,
"step": 35500
},
{
"epoch": 0.5105777930975053,
"grad_norm": 0.07404322922229767,
"learning_rate": 2.501671629441844e-05,
"loss": 0.0604,
"step": 35550
},
{
"epoch": 0.5112959053240841,
"grad_norm": 0.060134898871183395,
"learning_rate": 2.5009500627763e-05,
"loss": 0.0371,
"step": 35600
},
{
"epoch": 0.5120140175506628,
"grad_norm": 0.2887961268424988,
"learning_rate": 2.500228496110756e-05,
"loss": 0.1102,
"step": 35650
},
{
"epoch": 0.5127321297772416,
"grad_norm": 53.177459716796875,
"learning_rate": 2.4995069294452114e-05,
"loss": 0.0537,
"step": 35700
},
{
"epoch": 0.5134502420038204,
"grad_norm": 3.1034953594207764,
"learning_rate": 2.4987853627796673e-05,
"loss": 0.071,
"step": 35750
},
{
"epoch": 0.5141683542303991,
"grad_norm": 34.12720489501953,
"learning_rate": 2.4980637961141232e-05,
"loss": 0.1069,
"step": 35800
},
{
"epoch": 0.5148864664569779,
"grad_norm": 0.102497898042202,
"learning_rate": 2.4973422294485787e-05,
"loss": 0.0722,
"step": 35850
},
{
"epoch": 0.5156045786835567,
"grad_norm": 1164.751953125,
"learning_rate": 2.4966206627830346e-05,
"loss": 0.105,
"step": 35900
},
{
"epoch": 0.5163226909101354,
"grad_norm": 48.3822021484375,
"learning_rate": 2.4958990961174905e-05,
"loss": 0.1142,
"step": 35950
},
{
"epoch": 0.5170408031367142,
"grad_norm": 0.11598493903875351,
"learning_rate": 2.495177529451946e-05,
"loss": 0.076,
"step": 36000
},
{
"epoch": 0.517758915363293,
"grad_norm": 16.16822052001953,
"learning_rate": 2.494455962786402e-05,
"loss": 0.0823,
"step": 36050
},
{
"epoch": 0.5184770275898718,
"grad_norm": 1.6366539001464844,
"learning_rate": 2.4937343961208577e-05,
"loss": 0.0503,
"step": 36100
},
{
"epoch": 0.5191951398164505,
"grad_norm": 3.8799993991851807,
"learning_rate": 2.4930128294553133e-05,
"loss": 0.0792,
"step": 36150
},
{
"epoch": 0.5199132520430293,
"grad_norm": 2.2162089347839355,
"learning_rate": 2.4922912627897695e-05,
"loss": 0.0683,
"step": 36200
},
{
"epoch": 0.5206313642696081,
"grad_norm": 63.66940689086914,
"learning_rate": 2.491569696124225e-05,
"loss": 0.0728,
"step": 36250
},
{
"epoch": 0.5213494764961868,
"grad_norm": 1.301426887512207,
"learning_rate": 2.4908481294586806e-05,
"loss": 0.0623,
"step": 36300
},
{
"epoch": 0.5220675887227656,
"grad_norm": 0.48745834827423096,
"learning_rate": 2.4901265627931368e-05,
"loss": 0.0557,
"step": 36350
},
{
"epoch": 0.5227857009493444,
"grad_norm": 0.7868464589118958,
"learning_rate": 2.4894049961275923e-05,
"loss": 0.0373,
"step": 36400
},
{
"epoch": 0.5235038131759231,
"grad_norm": 4.382079601287842,
"learning_rate": 2.488683429462048e-05,
"loss": 0.0625,
"step": 36450
},
{
"epoch": 0.5242219254025019,
"grad_norm": 12.262738227844238,
"learning_rate": 2.487961862796504e-05,
"loss": 0.0645,
"step": 36500
},
{
"epoch": 0.5249400376290807,
"grad_norm": 1.408593773841858,
"learning_rate": 2.4872402961309596e-05,
"loss": 0.0888,
"step": 36550
},
{
"epoch": 0.5256581498556594,
"grad_norm": 0.11477218568325043,
"learning_rate": 2.486518729465415e-05,
"loss": 0.0614,
"step": 36600
},
{
"epoch": 0.5263762620822382,
"grad_norm": 0.086027592420578,
"learning_rate": 2.4857971627998713e-05,
"loss": 0.0485,
"step": 36650
},
{
"epoch": 0.527094374308817,
"grad_norm": 0.03868116810917854,
"learning_rate": 2.485075596134327e-05,
"loss": 0.0707,
"step": 36700
},
{
"epoch": 0.5278124865353957,
"grad_norm": 0.06335175037384033,
"learning_rate": 2.4843540294687824e-05,
"loss": 0.0476,
"step": 36750
},
{
"epoch": 0.5285305987619745,
"grad_norm": 0.36757639050483704,
"learning_rate": 2.4836324628032386e-05,
"loss": 0.0448,
"step": 36800
},
{
"epoch": 0.5292487109885533,
"grad_norm": 0.03587363660335541,
"learning_rate": 2.482910896137694e-05,
"loss": 0.0507,
"step": 36850
},
{
"epoch": 0.5299668232151321,
"grad_norm": 3.4281718730926514,
"learning_rate": 2.48218932947215e-05,
"loss": 0.0674,
"step": 36900
},
{
"epoch": 0.5306849354417108,
"grad_norm": 2.3952221870422363,
"learning_rate": 2.481467762806606e-05,
"loss": 0.0721,
"step": 36950
},
{
"epoch": 0.5314030476682896,
"grad_norm": 0.034354522824287415,
"learning_rate": 2.4807461961410614e-05,
"loss": 0.0834,
"step": 37000
},
{
"epoch": 0.5321211598948684,
"grad_norm": 1.9441754817962646,
"learning_rate": 2.4800246294755173e-05,
"loss": 0.1208,
"step": 37050
},
{
"epoch": 0.5328392721214471,
"grad_norm": 15.068452835083008,
"learning_rate": 2.479303062809973e-05,
"loss": 0.0937,
"step": 37100
},
{
"epoch": 0.5335573843480259,
"grad_norm": 0.0747528076171875,
"learning_rate": 2.4785814961444287e-05,
"loss": 0.0299,
"step": 37150
},
{
"epoch": 0.5342754965746047,
"grad_norm": 0.04734047129750252,
"learning_rate": 2.477859929478885e-05,
"loss": 0.0448,
"step": 37200
},
{
"epoch": 0.5349936088011834,
"grad_norm": 0.04633970558643341,
"learning_rate": 2.4771383628133404e-05,
"loss": 0.0488,
"step": 37250
},
{
"epoch": 0.5357117210277622,
"grad_norm": 0.2701416611671448,
"learning_rate": 2.476416796147796e-05,
"loss": 0.0407,
"step": 37300
},
{
"epoch": 0.536429833254341,
"grad_norm": 3592.864501953125,
"learning_rate": 2.4756952294822522e-05,
"loss": 0.0573,
"step": 37350
},
{
"epoch": 0.5371479454809197,
"grad_norm": 0.03521255776286125,
"learning_rate": 2.4749736628167077e-05,
"loss": 0.0265,
"step": 37400
},
{
"epoch": 0.5378660577074985,
"grad_norm": 0.056420013308525085,
"learning_rate": 2.4742520961511632e-05,
"loss": 0.0249,
"step": 37450
},
{
"epoch": 0.5385841699340773,
"grad_norm": 2.5189876556396484,
"learning_rate": 2.4735305294856195e-05,
"loss": 0.064,
"step": 37500
},
{
"epoch": 0.5393022821606561,
"grad_norm": 3.7576839923858643,
"learning_rate": 2.472808962820075e-05,
"loss": 0.0922,
"step": 37550
},
{
"epoch": 0.5400203943872348,
"grad_norm": 0.048396721482276917,
"learning_rate": 2.472087396154531e-05,
"loss": 0.0637,
"step": 37600
},
{
"epoch": 0.5407385066138136,
"grad_norm": 0.1982671469449997,
"learning_rate": 2.4713658294889867e-05,
"loss": 0.0396,
"step": 37650
},
{
"epoch": 0.5414566188403924,
"grad_norm": 1.0207908153533936,
"learning_rate": 2.4706442628234423e-05,
"loss": 0.0304,
"step": 37700
},
{
"epoch": 0.5421747310669711,
"grad_norm": 0.02437562867999077,
"learning_rate": 2.469922696157898e-05,
"loss": 0.0334,
"step": 37750
},
{
"epoch": 0.5428928432935499,
"grad_norm": 1.7860028743743896,
"learning_rate": 2.469201129492354e-05,
"loss": 0.0539,
"step": 37800
},
{
"epoch": 0.5436109555201287,
"grad_norm": 0.15321147441864014,
"learning_rate": 2.4684795628268095e-05,
"loss": 0.0629,
"step": 37850
},
{
"epoch": 0.5443290677467074,
"grad_norm": 0.15857814252376556,
"learning_rate": 2.4677579961612654e-05,
"loss": 0.0685,
"step": 37900
},
{
"epoch": 0.5450471799732862,
"grad_norm": 0.031938180327415466,
"learning_rate": 2.4670364294957213e-05,
"loss": 0.0584,
"step": 37950
},
{
"epoch": 0.545765292199865,
"grad_norm": 0.07411850243806839,
"learning_rate": 2.4663148628301768e-05,
"loss": 0.0442,
"step": 38000
},
{
"epoch": 0.5464834044264437,
"grad_norm": 2.765964984893799,
"learning_rate": 2.4655932961646327e-05,
"loss": 0.0919,
"step": 38050
},
{
"epoch": 0.5472015166530225,
"grad_norm": 4.264487266540527,
"learning_rate": 2.4648717294990886e-05,
"loss": 0.0493,
"step": 38100
},
{
"epoch": 0.5479196288796013,
"grad_norm": 6.618762969970703,
"learning_rate": 2.464150162833544e-05,
"loss": 0.0318,
"step": 38150
},
{
"epoch": 0.54863774110618,
"grad_norm": 2.5328927040100098,
"learning_rate": 2.463428596168e-05,
"loss": 0.0643,
"step": 38200
},
{
"epoch": 0.5493558533327588,
"grad_norm": 0.025601988658308983,
"learning_rate": 2.462707029502456e-05,
"loss": 0.069,
"step": 38250
},
{
"epoch": 0.5500739655593376,
"grad_norm": 0.07905403524637222,
"learning_rate": 2.4619854628369117e-05,
"loss": 0.068,
"step": 38300
},
{
"epoch": 0.5507920777859164,
"grad_norm": 0.034113090485334396,
"learning_rate": 2.4612638961713673e-05,
"loss": 0.0451,
"step": 38350
},
{
"epoch": 0.5515101900124951,
"grad_norm": 109.80461883544922,
"learning_rate": 2.460542329505823e-05,
"loss": 0.0712,
"step": 38400
},
{
"epoch": 0.5522283022390739,
"grad_norm": 0.12972399592399597,
"learning_rate": 2.459820762840279e-05,
"loss": 0.042,
"step": 38450
},
{
"epoch": 0.5529464144656527,
"grad_norm": 0.1377432644367218,
"learning_rate": 2.4590991961747345e-05,
"loss": 0.0359,
"step": 38500
},
{
"epoch": 0.5536645266922314,
"grad_norm": 0.021726874634623528,
"learning_rate": 2.4583776295091904e-05,
"loss": 0.0328,
"step": 38550
},
{
"epoch": 0.5543826389188102,
"grad_norm": 0.020801221951842308,
"learning_rate": 2.4576560628436463e-05,
"loss": 0.0628,
"step": 38600
},
{
"epoch": 0.555100751145389,
"grad_norm": 0.08038417994976044,
"learning_rate": 2.4569344961781018e-05,
"loss": 0.0351,
"step": 38650
},
{
"epoch": 0.5558188633719677,
"grad_norm": 18.516040802001953,
"learning_rate": 2.4562129295125577e-05,
"loss": 0.0494,
"step": 38700
},
{
"epoch": 0.5565369755985465,
"grad_norm": 0.055549539625644684,
"learning_rate": 2.4554913628470136e-05,
"loss": 0.06,
"step": 38750
},
{
"epoch": 0.5572550878251253,
"grad_norm": 0.03651417791843414,
"learning_rate": 2.454769796181469e-05,
"loss": 0.04,
"step": 38800
},
{
"epoch": 0.557973200051704,
"grad_norm": 0.015556145459413528,
"learning_rate": 2.454048229515925e-05,
"loss": 0.0434,
"step": 38850
},
{
"epoch": 0.5586913122782828,
"grad_norm": 0.011846818961203098,
"learning_rate": 2.4533266628503808e-05,
"loss": 0.0419,
"step": 38900
},
{
"epoch": 0.5594094245048616,
"grad_norm": 0.011472227051854134,
"learning_rate": 2.4526050961848364e-05,
"loss": 0.0197,
"step": 38950
},
{
"epoch": 0.5601275367314403,
"grad_norm": 0.028384173288941383,
"learning_rate": 2.4518835295192926e-05,
"loss": 0.0454,
"step": 39000
},
{
"epoch": 0.5608456489580191,
"grad_norm": 0.021673662588000298,
"learning_rate": 2.451161962853748e-05,
"loss": 0.039,
"step": 39050
},
{
"epoch": 0.5615637611845979,
"grad_norm": 0.01948692835867405,
"learning_rate": 2.450440396188204e-05,
"loss": 0.0446,
"step": 39100
},
{
"epoch": 0.5622818734111767,
"grad_norm": 0.011734464205801487,
"learning_rate": 2.44971882952266e-05,
"loss": 0.0187,
"step": 39150
},
{
"epoch": 0.5629999856377554,
"grad_norm": 19.443992614746094,
"learning_rate": 2.4489972628571154e-05,
"loss": 0.0651,
"step": 39200
},
{
"epoch": 0.5637180978643342,
"grad_norm": 0.06333588063716888,
"learning_rate": 2.4482756961915713e-05,
"loss": 0.0602,
"step": 39250
},
{
"epoch": 0.564436210090913,
"grad_norm": 0.03405553475022316,
"learning_rate": 2.447554129526027e-05,
"loss": 0.0383,
"step": 39300
},
{
"epoch": 0.5651543223174917,
"grad_norm": 4.403933048248291,
"learning_rate": 2.4468325628604827e-05,
"loss": 0.0364,
"step": 39350
},
{
"epoch": 0.5658724345440705,
"grad_norm": 45.96012496948242,
"learning_rate": 2.4461109961949385e-05,
"loss": 0.0267,
"step": 39400
},
{
"epoch": 0.5665905467706494,
"grad_norm": 0.03902297466993332,
"learning_rate": 2.4453894295293944e-05,
"loss": 0.0364,
"step": 39450
},
{
"epoch": 0.567308658997228,
"grad_norm": 0.012141493149101734,
"learning_rate": 2.44466786286385e-05,
"loss": 0.0419,
"step": 39500
},
{
"epoch": 0.5680267712238068,
"grad_norm": 32.401885986328125,
"learning_rate": 2.443960727531617e-05,
"loss": 0.0508,
"step": 39550
},
{
"epoch": 0.5687448834503857,
"grad_norm": 9.376387596130371,
"learning_rate": 2.4432391608660724e-05,
"loss": 0.0529,
"step": 39600
},
{
"epoch": 0.5694629956769643,
"grad_norm": 5.138739585876465,
"learning_rate": 2.4425175942005283e-05,
"loss": 0.042,
"step": 39650
},
{
"epoch": 0.5701811079035431,
"grad_norm": 0.014239504933357239,
"learning_rate": 2.441796027534984e-05,
"loss": 0.0397,
"step": 39700
},
{
"epoch": 0.570899220130122,
"grad_norm": 0.016275152564048767,
"learning_rate": 2.4410744608694397e-05,
"loss": 0.0516,
"step": 39750
},
{
"epoch": 0.5716173323567008,
"grad_norm": 0.03984437137842178,
"learning_rate": 2.4403528942038955e-05,
"loss": 0.0502,
"step": 39800
},
{
"epoch": 0.5723354445832795,
"grad_norm": 0.817456066608429,
"learning_rate": 2.4396313275383514e-05,
"loss": 0.0471,
"step": 39850
},
{
"epoch": 0.5730535568098583,
"grad_norm": 0.42270442843437195,
"learning_rate": 2.4389097608728073e-05,
"loss": 0.0527,
"step": 39900
},
{
"epoch": 0.5737716690364371,
"grad_norm": 0.029552064836025238,
"learning_rate": 2.4381881942072628e-05,
"loss": 0.0526,
"step": 39950
},
{
"epoch": 0.5744897812630158,
"grad_norm": 21.022872924804688,
"learning_rate": 2.4374666275417187e-05,
"loss": 0.053,
"step": 40000
},
{
"epoch": 0.5752078934895946,
"grad_norm": 0.0365062914788723,
"learning_rate": 2.4367450608761746e-05,
"loss": 0.0596,
"step": 40050
},
{
"epoch": 0.5759260057161734,
"grad_norm": 0.025358131155371666,
"learning_rate": 2.43602349421063e-05,
"loss": 0.0578,
"step": 40100
},
{
"epoch": 0.576644117942752,
"grad_norm": 0.033615849912166595,
"learning_rate": 2.435301927545086e-05,
"loss": 0.0245,
"step": 40150
},
{
"epoch": 0.5773622301693309,
"grad_norm": 0.030600300058722496,
"learning_rate": 2.434580360879542e-05,
"loss": 0.0596,
"step": 40200
},
{
"epoch": 0.5780803423959097,
"grad_norm": 0.04510806128382683,
"learning_rate": 2.4338587942139974e-05,
"loss": 0.0435,
"step": 40250
},
{
"epoch": 0.5787984546224884,
"grad_norm": 0.024853043258190155,
"learning_rate": 2.4331372275484532e-05,
"loss": 0.0391,
"step": 40300
},
{
"epoch": 0.5795165668490672,
"grad_norm": 0.07882773131132126,
"learning_rate": 2.432415660882909e-05,
"loss": 0.0566,
"step": 40350
},
{
"epoch": 0.580234679075646,
"grad_norm": 0.04951098933815956,
"learning_rate": 2.4316940942173647e-05,
"loss": 0.0768,
"step": 40400
},
{
"epoch": 0.5809527913022247,
"grad_norm": 0.09358759224414825,
"learning_rate": 2.4309725275518205e-05,
"loss": 0.0828,
"step": 40450
},
{
"epoch": 0.5816709035288035,
"grad_norm": 0.05936616286635399,
"learning_rate": 2.4302509608862764e-05,
"loss": 0.0406,
"step": 40500
},
{
"epoch": 0.5823890157553823,
"grad_norm": 0.03657994046807289,
"learning_rate": 2.429543825554043e-05,
"loss": 0.0516,
"step": 40550
},
{
"epoch": 0.5831071279819611,
"grad_norm": 8.627974510192871,
"learning_rate": 2.428822258888499e-05,
"loss": 0.0561,
"step": 40600
},
{
"epoch": 0.5838252402085398,
"grad_norm": 1.1441830396652222,
"learning_rate": 2.4281006922229544e-05,
"loss": 0.0463,
"step": 40650
},
{
"epoch": 0.5845433524351186,
"grad_norm": 0.07954325526952744,
"learning_rate": 2.4273791255574103e-05,
"loss": 0.0922,
"step": 40700
},
{
"epoch": 0.5852614646616974,
"grad_norm": 2.8461921215057373,
"learning_rate": 2.426657558891866e-05,
"loss": 0.0852,
"step": 40750
},
{
"epoch": 0.5859795768882761,
"grad_norm": 0.10721123218536377,
"learning_rate": 2.425935992226322e-05,
"loss": 0.0508,
"step": 40800
},
{
"epoch": 0.5866976891148549,
"grad_norm": 0.0931866392493248,
"learning_rate": 2.4252288568940882e-05,
"loss": 0.0788,
"step": 40850
},
{
"epoch": 0.5874158013414337,
"grad_norm": 0.061198893934488297,
"learning_rate": 2.4245072902285445e-05,
"loss": 0.0379,
"step": 40900
},
{
"epoch": 0.5881339135680124,
"grad_norm": 3.5173628330230713,
"learning_rate": 2.423785723563e-05,
"loss": 0.1242,
"step": 40950
},
{
"epoch": 0.5888520257945912,
"grad_norm": 2.4062447547912598,
"learning_rate": 2.423064156897456e-05,
"loss": 0.0625,
"step": 41000
},
{
"epoch": 0.58957013802117,
"grad_norm": 31.462963104248047,
"learning_rate": 2.4223425902319117e-05,
"loss": 0.0261,
"step": 41050
},
{
"epoch": 0.5902882502477487,
"grad_norm": 0.02886788733303547,
"learning_rate": 2.4216210235663673e-05,
"loss": 0.0291,
"step": 41100
},
{
"epoch": 0.5910063624743275,
"grad_norm": 0.0616585947573185,
"learning_rate": 2.420899456900823e-05,
"loss": 0.0408,
"step": 41150
},
{
"epoch": 0.5917244747009063,
"grad_norm": 0.048470042645931244,
"learning_rate": 2.420177890235279e-05,
"loss": 0.0297,
"step": 41200
},
{
"epoch": 0.5924425869274851,
"grad_norm": 10.751079559326172,
"learning_rate": 2.4194563235697345e-05,
"loss": 0.0538,
"step": 41250
},
{
"epoch": 0.5931606991540638,
"grad_norm": 2.543215274810791,
"learning_rate": 2.4187347569041904e-05,
"loss": 0.0796,
"step": 41300
},
{
"epoch": 0.5938788113806426,
"grad_norm": 0.0643203854560852,
"learning_rate": 2.4180131902386463e-05,
"loss": 0.0278,
"step": 41350
},
{
"epoch": 0.5945969236072214,
"grad_norm": 0.11458639800548553,
"learning_rate": 2.4172916235731018e-05,
"loss": 0.0531,
"step": 41400
},
{
"epoch": 0.5953150358338001,
"grad_norm": 0.14419472217559814,
"learning_rate": 2.4165700569075577e-05,
"loss": 0.0717,
"step": 41450
},
{
"epoch": 0.5960331480603789,
"grad_norm": 84.442138671875,
"learning_rate": 2.4158484902420136e-05,
"loss": 0.0399,
"step": 41500
},
{
"epoch": 0.5967512602869577,
"grad_norm": 0.08792130649089813,
"learning_rate": 2.415126923576469e-05,
"loss": 0.0632,
"step": 41550
},
{
"epoch": 0.5974693725135364,
"grad_norm": 0.21101626753807068,
"learning_rate": 2.414405356910925e-05,
"loss": 0.0574,
"step": 41600
},
{
"epoch": 0.5981874847401152,
"grad_norm": 0.052498213946819305,
"learning_rate": 2.413698221578692e-05,
"loss": 0.0439,
"step": 41650
},
{
"epoch": 0.598905596966694,
"grad_norm": 0.3066045641899109,
"learning_rate": 2.4129766549131474e-05,
"loss": 0.0677,
"step": 41700
},
{
"epoch": 0.5996237091932727,
"grad_norm": 0.0755561888217926,
"learning_rate": 2.412255088247603e-05,
"loss": 0.0518,
"step": 41750
},
{
"epoch": 0.6003418214198515,
"grad_norm": 0.07415340840816498,
"learning_rate": 2.411533521582059e-05,
"loss": 0.0503,
"step": 41800
},
{
"epoch": 0.6010599336464303,
"grad_norm": 0.15020149946212769,
"learning_rate": 2.4108119549165147e-05,
"loss": 0.0817,
"step": 41850
},
{
"epoch": 0.601778045873009,
"grad_norm": 0.06178657338023186,
"learning_rate": 2.4100903882509706e-05,
"loss": 0.0601,
"step": 41900
},
{
"epoch": 0.6024961580995878,
"grad_norm": 1.5859713554382324,
"learning_rate": 2.4093688215854264e-05,
"loss": 0.0599,
"step": 41950
},
{
"epoch": 0.6032142703261666,
"grad_norm": 1.891103744506836,
"learning_rate": 2.408647254919882e-05,
"loss": 0.065,
"step": 42000
},
{
"epoch": 0.6039323825527454,
"grad_norm": 0.3377256989479065,
"learning_rate": 2.407925688254338e-05,
"loss": 0.0688,
"step": 42050
},
{
"epoch": 0.6046504947793241,
"grad_norm": 16.288217544555664,
"learning_rate": 2.4072041215887937e-05,
"loss": 0.0413,
"step": 42100
},
{
"epoch": 0.6053686070059029,
"grad_norm": 0.11780572682619095,
"learning_rate": 2.4064825549232493e-05,
"loss": 0.0695,
"step": 42150
},
{
"epoch": 0.6060867192324817,
"grad_norm": 11.723837852478027,
"learning_rate": 2.405760988257705e-05,
"loss": 0.0692,
"step": 42200
},
{
"epoch": 0.6068048314590604,
"grad_norm": 0.0517687052488327,
"learning_rate": 2.405039421592161e-05,
"loss": 0.0356,
"step": 42250
},
{
"epoch": 0.6075229436856392,
"grad_norm": 0.06792303919792175,
"learning_rate": 2.4043178549266165e-05,
"loss": 0.0443,
"step": 42300
},
{
"epoch": 0.608241055912218,
"grad_norm": 0.09773588925600052,
"learning_rate": 2.4035962882610727e-05,
"loss": 0.0476,
"step": 42350
},
{
"epoch": 0.6089591681387967,
"grad_norm": 0.03788350522518158,
"learning_rate": 2.4028747215955283e-05,
"loss": 0.0362,
"step": 42400
},
{
"epoch": 0.6096772803653755,
"grad_norm": 0.047992464154958725,
"learning_rate": 2.4021531549299838e-05,
"loss": 0.0679,
"step": 42450
},
{
"epoch": 0.6103953925919543,
"grad_norm": 0.060338061302900314,
"learning_rate": 2.40143158826444e-05,
"loss": 0.052,
"step": 42500
},
{
"epoch": 0.611113504818533,
"grad_norm": 0.07789593189954758,
"learning_rate": 2.4007100215988956e-05,
"loss": 0.0203,
"step": 42550
},
{
"epoch": 0.6118316170451118,
"grad_norm": 0.03540048375725746,
"learning_rate": 2.3999884549333514e-05,
"loss": 0.0509,
"step": 42600
},
{
"epoch": 0.6125497292716906,
"grad_norm": 1.3714178800582886,
"learning_rate": 2.3992668882678073e-05,
"loss": 0.0938,
"step": 42650
},
{
"epoch": 0.6132678414982693,
"grad_norm": 0.08646780997514725,
"learning_rate": 2.398545321602263e-05,
"loss": 0.0548,
"step": 42700
},
{
"epoch": 0.6139859537248481,
"grad_norm": 0.10025494545698166,
"learning_rate": 2.3978237549367187e-05,
"loss": 0.0739,
"step": 42750
},
{
"epoch": 0.6147040659514269,
"grad_norm": 18.440555572509766,
"learning_rate": 2.3971021882711746e-05,
"loss": 0.0416,
"step": 42800
},
{
"epoch": 0.6154221781780057,
"grad_norm": 13.679024696350098,
"learning_rate": 2.39638062160563e-05,
"loss": 0.5511,
"step": 42850
},
{
"epoch": 0.6161402904045844,
"grad_norm": 4.185995578765869,
"learning_rate": 2.395659054940086e-05,
"loss": 1.2419,
"step": 42900
},
{
"epoch": 0.6168584026311632,
"grad_norm": 1.135672688484192,
"learning_rate": 2.394937488274542e-05,
"loss": 0.5634,
"step": 42950
},
{
"epoch": 0.617576514857742,
"grad_norm": 2.3285813331604004,
"learning_rate": 2.3942159216089974e-05,
"loss": 0.466,
"step": 43000
},
{
"epoch": 0.6182946270843207,
"grad_norm": 1.1176297664642334,
"learning_rate": 2.3934943549434533e-05,
"loss": 0.0652,
"step": 43050
},
{
"epoch": 0.6190127393108995,
"grad_norm": 51.0164794921875,
"learning_rate": 2.392772788277909e-05,
"loss": 0.0546,
"step": 43100
},
{
"epoch": 0.6197308515374783,
"grad_norm": 0.07210852950811386,
"learning_rate": 2.3920512216123647e-05,
"loss": 0.051,
"step": 43150
},
{
"epoch": 0.620448963764057,
"grad_norm": 0.8415210843086243,
"learning_rate": 2.3913296549468205e-05,
"loss": 0.0681,
"step": 43200
},
{
"epoch": 0.6211670759906358,
"grad_norm": 0.32454317808151245,
"learning_rate": 2.3906080882812764e-05,
"loss": 0.043,
"step": 43250
},
{
"epoch": 0.6218851882172146,
"grad_norm": 0.062497854232788086,
"learning_rate": 2.3898865216157323e-05,
"loss": 0.0698,
"step": 43300
},
{
"epoch": 0.6226033004437933,
"grad_norm": 27.095462799072266,
"learning_rate": 2.3891649549501878e-05,
"loss": 0.0621,
"step": 43350
},
{
"epoch": 0.6233214126703721,
"grad_norm": 0.11430204659700394,
"learning_rate": 2.3884433882846437e-05,
"loss": 0.0679,
"step": 43400
},
{
"epoch": 0.6240395248969509,
"grad_norm": 7.836416244506836,
"learning_rate": 2.3877218216190996e-05,
"loss": 0.068,
"step": 43450
},
{
"epoch": 0.6247576371235297,
"grad_norm": 0.03914704546332359,
"learning_rate": 2.387000254953555e-05,
"loss": 0.0456,
"step": 43500
},
{
"epoch": 0.6254757493501084,
"grad_norm": 0.13007937371730804,
"learning_rate": 2.386278688288011e-05,
"loss": 0.0488,
"step": 43550
},
{
"epoch": 0.6261938615766872,
"grad_norm": 0.1002429947257042,
"learning_rate": 2.385557121622467e-05,
"loss": 0.0534,
"step": 43600
},
{
"epoch": 0.626911973803266,
"grad_norm": 0.10558202117681503,
"learning_rate": 2.3848355549569224e-05,
"loss": 0.0921,
"step": 43650
},
{
"epoch": 0.6276300860298447,
"grad_norm": 0.027950316667556763,
"learning_rate": 2.3841139882913782e-05,
"loss": 0.027,
"step": 43700
},
{
"epoch": 0.6283481982564235,
"grad_norm": 0.0834326446056366,
"learning_rate": 2.383392421625834e-05,
"loss": 0.0552,
"step": 43750
},
{
"epoch": 0.6290663104830023,
"grad_norm": 0.03636254370212555,
"learning_rate": 2.3826708549602897e-05,
"loss": 0.0686,
"step": 43800
},
{
"epoch": 0.629784422709581,
"grad_norm": 24.880334854125977,
"learning_rate": 2.3819492882947455e-05,
"loss": 0.0433,
"step": 43850
},
{
"epoch": 0.6305025349361598,
"grad_norm": 0.04653225839138031,
"learning_rate": 2.3812277216292014e-05,
"loss": 0.0343,
"step": 43900
},
{
"epoch": 0.6312206471627386,
"grad_norm": 0.027844512835144997,
"learning_rate": 2.380506154963657e-05,
"loss": 0.0469,
"step": 43950
},
{
"epoch": 0.6319387593893173,
"grad_norm": 0.04555247724056244,
"learning_rate": 2.379784588298113e-05,
"loss": 0.0176,
"step": 44000
},
{
"epoch": 0.6326568716158961,
"grad_norm": 0.06262699514627457,
"learning_rate": 2.3790630216325687e-05,
"loss": 0.0272,
"step": 44050
},
{
"epoch": 0.6333749838424749,
"grad_norm": 0.26737722754478455,
"learning_rate": 2.3783414549670242e-05,
"loss": 0.0445,
"step": 44100
},
{
"epoch": 0.6340930960690536,
"grad_norm": 0.06617012619972229,
"learning_rate": 2.3776198883014804e-05,
"loss": 0.0222,
"step": 44150
},
{
"epoch": 0.6348112082956324,
"grad_norm": 0.02425909787416458,
"learning_rate": 2.376898321635936e-05,
"loss": 0.0522,
"step": 44200
},
{
"epoch": 0.6355293205222112,
"grad_norm": 0.0274038128554821,
"learning_rate": 2.3761767549703918e-05,
"loss": 0.0202,
"step": 44250
},
{
"epoch": 0.63624743274879,
"grad_norm": 9.2883939743042,
"learning_rate": 2.3754551883048477e-05,
"loss": 0.1086,
"step": 44300
},
{
"epoch": 0.6369655449753687,
"grad_norm": 0.02403036691248417,
"learning_rate": 2.3747336216393032e-05,
"loss": 0.0571,
"step": 44350
},
{
"epoch": 0.6376836572019475,
"grad_norm": 2.334660768508911,
"learning_rate": 2.374012054973759e-05,
"loss": 0.0452,
"step": 44400
},
{
"epoch": 0.6384017694285263,
"grad_norm": 0.0580274872481823,
"learning_rate": 2.373290488308215e-05,
"loss": 0.0455,
"step": 44450
},
{
"epoch": 0.639119881655105,
"grad_norm": 0.04195809364318848,
"learning_rate": 2.3725689216426705e-05,
"loss": 0.0381,
"step": 44500
},
{
"epoch": 0.6398379938816838,
"grad_norm": 1.6940784454345703,
"learning_rate": 2.3718473549771264e-05,
"loss": 0.0364,
"step": 44550
},
{
"epoch": 0.6405561061082626,
"grad_norm": 0.06187320500612259,
"learning_rate": 2.3711257883115823e-05,
"loss": 0.0816,
"step": 44600
},
{
"epoch": 0.6412742183348413,
"grad_norm": 0.03540629893541336,
"learning_rate": 2.3704042216460378e-05,
"loss": 0.0352,
"step": 44650
},
{
"epoch": 0.6419923305614201,
"grad_norm": 0.06254927068948746,
"learning_rate": 2.369682654980494e-05,
"loss": 0.0472,
"step": 44700
},
{
"epoch": 0.6427104427879989,
"grad_norm": 0.041405342519283295,
"learning_rate": 2.3689610883149495e-05,
"loss": 0.0404,
"step": 44750
},
{
"epoch": 0.6434285550145776,
"grad_norm": 0.03468115255236626,
"learning_rate": 2.368239521649405e-05,
"loss": 0.0436,
"step": 44800
},
{
"epoch": 0.6441466672411564,
"grad_norm": 0.027466176077723503,
"learning_rate": 2.3675179549838613e-05,
"loss": 0.0201,
"step": 44850
},
{
"epoch": 0.6448647794677352,
"grad_norm": 0.024088064208626747,
"learning_rate": 2.3667963883183168e-05,
"loss": 0.0464,
"step": 44900
},
{
"epoch": 0.645582891694314,
"grad_norm": 0.030090169981122017,
"learning_rate": 2.3660748216527723e-05,
"loss": 0.0711,
"step": 44950
},
{
"epoch": 0.6463010039208927,
"grad_norm": 0.041597750037908554,
"learning_rate": 2.3653532549872286e-05,
"loss": 0.0962,
"step": 45000
},
{
"epoch": 0.6470191161474715,
"grad_norm": 0.06742467731237411,
"learning_rate": 2.364631688321684e-05,
"loss": 0.0578,
"step": 45050
},
{
"epoch": 0.6477372283740503,
"grad_norm": 1.151302695274353,
"learning_rate": 2.3639101216561396e-05,
"loss": 0.0372,
"step": 45100
},
{
"epoch": 0.648455340600629,
"grad_norm": 0.024876704439520836,
"learning_rate": 2.363188554990596e-05,
"loss": 0.0394,
"step": 45150
},
{
"epoch": 0.6491734528272078,
"grad_norm": 0.05010320246219635,
"learning_rate": 2.3624669883250514e-05,
"loss": 0.0553,
"step": 45200
},
{
"epoch": 0.6498915650537866,
"grad_norm": 0.03817157447338104,
"learning_rate": 2.361745421659507e-05,
"loss": 0.0474,
"step": 45250
},
{
"epoch": 0.6506096772803653,
"grad_norm": 0.22997663915157318,
"learning_rate": 2.361023854993963e-05,
"loss": 0.0544,
"step": 45300
},
{
"epoch": 0.6513277895069441,
"grad_norm": 0.03478986397385597,
"learning_rate": 2.3603022883284186e-05,
"loss": 0.0403,
"step": 45350
},
{
"epoch": 0.652045901733523,
"grad_norm": 0.02963252365589142,
"learning_rate": 2.3595807216628745e-05,
"loss": 0.0477,
"step": 45400
},
{
"epoch": 0.6527640139601016,
"grad_norm": 0.031180815771222115,
"learning_rate": 2.3588591549973304e-05,
"loss": 0.0466,
"step": 45450
},
{
"epoch": 0.6534821261866804,
"grad_norm": 5.740962982177734,
"learning_rate": 2.358137588331786e-05,
"loss": 0.0608,
"step": 45500
},
{
"epoch": 0.6542002384132592,
"grad_norm": 0.03586834669113159,
"learning_rate": 2.3574160216662418e-05,
"loss": 0.0414,
"step": 45550
},
{
"epoch": 0.6549183506398379,
"grad_norm": 0.037541572004556656,
"learning_rate": 2.3566944550006977e-05,
"loss": 0.0365,
"step": 45600
},
{
"epoch": 0.6556364628664167,
"grad_norm": 0.02523988112807274,
"learning_rate": 2.3559728883351532e-05,
"loss": 0.0834,
"step": 45650
},
{
"epoch": 0.6563545750929956,
"grad_norm": 0.09356506913900375,
"learning_rate": 2.355251321669609e-05,
"loss": 0.0641,
"step": 45700
},
{
"epoch": 0.6570726873195744,
"grad_norm": 2.7426700592041016,
"learning_rate": 2.354529755004065e-05,
"loss": 0.0629,
"step": 45750
},
{
"epoch": 0.657790799546153,
"grad_norm": 0.03408382460474968,
"learning_rate": 2.3538081883385205e-05,
"loss": 0.0448,
"step": 45800
},
{
"epoch": 0.6585089117727319,
"grad_norm": 0.042699478566646576,
"learning_rate": 2.3530866216729764e-05,
"loss": 0.0256,
"step": 45850
},
{
"epoch": 0.6592270239993107,
"grad_norm": 0.07323332875967026,
"learning_rate": 2.3523650550074322e-05,
"loss": 0.0574,
"step": 45900
},
{
"epoch": 0.6599451362258894,
"grad_norm": 0.08284715563058853,
"learning_rate": 2.3516434883418878e-05,
"loss": 0.0516,
"step": 45950
},
{
"epoch": 0.6606632484524682,
"grad_norm": 0.05623028054833412,
"learning_rate": 2.3509219216763436e-05,
"loss": 0.0256,
"step": 46000
},
{
"epoch": 0.661381360679047,
"grad_norm": 4.314274311065674,
"learning_rate": 2.3502003550107995e-05,
"loss": 0.0436,
"step": 46050
},
{
"epoch": 0.6620994729056257,
"grad_norm": 33.84172058105469,
"learning_rate": 2.3494787883452554e-05,
"loss": 0.0542,
"step": 46100
},
{
"epoch": 0.6628175851322045,
"grad_norm": 0.04924549162387848,
"learning_rate": 2.348757221679711e-05,
"loss": 0.0574,
"step": 46150
},
{
"epoch": 0.6635356973587833,
"grad_norm": 0.04736483097076416,
"learning_rate": 2.3480356550141668e-05,
"loss": 0.0614,
"step": 46200
},
{
"epoch": 0.664253809585362,
"grad_norm": 0.06680673360824585,
"learning_rate": 2.3473140883486227e-05,
"loss": 0.0511,
"step": 46250
},
{
"epoch": 0.6649719218119408,
"grad_norm": 0.13749228417873383,
"learning_rate": 2.3465925216830785e-05,
"loss": 0.0642,
"step": 46300
},
{
"epoch": 0.6656900340385196,
"grad_norm": 1.6317678689956665,
"learning_rate": 2.345885386350845e-05,
"loss": 0.0787,
"step": 46350
},
{
"epoch": 0.6664081462650983,
"grad_norm": 0.05337180197238922,
"learning_rate": 2.3451638196853006e-05,
"loss": 0.0747,
"step": 46400
},
{
"epoch": 0.6671262584916771,
"grad_norm": 6.942692279815674,
"learning_rate": 2.3444422530197565e-05,
"loss": 0.0694,
"step": 46450
},
{
"epoch": 0.6678443707182559,
"grad_norm": 0.13905298709869385,
"learning_rate": 2.3437206863542124e-05,
"loss": 0.0425,
"step": 46500
},
{
"epoch": 0.6685624829448347,
"grad_norm": 18.240694046020508,
"learning_rate": 2.342999119688668e-05,
"loss": 0.0532,
"step": 46550
},
{
"epoch": 0.6692805951714134,
"grad_norm": 0.1489611268043518,
"learning_rate": 2.342277553023124e-05,
"loss": 0.0698,
"step": 46600
},
{
"epoch": 0.6699987073979922,
"grad_norm": 0.06899626553058624,
"learning_rate": 2.3415559863575797e-05,
"loss": 0.0595,
"step": 46650
},
{
"epoch": 0.670716819624571,
"grad_norm": 0.0636877715587616,
"learning_rate": 2.3408344196920352e-05,
"loss": 0.049,
"step": 46700
},
{
"epoch": 0.6714349318511497,
"grad_norm": 8.30612564086914,
"learning_rate": 2.3401128530264914e-05,
"loss": 0.0551,
"step": 46750
},
{
"epoch": 0.6721530440777285,
"grad_norm": 2.128549575805664,
"learning_rate": 2.339391286360947e-05,
"loss": 0.0701,
"step": 46800
},
{
"epoch": 0.6728711563043073,
"grad_norm": 0.06187480315566063,
"learning_rate": 2.3386697196954025e-05,
"loss": 0.0827,
"step": 46850
},
{
"epoch": 0.673589268530886,
"grad_norm": 0.03272176533937454,
"learning_rate": 2.3379481530298587e-05,
"loss": 0.0612,
"step": 46900
},
{
"epoch": 0.6743073807574648,
"grad_norm": 0.9007086753845215,
"learning_rate": 2.3372265863643142e-05,
"loss": 0.0303,
"step": 46950
},
{
"epoch": 0.6750254929840436,
"grad_norm": 0.48982569575309753,
"learning_rate": 2.33650501969877e-05,
"loss": 0.0446,
"step": 47000
},
{
"epoch": 0.6757436052106223,
"grad_norm": 0.0573323518037796,
"learning_rate": 2.335783453033226e-05,
"loss": 0.0466,
"step": 47050
},
{
"epoch": 0.6764617174372011,
"grad_norm": 0.035964153707027435,
"learning_rate": 2.3350618863676815e-05,
"loss": 0.0385,
"step": 47100
},
{
"epoch": 0.6771798296637799,
"grad_norm": 0.10130264610052109,
"learning_rate": 2.3343403197021374e-05,
"loss": 0.0615,
"step": 47150
},
{
"epoch": 0.6778979418903587,
"grad_norm": 0.08190751075744629,
"learning_rate": 2.3336187530365932e-05,
"loss": 0.0342,
"step": 47200
},
{
"epoch": 0.6786160541169374,
"grad_norm": 0.061607372015714645,
"learning_rate": 2.3328971863710488e-05,
"loss": 0.0418,
"step": 47250
},
{
"epoch": 0.6793341663435162,
"grad_norm": 0.16257597506046295,
"learning_rate": 2.3321756197055046e-05,
"loss": 0.0375,
"step": 47300
},
{
"epoch": 0.680052278570095,
"grad_norm": 3.986081123352051,
"learning_rate": 2.3314540530399605e-05,
"loss": 0.0343,
"step": 47350
},
{
"epoch": 0.6807703907966737,
"grad_norm": 0.6988328099250793,
"learning_rate": 2.330732486374416e-05,
"loss": 0.0137,
"step": 47400
},
{
"epoch": 0.6814885030232525,
"grad_norm": 1.6210227012634277,
"learning_rate": 2.330010919708872e-05,
"loss": 0.0755,
"step": 47450
},
{
"epoch": 0.6822066152498313,
"grad_norm": 0.025823798030614853,
"learning_rate": 2.3292893530433278e-05,
"loss": 0.0405,
"step": 47500
},
{
"epoch": 0.68292472747641,
"grad_norm": 9.932079315185547,
"learning_rate": 2.3285677863777833e-05,
"loss": 0.0547,
"step": 47550
},
{
"epoch": 0.6836428397029888,
"grad_norm": 0.060679610818624496,
"learning_rate": 2.3278462197122392e-05,
"loss": 0.0827,
"step": 47600
},
{
"epoch": 0.6843609519295676,
"grad_norm": 0.05937912315130234,
"learning_rate": 2.327124653046695e-05,
"loss": 0.0997,
"step": 47650
},
{
"epoch": 0.6850790641561463,
"grad_norm": 0.19867148995399475,
"learning_rate": 2.326403086381151e-05,
"loss": 0.0405,
"step": 47700
},
{
"epoch": 0.6857971763827251,
"grad_norm": 0.05542898178100586,
"learning_rate": 2.3256815197156065e-05,
"loss": 0.0463,
"step": 47750
},
{
"epoch": 0.6865152886093039,
"grad_norm": 0.027509741485118866,
"learning_rate": 2.3249599530500623e-05,
"loss": 0.0291,
"step": 47800
},
{
"epoch": 0.6872334008358826,
"grad_norm": 2.161909818649292,
"learning_rate": 2.3242383863845182e-05,
"loss": 0.0709,
"step": 47850
},
{
"epoch": 0.6879515130624614,
"grad_norm": 0.05292743816971779,
"learning_rate": 2.3235168197189738e-05,
"loss": 0.0551,
"step": 47900
},
{
"epoch": 0.6886696252890402,
"grad_norm": 0.043459270149469376,
"learning_rate": 2.3227952530534296e-05,
"loss": 0.0444,
"step": 47950
},
{
"epoch": 0.689387737515619,
"grad_norm": 0.08632846176624298,
"learning_rate": 2.3220736863878855e-05,
"loss": 0.0678,
"step": 48000
},
{
"epoch": 0.6901058497421977,
"grad_norm": 0.12333878874778748,
"learning_rate": 2.321352119722341e-05,
"loss": 0.0509,
"step": 48050
},
{
"epoch": 0.6908239619687765,
"grad_norm": 0.0415424220263958,
"learning_rate": 2.320630553056797e-05,
"loss": 0.063,
"step": 48100
},
{
"epoch": 0.6915420741953553,
"grad_norm": 0.7142232656478882,
"learning_rate": 2.3199089863912528e-05,
"loss": 0.0596,
"step": 48150
},
{
"epoch": 0.692260186421934,
"grad_norm": 0.036065757274627686,
"learning_rate": 2.3191874197257083e-05,
"loss": 0.0404,
"step": 48200
},
{
"epoch": 0.6929782986485128,
"grad_norm": 0.02197784185409546,
"learning_rate": 2.3184658530601645e-05,
"loss": 0.025,
"step": 48250
},
{
"epoch": 0.6936964108750916,
"grad_norm": 12.290438652038574,
"learning_rate": 2.31774428639462e-05,
"loss": 0.0583,
"step": 48300
},
{
"epoch": 0.6944145231016703,
"grad_norm": 0.08092088252305984,
"learning_rate": 2.3170227197290756e-05,
"loss": 0.0656,
"step": 48350
},
{
"epoch": 0.6951326353282491,
"grad_norm": 0.07545096427202225,
"learning_rate": 2.3163011530635318e-05,
"loss": 0.0505,
"step": 48400
},
{
"epoch": 0.6958507475548279,
"grad_norm": 1.5212199687957764,
"learning_rate": 2.3155795863979873e-05,
"loss": 0.0378,
"step": 48450
},
{
"epoch": 0.6965688597814066,
"grad_norm": 0.05450819805264473,
"learning_rate": 2.3148580197324432e-05,
"loss": 0.0255,
"step": 48500
},
{
"epoch": 0.6972869720079854,
"grad_norm": 0.30951157212257385,
"learning_rate": 2.314136453066899e-05,
"loss": 0.0911,
"step": 48550
},
{
"epoch": 0.6980050842345642,
"grad_norm": 0.032920509576797485,
"learning_rate": 2.3134148864013546e-05,
"loss": 0.0368,
"step": 48600
},
{
"epoch": 0.6987231964611429,
"grad_norm": 0.09513316303491592,
"learning_rate": 2.3126933197358105e-05,
"loss": 0.0462,
"step": 48650
},
{
"epoch": 0.6994413086877217,
"grad_norm": 0.05050795152783394,
"learning_rate": 2.3119717530702664e-05,
"loss": 0.0413,
"step": 48700
},
{
"epoch": 0.7001594209143005,
"grad_norm": 4.3214430809021,
"learning_rate": 2.311250186404722e-05,
"loss": 0.0649,
"step": 48750
},
{
"epoch": 0.7008775331408793,
"grad_norm": 0.04189575836062431,
"learning_rate": 2.3105286197391778e-05,
"loss": 0.0509,
"step": 48800
},
{
"epoch": 0.701595645367458,
"grad_norm": 57.516822814941406,
"learning_rate": 2.3098070530736336e-05,
"loss": 0.0302,
"step": 48850
},
{
"epoch": 0.7023137575940368,
"grad_norm": 0.05342291295528412,
"learning_rate": 2.309085486408089e-05,
"loss": 0.0981,
"step": 48900
},
{
"epoch": 0.7030318698206156,
"grad_norm": 0.06606775522232056,
"learning_rate": 2.3083639197425454e-05,
"loss": 0.0375,
"step": 48950
},
{
"epoch": 0.7037499820471943,
"grad_norm": 4.33423376083374,
"learning_rate": 2.307642353077001e-05,
"loss": 0.0372,
"step": 49000
},
{
"epoch": 0.7044680942737731,
"grad_norm": 0.1970718950033188,
"learning_rate": 2.3069207864114564e-05,
"loss": 0.0447,
"step": 49050
},
{
"epoch": 0.7051862065003519,
"grad_norm": 0.03607802465558052,
"learning_rate": 2.3061992197459127e-05,
"loss": 0.0762,
"step": 49100
},
{
"epoch": 0.7059043187269306,
"grad_norm": 0.03366904705762863,
"learning_rate": 2.3054776530803682e-05,
"loss": 0.0685,
"step": 49150
},
{
"epoch": 0.7066224309535094,
"grad_norm": 7.310495376586914,
"learning_rate": 2.3047560864148237e-05,
"loss": 0.0862,
"step": 49200
},
{
"epoch": 0.7073405431800882,
"grad_norm": 0.07653423398733139,
"learning_rate": 2.30403451974928e-05,
"loss": 0.0479,
"step": 49250
},
{
"epoch": 0.7080586554066669,
"grad_norm": 0.03945968300104141,
"learning_rate": 2.3033129530837355e-05,
"loss": 0.0297,
"step": 49300
},
{
"epoch": 0.7087767676332457,
"grad_norm": 0.015093422494828701,
"learning_rate": 2.302591386418191e-05,
"loss": 0.0314,
"step": 49350
},
{
"epoch": 0.7094948798598245,
"grad_norm": 0.025879524648189545,
"learning_rate": 2.3018698197526472e-05,
"loss": 0.0603,
"step": 49400
},
{
"epoch": 0.7102129920864033,
"grad_norm": 0.01879737712442875,
"learning_rate": 2.3011482530871027e-05,
"loss": 0.0545,
"step": 49450
},
{
"epoch": 0.710931104312982,
"grad_norm": 0.027388766407966614,
"learning_rate": 2.3004266864215583e-05,
"loss": 0.0252,
"step": 49500
},
{
"epoch": 0.7116492165395608,
"grad_norm": 0.03858887404203415,
"learning_rate": 2.2997051197560145e-05,
"loss": 0.042,
"step": 49550
},
{
"epoch": 0.7123673287661396,
"grad_norm": 0.21305541694164276,
"learning_rate": 2.29898355309047e-05,
"loss": 0.0649,
"step": 49600
},
{
"epoch": 0.7130854409927183,
"grad_norm": 0.13431255519390106,
"learning_rate": 2.298261986424926e-05,
"loss": 0.0599,
"step": 49650
},
{
"epoch": 0.7138035532192971,
"grad_norm": 4.919084072113037,
"learning_rate": 2.2975404197593818e-05,
"loss": 0.0439,
"step": 49700
},
{
"epoch": 0.7145216654458759,
"grad_norm": 0.0446869432926178,
"learning_rate": 2.2968188530938373e-05,
"loss": 0.027,
"step": 49750
},
{
"epoch": 0.7152397776724546,
"grad_norm": 0.018511831760406494,
"learning_rate": 2.2960972864282932e-05,
"loss": 0.0274,
"step": 49800
},
{
"epoch": 0.7159578898990334,
"grad_norm": 1.278128743171692,
"learning_rate": 2.295375719762749e-05,
"loss": 0.0496,
"step": 49850
},
{
"epoch": 0.7166760021256122,
"grad_norm": 0.04163791239261627,
"learning_rate": 2.2946541530972046e-05,
"loss": 0.0374,
"step": 49900
},
{
"epoch": 0.7173941143521909,
"grad_norm": 2.655949115753174,
"learning_rate": 2.2939325864316605e-05,
"loss": 0.0497,
"step": 49950
},
{
"epoch": 0.7181122265787697,
"grad_norm": 2.2864792346954346,
"learning_rate": 2.2932110197661163e-05,
"loss": 0.0375,
"step": 50000
},
{
"epoch": 0.7188303388053485,
"grad_norm": 1.0832637548446655,
"learning_rate": 2.292489453100572e-05,
"loss": 0.033,
"step": 50050
},
{
"epoch": 0.7195484510319272,
"grad_norm": 0.036344680935144424,
"learning_rate": 2.2917678864350277e-05,
"loss": 0.0441,
"step": 50100
},
{
"epoch": 0.720266563258506,
"grad_norm": 0.07024290412664413,
"learning_rate": 2.2910463197694836e-05,
"loss": 0.0464,
"step": 50150
},
{
"epoch": 0.7209846754850848,
"grad_norm": 0.04781986400485039,
"learning_rate": 2.290324753103939e-05,
"loss": 0.0235,
"step": 50200
},
{
"epoch": 0.7217027877116636,
"grad_norm": 7.497429847717285,
"learning_rate": 2.289603186438395e-05,
"loss": 0.0568,
"step": 50250
},
{
"epoch": 0.7224208999382423,
"grad_norm": 0.03674536198377609,
"learning_rate": 2.288881619772851e-05,
"loss": 0.0229,
"step": 50300
},
{
"epoch": 0.7231390121648211,
"grad_norm": 27.460649490356445,
"learning_rate": 2.2881600531073068e-05,
"loss": 0.0394,
"step": 50350
},
{
"epoch": 0.7238571243913999,
"grad_norm": 0.5803928375244141,
"learning_rate": 2.2874384864417623e-05,
"loss": 0.0319,
"step": 50400
},
{
"epoch": 0.7245752366179786,
"grad_norm": 0.22803975641727448,
"learning_rate": 2.286716919776218e-05,
"loss": 0.0652,
"step": 50450
},
{
"epoch": 0.7252933488445574,
"grad_norm": 0.061174940317869186,
"learning_rate": 2.285995353110674e-05,
"loss": 0.0387,
"step": 50500
},
{
"epoch": 0.7260114610711362,
"grad_norm": 9.43375015258789,
"learning_rate": 2.28527378644513e-05,
"loss": 0.0538,
"step": 50550
},
{
"epoch": 0.7267295732977149,
"grad_norm": 0.11616658419370651,
"learning_rate": 2.2845522197795854e-05,
"loss": 0.0566,
"step": 50600
},
{
"epoch": 0.7274476855242937,
"grad_norm": 0.30462756752967834,
"learning_rate": 2.2838306531140413e-05,
"loss": 0.0836,
"step": 50650
},
{
"epoch": 0.7281657977508725,
"grad_norm": 2.478368043899536,
"learning_rate": 2.2831090864484972e-05,
"loss": 0.0556,
"step": 50700
},
{
"epoch": 0.7288839099774512,
"grad_norm": 1.8832333087921143,
"learning_rate": 2.2823875197829527e-05,
"loss": 0.0547,
"step": 50750
},
{
"epoch": 0.72960202220403,
"grad_norm": 0.07902120053768158,
"learning_rate": 2.2816659531174086e-05,
"loss": 0.0332,
"step": 50800
},
{
"epoch": 0.7303201344306088,
"grad_norm": 3.213438034057617,
"learning_rate": 2.2809443864518645e-05,
"loss": 0.0287,
"step": 50850
},
{
"epoch": 0.7310382466571876,
"grad_norm": 1.2780394554138184,
"learning_rate": 2.28022281978632e-05,
"loss": 0.0519,
"step": 50900
},
{
"epoch": 0.7317563588837663,
"grad_norm": 0.04258378595113754,
"learning_rate": 2.279501253120776e-05,
"loss": 0.0229,
"step": 50950
},
{
"epoch": 0.7324744711103451,
"grad_norm": 0.04968509078025818,
"learning_rate": 2.2787796864552317e-05,
"loss": 0.0492,
"step": 51000
},
{
"epoch": 0.7331925833369239,
"grad_norm": 15.08222770690918,
"learning_rate": 2.2780581197896876e-05,
"loss": 0.0727,
"step": 51050
},
{
"epoch": 0.7339106955635026,
"grad_norm": 0.038185350596904755,
"learning_rate": 2.277336553124143e-05,
"loss": 0.0455,
"step": 51100
},
{
"epoch": 0.7346288077900814,
"grad_norm": 37.51081466674805,
"learning_rate": 2.276614986458599e-05,
"loss": 0.0562,
"step": 51150
},
{
"epoch": 0.7353469200166602,
"grad_norm": 1.8346716165542603,
"learning_rate": 2.275893419793055e-05,
"loss": 0.0362,
"step": 51200
},
{
"epoch": 0.7360650322432389,
"grad_norm": 0.023607393726706505,
"learning_rate": 2.2751718531275104e-05,
"loss": 0.0701,
"step": 51250
},
{
"epoch": 0.7367831444698177,
"grad_norm": 0.03187699615955353,
"learning_rate": 2.2744502864619663e-05,
"loss": 0.0289,
"step": 51300
},
{
"epoch": 0.7375012566963965,
"grad_norm": 0.48830530047416687,
"learning_rate": 2.273728719796422e-05,
"loss": 0.0756,
"step": 51350
},
{
"epoch": 0.7382193689229752,
"grad_norm": 0.08487236499786377,
"learning_rate": 2.2730071531308777e-05,
"loss": 0.0709,
"step": 51400
},
{
"epoch": 0.738937481149554,
"grad_norm": 0.061929915100336075,
"learning_rate": 2.2722855864653336e-05,
"loss": 0.0424,
"step": 51450
},
{
"epoch": 0.7396555933761328,
"grad_norm": 0.02025146409869194,
"learning_rate": 2.2715640197997894e-05,
"loss": 0.0261,
"step": 51500
},
{
"epoch": 0.7403737056027115,
"grad_norm": 0.04376785829663277,
"learning_rate": 2.270842453134245e-05,
"loss": 0.062,
"step": 51550
},
{
"epoch": 0.7410918178292903,
"grad_norm": 0.05691489577293396,
"learning_rate": 2.270120886468701e-05,
"loss": 0.0492,
"step": 51600
},
{
"epoch": 0.7418099300558691,
"grad_norm": 0.04114688187837601,
"learning_rate": 2.2693993198031567e-05,
"loss": 0.0477,
"step": 51650
},
{
"epoch": 0.742528042282448,
"grad_norm": 0.04338749125599861,
"learning_rate": 2.2686777531376123e-05,
"loss": 0.0524,
"step": 51700
},
{
"epoch": 0.7432461545090266,
"grad_norm": 0.031342532485723495,
"learning_rate": 2.2679561864720685e-05,
"loss": 0.0278,
"step": 51750
},
{
"epoch": 0.7439642667356055,
"grad_norm": 24.92247200012207,
"learning_rate": 2.267234619806524e-05,
"loss": 0.0359,
"step": 51800
},
{
"epoch": 0.7446823789621843,
"grad_norm": 0.12787804007530212,
"learning_rate": 2.2665130531409795e-05,
"loss": 0.0413,
"step": 51850
},
{
"epoch": 0.745400491188763,
"grad_norm": 0.013693703338503838,
"learning_rate": 2.2657914864754357e-05,
"loss": 0.0279,
"step": 51900
},
{
"epoch": 0.7461186034153418,
"grad_norm": 408.0387878417969,
"learning_rate": 2.2650699198098913e-05,
"loss": 0.0313,
"step": 51950
},
{
"epoch": 0.7468367156419206,
"grad_norm": 0.014424344524741173,
"learning_rate": 2.2643483531443468e-05,
"loss": 0.034,
"step": 52000
},
{
"epoch": 0.7475548278684992,
"grad_norm": 0.08368874341249466,
"learning_rate": 2.263626786478803e-05,
"loss": 0.0298,
"step": 52050
},
{
"epoch": 0.748272940095078,
"grad_norm": 0.20141181349754333,
"learning_rate": 2.2629052198132586e-05,
"loss": 0.0459,
"step": 52100
},
{
"epoch": 0.7489910523216569,
"grad_norm": 0.14126254618167877,
"learning_rate": 2.262183653147714e-05,
"loss": 0.0477,
"step": 52150
},
{
"epoch": 0.7497091645482356,
"grad_norm": 0.013539042323827744,
"learning_rate": 2.2614620864821703e-05,
"loss": 0.0348,
"step": 52200
},
{
"epoch": 0.7504272767748144,
"grad_norm": 0.15344727039337158,
"learning_rate": 2.260740519816626e-05,
"loss": 0.0672,
"step": 52250
},
{
"epoch": 0.7511453890013932,
"grad_norm": 0.028053514659404755,
"learning_rate": 2.2600189531510814e-05,
"loss": 0.0288,
"step": 52300
},
{
"epoch": 0.7518635012279719,
"grad_norm": 0.030573882162570953,
"learning_rate": 2.2592973864855376e-05,
"loss": 0.0338,
"step": 52350
},
{
"epoch": 0.7525816134545507,
"grad_norm": 0.03588308021426201,
"learning_rate": 2.258575819819993e-05,
"loss": 0.0362,
"step": 52400
},
{
"epoch": 0.7532997256811295,
"grad_norm": 0.031105153262615204,
"learning_rate": 2.2578542531544493e-05,
"loss": 0.0311,
"step": 52450
},
{
"epoch": 0.7540178379077083,
"grad_norm": 4.209131717681885,
"learning_rate": 2.257132686488905e-05,
"loss": 0.014,
"step": 52500
},
{
"epoch": 0.754735950134287,
"grad_norm": 0.23591452836990356,
"learning_rate": 2.2564111198233604e-05,
"loss": 0.0755,
"step": 52550
},
{
"epoch": 0.7554540623608658,
"grad_norm": 2.0754292011260986,
"learning_rate": 2.2556895531578166e-05,
"loss": 0.026,
"step": 52600
},
{
"epoch": 0.7561721745874446,
"grad_norm": 0.00931734312325716,
"learning_rate": 2.254967986492272e-05,
"loss": 0.0423,
"step": 52650
},
{
"epoch": 0.7568902868140233,
"grad_norm": 0.04183708503842354,
"learning_rate": 2.2542464198267277e-05,
"loss": 0.0501,
"step": 52700
},
{
"epoch": 0.7576083990406021,
"grad_norm": 1.4638354778289795,
"learning_rate": 2.253524853161184e-05,
"loss": 0.0654,
"step": 52750
},
{
"epoch": 0.7583265112671809,
"grad_norm": 0.03767836093902588,
"learning_rate": 2.2528032864956394e-05,
"loss": 0.0318,
"step": 52800
},
{
"epoch": 0.7590446234937596,
"grad_norm": 0.021045122295618057,
"learning_rate": 2.252081719830095e-05,
"loss": 0.0148,
"step": 52850
},
{
"epoch": 0.7597627357203384,
"grad_norm": 0.024530332535505295,
"learning_rate": 2.251360153164551e-05,
"loss": 0.0532,
"step": 52900
},
{
"epoch": 0.7604808479469172,
"grad_norm": 0.029710279777646065,
"learning_rate": 2.2506385864990067e-05,
"loss": 0.0595,
"step": 52950
},
{
"epoch": 0.7611989601734959,
"grad_norm": 0.04434856027364731,
"learning_rate": 2.2499170198334622e-05,
"loss": 0.036,
"step": 53000
},
{
"epoch": 0.7619170724000747,
"grad_norm": 18.918506622314453,
"learning_rate": 2.2491954531679184e-05,
"loss": 0.0553,
"step": 53050
},
{
"epoch": 0.7626351846266535,
"grad_norm": 0.049203574657440186,
"learning_rate": 2.248473886502374e-05,
"loss": 0.0348,
"step": 53100
},
{
"epoch": 0.7633532968532323,
"grad_norm": 1.5684940814971924,
"learning_rate": 2.24775231983683e-05,
"loss": 0.0347,
"step": 53150
},
{
"epoch": 0.764071409079811,
"grad_norm": 0.02496212162077427,
"learning_rate": 2.2470307531712857e-05,
"loss": 0.0626,
"step": 53200
},
{
"epoch": 0.7647895213063898,
"grad_norm": 0.024081533774733543,
"learning_rate": 2.2463091865057412e-05,
"loss": 0.0538,
"step": 53250
},
{
"epoch": 0.7655076335329686,
"grad_norm": 0.460574746131897,
"learning_rate": 2.245587619840197e-05,
"loss": 0.0426,
"step": 53300
},
{
"epoch": 0.7662257457595473,
"grad_norm": 0.01899988390505314,
"learning_rate": 2.244866053174653e-05,
"loss": 0.0283,
"step": 53350
},
{
"epoch": 0.7669438579861261,
"grad_norm": 2.701721668243408,
"learning_rate": 2.2441444865091085e-05,
"loss": 0.0628,
"step": 53400
},
{
"epoch": 0.7676619702127049,
"grad_norm": 0.5999152660369873,
"learning_rate": 2.2434229198435644e-05,
"loss": 0.019,
"step": 53450
},
{
"epoch": 0.7683800824392836,
"grad_norm": 0.2346739023923874,
"learning_rate": 2.2427013531780203e-05,
"loss": 0.035,
"step": 53500
},
{
"epoch": 0.7690981946658624,
"grad_norm": 0.01378537155687809,
"learning_rate": 2.2419797865124758e-05,
"loss": 0.0494,
"step": 53550
},
{
"epoch": 0.7698163068924412,
"grad_norm": 0.010730593465268612,
"learning_rate": 2.2412582198469317e-05,
"loss": 0.0394,
"step": 53600
},
{
"epoch": 0.7705344191190199,
"grad_norm": 85.68353271484375,
"learning_rate": 2.2405366531813875e-05,
"loss": 0.0542,
"step": 53650
},
{
"epoch": 0.7712525313455987,
"grad_norm": 0.024320660158991814,
"learning_rate": 2.239815086515843e-05,
"loss": 0.039,
"step": 53700
},
{
"epoch": 0.7719706435721775,
"grad_norm": 0.03382103890180588,
"learning_rate": 2.239093519850299e-05,
"loss": 0.0696,
"step": 53750
},
{
"epoch": 0.7726887557987562,
"grad_norm": 0.1424567699432373,
"learning_rate": 2.2383719531847548e-05,
"loss": 0.0431,
"step": 53800
},
{
"epoch": 0.773406868025335,
"grad_norm": 6.650714874267578,
"learning_rate": 2.2376503865192107e-05,
"loss": 0.0292,
"step": 53850
},
{
"epoch": 0.7741249802519138,
"grad_norm": 0.012601901777088642,
"learning_rate": 2.2369288198536662e-05,
"loss": 0.0089,
"step": 53900
},
{
"epoch": 0.7748430924784926,
"grad_norm": 0.017761344090104103,
"learning_rate": 2.236207253188122e-05,
"loss": 0.0547,
"step": 53950
},
{
"epoch": 0.7755612047050713,
"grad_norm": 0.021128958091139793,
"learning_rate": 2.235485686522578e-05,
"loss": 0.0539,
"step": 54000
},
{
"epoch": 0.7762793169316501,
"grad_norm": 0.03616216406226158,
"learning_rate": 2.2347641198570335e-05,
"loss": 0.0574,
"step": 54050
},
{
"epoch": 0.7769974291582289,
"grad_norm": 0.0315951332449913,
"learning_rate": 2.2340425531914894e-05,
"loss": 0.0553,
"step": 54100
},
{
"epoch": 0.7777155413848076,
"grad_norm": 0.1365536004304886,
"learning_rate": 2.2333209865259453e-05,
"loss": 0.0376,
"step": 54150
},
{
"epoch": 0.7784336536113864,
"grad_norm": 0.064913310110569,
"learning_rate": 2.2325994198604008e-05,
"loss": 0.0362,
"step": 54200
},
{
"epoch": 0.7791517658379652,
"grad_norm": 0.2787727415561676,
"learning_rate": 2.2318778531948567e-05,
"loss": 0.0333,
"step": 54250
},
{
"epoch": 0.7798698780645439,
"grad_norm": 0.06095251441001892,
"learning_rate": 2.2311562865293125e-05,
"loss": 0.0442,
"step": 54300
},
{
"epoch": 0.7805879902911227,
"grad_norm": 0.06852833181619644,
"learning_rate": 2.230434719863768e-05,
"loss": 0.0483,
"step": 54350
},
{
"epoch": 0.7813061025177015,
"grad_norm": 5.220205307006836,
"learning_rate": 2.229713153198224e-05,
"loss": 0.0524,
"step": 54400
},
{
"epoch": 0.7820242147442802,
"grad_norm": 0.03756846487522125,
"learning_rate": 2.2289915865326798e-05,
"loss": 0.0208,
"step": 54450
},
{
"epoch": 0.782742326970859,
"grad_norm": 0.027425279840826988,
"learning_rate": 2.2282700198671357e-05,
"loss": 0.0503,
"step": 54500
},
{
"epoch": 0.7834604391974378,
"grad_norm": 0.040080614387989044,
"learning_rate": 2.2275484532015916e-05,
"loss": 0.0201,
"step": 54550
},
{
"epoch": 0.7841785514240165,
"grad_norm": 46.412166595458984,
"learning_rate": 2.226826886536047e-05,
"loss": 0.0613,
"step": 54600
},
{
"epoch": 0.7848966636505953,
"grad_norm": 8.979586601257324,
"learning_rate": 2.226105319870503e-05,
"loss": 0.0199,
"step": 54650
},
{
"epoch": 0.7856147758771741,
"grad_norm": 0.05250140652060509,
"learning_rate": 2.225383753204959e-05,
"loss": 0.0367,
"step": 54700
},
{
"epoch": 0.7863328881037529,
"grad_norm": 1.2878022193908691,
"learning_rate": 2.2246621865394144e-05,
"loss": 0.049,
"step": 54750
},
{
"epoch": 0.7870510003303316,
"grad_norm": 1.8124881982803345,
"learning_rate": 2.2239406198738702e-05,
"loss": 0.0195,
"step": 54800
},
{
"epoch": 0.7877691125569104,
"grad_norm": 0.08563178777694702,
"learning_rate": 2.223219053208326e-05,
"loss": 0.0463,
"step": 54850
},
{
"epoch": 0.7884872247834892,
"grad_norm": 0.10941838473081589,
"learning_rate": 2.2224974865427816e-05,
"loss": 0.0641,
"step": 54900
},
{
"epoch": 0.7892053370100679,
"grad_norm": 0.09081951528787613,
"learning_rate": 2.2217759198772375e-05,
"loss": 0.0405,
"step": 54950
},
{
"epoch": 0.7899234492366467,
"grad_norm": 3.70243239402771,
"learning_rate": 2.2210543532116934e-05,
"loss": 0.0546,
"step": 55000
},
{
"epoch": 0.7906415614632255,
"grad_norm": 0.4412289559841156,
"learning_rate": 2.220332786546149e-05,
"loss": 0.0465,
"step": 55050
},
{
"epoch": 0.7913596736898042,
"grad_norm": 3.0478625297546387,
"learning_rate": 2.2196112198806048e-05,
"loss": 0.0307,
"step": 55100
},
{
"epoch": 0.792077785916383,
"grad_norm": 26.179941177368164,
"learning_rate": 2.2188896532150607e-05,
"loss": 0.0603,
"step": 55150
},
{
"epoch": 0.7927958981429618,
"grad_norm": 6.677818298339844,
"learning_rate": 2.2181680865495162e-05,
"loss": 0.0224,
"step": 55200
},
{
"epoch": 0.7935140103695405,
"grad_norm": 0.05255184322595596,
"learning_rate": 2.2174465198839724e-05,
"loss": 0.0374,
"step": 55250
},
{
"epoch": 0.7942321225961193,
"grad_norm": 0.07613476365804672,
"learning_rate": 2.216724953218428e-05,
"loss": 0.0273,
"step": 55300
},
{
"epoch": 0.7949502348226981,
"grad_norm": 0.021911421790719032,
"learning_rate": 2.2160033865528835e-05,
"loss": 0.0326,
"step": 55350
},
{
"epoch": 0.7956683470492769,
"grad_norm": 2.5375399589538574,
"learning_rate": 2.2152818198873397e-05,
"loss": 0.0549,
"step": 55400
},
{
"epoch": 0.7963864592758556,
"grad_norm": 0.011430012993514538,
"learning_rate": 2.2145602532217952e-05,
"loss": 0.0511,
"step": 55450
},
{
"epoch": 0.7971045715024344,
"grad_norm": 0.04169834032654762,
"learning_rate": 2.2138386865562508e-05,
"loss": 0.0571,
"step": 55500
},
{
"epoch": 0.7978226837290132,
"grad_norm": 0.03755054622888565,
"learning_rate": 2.213117119890707e-05,
"loss": 0.0327,
"step": 55550
},
{
"epoch": 0.7985407959555919,
"grad_norm": 17.928287506103516,
"learning_rate": 2.2123955532251625e-05,
"loss": 0.0356,
"step": 55600
},
{
"epoch": 0.7992589081821707,
"grad_norm": 0.16650213301181793,
"learning_rate": 2.211673986559618e-05,
"loss": 0.0801,
"step": 55650
},
{
"epoch": 0.7999770204087495,
"grad_norm": 0.09039030224084854,
"learning_rate": 2.2109524198940742e-05,
"loss": 0.0364,
"step": 55700
},
{
"epoch": 0.8006951326353282,
"grad_norm": 0.6012157797813416,
"learning_rate": 2.2102308532285298e-05,
"loss": 0.0696,
"step": 55750
},
{
"epoch": 0.801413244861907,
"grad_norm": 0.10176081955432892,
"learning_rate": 2.2095092865629853e-05,
"loss": 0.0163,
"step": 55800
},
{
"epoch": 0.8021313570884858,
"grad_norm": 0.047887321561574936,
"learning_rate": 2.2087877198974415e-05,
"loss": 0.0308,
"step": 55850
},
{
"epoch": 0.8028494693150645,
"grad_norm": 0.04983760043978691,
"learning_rate": 2.208066153231897e-05,
"loss": 0.0274,
"step": 55900
},
{
"epoch": 0.8035675815416433,
"grad_norm": 0.30719447135925293,
"learning_rate": 2.207344586566353e-05,
"loss": 0.0187,
"step": 55950
},
{
"epoch": 0.8042856937682221,
"grad_norm": 0.024881890043616295,
"learning_rate": 2.2066230199008088e-05,
"loss": 0.0512,
"step": 56000
},
{
"epoch": 0.8050038059948008,
"grad_norm": 11.138710975646973,
"learning_rate": 2.2059014532352643e-05,
"loss": 0.0306,
"step": 56050
},
{
"epoch": 0.8057219182213796,
"grad_norm": 0.06446365267038345,
"learning_rate": 2.2051798865697202e-05,
"loss": 0.0406,
"step": 56100
},
{
"epoch": 0.8064400304479584,
"grad_norm": 0.054006390273571014,
"learning_rate": 2.204458319904176e-05,
"loss": 0.0479,
"step": 56150
},
{
"epoch": 0.8071581426745372,
"grad_norm": 1.996266484260559,
"learning_rate": 2.2037367532386316e-05,
"loss": 0.0436,
"step": 56200
},
{
"epoch": 0.8078762549011159,
"grad_norm": 0.07242928445339203,
"learning_rate": 2.2030151865730875e-05,
"loss": 0.0208,
"step": 56250
},
{
"epoch": 0.8085943671276947,
"grad_norm": 0.02184985764324665,
"learning_rate": 2.2022936199075434e-05,
"loss": 0.055,
"step": 56300
},
{
"epoch": 0.8093124793542735,
"grad_norm": 0.04901851341128349,
"learning_rate": 2.201572053241999e-05,
"loss": 0.0448,
"step": 56350
},
{
"epoch": 0.8100305915808522,
"grad_norm": 0.05226970463991165,
"learning_rate": 2.200850486576455e-05,
"loss": 0.0315,
"step": 56400
},
{
"epoch": 0.810748703807431,
"grad_norm": 0.006065180990844965,
"learning_rate": 2.2001289199109106e-05,
"loss": 0.0244,
"step": 56450
},
{
"epoch": 0.8114668160340098,
"grad_norm": 0.027511123567819595,
"learning_rate": 2.1994073532453662e-05,
"loss": 0.0454,
"step": 56500
},
{
"epoch": 0.8121849282605885,
"grad_norm": 1.7325809001922607,
"learning_rate": 2.1986857865798224e-05,
"loss": 0.0392,
"step": 56550
},
{
"epoch": 0.8129030404871673,
"grad_norm": 33.937747955322266,
"learning_rate": 2.197964219914278e-05,
"loss": 0.0374,
"step": 56600
},
{
"epoch": 0.8136211527137461,
"grad_norm": 0.05114162340760231,
"learning_rate": 2.1972426532487338e-05,
"loss": 0.0551,
"step": 56650
},
{
"epoch": 0.8143392649403248,
"grad_norm": 0.015771133825182915,
"learning_rate": 2.1965210865831897e-05,
"loss": 0.0464,
"step": 56700
},
{
"epoch": 0.8150573771669036,
"grad_norm": 0.032142359763383865,
"learning_rate": 2.1957995199176452e-05,
"loss": 0.0581,
"step": 56750
},
{
"epoch": 0.8157754893934824,
"grad_norm": 8.521427154541016,
"learning_rate": 2.195077953252101e-05,
"loss": 0.0469,
"step": 56800
},
{
"epoch": 0.8164936016200612,
"grad_norm": 0.06911306083202362,
"learning_rate": 2.194356386586557e-05,
"loss": 0.055,
"step": 56850
},
{
"epoch": 0.8172117138466399,
"grad_norm": 0.028982315212488174,
"learning_rate": 2.1936348199210125e-05,
"loss": 0.0365,
"step": 56900
},
{
"epoch": 0.8179298260732187,
"grad_norm": 0.04234936460852623,
"learning_rate": 2.1929132532554683e-05,
"loss": 0.0474,
"step": 56950
},
{
"epoch": 0.8186479382997975,
"grad_norm": 48.88418960571289,
"learning_rate": 2.1921916865899242e-05,
"loss": 0.0318,
"step": 57000
},
{
"epoch": 0.8193660505263762,
"grad_norm": 0.007455866783857346,
"learning_rate": 2.1914701199243797e-05,
"loss": 0.0322,
"step": 57050
},
{
"epoch": 0.820084162752955,
"grad_norm": 0.013227002695202827,
"learning_rate": 2.1907485532588356e-05,
"loss": 0.0464,
"step": 57100
},
{
"epoch": 0.8208022749795338,
"grad_norm": 0.017085455358028412,
"learning_rate": 2.1900269865932915e-05,
"loss": 0.0502,
"step": 57150
},
{
"epoch": 0.8215203872061125,
"grad_norm": 0.9700308442115784,
"learning_rate": 2.189305419927747e-05,
"loss": 0.067,
"step": 57200
},
{
"epoch": 0.8222384994326913,
"grad_norm": 0.015162572264671326,
"learning_rate": 2.188583853262203e-05,
"loss": 0.0436,
"step": 57250
},
{
"epoch": 0.8229566116592701,
"grad_norm": 1.4316377639770508,
"learning_rate": 2.1878622865966588e-05,
"loss": 0.0339,
"step": 57300
},
{
"epoch": 0.8236747238858488,
"grad_norm": 0.008070161566138268,
"learning_rate": 2.1871407199311146e-05,
"loss": 0.0334,
"step": 57350
},
{
"epoch": 0.8243928361124276,
"grad_norm": 0.1689612716436386,
"learning_rate": 2.1864191532655702e-05,
"loss": 0.0489,
"step": 57400
},
{
"epoch": 0.8251109483390064,
"grad_norm": 10.127570152282715,
"learning_rate": 2.185697586600026e-05,
"loss": 0.0928,
"step": 57450
},
{
"epoch": 0.8258290605655851,
"grad_norm": 0.34898841381073,
"learning_rate": 2.184976019934482e-05,
"loss": 0.0967,
"step": 57500
},
{
"epoch": 0.8265471727921639,
"grad_norm": 0.0451822392642498,
"learning_rate": 2.1842544532689375e-05,
"loss": 0.0426,
"step": 57550
},
{
"epoch": 0.8272652850187427,
"grad_norm": 0.016039999201893806,
"learning_rate": 2.1835328866033933e-05,
"loss": 0.0262,
"step": 57600
},
{
"epoch": 0.8279833972453216,
"grad_norm": 0.01928391307592392,
"learning_rate": 2.1828113199378492e-05,
"loss": 0.0528,
"step": 57650
},
{
"epoch": 0.8287015094719002,
"grad_norm": 0.024930743500590324,
"learning_rate": 2.1820897532723047e-05,
"loss": 0.0286,
"step": 57700
},
{
"epoch": 0.829419621698479,
"grad_norm": 1.6972570419311523,
"learning_rate": 2.1813681866067606e-05,
"loss": 0.0308,
"step": 57750
},
{
"epoch": 0.8301377339250579,
"grad_norm": 3.600100040435791,
"learning_rate": 2.1806466199412165e-05,
"loss": 0.0596,
"step": 57800
},
{
"epoch": 0.8308558461516365,
"grad_norm": 0.044996269047260284,
"learning_rate": 2.179925053275672e-05,
"loss": 0.0271,
"step": 57850
},
{
"epoch": 0.8315739583782153,
"grad_norm": 0.05033883824944496,
"learning_rate": 2.179203486610128e-05,
"loss": 0.0512,
"step": 57900
},
{
"epoch": 0.8322920706047942,
"grad_norm": 0.043055735528469086,
"learning_rate": 2.1784819199445838e-05,
"loss": 0.0393,
"step": 57950
},
{
"epoch": 0.8330101828313728,
"grad_norm": 0.013648094609379768,
"learning_rate": 2.1777603532790393e-05,
"loss": 0.0292,
"step": 58000
},
{
"epoch": 0.8337282950579517,
"grad_norm": 4.960720062255859,
"learning_rate": 2.1770387866134955e-05,
"loss": 0.0294,
"step": 58050
},
{
"epoch": 0.8344464072845305,
"grad_norm": 0.045770492404699326,
"learning_rate": 2.176317219947951e-05,
"loss": 0.042,
"step": 58100
},
{
"epoch": 0.8351645195111091,
"grad_norm": 0.026327569037675858,
"learning_rate": 2.1755956532824066e-05,
"loss": 0.0096,
"step": 58150
},
{
"epoch": 0.835882631737688,
"grad_norm": 0.015447799116373062,
"learning_rate": 2.1748740866168628e-05,
"loss": 0.0227,
"step": 58200
},
{
"epoch": 0.8366007439642668,
"grad_norm": 1.7331517934799194,
"learning_rate": 2.1741525199513183e-05,
"loss": 0.0297,
"step": 58250
},
{
"epoch": 0.8373188561908455,
"grad_norm": 0.031074518337845802,
"learning_rate": 2.173430953285774e-05,
"loss": 0.027,
"step": 58300
},
{
"epoch": 0.8380369684174243,
"grad_norm": 0.16247278451919556,
"learning_rate": 2.17270938662023e-05,
"loss": 0.0526,
"step": 58350
},
{
"epoch": 0.8387550806440031,
"grad_norm": 0.011318527162075043,
"learning_rate": 2.1719878199546856e-05,
"loss": 0.0411,
"step": 58400
},
{
"epoch": 0.8394731928705819,
"grad_norm": 0.21910272538661957,
"learning_rate": 2.1712662532891415e-05,
"loss": 0.0467,
"step": 58450
},
{
"epoch": 0.8401913050971606,
"grad_norm": 0.047498930245637894,
"learning_rate": 2.1705446866235973e-05,
"loss": 0.0387,
"step": 58500
},
{
"epoch": 0.8409094173237394,
"grad_norm": 0.8327946662902832,
"learning_rate": 2.169823119958053e-05,
"loss": 0.0189,
"step": 58550
},
{
"epoch": 0.8416275295503182,
"grad_norm": 0.08080939203500748,
"learning_rate": 2.1691015532925087e-05,
"loss": 0.0261,
"step": 58600
},
{
"epoch": 0.8423456417768969,
"grad_norm": 0.3818748891353607,
"learning_rate": 2.1683799866269646e-05,
"loss": 0.0157,
"step": 58650
},
{
"epoch": 0.8430637540034757,
"grad_norm": 0.173528254032135,
"learning_rate": 2.16765841996142e-05,
"loss": 0.0432,
"step": 58700
},
{
"epoch": 0.8437818662300545,
"grad_norm": 7.027990818023682,
"learning_rate": 2.1669368532958764e-05,
"loss": 0.0362,
"step": 58750
},
{
"epoch": 0.8444999784566332,
"grad_norm": 0.05787297338247299,
"learning_rate": 2.166215286630332e-05,
"loss": 0.0425,
"step": 58800
},
{
"epoch": 0.845218090683212,
"grad_norm": 0.012338577769696712,
"learning_rate": 2.1654937199647874e-05,
"loss": 0.0314,
"step": 58850
},
{
"epoch": 0.8459362029097908,
"grad_norm": 0.25181296467781067,
"learning_rate": 2.1647721532992436e-05,
"loss": 0.0335,
"step": 58900
},
{
"epoch": 0.8466543151363695,
"grad_norm": 3.5599021911621094,
"learning_rate": 2.164050586633699e-05,
"loss": 0.046,
"step": 58950
},
{
"epoch": 0.8473724273629483,
"grad_norm": 33.42082595825195,
"learning_rate": 2.1633434513014657e-05,
"loss": 0.0575,
"step": 59000
},
{
"epoch": 0.8480905395895271,
"grad_norm": 17.722803115844727,
"learning_rate": 2.1626218846359216e-05,
"loss": 0.024,
"step": 59050
},
{
"epoch": 0.8488086518161059,
"grad_norm": 0.07347019761800766,
"learning_rate": 2.1619003179703775e-05,
"loss": 0.0199,
"step": 59100
},
{
"epoch": 0.8495267640426846,
"grad_norm": 0.017910439521074295,
"learning_rate": 2.161178751304833e-05,
"loss": 0.0336,
"step": 59150
},
{
"epoch": 0.8502448762692634,
"grad_norm": 0.014987743459641933,
"learning_rate": 2.160457184639289e-05,
"loss": 0.0373,
"step": 59200
},
{
"epoch": 0.8509629884958422,
"grad_norm": 0.013330746442079544,
"learning_rate": 2.1597500493070555e-05,
"loss": 0.0231,
"step": 59250
},
{
"epoch": 0.8516811007224209,
"grad_norm": 0.2872665226459503,
"learning_rate": 2.1590284826415113e-05,
"loss": 0.0446,
"step": 59300
},
{
"epoch": 0.8523992129489997,
"grad_norm": 0.04101574420928955,
"learning_rate": 2.1583069159759672e-05,
"loss": 0.0475,
"step": 59350
},
{
"epoch": 0.8531173251755785,
"grad_norm": 0.025700349360704422,
"learning_rate": 2.1575853493104228e-05,
"loss": 0.042,
"step": 59400
},
{
"epoch": 0.8538354374021572,
"grad_norm": 0.022509241476655006,
"learning_rate": 2.1568637826448786e-05,
"loss": 0.036,
"step": 59450
},
{
"epoch": 0.854553549628736,
"grad_norm": 0.013904242776334286,
"learning_rate": 2.1561422159793345e-05,
"loss": 0.0276,
"step": 59500
},
{
"epoch": 0.8552716618553148,
"grad_norm": 0.025636285543441772,
"learning_rate": 2.15542064931379e-05,
"loss": 0.0475,
"step": 59550
},
{
"epoch": 0.8559897740818935,
"grad_norm": 0.13354450464248657,
"learning_rate": 2.154699082648246e-05,
"loss": 0.0432,
"step": 59600
},
{
"epoch": 0.8567078863084723,
"grad_norm": 0.057024553418159485,
"learning_rate": 2.1539775159827018e-05,
"loss": 0.0476,
"step": 59650
},
{
"epoch": 0.8574259985350511,
"grad_norm": 0.02197224833071232,
"learning_rate": 2.1532559493171573e-05,
"loss": 0.0551,
"step": 59700
},
{
"epoch": 0.8581441107616298,
"grad_norm": 8.308976173400879,
"learning_rate": 2.1525343826516132e-05,
"loss": 0.0309,
"step": 59750
},
{
"epoch": 0.8588622229882086,
"grad_norm": 0.030680665746331215,
"learning_rate": 2.151812815986069e-05,
"loss": 0.0315,
"step": 59800
},
{
"epoch": 0.8595803352147874,
"grad_norm": 0.03317005932331085,
"learning_rate": 2.151091249320525e-05,
"loss": 0.0189,
"step": 59850
},
{
"epoch": 0.8602984474413662,
"grad_norm": 0.0691901445388794,
"learning_rate": 2.1503696826549805e-05,
"loss": 0.0566,
"step": 59900
},
{
"epoch": 0.8610165596679449,
"grad_norm": 2.728785991668701,
"learning_rate": 2.1496481159894363e-05,
"loss": 0.0303,
"step": 59950
},
{
"epoch": 0.8617346718945237,
"grad_norm": 110.03862762451172,
"learning_rate": 2.1489265493238922e-05,
"loss": 0.0477,
"step": 60000
},
{
"epoch": 0.8624527841211025,
"grad_norm": 0.3772555887699127,
"learning_rate": 2.1482338453249695e-05,
"loss": 0.0699,
"step": 60050
},
{
"epoch": 0.8631708963476812,
"grad_norm": 0.03029673732817173,
"learning_rate": 2.1475122786594254e-05,
"loss": 0.0445,
"step": 60100
},
{
"epoch": 0.86388900857426,
"grad_norm": 0.059018220752477646,
"learning_rate": 2.1467907119938812e-05,
"loss": 0.0407,
"step": 60150
},
{
"epoch": 0.8646071208008388,
"grad_norm": 0.018615849316120148,
"learning_rate": 2.1460691453283368e-05,
"loss": 0.0241,
"step": 60200
},
{
"epoch": 0.8653252330274175,
"grad_norm": 0.048536475747823715,
"learning_rate": 2.145347578662793e-05,
"loss": 0.0361,
"step": 60250
},
{
"epoch": 0.8660433452539963,
"grad_norm": 0.02451617270708084,
"learning_rate": 2.1446260119972485e-05,
"loss": 0.0208,
"step": 60300
},
{
"epoch": 0.8667614574805751,
"grad_norm": 43.35445022583008,
"learning_rate": 2.143904445331704e-05,
"loss": 0.0227,
"step": 60350
},
{
"epoch": 0.8674795697071538,
"grad_norm": 0.02983827330172062,
"learning_rate": 2.1431828786661603e-05,
"loss": 0.0637,
"step": 60400
},
{
"epoch": 0.8681976819337326,
"grad_norm": 0.034458402544260025,
"learning_rate": 2.1424613120006158e-05,
"loss": 0.0372,
"step": 60450
},
{
"epoch": 0.8689157941603114,
"grad_norm": 3.9216103553771973,
"learning_rate": 2.1417397453350713e-05,
"loss": 0.0657,
"step": 60500
},
{
"epoch": 0.8696339063868902,
"grad_norm": 0.036895278841257095,
"learning_rate": 2.1410181786695275e-05,
"loss": 0.0248,
"step": 60550
},
{
"epoch": 0.8703520186134689,
"grad_norm": 0.026578128337860107,
"learning_rate": 2.140296612003983e-05,
"loss": 0.0368,
"step": 60600
},
{
"epoch": 0.8710701308400477,
"grad_norm": 0.0387389212846756,
"learning_rate": 2.1395750453384386e-05,
"loss": 0.0618,
"step": 60650
},
{
"epoch": 0.8717882430666265,
"grad_norm": 0.021623022854328156,
"learning_rate": 2.1388534786728948e-05,
"loss": 0.0489,
"step": 60700
},
{
"epoch": 0.8725063552932052,
"grad_norm": 0.05269308388233185,
"learning_rate": 2.1381319120073503e-05,
"loss": 0.0339,
"step": 60750
},
{
"epoch": 0.873224467519784,
"grad_norm": 0.3148886561393738,
"learning_rate": 2.1374103453418062e-05,
"loss": 0.0268,
"step": 60800
},
{
"epoch": 0.8739425797463628,
"grad_norm": 0.022542864084243774,
"learning_rate": 2.136688778676262e-05,
"loss": 0.0567,
"step": 60850
},
{
"epoch": 0.8746606919729415,
"grad_norm": 0.04053495079278946,
"learning_rate": 2.1359672120107176e-05,
"loss": 0.0141,
"step": 60900
},
{
"epoch": 0.8753788041995203,
"grad_norm": 0.031727734953165054,
"learning_rate": 2.1352456453451735e-05,
"loss": 0.0357,
"step": 60950
},
{
"epoch": 0.8760969164260991,
"grad_norm": 0.019135266542434692,
"learning_rate": 2.1345240786796294e-05,
"loss": 0.0729,
"step": 61000
},
{
"epoch": 0.8768150286526778,
"grad_norm": 0.0378669910132885,
"learning_rate": 2.133802512014085e-05,
"loss": 0.0322,
"step": 61050
},
{
"epoch": 0.8775331408792566,
"grad_norm": 0.7534425258636475,
"learning_rate": 2.1330809453485408e-05,
"loss": 0.0449,
"step": 61100
},
{
"epoch": 0.8782512531058354,
"grad_norm": 0.028031494468450546,
"learning_rate": 2.1323593786829966e-05,
"loss": 0.0164,
"step": 61150
},
{
"epoch": 0.8789693653324141,
"grad_norm": 1.30186128616333,
"learning_rate": 2.1316378120174522e-05,
"loss": 0.0094,
"step": 61200
},
{
"epoch": 0.8796874775589929,
"grad_norm": 0.01870095543563366,
"learning_rate": 2.130916245351908e-05,
"loss": 0.0413,
"step": 61250
},
{
"epoch": 0.8804055897855717,
"grad_norm": 0.010914398357272148,
"learning_rate": 2.130194678686364e-05,
"loss": 0.0453,
"step": 61300
},
{
"epoch": 0.8811237020121505,
"grad_norm": 0.02804506942629814,
"learning_rate": 2.1294731120208195e-05,
"loss": 0.0662,
"step": 61350
},
{
"epoch": 0.8818418142387292,
"grad_norm": 0.022548319771885872,
"learning_rate": 2.1287515453552753e-05,
"loss": 0.0409,
"step": 61400
},
{
"epoch": 0.882559926465308,
"grad_norm": 5.709629058837891,
"learning_rate": 2.1280299786897312e-05,
"loss": 0.0509,
"step": 61450
},
{
"epoch": 0.8832780386918868,
"grad_norm": 1.946844220161438,
"learning_rate": 2.127308412024187e-05,
"loss": 0.0619,
"step": 61500
},
{
"epoch": 0.8839961509184655,
"grad_norm": 0.020764781162142754,
"learning_rate": 2.126586845358643e-05,
"loss": 0.02,
"step": 61550
},
{
"epoch": 0.8847142631450443,
"grad_norm": 0.03979656100273132,
"learning_rate": 2.1258652786930985e-05,
"loss": 0.0477,
"step": 61600
},
{
"epoch": 0.8854323753716231,
"grad_norm": 1.5372118949890137,
"learning_rate": 2.1251437120275544e-05,
"loss": 0.0439,
"step": 61650
},
{
"epoch": 0.8861504875982018,
"grad_norm": 2.8347742557525635,
"learning_rate": 2.1244221453620102e-05,
"loss": 0.0552,
"step": 61700
},
{
"epoch": 0.8868685998247806,
"grad_norm": 0.5707604885101318,
"learning_rate": 2.1237005786964658e-05,
"loss": 0.0492,
"step": 61750
},
{
"epoch": 0.8875867120513594,
"grad_norm": 0.018220653757452965,
"learning_rate": 2.1229790120309216e-05,
"loss": 0.0215,
"step": 61800
},
{
"epoch": 0.8883048242779381,
"grad_norm": 0.025390001013875008,
"learning_rate": 2.1222574453653775e-05,
"loss": 0.0496,
"step": 61850
},
{
"epoch": 0.8890229365045169,
"grad_norm": 0.015123003162443638,
"learning_rate": 2.121535878699833e-05,
"loss": 0.0103,
"step": 61900
},
{
"epoch": 0.8897410487310957,
"grad_norm": 0.02207750454545021,
"learning_rate": 2.120814312034289e-05,
"loss": 0.0303,
"step": 61950
},
{
"epoch": 0.8904591609576744,
"grad_norm": 0.30433520674705505,
"learning_rate": 2.1200927453687448e-05,
"loss": 0.0432,
"step": 62000
},
{
"epoch": 0.8911772731842532,
"grad_norm": 0.3474635183811188,
"learning_rate": 2.1193711787032003e-05,
"loss": 0.0413,
"step": 62050
},
{
"epoch": 0.891895385410832,
"grad_norm": 0.011259687133133411,
"learning_rate": 2.1186496120376562e-05,
"loss": 0.0248,
"step": 62100
},
{
"epoch": 0.8926134976374108,
"grad_norm": 0.32533684372901917,
"learning_rate": 2.117928045372112e-05,
"loss": 0.0463,
"step": 62150
},
{
"epoch": 0.8933316098639895,
"grad_norm": 12.774140357971191,
"learning_rate": 2.117206478706568e-05,
"loss": 0.0858,
"step": 62200
},
{
"epoch": 0.8940497220905683,
"grad_norm": 1.7531408071517944,
"learning_rate": 2.1164849120410235e-05,
"loss": 0.0398,
"step": 62250
},
{
"epoch": 0.8947678343171471,
"grad_norm": 0.021759033203125,
"learning_rate": 2.1157633453754793e-05,
"loss": 0.0274,
"step": 62300
},
{
"epoch": 0.8954859465437258,
"grad_norm": 0.01539881806820631,
"learning_rate": 2.1150417787099352e-05,
"loss": 0.0437,
"step": 62350
},
{
"epoch": 0.8962040587703046,
"grad_norm": 0.06527174264192581,
"learning_rate": 2.1143202120443907e-05,
"loss": 0.0316,
"step": 62400
},
{
"epoch": 0.8969221709968834,
"grad_norm": 0.07329878211021423,
"learning_rate": 2.1135986453788466e-05,
"loss": 0.0175,
"step": 62450
},
{
"epoch": 0.8976402832234621,
"grad_norm": 1.6692662239074707,
"learning_rate": 2.1128770787133025e-05,
"loss": 0.0746,
"step": 62500
},
{
"epoch": 0.8983583954500409,
"grad_norm": 0.054094549268484116,
"learning_rate": 2.112155512047758e-05,
"loss": 0.0319,
"step": 62550
},
{
"epoch": 0.8990765076766197,
"grad_norm": 0.02748558297753334,
"learning_rate": 2.111433945382214e-05,
"loss": 0.0273,
"step": 62600
},
{
"epoch": 0.8997946199031984,
"grad_norm": 0.05098741501569748,
"learning_rate": 2.1107123787166698e-05,
"loss": 0.0551,
"step": 62650
},
{
"epoch": 0.9005127321297772,
"grad_norm": 0.03316742926836014,
"learning_rate": 2.1099908120511253e-05,
"loss": 0.0349,
"step": 62700
},
{
"epoch": 0.901230844356356,
"grad_norm": 0.16046391427516937,
"learning_rate": 2.1092692453855812e-05,
"loss": 0.022,
"step": 62750
},
{
"epoch": 0.9019489565829348,
"grad_norm": 0.6568982601165771,
"learning_rate": 2.108547678720037e-05,
"loss": 0.0152,
"step": 62800
},
{
"epoch": 0.9026670688095135,
"grad_norm": 0.017578423023223877,
"learning_rate": 2.1078261120544926e-05,
"loss": 0.042,
"step": 62850
},
{
"epoch": 0.9033851810360923,
"grad_norm": 1.6092095375061035,
"learning_rate": 2.1071045453889488e-05,
"loss": 0.0478,
"step": 62900
},
{
"epoch": 0.9041032932626711,
"grad_norm": 0.12522783875465393,
"learning_rate": 2.1063829787234043e-05,
"loss": 0.045,
"step": 62950
},
{
"epoch": 0.9048214054892498,
"grad_norm": 0.016538824886083603,
"learning_rate": 2.10566141205786e-05,
"loss": 0.0262,
"step": 63000
},
{
"epoch": 0.9055395177158286,
"grad_norm": 0.019631749019026756,
"learning_rate": 2.104939845392316e-05,
"loss": 0.0338,
"step": 63050
},
{
"epoch": 0.9062576299424074,
"grad_norm": 2.552657127380371,
"learning_rate": 2.1042182787267716e-05,
"loss": 0.0543,
"step": 63100
},
{
"epoch": 0.9069757421689861,
"grad_norm": 8.885793685913086,
"learning_rate": 2.103496712061227e-05,
"loss": 0.0543,
"step": 63150
},
{
"epoch": 0.9076938543955649,
"grad_norm": 0.014104747213423252,
"learning_rate": 2.1027751453956833e-05,
"loss": 0.0587,
"step": 63200
},
{
"epoch": 0.9084119666221437,
"grad_norm": 0.07887265086174011,
"learning_rate": 2.102053578730139e-05,
"loss": 0.0164,
"step": 63250
},
{
"epoch": 0.9091300788487224,
"grad_norm": 0.018778778612613678,
"learning_rate": 2.1013320120645944e-05,
"loss": 0.027,
"step": 63300
},
{
"epoch": 0.9098481910753012,
"grad_norm": 0.05986733362078667,
"learning_rate": 2.1006104453990506e-05,
"loss": 0.0321,
"step": 63350
},
{
"epoch": 0.91056630330188,
"grad_norm": 0.03736149147152901,
"learning_rate": 2.099888878733506e-05,
"loss": 0.0349,
"step": 63400
},
{
"epoch": 0.9112844155284587,
"grad_norm": 9.391321182250977,
"learning_rate": 2.099167312067962e-05,
"loss": 0.0424,
"step": 63450
},
{
"epoch": 0.9120025277550375,
"grad_norm": 0.01603916846215725,
"learning_rate": 2.098445745402418e-05,
"loss": 0.0337,
"step": 63500
},
{
"epoch": 0.9127206399816163,
"grad_norm": 5.215839385986328,
"learning_rate": 2.0977241787368734e-05,
"loss": 0.0306,
"step": 63550
},
{
"epoch": 0.9134387522081951,
"grad_norm": 0.03886263445019722,
"learning_rate": 2.0970026120713296e-05,
"loss": 0.0331,
"step": 63600
},
{
"epoch": 0.9141568644347738,
"grad_norm": 0.3178030848503113,
"learning_rate": 2.0962810454057852e-05,
"loss": 0.0417,
"step": 63650
},
{
"epoch": 0.9148749766613526,
"grad_norm": 0.026981327682733536,
"learning_rate": 2.0955594787402407e-05,
"loss": 0.043,
"step": 63700
},
{
"epoch": 0.9155930888879314,
"grad_norm": 0.019250132143497467,
"learning_rate": 2.094837912074697e-05,
"loss": 0.0451,
"step": 63750
},
{
"epoch": 0.9163112011145101,
"grad_norm": 0.016561010852456093,
"learning_rate": 2.0941163454091525e-05,
"loss": 0.0355,
"step": 63800
},
{
"epoch": 0.917029313341089,
"grad_norm": 0.011019432917237282,
"learning_rate": 2.093394778743608e-05,
"loss": 0.0514,
"step": 63850
},
{
"epoch": 0.9177474255676678,
"grad_norm": 2.2995824813842773,
"learning_rate": 2.0926732120780642e-05,
"loss": 0.0302,
"step": 63900
},
{
"epoch": 0.9184655377942464,
"grad_norm": 0.013754851184785366,
"learning_rate": 2.0919516454125197e-05,
"loss": 0.0452,
"step": 63950
},
{
"epoch": 0.9191836500208252,
"grad_norm": 0.028004441410303116,
"learning_rate": 2.0912300787469753e-05,
"loss": 0.0459,
"step": 64000
},
{
"epoch": 0.919901762247404,
"grad_norm": 0.016767442226409912,
"learning_rate": 2.0905085120814315e-05,
"loss": 0.0177,
"step": 64050
},
{
"epoch": 0.9206198744739827,
"grad_norm": 3.818044900894165,
"learning_rate": 2.089801376749198e-05,
"loss": 0.064,
"step": 64100
},
{
"epoch": 0.9213379867005616,
"grad_norm": 0.04125185310840607,
"learning_rate": 2.0890798100836536e-05,
"loss": 0.0469,
"step": 64150
},
{
"epoch": 0.9220560989271404,
"grad_norm": 0.02808566577732563,
"learning_rate": 2.0883582434181095e-05,
"loss": 0.0454,
"step": 64200
},
{
"epoch": 0.922774211153719,
"grad_norm": 0.03282586857676506,
"learning_rate": 2.0876366767525653e-05,
"loss": 0.0288,
"step": 64250
},
{
"epoch": 0.9234923233802979,
"grad_norm": 0.018743840977549553,
"learning_rate": 2.086915110087021e-05,
"loss": 0.046,
"step": 64300
},
{
"epoch": 0.9242104356068767,
"grad_norm": 0.049901194870471954,
"learning_rate": 2.0861935434214767e-05,
"loss": 0.0595,
"step": 64350
},
{
"epoch": 0.9249285478334555,
"grad_norm": 34.75381851196289,
"learning_rate": 2.0854719767559326e-05,
"loss": 0.0414,
"step": 64400
},
{
"epoch": 0.9256466600600342,
"grad_norm": 0.02926110103726387,
"learning_rate": 2.084750410090388e-05,
"loss": 0.0481,
"step": 64450
},
{
"epoch": 0.926364772286613,
"grad_norm": 0.06628941744565964,
"learning_rate": 2.0840288434248444e-05,
"loss": 0.0238,
"step": 64500
},
{
"epoch": 0.9270828845131918,
"grad_norm": 0.5184805393218994,
"learning_rate": 2.0833072767593e-05,
"loss": 0.0169,
"step": 64550
},
{
"epoch": 0.9278009967397705,
"grad_norm": 0.03416599705815315,
"learning_rate": 2.0825857100937554e-05,
"loss": 0.0477,
"step": 64600
},
{
"epoch": 0.9285191089663493,
"grad_norm": 0.10312914848327637,
"learning_rate": 2.0818641434282116e-05,
"loss": 0.0476,
"step": 64650
},
{
"epoch": 0.9292372211929281,
"grad_norm": 0.9014744758605957,
"learning_rate": 2.0811425767626672e-05,
"loss": 0.0496,
"step": 64700
},
{
"epoch": 0.9299553334195068,
"grad_norm": 0.015887757763266563,
"learning_rate": 2.0804210100971227e-05,
"loss": 0.0311,
"step": 64750
},
{
"epoch": 0.9306734456460856,
"grad_norm": 0.5302422642707825,
"learning_rate": 2.079699443431579e-05,
"loss": 0.0492,
"step": 64800
},
{
"epoch": 0.9313915578726644,
"grad_norm": 0.29680338501930237,
"learning_rate": 2.0789778767660344e-05,
"loss": 0.0251,
"step": 64850
},
{
"epoch": 0.9321096700992431,
"grad_norm": 0.003984624985605478,
"learning_rate": 2.07825631010049e-05,
"loss": 0.0304,
"step": 64900
},
{
"epoch": 0.9328277823258219,
"grad_norm": 0.3101426064968109,
"learning_rate": 2.0775347434349462e-05,
"loss": 0.0151,
"step": 64950
},
{
"epoch": 0.9335458945524007,
"grad_norm": 0.15551728010177612,
"learning_rate": 2.0768131767694017e-05,
"loss": 0.0299,
"step": 65000
},
{
"epoch": 0.9342640067789795,
"grad_norm": 0.03081173077225685,
"learning_rate": 2.0760916101038573e-05,
"loss": 0.0626,
"step": 65050
},
{
"epoch": 0.9349821190055582,
"grad_norm": 0.07034569978713989,
"learning_rate": 2.0753700434383135e-05,
"loss": 0.0275,
"step": 65100
},
{
"epoch": 0.935700231232137,
"grad_norm": 21.155750274658203,
"learning_rate": 2.074648476772769e-05,
"loss": 0.0584,
"step": 65150
},
{
"epoch": 0.9364183434587158,
"grad_norm": 0.02575441636145115,
"learning_rate": 2.073926910107225e-05,
"loss": 0.0262,
"step": 65200
},
{
"epoch": 0.9371364556852945,
"grad_norm": 0.5820505023002625,
"learning_rate": 2.0732053434416808e-05,
"loss": 0.0502,
"step": 65250
},
{
"epoch": 0.9378545679118733,
"grad_norm": 0.1326725035905838,
"learning_rate": 2.0724837767761363e-05,
"loss": 0.026,
"step": 65300
},
{
"epoch": 0.9385726801384521,
"grad_norm": 0.29643315076828003,
"learning_rate": 2.071762210110592e-05,
"loss": 0.0224,
"step": 65350
},
{
"epoch": 0.9392907923650308,
"grad_norm": 0.006369379349052906,
"learning_rate": 2.071040643445048e-05,
"loss": 0.0427,
"step": 65400
},
{
"epoch": 0.9400089045916096,
"grad_norm": 0.052178192883729935,
"learning_rate": 2.0703190767795036e-05,
"loss": 0.0478,
"step": 65450
},
{
"epoch": 0.9407270168181884,
"grad_norm": 0.07716100662946701,
"learning_rate": 2.0695975101139594e-05,
"loss": 0.0271,
"step": 65500
},
{
"epoch": 0.9414451290447671,
"grad_norm": 0.019032573327422142,
"learning_rate": 2.0688759434484153e-05,
"loss": 0.0303,
"step": 65550
},
{
"epoch": 0.9421632412713459,
"grad_norm": 0.01876581460237503,
"learning_rate": 2.068154376782871e-05,
"loss": 0.0201,
"step": 65600
},
{
"epoch": 0.9428813534979247,
"grad_norm": 0.013573722913861275,
"learning_rate": 2.0674328101173267e-05,
"loss": 0.0584,
"step": 65650
},
{
"epoch": 0.9435994657245034,
"grad_norm": 0.0182351041585207,
"learning_rate": 2.0667112434517826e-05,
"loss": 0.0464,
"step": 65700
},
{
"epoch": 0.9443175779510822,
"grad_norm": 0.044160228222608566,
"learning_rate": 2.065989676786238e-05,
"loss": 0.0364,
"step": 65750
},
{
"epoch": 0.945035690177661,
"grad_norm": 0.17557665705680847,
"learning_rate": 2.0652681101206943e-05,
"loss": 0.0362,
"step": 65800
},
{
"epoch": 0.9457538024042398,
"grad_norm": 0.008380572311580181,
"learning_rate": 2.06454654345515e-05,
"loss": 0.0328,
"step": 65850
},
{
"epoch": 0.9464719146308185,
"grad_norm": 0.020001381635665894,
"learning_rate": 2.0638249767896057e-05,
"loss": 0.0243,
"step": 65900
},
{
"epoch": 0.9471900268573973,
"grad_norm": 0.14650805294513702,
"learning_rate": 2.0631034101240616e-05,
"loss": 0.0409,
"step": 65950
},
{
"epoch": 0.9479081390839761,
"grad_norm": 0.016235264018177986,
"learning_rate": 2.062381843458517e-05,
"loss": 0.0453,
"step": 66000
},
{
"epoch": 0.9486262513105548,
"grad_norm": 2.048232316970825,
"learning_rate": 2.061660276792973e-05,
"loss": 0.0701,
"step": 66050
},
{
"epoch": 0.9493443635371336,
"grad_norm": 0.05848151445388794,
"learning_rate": 2.060938710127429e-05,
"loss": 0.0345,
"step": 66100
},
{
"epoch": 0.9500624757637124,
"grad_norm": 0.03294019028544426,
"learning_rate": 2.0602171434618844e-05,
"loss": 0.0328,
"step": 66150
},
{
"epoch": 0.9507805879902911,
"grad_norm": 0.01830822415649891,
"learning_rate": 2.0594955767963403e-05,
"loss": 0.0138,
"step": 66200
},
{
"epoch": 0.9514987002168699,
"grad_norm": 0.01788957603275776,
"learning_rate": 2.058774010130796e-05,
"loss": 0.015,
"step": 66250
},
{
"epoch": 0.9522168124434487,
"grad_norm": 0.6261688470840454,
"learning_rate": 2.0580524434652517e-05,
"loss": 0.0485,
"step": 66300
},
{
"epoch": 0.9529349246700274,
"grad_norm": 0.030795125290751457,
"learning_rate": 2.0573308767997076e-05,
"loss": 0.0503,
"step": 66350
},
{
"epoch": 0.9536530368966062,
"grad_norm": 0.02338263764977455,
"learning_rate": 2.0566093101341634e-05,
"loss": 0.0448,
"step": 66400
},
{
"epoch": 0.954371149123185,
"grad_norm": 0.39836427569389343,
"learning_rate": 2.055887743468619e-05,
"loss": 0.0601,
"step": 66450
},
{
"epoch": 0.9550892613497638,
"grad_norm": 0.0230514295399189,
"learning_rate": 2.055166176803075e-05,
"loss": 0.0371,
"step": 66500
},
{
"epoch": 0.9558073735763425,
"grad_norm": 0.02001722902059555,
"learning_rate": 2.0544446101375307e-05,
"loss": 0.0435,
"step": 66550
},
{
"epoch": 0.9565254858029213,
"grad_norm": 0.03101685456931591,
"learning_rate": 2.0537230434719866e-05,
"loss": 0.0594,
"step": 66600
},
{
"epoch": 0.9572435980295001,
"grad_norm": 0.02627638168632984,
"learning_rate": 2.053001476806442e-05,
"loss": 0.0273,
"step": 66650
},
{
"epoch": 0.9579617102560788,
"grad_norm": 0.02331429533660412,
"learning_rate": 2.052279910140898e-05,
"loss": 0.0452,
"step": 66700
},
{
"epoch": 0.9586798224826576,
"grad_norm": 0.04160163179039955,
"learning_rate": 2.051558343475354e-05,
"loss": 0.031,
"step": 66750
},
{
"epoch": 0.9593979347092364,
"grad_norm": 123.35272979736328,
"learning_rate": 2.0508367768098094e-05,
"loss": 0.0523,
"step": 66800
},
{
"epoch": 0.9601160469358151,
"grad_norm": 0.07898234575986862,
"learning_rate": 2.0501152101442653e-05,
"loss": 0.0292,
"step": 66850
},
{
"epoch": 0.9608341591623939,
"grad_norm": 1.0723378658294678,
"learning_rate": 2.049393643478721e-05,
"loss": 0.0563,
"step": 66900
},
{
"epoch": 0.9615522713889727,
"grad_norm": 0.06465094536542892,
"learning_rate": 2.0486720768131767e-05,
"loss": 0.0157,
"step": 66950
},
{
"epoch": 0.9622703836155514,
"grad_norm": 0.03426423668861389,
"learning_rate": 2.0479505101476326e-05,
"loss": 0.0287,
"step": 67000
},
{
"epoch": 0.9629884958421302,
"grad_norm": 0.013101859949529171,
"learning_rate": 2.0472289434820884e-05,
"loss": 0.0195,
"step": 67050
},
{
"epoch": 0.963706608068709,
"grad_norm": 3.0221145153045654,
"learning_rate": 2.046507376816544e-05,
"loss": 0.0597,
"step": 67100
},
{
"epoch": 0.9644247202952877,
"grad_norm": 0.02117176167666912,
"learning_rate": 2.045785810151e-05,
"loss": 0.0303,
"step": 67150
},
{
"epoch": 0.9651428325218665,
"grad_norm": 3.6087586879730225,
"learning_rate": 2.0450642434854557e-05,
"loss": 0.057,
"step": 67200
},
{
"epoch": 0.9658609447484453,
"grad_norm": 0.014843679033219814,
"learning_rate": 2.0443426768199112e-05,
"loss": 0.0242,
"step": 67250
},
{
"epoch": 0.9665790569750241,
"grad_norm": 0.5270505547523499,
"learning_rate": 2.0436211101543674e-05,
"loss": 0.0342,
"step": 67300
},
{
"epoch": 0.9672971692016028,
"grad_norm": 0.031143778935074806,
"learning_rate": 2.042899543488823e-05,
"loss": 0.0415,
"step": 67350
},
{
"epoch": 0.9680152814281816,
"grad_norm": 30.298582077026367,
"learning_rate": 2.0421779768232785e-05,
"loss": 0.0355,
"step": 67400
},
{
"epoch": 0.9687333936547604,
"grad_norm": 0.10604274272918701,
"learning_rate": 2.0414564101577347e-05,
"loss": 0.0599,
"step": 67450
},
{
"epoch": 0.9694515058813391,
"grad_norm": 0.03374650701880455,
"learning_rate": 2.0407348434921903e-05,
"loss": 0.0312,
"step": 67500
},
{
"epoch": 0.9701696181079179,
"grad_norm": 14.213685989379883,
"learning_rate": 2.0400132768266458e-05,
"loss": 0.0481,
"step": 67550
},
{
"epoch": 0.9708877303344967,
"grad_norm": 1.0365869998931885,
"learning_rate": 2.039291710161102e-05,
"loss": 0.0324,
"step": 67600
},
{
"epoch": 0.9716058425610754,
"grad_norm": 0.015141056850552559,
"learning_rate": 2.0385701434955575e-05,
"loss": 0.0085,
"step": 67650
},
{
"epoch": 0.9723239547876542,
"grad_norm": 0.015030864626169205,
"learning_rate": 2.0378485768300134e-05,
"loss": 0.0297,
"step": 67700
},
{
"epoch": 0.973042067014233,
"grad_norm": 0.0173374954611063,
"learning_rate": 2.0371270101644693e-05,
"loss": 0.0255,
"step": 67750
},
{
"epoch": 0.9737601792408117,
"grad_norm": 0.011118406429886818,
"learning_rate": 2.0364054434989248e-05,
"loss": 0.0145,
"step": 67800
},
{
"epoch": 0.9744782914673905,
"grad_norm": 0.0857364684343338,
"learning_rate": 2.0356838768333807e-05,
"loss": 0.0356,
"step": 67850
},
{
"epoch": 0.9751964036939693,
"grad_norm": 0.0073992363177239895,
"learning_rate": 2.0349623101678366e-05,
"loss": 0.026,
"step": 67900
},
{
"epoch": 0.975914515920548,
"grad_norm": 0.03137329965829849,
"learning_rate": 2.034240743502292e-05,
"loss": 0.0345,
"step": 67950
},
{
"epoch": 0.9766326281471268,
"grad_norm": 4.4521050453186035,
"learning_rate": 2.0335191768367483e-05,
"loss": 0.0393,
"step": 68000
},
{
"epoch": 0.9773507403737056,
"grad_norm": 0.43478429317474365,
"learning_rate": 2.032797610171204e-05,
"loss": 0.0281,
"step": 68050
},
{
"epoch": 0.9780688526002844,
"grad_norm": 1.3469187021255493,
"learning_rate": 2.0320760435056594e-05,
"loss": 0.0349,
"step": 68100
},
{
"epoch": 0.9787869648268631,
"grad_norm": 2.2233638763427734,
"learning_rate": 2.0313544768401156e-05,
"loss": 0.0639,
"step": 68150
},
{
"epoch": 0.9795050770534419,
"grad_norm": 2.721013307571411,
"learning_rate": 2.030632910174571e-05,
"loss": 0.0153,
"step": 68200
},
{
"epoch": 0.9802231892800207,
"grad_norm": 0.013984871096909046,
"learning_rate": 2.0299113435090266e-05,
"loss": 0.0468,
"step": 68250
},
{
"epoch": 0.9809413015065994,
"grad_norm": 0.13640810549259186,
"learning_rate": 2.029189776843483e-05,
"loss": 0.0539,
"step": 68300
},
{
"epoch": 0.9816594137331782,
"grad_norm": 0.023162130266427994,
"learning_rate": 2.0284682101779384e-05,
"loss": 0.048,
"step": 68350
},
{
"epoch": 0.982377525959757,
"grad_norm": 0.007806818932294846,
"learning_rate": 2.027746643512394e-05,
"loss": 0.0221,
"step": 68400
},
{
"epoch": 0.9830956381863357,
"grad_norm": 0.1331024169921875,
"learning_rate": 2.02702507684685e-05,
"loss": 0.045,
"step": 68450
},
{
"epoch": 0.9838137504129145,
"grad_norm": 0.027483688667416573,
"learning_rate": 2.0263035101813057e-05,
"loss": 0.02,
"step": 68500
},
{
"epoch": 0.9845318626394933,
"grad_norm": 0.017357712611556053,
"learning_rate": 2.0255819435157612e-05,
"loss": 0.0299,
"step": 68550
},
{
"epoch": 0.985249974866072,
"grad_norm": 0.0151291498914361,
"learning_rate": 2.0248603768502174e-05,
"loss": 0.0305,
"step": 68600
},
{
"epoch": 0.9859680870926508,
"grad_norm": 0.03477681055665016,
"learning_rate": 2.024138810184673e-05,
"loss": 0.0561,
"step": 68650
},
{
"epoch": 0.9866861993192296,
"grad_norm": 0.016767321154475212,
"learning_rate": 2.0234172435191288e-05,
"loss": 0.0401,
"step": 68700
},
{
"epoch": 0.9874043115458084,
"grad_norm": 0.32830455899238586,
"learning_rate": 2.0226956768535847e-05,
"loss": 0.0557,
"step": 68750
},
{
"epoch": 0.9881224237723871,
"grad_norm": 5.949401378631592,
"learning_rate": 2.0219741101880402e-05,
"loss": 0.0062,
"step": 68800
},
{
"epoch": 0.9888405359989659,
"grad_norm": 0.01927843689918518,
"learning_rate": 2.021252543522496e-05,
"loss": 0.0371,
"step": 68850
},
{
"epoch": 0.9895586482255447,
"grad_norm": 0.024355091154575348,
"learning_rate": 2.020530976856952e-05,
"loss": 0.0392,
"step": 68900
},
{
"epoch": 0.9902767604521234,
"grad_norm": 2.1073899269104004,
"learning_rate": 2.0198094101914075e-05,
"loss": 0.0172,
"step": 68950
},
{
"epoch": 0.9909948726787022,
"grad_norm": 30.42909049987793,
"learning_rate": 2.0190878435258634e-05,
"loss": 0.0356,
"step": 69000
},
{
"epoch": 0.991712984905281,
"grad_norm": 2.0507357120513916,
"learning_rate": 2.0183662768603193e-05,
"loss": 0.0344,
"step": 69050
},
{
"epoch": 0.9924310971318597,
"grad_norm": 0.010869896970689297,
"learning_rate": 2.0176447101947748e-05,
"loss": 0.0201,
"step": 69100
},
{
"epoch": 0.9931492093584385,
"grad_norm": 170.85975646972656,
"learning_rate": 2.0169231435292307e-05,
"loss": 0.0148,
"step": 69150
},
{
"epoch": 0.9938673215850173,
"grad_norm": 34.987667083740234,
"learning_rate": 2.0162015768636865e-05,
"loss": 0.0465,
"step": 69200
},
{
"epoch": 0.994585433811596,
"grad_norm": 4.180880546569824,
"learning_rate": 2.015480010198142e-05,
"loss": 0.0156,
"step": 69250
},
{
"epoch": 0.9953035460381748,
"grad_norm": 0.004112455993890762,
"learning_rate": 2.014758443532598e-05,
"loss": 0.0458,
"step": 69300
},
{
"epoch": 0.9960216582647536,
"grad_norm": 0.01970968022942543,
"learning_rate": 2.0140368768670538e-05,
"loss": 0.0239,
"step": 69350
},
{
"epoch": 0.9967397704913323,
"grad_norm": 0.024871932342648506,
"learning_rate": 2.0133153102015097e-05,
"loss": 0.0472,
"step": 69400
},
{
"epoch": 0.9974578827179111,
"grad_norm": 79.59233093261719,
"learning_rate": 2.0125937435359652e-05,
"loss": 0.042,
"step": 69450
},
{
"epoch": 0.9981759949444899,
"grad_norm": 0.008201238699257374,
"learning_rate": 2.011872176870421e-05,
"loss": 0.0375,
"step": 69500
},
{
"epoch": 0.9988941071710687,
"grad_norm": 5.825506210327148,
"learning_rate": 2.011150610204877e-05,
"loss": 0.0456,
"step": 69550
},
{
"epoch": 0.9996122193976474,
"grad_norm": 8.27955436706543,
"learning_rate": 2.0104290435393325e-05,
"loss": 0.0497,
"step": 69600
},
{
"epoch": 1.0,
"eval_accuracy": {
"accuracy": 0.9902156155185724
},
"eval_f1": {
"f1": 0.9816237212470244
},
"eval_loss": 0.04228377714753151,
"eval_precision": {
"precision": 0.9665604249667995
},
"eval_recall": {
"recall": 0.99716395621258
},
"eval_runtime": 478.7526,
"eval_samples_per_second": 581.731,
"eval_steps_per_second": 18.181,
"step": 69627
}
],
"logging_steps": 50,
"max_steps": 208881,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.931101885589811e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}