icelandic_lora8_full / trainer_state.json
rominaoji's picture
Upload folder using huggingface_hub
03b2b18 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 193455,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007753741180119408,
"grad_norm": 2.8055944442749023,
"learning_rate": 4.9870770980331344e-05,
"loss": 8.6951,
"step": 500
},
{
"epoch": 0.015507482360238816,
"grad_norm": 2.7864086627960205,
"learning_rate": 4.974154196066269e-05,
"loss": 6.4397,
"step": 1000
},
{
"epoch": 0.023261223540358224,
"grad_norm": 2.835425853729248,
"learning_rate": 4.961231294099403e-05,
"loss": 5.8474,
"step": 1500
},
{
"epoch": 0.03101496472047763,
"grad_norm": 3.0123701095581055,
"learning_rate": 4.9483083921325374e-05,
"loss": 5.5019,
"step": 2000
},
{
"epoch": 0.038768705900597035,
"grad_norm": 2.9796297550201416,
"learning_rate": 4.935385490165672e-05,
"loss": 5.2444,
"step": 2500
},
{
"epoch": 0.04652244708071645,
"grad_norm": 3.042416572570801,
"learning_rate": 4.9224625881988064e-05,
"loss": 5.0392,
"step": 3000
},
{
"epoch": 0.054276188260835856,
"grad_norm": 2.9367804527282715,
"learning_rate": 4.9095396862319405e-05,
"loss": 4.8613,
"step": 3500
},
{
"epoch": 0.06202992944095526,
"grad_norm": 2.9740755558013916,
"learning_rate": 4.8966167842650746e-05,
"loss": 4.7246,
"step": 4000
},
{
"epoch": 0.06978367062107467,
"grad_norm": 3.364341974258423,
"learning_rate": 4.8836938822982094e-05,
"loss": 4.5969,
"step": 4500
},
{
"epoch": 0.07753741180119407,
"grad_norm": 3.4089107513427734,
"learning_rate": 4.8707709803313436e-05,
"loss": 4.4929,
"step": 5000
},
{
"epoch": 0.08529115298131348,
"grad_norm": 3.324126720428467,
"learning_rate": 4.857848078364478e-05,
"loss": 4.3966,
"step": 5500
},
{
"epoch": 0.0930448941614329,
"grad_norm": 3.3835084438323975,
"learning_rate": 4.8449251763976125e-05,
"loss": 4.3176,
"step": 6000
},
{
"epoch": 0.1007986353415523,
"grad_norm": 3.6219263076782227,
"learning_rate": 4.8320022744307466e-05,
"loss": 4.2318,
"step": 6500
},
{
"epoch": 0.10855237652167171,
"grad_norm": 3.2887094020843506,
"learning_rate": 4.819079372463881e-05,
"loss": 4.161,
"step": 7000
},
{
"epoch": 0.11630611770179111,
"grad_norm": 3.6443707942962646,
"learning_rate": 4.806156470497015e-05,
"loss": 4.0995,
"step": 7500
},
{
"epoch": 0.12405985888191053,
"grad_norm": 3.623699903488159,
"learning_rate": 4.79323356853015e-05,
"loss": 4.0583,
"step": 8000
},
{
"epoch": 0.13181360006202994,
"grad_norm": 3.6302268505096436,
"learning_rate": 4.780310666563284e-05,
"loss": 4.0,
"step": 8500
},
{
"epoch": 0.13956734124214934,
"grad_norm": 3.6418278217315674,
"learning_rate": 4.767387764596418e-05,
"loss": 3.9486,
"step": 9000
},
{
"epoch": 0.14732108242226874,
"grad_norm": 3.635089159011841,
"learning_rate": 4.754464862629552e-05,
"loss": 3.896,
"step": 9500
},
{
"epoch": 0.15507482360238814,
"grad_norm": 3.8315062522888184,
"learning_rate": 4.741541960662686e-05,
"loss": 3.8629,
"step": 10000
},
{
"epoch": 0.16282856478250757,
"grad_norm": 3.7439329624176025,
"learning_rate": 4.728619058695821e-05,
"loss": 3.8255,
"step": 10500
},
{
"epoch": 0.17058230596262697,
"grad_norm": 3.732562303543091,
"learning_rate": 4.715696156728955e-05,
"loss": 3.7887,
"step": 11000
},
{
"epoch": 0.17833604714274637,
"grad_norm": 3.7784509658813477,
"learning_rate": 4.702773254762089e-05,
"loss": 3.749,
"step": 11500
},
{
"epoch": 0.1860897883228658,
"grad_norm": 3.9984617233276367,
"learning_rate": 4.6898503527952234e-05,
"loss": 3.7065,
"step": 12000
},
{
"epoch": 0.1938435295029852,
"grad_norm": 3.954652786254883,
"learning_rate": 4.676927450828358e-05,
"loss": 3.6931,
"step": 12500
},
{
"epoch": 0.2015972706831046,
"grad_norm": 3.9958720207214355,
"learning_rate": 4.6640045488614924e-05,
"loss": 3.6545,
"step": 13000
},
{
"epoch": 0.209351011863224,
"grad_norm": 3.922849178314209,
"learning_rate": 4.6510816468946265e-05,
"loss": 3.6207,
"step": 13500
},
{
"epoch": 0.21710475304334342,
"grad_norm": 4.092968463897705,
"learning_rate": 4.638158744927761e-05,
"loss": 3.5974,
"step": 14000
},
{
"epoch": 0.22485849422346282,
"grad_norm": 3.8097622394561768,
"learning_rate": 4.6252358429608954e-05,
"loss": 3.5775,
"step": 14500
},
{
"epoch": 0.23261223540358222,
"grad_norm": 4.094768047332764,
"learning_rate": 4.6123129409940296e-05,
"loss": 3.5543,
"step": 15000
},
{
"epoch": 0.24036597658370162,
"grad_norm": 3.8095757961273193,
"learning_rate": 4.5993900390271644e-05,
"loss": 3.5392,
"step": 15500
},
{
"epoch": 0.24811971776382105,
"grad_norm": 3.8520195484161377,
"learning_rate": 4.5864671370602985e-05,
"loss": 3.5047,
"step": 16000
},
{
"epoch": 0.2558734589439404,
"grad_norm": 4.280955791473389,
"learning_rate": 4.5735442350934326e-05,
"loss": 3.495,
"step": 16500
},
{
"epoch": 0.2636272001240599,
"grad_norm": 4.455172538757324,
"learning_rate": 4.5606213331265674e-05,
"loss": 3.4756,
"step": 17000
},
{
"epoch": 0.2713809413041793,
"grad_norm": 4.1695733070373535,
"learning_rate": 4.5476984311597016e-05,
"loss": 3.4505,
"step": 17500
},
{
"epoch": 0.2791346824842987,
"grad_norm": 4.223578453063965,
"learning_rate": 4.534775529192836e-05,
"loss": 3.4446,
"step": 18000
},
{
"epoch": 0.2868884236644181,
"grad_norm": 4.616567134857178,
"learning_rate": 4.52185262722597e-05,
"loss": 3.4202,
"step": 18500
},
{
"epoch": 0.2946421648445375,
"grad_norm": 4.264205455780029,
"learning_rate": 4.5089297252591046e-05,
"loss": 3.4024,
"step": 19000
},
{
"epoch": 0.3023959060246569,
"grad_norm": 4.493732929229736,
"learning_rate": 4.496006823292239e-05,
"loss": 3.397,
"step": 19500
},
{
"epoch": 0.3101496472047763,
"grad_norm": 4.321922779083252,
"learning_rate": 4.483083921325373e-05,
"loss": 3.3678,
"step": 20000
},
{
"epoch": 0.31790338838489574,
"grad_norm": 4.248241424560547,
"learning_rate": 4.470161019358508e-05,
"loss": 3.3634,
"step": 20500
},
{
"epoch": 0.32565712956501514,
"grad_norm": 4.490056037902832,
"learning_rate": 4.457238117391642e-05,
"loss": 3.3529,
"step": 21000
},
{
"epoch": 0.33341087074513454,
"grad_norm": 4.652819633483887,
"learning_rate": 4.444315215424776e-05,
"loss": 3.3491,
"step": 21500
},
{
"epoch": 0.34116461192525394,
"grad_norm": 4.65127420425415,
"learning_rate": 4.431392313457911e-05,
"loss": 3.3239,
"step": 22000
},
{
"epoch": 0.34891835310537334,
"grad_norm": 4.279469013214111,
"learning_rate": 4.418469411491045e-05,
"loss": 3.3061,
"step": 22500
},
{
"epoch": 0.35667209428549274,
"grad_norm": 4.562509059906006,
"learning_rate": 4.405546509524179e-05,
"loss": 3.3014,
"step": 23000
},
{
"epoch": 0.36442583546561214,
"grad_norm": 4.664194583892822,
"learning_rate": 4.392623607557314e-05,
"loss": 3.2963,
"step": 23500
},
{
"epoch": 0.3721795766457316,
"grad_norm": 4.531868934631348,
"learning_rate": 4.379700705590448e-05,
"loss": 3.2776,
"step": 24000
},
{
"epoch": 0.379933317825851,
"grad_norm": 4.553177833557129,
"learning_rate": 4.366777803623582e-05,
"loss": 3.2559,
"step": 24500
},
{
"epoch": 0.3876870590059704,
"grad_norm": 4.358414649963379,
"learning_rate": 4.353854901656716e-05,
"loss": 3.253,
"step": 25000
},
{
"epoch": 0.3954408001860898,
"grad_norm": 4.921834468841553,
"learning_rate": 4.3409319996898503e-05,
"loss": 3.257,
"step": 25500
},
{
"epoch": 0.4031945413662092,
"grad_norm": 4.523054599761963,
"learning_rate": 4.3280090977229845e-05,
"loss": 3.2368,
"step": 26000
},
{
"epoch": 0.4109482825463286,
"grad_norm": 4.663913249969482,
"learning_rate": 4.315086195756119e-05,
"loss": 3.2314,
"step": 26500
},
{
"epoch": 0.418702023726448,
"grad_norm": 4.529434680938721,
"learning_rate": 4.3021632937892534e-05,
"loss": 3.2216,
"step": 27000
},
{
"epoch": 0.4264557649065674,
"grad_norm": 4.806913375854492,
"learning_rate": 4.2892403918223875e-05,
"loss": 3.2176,
"step": 27500
},
{
"epoch": 0.43420950608668685,
"grad_norm": 4.8940019607543945,
"learning_rate": 4.276317489855522e-05,
"loss": 3.2061,
"step": 28000
},
{
"epoch": 0.44196324726680625,
"grad_norm": 4.507465839385986,
"learning_rate": 4.2633945878886565e-05,
"loss": 3.1957,
"step": 28500
},
{
"epoch": 0.44971698844692565,
"grad_norm": 5.141822338104248,
"learning_rate": 4.2504716859217906e-05,
"loss": 3.1849,
"step": 29000
},
{
"epoch": 0.45747072962704505,
"grad_norm": 4.68912935256958,
"learning_rate": 4.237548783954925e-05,
"loss": 3.1707,
"step": 29500
},
{
"epoch": 0.46522447080716445,
"grad_norm": 5.046304225921631,
"learning_rate": 4.2246258819880595e-05,
"loss": 3.1705,
"step": 30000
},
{
"epoch": 0.47297821198728385,
"grad_norm": 4.986753940582275,
"learning_rate": 4.211702980021194e-05,
"loss": 3.1641,
"step": 30500
},
{
"epoch": 0.48073195316740325,
"grad_norm": 5.041975498199463,
"learning_rate": 4.198780078054328e-05,
"loss": 3.1545,
"step": 31000
},
{
"epoch": 0.4884856943475227,
"grad_norm": 5.255220413208008,
"learning_rate": 4.1858571760874626e-05,
"loss": 3.1566,
"step": 31500
},
{
"epoch": 0.4962394355276421,
"grad_norm": 5.2884697914123535,
"learning_rate": 4.172934274120597e-05,
"loss": 3.1497,
"step": 32000
},
{
"epoch": 0.5039931767077614,
"grad_norm": 4.990237712860107,
"learning_rate": 4.160011372153731e-05,
"loss": 3.1417,
"step": 32500
},
{
"epoch": 0.5117469178878808,
"grad_norm": 5.165491104125977,
"learning_rate": 4.147088470186866e-05,
"loss": 3.1227,
"step": 33000
},
{
"epoch": 0.5195006590680004,
"grad_norm": 5.079947471618652,
"learning_rate": 4.13416556822e-05,
"loss": 3.1276,
"step": 33500
},
{
"epoch": 0.5272544002481198,
"grad_norm": 4.830618858337402,
"learning_rate": 4.121242666253134e-05,
"loss": 3.1195,
"step": 34000
},
{
"epoch": 0.5350081414282392,
"grad_norm": 5.067671775817871,
"learning_rate": 4.108319764286268e-05,
"loss": 3.1168,
"step": 34500
},
{
"epoch": 0.5427618826083586,
"grad_norm": 5.153751373291016,
"learning_rate": 4.095396862319403e-05,
"loss": 3.1087,
"step": 35000
},
{
"epoch": 0.550515623788478,
"grad_norm": 5.0933027267456055,
"learning_rate": 4.082473960352537e-05,
"loss": 3.0945,
"step": 35500
},
{
"epoch": 0.5582693649685974,
"grad_norm": 5.372694492340088,
"learning_rate": 4.069551058385671e-05,
"loss": 3.0906,
"step": 36000
},
{
"epoch": 0.5660231061487168,
"grad_norm": 5.414623737335205,
"learning_rate": 4.056628156418806e-05,
"loss": 3.0894,
"step": 36500
},
{
"epoch": 0.5737768473288362,
"grad_norm": 5.126060962677002,
"learning_rate": 4.04370525445194e-05,
"loss": 3.0848,
"step": 37000
},
{
"epoch": 0.5815305885089556,
"grad_norm": 4.877682209014893,
"learning_rate": 4.030782352485074e-05,
"loss": 3.0752,
"step": 37500
},
{
"epoch": 0.589284329689075,
"grad_norm": 5.10503625869751,
"learning_rate": 4.017859450518209e-05,
"loss": 3.0678,
"step": 38000
},
{
"epoch": 0.5970380708691944,
"grad_norm": 5.342639923095703,
"learning_rate": 4.004936548551343e-05,
"loss": 3.074,
"step": 38500
},
{
"epoch": 0.6047918120493138,
"grad_norm": 5.377997875213623,
"learning_rate": 3.992013646584477e-05,
"loss": 3.0633,
"step": 39000
},
{
"epoch": 0.6125455532294332,
"grad_norm": 5.535395622253418,
"learning_rate": 3.979090744617612e-05,
"loss": 3.0667,
"step": 39500
},
{
"epoch": 0.6202992944095526,
"grad_norm": 5.380632400512695,
"learning_rate": 3.966167842650746e-05,
"loss": 3.042,
"step": 40000
},
{
"epoch": 0.6280530355896721,
"grad_norm": 5.2183027267456055,
"learning_rate": 3.95324494068388e-05,
"loss": 3.0488,
"step": 40500
},
{
"epoch": 0.6358067767697915,
"grad_norm": 5.565989971160889,
"learning_rate": 3.9403220387170145e-05,
"loss": 3.0477,
"step": 41000
},
{
"epoch": 0.6435605179499109,
"grad_norm": 5.33280611038208,
"learning_rate": 3.9273991367501486e-05,
"loss": 3.0429,
"step": 41500
},
{
"epoch": 0.6513142591300303,
"grad_norm": 5.605506896972656,
"learning_rate": 3.9144762347832834e-05,
"loss": 3.0423,
"step": 42000
},
{
"epoch": 0.6590680003101497,
"grad_norm": 5.62191915512085,
"learning_rate": 3.9015533328164175e-05,
"loss": 3.0205,
"step": 42500
},
{
"epoch": 0.6668217414902691,
"grad_norm": 5.318583965301514,
"learning_rate": 3.888630430849552e-05,
"loss": 3.0277,
"step": 43000
},
{
"epoch": 0.6745754826703885,
"grad_norm": 5.763892650604248,
"learning_rate": 3.875707528882686e-05,
"loss": 3.0342,
"step": 43500
},
{
"epoch": 0.6823292238505079,
"grad_norm": 5.43319034576416,
"learning_rate": 3.86278462691582e-05,
"loss": 3.0134,
"step": 44000
},
{
"epoch": 0.6900829650306273,
"grad_norm": 5.857462406158447,
"learning_rate": 3.849861724948955e-05,
"loss": 3.0091,
"step": 44500
},
{
"epoch": 0.6978367062107467,
"grad_norm": 5.6617021560668945,
"learning_rate": 3.836938822982089e-05,
"loss": 3.0117,
"step": 45000
},
{
"epoch": 0.7055904473908661,
"grad_norm": 5.857166290283203,
"learning_rate": 3.824015921015223e-05,
"loss": 3.0189,
"step": 45500
},
{
"epoch": 0.7133441885709855,
"grad_norm": 5.425033092498779,
"learning_rate": 3.811093019048358e-05,
"loss": 3.019,
"step": 46000
},
{
"epoch": 0.7210979297511049,
"grad_norm": 5.529470920562744,
"learning_rate": 3.798170117081492e-05,
"loss": 3.002,
"step": 46500
},
{
"epoch": 0.7288516709312243,
"grad_norm": 5.534538745880127,
"learning_rate": 3.785247215114626e-05,
"loss": 2.995,
"step": 47000
},
{
"epoch": 0.7366054121113437,
"grad_norm": 5.707602024078369,
"learning_rate": 3.772324313147761e-05,
"loss": 2.9942,
"step": 47500
},
{
"epoch": 0.7443591532914632,
"grad_norm": 5.565040588378906,
"learning_rate": 3.759401411180895e-05,
"loss": 2.9947,
"step": 48000
},
{
"epoch": 0.7521128944715826,
"grad_norm": 5.714329242706299,
"learning_rate": 3.746478509214029e-05,
"loss": 2.9941,
"step": 48500
},
{
"epoch": 0.759866635651702,
"grad_norm": 5.562424182891846,
"learning_rate": 3.733555607247163e-05,
"loss": 2.9827,
"step": 49000
},
{
"epoch": 0.7676203768318214,
"grad_norm": 5.685086250305176,
"learning_rate": 3.720632705280298e-05,
"loss": 2.9812,
"step": 49500
},
{
"epoch": 0.7753741180119408,
"grad_norm": 5.563987731933594,
"learning_rate": 3.707709803313432e-05,
"loss": 2.9781,
"step": 50000
},
{
"epoch": 0.7831278591920602,
"grad_norm": 5.545105934143066,
"learning_rate": 3.694786901346566e-05,
"loss": 2.9586,
"step": 50500
},
{
"epoch": 0.7908816003721796,
"grad_norm": 5.9238386154174805,
"learning_rate": 3.681863999379701e-05,
"loss": 2.9713,
"step": 51000
},
{
"epoch": 0.798635341552299,
"grad_norm": 5.929417133331299,
"learning_rate": 3.668941097412835e-05,
"loss": 2.9623,
"step": 51500
},
{
"epoch": 0.8063890827324184,
"grad_norm": 5.734675884246826,
"learning_rate": 3.6560181954459694e-05,
"loss": 2.9659,
"step": 52000
},
{
"epoch": 0.8141428239125378,
"grad_norm": 5.726919651031494,
"learning_rate": 3.643095293479104e-05,
"loss": 2.9624,
"step": 52500
},
{
"epoch": 0.8218965650926572,
"grad_norm": 5.474340438842773,
"learning_rate": 3.630172391512238e-05,
"loss": 2.9597,
"step": 53000
},
{
"epoch": 0.8296503062727766,
"grad_norm": 5.879449367523193,
"learning_rate": 3.6172494895453725e-05,
"loss": 2.9601,
"step": 53500
},
{
"epoch": 0.837404047452896,
"grad_norm": 6.051661491394043,
"learning_rate": 3.604326587578507e-05,
"loss": 2.9606,
"step": 54000
},
{
"epoch": 0.8451577886330154,
"grad_norm": 6.062263488769531,
"learning_rate": 3.5914036856116414e-05,
"loss": 2.9573,
"step": 54500
},
{
"epoch": 0.8529115298131348,
"grad_norm": 5.804770469665527,
"learning_rate": 3.5784807836447755e-05,
"loss": 2.9512,
"step": 55000
},
{
"epoch": 0.8606652709932543,
"grad_norm": 5.899106025695801,
"learning_rate": 3.5655578816779097e-05,
"loss": 2.9563,
"step": 55500
},
{
"epoch": 0.8684190121733737,
"grad_norm": 5.6610894203186035,
"learning_rate": 3.5526349797110445e-05,
"loss": 2.9468,
"step": 56000
},
{
"epoch": 0.8761727533534931,
"grad_norm": 5.676564693450928,
"learning_rate": 3.5397120777441786e-05,
"loss": 2.9442,
"step": 56500
},
{
"epoch": 0.8839264945336125,
"grad_norm": 6.026761531829834,
"learning_rate": 3.526789175777313e-05,
"loss": 2.945,
"step": 57000
},
{
"epoch": 0.8916802357137319,
"grad_norm": 6.285312652587891,
"learning_rate": 3.5138662738104475e-05,
"loss": 2.935,
"step": 57500
},
{
"epoch": 0.8994339768938513,
"grad_norm": 5.787561416625977,
"learning_rate": 3.5009433718435817e-05,
"loss": 2.9395,
"step": 58000
},
{
"epoch": 0.9071877180739707,
"grad_norm": 5.658621311187744,
"learning_rate": 3.488020469876716e-05,
"loss": 2.9288,
"step": 58500
},
{
"epoch": 0.9149414592540901,
"grad_norm": 5.896640300750732,
"learning_rate": 3.47509756790985e-05,
"loss": 2.9351,
"step": 59000
},
{
"epoch": 0.9226952004342095,
"grad_norm": 6.219537734985352,
"learning_rate": 3.462174665942984e-05,
"loss": 2.9257,
"step": 59500
},
{
"epoch": 0.9304489416143289,
"grad_norm": 6.03794527053833,
"learning_rate": 3.449251763976118e-05,
"loss": 2.924,
"step": 60000
},
{
"epoch": 0.9382026827944483,
"grad_norm": 6.291288375854492,
"learning_rate": 3.436328862009253e-05,
"loss": 2.9141,
"step": 60500
},
{
"epoch": 0.9459564239745677,
"grad_norm": 6.239747524261475,
"learning_rate": 3.423405960042387e-05,
"loss": 2.9212,
"step": 61000
},
{
"epoch": 0.9537101651546871,
"grad_norm": 5.852220058441162,
"learning_rate": 3.410483058075521e-05,
"loss": 2.9187,
"step": 61500
},
{
"epoch": 0.9614639063348065,
"grad_norm": 5.890344142913818,
"learning_rate": 3.397560156108656e-05,
"loss": 2.9167,
"step": 62000
},
{
"epoch": 0.969217647514926,
"grad_norm": 6.417314529418945,
"learning_rate": 3.38463725414179e-05,
"loss": 2.9147,
"step": 62500
},
{
"epoch": 0.9769713886950454,
"grad_norm": 6.077672481536865,
"learning_rate": 3.371714352174924e-05,
"loss": 2.9144,
"step": 63000
},
{
"epoch": 0.9847251298751648,
"grad_norm": 6.114253520965576,
"learning_rate": 3.3587914502080584e-05,
"loss": 2.9025,
"step": 63500
},
{
"epoch": 0.9924788710552842,
"grad_norm": 6.094882965087891,
"learning_rate": 3.345868548241193e-05,
"loss": 2.9058,
"step": 64000
},
{
"epoch": 1.0002326122354035,
"grad_norm": 5.9865498542785645,
"learning_rate": 3.3329456462743274e-05,
"loss": 2.9067,
"step": 64500
},
{
"epoch": 1.007986353415523,
"grad_norm": 5.904115200042725,
"learning_rate": 3.3200227443074615e-05,
"loss": 2.8973,
"step": 65000
},
{
"epoch": 1.0157400945956423,
"grad_norm": 6.1497392654418945,
"learning_rate": 3.307099842340596e-05,
"loss": 2.897,
"step": 65500
},
{
"epoch": 1.0234938357757617,
"grad_norm": 6.136323928833008,
"learning_rate": 3.2941769403737304e-05,
"loss": 2.9023,
"step": 66000
},
{
"epoch": 1.0312475769558813,
"grad_norm": 6.581076145172119,
"learning_rate": 3.2812540384068646e-05,
"loss": 2.8962,
"step": 66500
},
{
"epoch": 1.0390013181360007,
"grad_norm": 6.236713886260986,
"learning_rate": 3.2683311364399994e-05,
"loss": 2.8953,
"step": 67000
},
{
"epoch": 1.0467550593161201,
"grad_norm": 6.375150680541992,
"learning_rate": 3.2554082344731335e-05,
"loss": 2.8951,
"step": 67500
},
{
"epoch": 1.0545088004962395,
"grad_norm": 6.454219818115234,
"learning_rate": 3.2424853325062676e-05,
"loss": 2.8902,
"step": 68000
},
{
"epoch": 1.062262541676359,
"grad_norm": 6.4552226066589355,
"learning_rate": 3.2295624305394024e-05,
"loss": 2.9028,
"step": 68500
},
{
"epoch": 1.0700162828564783,
"grad_norm": 6.323915481567383,
"learning_rate": 3.2166395285725366e-05,
"loss": 2.8793,
"step": 69000
},
{
"epoch": 1.0777700240365977,
"grad_norm": 6.413472652435303,
"learning_rate": 3.203716626605671e-05,
"loss": 2.8704,
"step": 69500
},
{
"epoch": 1.0855237652167171,
"grad_norm": 6.230509281158447,
"learning_rate": 3.190793724638805e-05,
"loss": 2.8867,
"step": 70000
},
{
"epoch": 1.0932775063968365,
"grad_norm": 6.334902286529541,
"learning_rate": 3.1778708226719396e-05,
"loss": 2.8851,
"step": 70500
},
{
"epoch": 1.101031247576956,
"grad_norm": 6.225689888000488,
"learning_rate": 3.164947920705074e-05,
"loss": 2.8783,
"step": 71000
},
{
"epoch": 1.1087849887570753,
"grad_norm": 6.088467597961426,
"learning_rate": 3.152025018738208e-05,
"loss": 2.8837,
"step": 71500
},
{
"epoch": 1.1165387299371947,
"grad_norm": 6.089876174926758,
"learning_rate": 3.139102116771343e-05,
"loss": 2.8681,
"step": 72000
},
{
"epoch": 1.1242924711173141,
"grad_norm": 6.693448066711426,
"learning_rate": 3.126179214804477e-05,
"loss": 2.8678,
"step": 72500
},
{
"epoch": 1.1320462122974335,
"grad_norm": 6.45464563369751,
"learning_rate": 3.113256312837611e-05,
"loss": 2.8717,
"step": 73000
},
{
"epoch": 1.139799953477553,
"grad_norm": 6.7853803634643555,
"learning_rate": 3.100333410870746e-05,
"loss": 2.8607,
"step": 73500
},
{
"epoch": 1.1475536946576723,
"grad_norm": 6.709279537200928,
"learning_rate": 3.08741050890388e-05,
"loss": 2.862,
"step": 74000
},
{
"epoch": 1.1553074358377917,
"grad_norm": 6.519172668457031,
"learning_rate": 3.074487606937014e-05,
"loss": 2.8704,
"step": 74500
},
{
"epoch": 1.1630611770179111,
"grad_norm": 6.5100507736206055,
"learning_rate": 3.061564704970148e-05,
"loss": 2.8636,
"step": 75000
},
{
"epoch": 1.1708149181980305,
"grad_norm": 6.233548164367676,
"learning_rate": 3.0486418030032826e-05,
"loss": 2.8773,
"step": 75500
},
{
"epoch": 1.17856865937815,
"grad_norm": 6.704037666320801,
"learning_rate": 3.0357189010364168e-05,
"loss": 2.8704,
"step": 76000
},
{
"epoch": 1.1863224005582693,
"grad_norm": 6.638470649719238,
"learning_rate": 3.0227959990695516e-05,
"loss": 2.8708,
"step": 76500
},
{
"epoch": 1.1940761417383887,
"grad_norm": 6.931632995605469,
"learning_rate": 3.0098730971026857e-05,
"loss": 2.8619,
"step": 77000
},
{
"epoch": 1.2018298829185081,
"grad_norm": 6.729213714599609,
"learning_rate": 2.99695019513582e-05,
"loss": 2.8558,
"step": 77500
},
{
"epoch": 1.2095836240986275,
"grad_norm": 6.931024551391602,
"learning_rate": 2.984027293168954e-05,
"loss": 2.8559,
"step": 78000
},
{
"epoch": 1.217337365278747,
"grad_norm": 6.658525466918945,
"learning_rate": 2.9711043912020888e-05,
"loss": 2.8573,
"step": 78500
},
{
"epoch": 1.2250911064588663,
"grad_norm": 6.135016918182373,
"learning_rate": 2.958181489235223e-05,
"loss": 2.8604,
"step": 79000
},
{
"epoch": 1.2328448476389857,
"grad_norm": 6.685146331787109,
"learning_rate": 2.945258587268357e-05,
"loss": 2.844,
"step": 79500
},
{
"epoch": 1.2405985888191053,
"grad_norm": 6.7349982261657715,
"learning_rate": 2.9323356853014915e-05,
"loss": 2.8451,
"step": 80000
},
{
"epoch": 1.2483523299992245,
"grad_norm": 6.538317680358887,
"learning_rate": 2.9194127833346256e-05,
"loss": 2.8531,
"step": 80500
},
{
"epoch": 1.2561060711793441,
"grad_norm": 6.23037576675415,
"learning_rate": 2.9064898813677598e-05,
"loss": 2.8454,
"step": 81000
},
{
"epoch": 1.2638598123594633,
"grad_norm": 6.324411392211914,
"learning_rate": 2.8935669794008946e-05,
"loss": 2.8462,
"step": 81500
},
{
"epoch": 1.271613553539583,
"grad_norm": 6.693195343017578,
"learning_rate": 2.8806440774340287e-05,
"loss": 2.8382,
"step": 82000
},
{
"epoch": 1.2793672947197021,
"grad_norm": 6.47921085357666,
"learning_rate": 2.8677211754671628e-05,
"loss": 2.8492,
"step": 82500
},
{
"epoch": 1.2871210358998217,
"grad_norm": 6.768202304840088,
"learning_rate": 2.8547982735002976e-05,
"loss": 2.8344,
"step": 83000
},
{
"epoch": 1.2948747770799411,
"grad_norm": 6.594978332519531,
"learning_rate": 2.8418753715334318e-05,
"loss": 2.8337,
"step": 83500
},
{
"epoch": 1.3026285182600605,
"grad_norm": 6.703185081481934,
"learning_rate": 2.828952469566566e-05,
"loss": 2.834,
"step": 84000
},
{
"epoch": 1.31038225944018,
"grad_norm": 6.614627838134766,
"learning_rate": 2.8160295675997e-05,
"loss": 2.835,
"step": 84500
},
{
"epoch": 1.3181360006202993,
"grad_norm": 6.38785457611084,
"learning_rate": 2.8031066656328348e-05,
"loss": 2.8439,
"step": 85000
},
{
"epoch": 1.3258897418004187,
"grad_norm": 6.479560852050781,
"learning_rate": 2.790183763665969e-05,
"loss": 2.8338,
"step": 85500
},
{
"epoch": 1.3336434829805381,
"grad_norm": 6.241596698760986,
"learning_rate": 2.777260861699103e-05,
"loss": 2.8304,
"step": 86000
},
{
"epoch": 1.3413972241606575,
"grad_norm": 7.731629371643066,
"learning_rate": 2.764337959732238e-05,
"loss": 2.828,
"step": 86500
},
{
"epoch": 1.349150965340777,
"grad_norm": 6.706104278564453,
"learning_rate": 2.751415057765372e-05,
"loss": 2.8297,
"step": 87000
},
{
"epoch": 1.3569047065208963,
"grad_norm": 6.772350311279297,
"learning_rate": 2.738492155798506e-05,
"loss": 2.8331,
"step": 87500
},
{
"epoch": 1.3646584477010157,
"grad_norm": 6.890201091766357,
"learning_rate": 2.7255692538316406e-05,
"loss": 2.8286,
"step": 88000
},
{
"epoch": 1.3724121888811351,
"grad_norm": 6.540558815002441,
"learning_rate": 2.7126463518647748e-05,
"loss": 2.8306,
"step": 88500
},
{
"epoch": 1.3801659300612545,
"grad_norm": 6.890176773071289,
"learning_rate": 2.699723449897909e-05,
"loss": 2.827,
"step": 89000
},
{
"epoch": 1.387919671241374,
"grad_norm": 6.634540557861328,
"learning_rate": 2.6868005479310437e-05,
"loss": 2.8263,
"step": 89500
},
{
"epoch": 1.3956734124214933,
"grad_norm": 7.228022575378418,
"learning_rate": 2.6738776459641778e-05,
"loss": 2.8365,
"step": 90000
},
{
"epoch": 1.4034271536016127,
"grad_norm": 6.7347869873046875,
"learning_rate": 2.660954743997312e-05,
"loss": 2.825,
"step": 90500
},
{
"epoch": 1.4111808947817321,
"grad_norm": 6.936458110809326,
"learning_rate": 2.6480318420304468e-05,
"loss": 2.8266,
"step": 91000
},
{
"epoch": 1.4189346359618515,
"grad_norm": 6.489315509796143,
"learning_rate": 2.635108940063581e-05,
"loss": 2.8173,
"step": 91500
},
{
"epoch": 1.426688377141971,
"grad_norm": 7.1031012535095215,
"learning_rate": 2.622186038096715e-05,
"loss": 2.8114,
"step": 92000
},
{
"epoch": 1.4344421183220903,
"grad_norm": 6.918934345245361,
"learning_rate": 2.609263136129849e-05,
"loss": 2.8252,
"step": 92500
},
{
"epoch": 1.4421958595022097,
"grad_norm": 6.686205863952637,
"learning_rate": 2.596340234162984e-05,
"loss": 2.8138,
"step": 93000
},
{
"epoch": 1.4499496006823291,
"grad_norm": 6.464860439300537,
"learning_rate": 2.583417332196118e-05,
"loss": 2.8284,
"step": 93500
},
{
"epoch": 1.4577033418624485,
"grad_norm": 6.871826171875,
"learning_rate": 2.5704944302292522e-05,
"loss": 2.8215,
"step": 94000
},
{
"epoch": 1.4654570830425682,
"grad_norm": 6.555510520935059,
"learning_rate": 2.557571528262387e-05,
"loss": 2.8166,
"step": 94500
},
{
"epoch": 1.4732108242226873,
"grad_norm": 6.931303977966309,
"learning_rate": 2.544648626295521e-05,
"loss": 2.8218,
"step": 95000
},
{
"epoch": 1.480964565402807,
"grad_norm": 6.583662509918213,
"learning_rate": 2.5317257243286553e-05,
"loss": 2.8075,
"step": 95500
},
{
"epoch": 1.4887183065829261,
"grad_norm": 6.624995231628418,
"learning_rate": 2.5188028223617897e-05,
"loss": 2.8281,
"step": 96000
},
{
"epoch": 1.4964720477630458,
"grad_norm": 6.899562835693359,
"learning_rate": 2.505879920394924e-05,
"loss": 2.8195,
"step": 96500
},
{
"epoch": 1.504225788943165,
"grad_norm": 6.846054553985596,
"learning_rate": 2.4929570184280583e-05,
"loss": 2.8149,
"step": 97000
},
{
"epoch": 1.5119795301232846,
"grad_norm": 6.590377330780029,
"learning_rate": 2.4800341164611925e-05,
"loss": 2.7984,
"step": 97500
},
{
"epoch": 1.5197332713034037,
"grad_norm": 6.771044731140137,
"learning_rate": 2.467111214494327e-05,
"loss": 2.82,
"step": 98000
},
{
"epoch": 1.5274870124835234,
"grad_norm": 6.996868133544922,
"learning_rate": 2.454188312527461e-05,
"loss": 2.803,
"step": 98500
},
{
"epoch": 1.5352407536636425,
"grad_norm": 6.782078742980957,
"learning_rate": 2.4412654105605955e-05,
"loss": 2.8156,
"step": 99000
},
{
"epoch": 1.5429944948437622,
"grad_norm": 7.141603469848633,
"learning_rate": 2.42834250859373e-05,
"loss": 2.8081,
"step": 99500
},
{
"epoch": 1.5507482360238816,
"grad_norm": 7.204538822174072,
"learning_rate": 2.415419606626864e-05,
"loss": 2.7939,
"step": 100000
},
{
"epoch": 1.558501977204001,
"grad_norm": 7.218080043792725,
"learning_rate": 2.4024967046599986e-05,
"loss": 2.8011,
"step": 100500
},
{
"epoch": 1.5662557183841204,
"grad_norm": 6.774955749511719,
"learning_rate": 2.3895738026931327e-05,
"loss": 2.8086,
"step": 101000
},
{
"epoch": 1.5740094595642398,
"grad_norm": 6.7942657470703125,
"learning_rate": 2.3766509007262672e-05,
"loss": 2.7981,
"step": 101500
},
{
"epoch": 1.5817632007443592,
"grad_norm": 6.575582027435303,
"learning_rate": 2.3637279987594017e-05,
"loss": 2.8048,
"step": 102000
},
{
"epoch": 1.5895169419244786,
"grad_norm": 6.921658515930176,
"learning_rate": 2.3508050967925358e-05,
"loss": 2.8001,
"step": 102500
},
{
"epoch": 1.597270683104598,
"grad_norm": 7.207976341247559,
"learning_rate": 2.3378821948256703e-05,
"loss": 2.8112,
"step": 103000
},
{
"epoch": 1.6050244242847174,
"grad_norm": 7.6573710441589355,
"learning_rate": 2.3249592928588047e-05,
"loss": 2.8044,
"step": 103500
},
{
"epoch": 1.6127781654648368,
"grad_norm": 7.072439670562744,
"learning_rate": 2.312036390891939e-05,
"loss": 2.7871,
"step": 104000
},
{
"epoch": 1.6205319066449562,
"grad_norm": 7.3211259841918945,
"learning_rate": 2.299113488925073e-05,
"loss": 2.7921,
"step": 104500
},
{
"epoch": 1.6282856478250756,
"grad_norm": 7.107245445251465,
"learning_rate": 2.2861905869582075e-05,
"loss": 2.8031,
"step": 105000
},
{
"epoch": 1.636039389005195,
"grad_norm": 6.947020530700684,
"learning_rate": 2.2732676849913416e-05,
"loss": 2.7869,
"step": 105500
},
{
"epoch": 1.6437931301853144,
"grad_norm": 7.2329301834106445,
"learning_rate": 2.260344783024476e-05,
"loss": 2.7876,
"step": 106000
},
{
"epoch": 1.6515468713654338,
"grad_norm": 6.861079692840576,
"learning_rate": 2.2474218810576102e-05,
"loss": 2.7875,
"step": 106500
},
{
"epoch": 1.6593006125455534,
"grad_norm": 7.405232906341553,
"learning_rate": 2.2344989790907447e-05,
"loss": 2.7955,
"step": 107000
},
{
"epoch": 1.6670543537256726,
"grad_norm": 7.370352745056152,
"learning_rate": 2.221576077123879e-05,
"loss": 2.7931,
"step": 107500
},
{
"epoch": 1.6748080949057922,
"grad_norm": 7.008327960968018,
"learning_rate": 2.2086531751570133e-05,
"loss": 2.7936,
"step": 108000
},
{
"epoch": 1.6825618360859114,
"grad_norm": 7.268221378326416,
"learning_rate": 2.1957302731901477e-05,
"loss": 2.7969,
"step": 108500
},
{
"epoch": 1.690315577266031,
"grad_norm": 6.869812488555908,
"learning_rate": 2.182807371223282e-05,
"loss": 2.7879,
"step": 109000
},
{
"epoch": 1.6980693184461502,
"grad_norm": 6.86647891998291,
"learning_rate": 2.1698844692564163e-05,
"loss": 2.7992,
"step": 109500
},
{
"epoch": 1.7058230596262698,
"grad_norm": 7.082624435424805,
"learning_rate": 2.1569615672895508e-05,
"loss": 2.7829,
"step": 110000
},
{
"epoch": 1.713576800806389,
"grad_norm": 6.880459308624268,
"learning_rate": 2.144038665322685e-05,
"loss": 2.7797,
"step": 110500
},
{
"epoch": 1.7213305419865086,
"grad_norm": 7.15917444229126,
"learning_rate": 2.1311157633558194e-05,
"loss": 2.7952,
"step": 111000
},
{
"epoch": 1.7290842831666278,
"grad_norm": 7.239593982696533,
"learning_rate": 2.118192861388954e-05,
"loss": 2.7836,
"step": 111500
},
{
"epoch": 1.7368380243467474,
"grad_norm": 6.907558441162109,
"learning_rate": 2.105269959422088e-05,
"loss": 2.7974,
"step": 112000
},
{
"epoch": 1.7445917655268666,
"grad_norm": 7.07895040512085,
"learning_rate": 2.0923470574552225e-05,
"loss": 2.7847,
"step": 112500
},
{
"epoch": 1.7523455067069862,
"grad_norm": 6.944314956665039,
"learning_rate": 2.0794241554883566e-05,
"loss": 2.7875,
"step": 113000
},
{
"epoch": 1.7600992478871054,
"grad_norm": 6.936674118041992,
"learning_rate": 2.0665012535214907e-05,
"loss": 2.7743,
"step": 113500
},
{
"epoch": 1.767852989067225,
"grad_norm": 7.579113960266113,
"learning_rate": 2.0535783515546252e-05,
"loss": 2.7748,
"step": 114000
},
{
"epoch": 1.7756067302473444,
"grad_norm": 6.939824104309082,
"learning_rate": 2.0406554495877593e-05,
"loss": 2.7813,
"step": 114500
},
{
"epoch": 1.7833604714274638,
"grad_norm": 6.831909656524658,
"learning_rate": 2.0277325476208938e-05,
"loss": 2.7755,
"step": 115000
},
{
"epoch": 1.7911142126075832,
"grad_norm": 6.841889381408691,
"learning_rate": 2.0148096456540283e-05,
"loss": 2.7833,
"step": 115500
},
{
"epoch": 1.7988679537877026,
"grad_norm": 6.934596538543701,
"learning_rate": 2.0018867436871624e-05,
"loss": 2.7816,
"step": 116000
},
{
"epoch": 1.806621694967822,
"grad_norm": 7.232493877410889,
"learning_rate": 1.988963841720297e-05,
"loss": 2.7718,
"step": 116500
},
{
"epoch": 1.8143754361479414,
"grad_norm": 6.8913421630859375,
"learning_rate": 1.976040939753431e-05,
"loss": 2.7806,
"step": 117000
},
{
"epoch": 1.8221291773280608,
"grad_norm": 7.189756393432617,
"learning_rate": 1.9631180377865655e-05,
"loss": 2.7696,
"step": 117500
},
{
"epoch": 1.8298829185081802,
"grad_norm": 7.165264129638672,
"learning_rate": 1.9501951358197e-05,
"loss": 2.772,
"step": 118000
},
{
"epoch": 1.8376366596882996,
"grad_norm": 7.418449878692627,
"learning_rate": 1.937272233852834e-05,
"loss": 2.7673,
"step": 118500
},
{
"epoch": 1.845390400868419,
"grad_norm": 7.016151428222656,
"learning_rate": 1.9243493318859685e-05,
"loss": 2.7721,
"step": 119000
},
{
"epoch": 1.8531441420485384,
"grad_norm": 7.69176721572876,
"learning_rate": 1.9114264299191027e-05,
"loss": 2.7712,
"step": 119500
},
{
"epoch": 1.8608978832286578,
"grad_norm": 7.078608512878418,
"learning_rate": 1.898503527952237e-05,
"loss": 2.7626,
"step": 120000
},
{
"epoch": 1.8686516244087772,
"grad_norm": 7.167757034301758,
"learning_rate": 1.8855806259853716e-05,
"loss": 2.7757,
"step": 120500
},
{
"epoch": 1.8764053655888966,
"grad_norm": 7.261277675628662,
"learning_rate": 1.8726577240185057e-05,
"loss": 2.7638,
"step": 121000
},
{
"epoch": 1.8841591067690162,
"grad_norm": 7.32155179977417,
"learning_rate": 1.85973482205164e-05,
"loss": 2.7743,
"step": 121500
},
{
"epoch": 1.8919128479491354,
"grad_norm": 7.3756103515625,
"learning_rate": 1.8468119200847743e-05,
"loss": 2.7751,
"step": 122000
},
{
"epoch": 1.899666589129255,
"grad_norm": 7.422321796417236,
"learning_rate": 1.8338890181179085e-05,
"loss": 2.7766,
"step": 122500
},
{
"epoch": 1.9074203303093742,
"grad_norm": 7.091059684753418,
"learning_rate": 1.820966116151043e-05,
"loss": 2.7643,
"step": 123000
},
{
"epoch": 1.9151740714894938,
"grad_norm": 6.582401275634766,
"learning_rate": 1.808043214184177e-05,
"loss": 2.7665,
"step": 123500
},
{
"epoch": 1.922927812669613,
"grad_norm": 7.574552536010742,
"learning_rate": 1.7951203122173115e-05,
"loss": 2.7548,
"step": 124000
},
{
"epoch": 1.9306815538497326,
"grad_norm": 7.082491874694824,
"learning_rate": 1.782197410250446e-05,
"loss": 2.7577,
"step": 124500
},
{
"epoch": 1.9384352950298518,
"grad_norm": 7.546943187713623,
"learning_rate": 1.76927450828358e-05,
"loss": 2.7709,
"step": 125000
},
{
"epoch": 1.9461890362099714,
"grad_norm": 7.498143196105957,
"learning_rate": 1.7563516063167146e-05,
"loss": 2.7674,
"step": 125500
},
{
"epoch": 1.9539427773900906,
"grad_norm": 7.182895660400391,
"learning_rate": 1.743428704349849e-05,
"loss": 2.7595,
"step": 126000
},
{
"epoch": 1.9616965185702102,
"grad_norm": 7.754599094390869,
"learning_rate": 1.7305058023829832e-05,
"loss": 2.7586,
"step": 126500
},
{
"epoch": 1.9694502597503294,
"grad_norm": 7.348043918609619,
"learning_rate": 1.7175829004161177e-05,
"loss": 2.7671,
"step": 127000
},
{
"epoch": 1.977204000930449,
"grad_norm": 7.5025835037231445,
"learning_rate": 1.7046599984492518e-05,
"loss": 2.7596,
"step": 127500
},
{
"epoch": 1.9849577421105682,
"grad_norm": 7.277556896209717,
"learning_rate": 1.6917370964823863e-05,
"loss": 2.7554,
"step": 128000
},
{
"epoch": 1.9927114832906878,
"grad_norm": 7.15894079208374,
"learning_rate": 1.6788141945155207e-05,
"loss": 2.758,
"step": 128500
},
{
"epoch": 2.000465224470807,
"grad_norm": 7.221950054168701,
"learning_rate": 1.665891292548655e-05,
"loss": 2.7564,
"step": 129000
},
{
"epoch": 2.0082189656509266,
"grad_norm": 7.185346603393555,
"learning_rate": 1.6529683905817893e-05,
"loss": 2.7671,
"step": 129500
},
{
"epoch": 2.015972706831046,
"grad_norm": 7.411344528198242,
"learning_rate": 1.6400454886149234e-05,
"loss": 2.7698,
"step": 130000
},
{
"epoch": 2.0237264480111654,
"grad_norm": 7.3418498039245605,
"learning_rate": 1.6271225866480576e-05,
"loss": 2.7508,
"step": 130500
},
{
"epoch": 2.0314801891912846,
"grad_norm": 7.749533176422119,
"learning_rate": 1.614199684681192e-05,
"loss": 2.7643,
"step": 131000
},
{
"epoch": 2.039233930371404,
"grad_norm": 7.400169372558594,
"learning_rate": 1.6012767827143262e-05,
"loss": 2.7523,
"step": 131500
},
{
"epoch": 2.0469876715515234,
"grad_norm": 6.999739646911621,
"learning_rate": 1.5883538807474606e-05,
"loss": 2.7525,
"step": 132000
},
{
"epoch": 2.054741412731643,
"grad_norm": 7.423766613006592,
"learning_rate": 1.575430978780595e-05,
"loss": 2.7497,
"step": 132500
},
{
"epoch": 2.0624951539117626,
"grad_norm": 7.121034622192383,
"learning_rate": 1.5625080768137292e-05,
"loss": 2.7644,
"step": 133000
},
{
"epoch": 2.070248895091882,
"grad_norm": 7.697927951812744,
"learning_rate": 1.5495851748468637e-05,
"loss": 2.7599,
"step": 133500
},
{
"epoch": 2.0780026362720014,
"grad_norm": 7.259957313537598,
"learning_rate": 1.5366622728799982e-05,
"loss": 2.7569,
"step": 134000
},
{
"epoch": 2.0857563774521206,
"grad_norm": 7.2549943923950195,
"learning_rate": 1.5237393709131323e-05,
"loss": 2.764,
"step": 134500
},
{
"epoch": 2.0935101186322402,
"grad_norm": 7.1079535484313965,
"learning_rate": 1.5108164689462668e-05,
"loss": 2.7463,
"step": 135000
},
{
"epoch": 2.1012638598123594,
"grad_norm": 7.20269250869751,
"learning_rate": 1.4978935669794009e-05,
"loss": 2.7464,
"step": 135500
},
{
"epoch": 2.109017600992479,
"grad_norm": 7.686685085296631,
"learning_rate": 1.4849706650125352e-05,
"loss": 2.7546,
"step": 136000
},
{
"epoch": 2.116771342172598,
"grad_norm": 6.865842342376709,
"learning_rate": 1.4720477630456697e-05,
"loss": 2.743,
"step": 136500
},
{
"epoch": 2.124525083352718,
"grad_norm": 6.892743110656738,
"learning_rate": 1.4591248610788038e-05,
"loss": 2.7597,
"step": 137000
},
{
"epoch": 2.132278824532837,
"grad_norm": 7.216090679168701,
"learning_rate": 1.4462019591119383e-05,
"loss": 2.7638,
"step": 137500
},
{
"epoch": 2.1400325657129566,
"grad_norm": 7.859537601470947,
"learning_rate": 1.4332790571450724e-05,
"loss": 2.7522,
"step": 138000
},
{
"epoch": 2.147786306893076,
"grad_norm": 7.197884559631348,
"learning_rate": 1.4203561551782069e-05,
"loss": 2.7524,
"step": 138500
},
{
"epoch": 2.1555400480731954,
"grad_norm": 7.401318550109863,
"learning_rate": 1.4074332532113413e-05,
"loss": 2.7629,
"step": 139000
},
{
"epoch": 2.1632937892533146,
"grad_norm": 7.095146656036377,
"learning_rate": 1.3945103512444755e-05,
"loss": 2.7571,
"step": 139500
},
{
"epoch": 2.1710475304334342,
"grad_norm": 7.200826168060303,
"learning_rate": 1.38158744927761e-05,
"loss": 2.7411,
"step": 140000
},
{
"epoch": 2.1788012716135534,
"grad_norm": 7.727132797241211,
"learning_rate": 1.3686645473107442e-05,
"loss": 2.7518,
"step": 140500
},
{
"epoch": 2.186555012793673,
"grad_norm": 7.624775409698486,
"learning_rate": 1.3557416453438784e-05,
"loss": 2.7461,
"step": 141000
},
{
"epoch": 2.194308753973792,
"grad_norm": 7.7125935554504395,
"learning_rate": 1.3428187433770128e-05,
"loss": 2.7506,
"step": 141500
},
{
"epoch": 2.202062495153912,
"grad_norm": 7.944740295410156,
"learning_rate": 1.329895841410147e-05,
"loss": 2.7435,
"step": 142000
},
{
"epoch": 2.209816236334031,
"grad_norm": 7.168126106262207,
"learning_rate": 1.3169729394432814e-05,
"loss": 2.7577,
"step": 142500
},
{
"epoch": 2.2175699775141506,
"grad_norm": 7.608551979064941,
"learning_rate": 1.3040500374764159e-05,
"loss": 2.7514,
"step": 143000
},
{
"epoch": 2.22532371869427,
"grad_norm": 7.155666351318359,
"learning_rate": 1.29112713550955e-05,
"loss": 2.756,
"step": 143500
},
{
"epoch": 2.2330774598743894,
"grad_norm": 7.49126672744751,
"learning_rate": 1.2782042335426845e-05,
"loss": 2.7452,
"step": 144000
},
{
"epoch": 2.2408312010545086,
"grad_norm": 7.515799522399902,
"learning_rate": 1.2652813315758188e-05,
"loss": 2.7485,
"step": 144500
},
{
"epoch": 2.2485849422346282,
"grad_norm": 7.652871608734131,
"learning_rate": 1.252358429608953e-05,
"loss": 2.7519,
"step": 145000
},
{
"epoch": 2.2563386834147474,
"grad_norm": 7.006313800811768,
"learning_rate": 1.2394355276420874e-05,
"loss": 2.7401,
"step": 145500
},
{
"epoch": 2.264092424594867,
"grad_norm": 7.337978839874268,
"learning_rate": 1.2265126256752217e-05,
"loss": 2.7458,
"step": 146000
},
{
"epoch": 2.2718461657749867,
"grad_norm": 7.185283184051514,
"learning_rate": 1.213589723708356e-05,
"loss": 2.7546,
"step": 146500
},
{
"epoch": 2.279599906955106,
"grad_norm": 7.885451316833496,
"learning_rate": 1.2006668217414903e-05,
"loss": 2.738,
"step": 147000
},
{
"epoch": 2.287353648135225,
"grad_norm": 7.163339138031006,
"learning_rate": 1.1877439197746248e-05,
"loss": 2.7403,
"step": 147500
},
{
"epoch": 2.2951073893153446,
"grad_norm": 7.566407680511475,
"learning_rate": 1.174821017807759e-05,
"loss": 2.7441,
"step": 148000
},
{
"epoch": 2.3028611304954643,
"grad_norm": 7.626791477203369,
"learning_rate": 1.1618981158408934e-05,
"loss": 2.7442,
"step": 148500
},
{
"epoch": 2.3106148716755834,
"grad_norm": 7.609415054321289,
"learning_rate": 1.1489752138740275e-05,
"loss": 2.7442,
"step": 149000
},
{
"epoch": 2.318368612855703,
"grad_norm": 7.549880504608154,
"learning_rate": 1.136052311907162e-05,
"loss": 2.7398,
"step": 149500
},
{
"epoch": 2.3261223540358222,
"grad_norm": 7.753575325012207,
"learning_rate": 1.1231294099402963e-05,
"loss": 2.7376,
"step": 150000
},
{
"epoch": 2.333876095215942,
"grad_norm": 7.579866886138916,
"learning_rate": 1.1102065079734306e-05,
"loss": 2.7449,
"step": 150500
},
{
"epoch": 2.341629836396061,
"grad_norm": 7.787561893463135,
"learning_rate": 1.0972836060065649e-05,
"loss": 2.7418,
"step": 151000
},
{
"epoch": 2.3493835775761807,
"grad_norm": 7.163692474365234,
"learning_rate": 1.0843607040396992e-05,
"loss": 2.7459,
"step": 151500
},
{
"epoch": 2.3571373187563,
"grad_norm": 8.124524116516113,
"learning_rate": 1.0714378020728336e-05,
"loss": 2.7472,
"step": 152000
},
{
"epoch": 2.3648910599364195,
"grad_norm": 7.68442964553833,
"learning_rate": 1.058514900105968e-05,
"loss": 2.7454,
"step": 152500
},
{
"epoch": 2.3726448011165386,
"grad_norm": 7.561328887939453,
"learning_rate": 1.045591998139102e-05,
"loss": 2.7314,
"step": 153000
},
{
"epoch": 2.3803985422966583,
"grad_norm": 7.480719566345215,
"learning_rate": 1.0326690961722364e-05,
"loss": 2.7385,
"step": 153500
},
{
"epoch": 2.3881522834767774,
"grad_norm": 7.676718235015869,
"learning_rate": 1.0197461942053708e-05,
"loss": 2.7363,
"step": 154000
},
{
"epoch": 2.395906024656897,
"grad_norm": 7.30204963684082,
"learning_rate": 1.0068232922385051e-05,
"loss": 2.7469,
"step": 154500
},
{
"epoch": 2.4036597658370162,
"grad_norm": 7.684243202209473,
"learning_rate": 9.939003902716394e-06,
"loss": 2.7404,
"step": 155000
},
{
"epoch": 2.411413507017136,
"grad_norm": 7.187122821807861,
"learning_rate": 9.809774883047737e-06,
"loss": 2.7364,
"step": 155500
},
{
"epoch": 2.419167248197255,
"grad_norm": 7.586068153381348,
"learning_rate": 9.680545863379082e-06,
"loss": 2.7331,
"step": 156000
},
{
"epoch": 2.4269209893773747,
"grad_norm": 7.374856948852539,
"learning_rate": 9.551316843710425e-06,
"loss": 2.739,
"step": 156500
},
{
"epoch": 2.434674730557494,
"grad_norm": 7.3092474937438965,
"learning_rate": 9.422087824041768e-06,
"loss": 2.7412,
"step": 157000
},
{
"epoch": 2.4424284717376135,
"grad_norm": 7.9172844886779785,
"learning_rate": 9.29285880437311e-06,
"loss": 2.7384,
"step": 157500
},
{
"epoch": 2.4501822129177326,
"grad_norm": 7.155998706817627,
"learning_rate": 9.163629784704454e-06,
"loss": 2.7324,
"step": 158000
},
{
"epoch": 2.4579359540978523,
"grad_norm": 7.371484756469727,
"learning_rate": 9.034400765035797e-06,
"loss": 2.7382,
"step": 158500
},
{
"epoch": 2.4656896952779714,
"grad_norm": 7.271082401275635,
"learning_rate": 8.90517174536714e-06,
"loss": 2.7238,
"step": 159000
},
{
"epoch": 2.473443436458091,
"grad_norm": 7.525820255279541,
"learning_rate": 8.775942725698483e-06,
"loss": 2.7353,
"step": 159500
},
{
"epoch": 2.4811971776382107,
"grad_norm": 7.422860622406006,
"learning_rate": 8.646713706029828e-06,
"loss": 2.7312,
"step": 160000
},
{
"epoch": 2.48895091881833,
"grad_norm": 7.786092758178711,
"learning_rate": 8.51748468636117e-06,
"loss": 2.729,
"step": 160500
},
{
"epoch": 2.496704659998449,
"grad_norm": 7.733543872833252,
"learning_rate": 8.388255666692514e-06,
"loss": 2.7377,
"step": 161000
},
{
"epoch": 2.5044584011785687,
"grad_norm": 7.477449417114258,
"learning_rate": 8.259026647023855e-06,
"loss": 2.7431,
"step": 161500
},
{
"epoch": 2.5122121423586883,
"grad_norm": 7.466070652008057,
"learning_rate": 8.1297976273552e-06,
"loss": 2.743,
"step": 162000
},
{
"epoch": 2.5199658835388075,
"grad_norm": 7.578529357910156,
"learning_rate": 8.000568607686542e-06,
"loss": 2.738,
"step": 162500
},
{
"epoch": 2.5277196247189266,
"grad_norm": 7.481320381164551,
"learning_rate": 7.871339588017885e-06,
"loss": 2.7332,
"step": 163000
},
{
"epoch": 2.5354733658990463,
"grad_norm": 8.073503494262695,
"learning_rate": 7.742110568349228e-06,
"loss": 2.7517,
"step": 163500
},
{
"epoch": 2.543227107079166,
"grad_norm": 7.4196457862854,
"learning_rate": 7.612881548680572e-06,
"loss": 2.7231,
"step": 164000
},
{
"epoch": 2.550980848259285,
"grad_norm": 7.558558940887451,
"learning_rate": 7.483652529011915e-06,
"loss": 2.7384,
"step": 164500
},
{
"epoch": 2.5587345894394042,
"grad_norm": 7.38846492767334,
"learning_rate": 7.354423509343258e-06,
"loss": 2.7348,
"step": 165000
},
{
"epoch": 2.566488330619524,
"grad_norm": 7.8365864753723145,
"learning_rate": 7.225194489674601e-06,
"loss": 2.7254,
"step": 165500
},
{
"epoch": 2.5742420717996435,
"grad_norm": 7.362669944763184,
"learning_rate": 7.095965470005945e-06,
"loss": 2.729,
"step": 166000
},
{
"epoch": 2.5819958129797627,
"grad_norm": 7.646996974945068,
"learning_rate": 6.966736450337288e-06,
"loss": 2.7333,
"step": 166500
},
{
"epoch": 2.5897495541598823,
"grad_norm": 7.944218158721924,
"learning_rate": 6.837507430668631e-06,
"loss": 2.7423,
"step": 167000
},
{
"epoch": 2.5975032953400015,
"grad_norm": 7.502200603485107,
"learning_rate": 6.708278410999974e-06,
"loss": 2.7225,
"step": 167500
},
{
"epoch": 2.605257036520121,
"grad_norm": 7.175666809082031,
"learning_rate": 6.579049391331319e-06,
"loss": 2.7366,
"step": 168000
},
{
"epoch": 2.6130107777002403,
"grad_norm": 7.814846992492676,
"learning_rate": 6.449820371662661e-06,
"loss": 2.7421,
"step": 168500
},
{
"epoch": 2.62076451888036,
"grad_norm": 7.270232677459717,
"learning_rate": 6.320591351994004e-06,
"loss": 2.7335,
"step": 169000
},
{
"epoch": 2.628518260060479,
"grad_norm": 7.920383930206299,
"learning_rate": 6.191362332325347e-06,
"loss": 2.7352,
"step": 169500
},
{
"epoch": 2.6362720012405987,
"grad_norm": 7.142765998840332,
"learning_rate": 6.062133312656691e-06,
"loss": 2.7318,
"step": 170000
},
{
"epoch": 2.644025742420718,
"grad_norm": 8.12151050567627,
"learning_rate": 5.932904292988034e-06,
"loss": 2.7168,
"step": 170500
},
{
"epoch": 2.6517794836008375,
"grad_norm": 7.717370510101318,
"learning_rate": 5.803675273319377e-06,
"loss": 2.7145,
"step": 171000
},
{
"epoch": 2.6595332247809567,
"grad_norm": 7.359320640563965,
"learning_rate": 5.67444625365072e-06,
"loss": 2.7382,
"step": 171500
},
{
"epoch": 2.6672869659610763,
"grad_norm": 7.525691509246826,
"learning_rate": 5.5452172339820636e-06,
"loss": 2.7228,
"step": 172000
},
{
"epoch": 2.6750407071411955,
"grad_norm": 7.967082500457764,
"learning_rate": 5.4159882143134066e-06,
"loss": 2.7258,
"step": 172500
},
{
"epoch": 2.682794448321315,
"grad_norm": 7.760034561157227,
"learning_rate": 5.2867591946447495e-06,
"loss": 2.7385,
"step": 173000
},
{
"epoch": 2.6905481895014347,
"grad_norm": 7.141742706298828,
"learning_rate": 5.1575301749760925e-06,
"loss": 2.7282,
"step": 173500
},
{
"epoch": 2.698301930681554,
"grad_norm": 7.685527801513672,
"learning_rate": 5.028301155307436e-06,
"loss": 2.7216,
"step": 174000
},
{
"epoch": 2.706055671861673,
"grad_norm": 7.3134236335754395,
"learning_rate": 4.899072135638779e-06,
"loss": 2.7188,
"step": 174500
},
{
"epoch": 2.7138094130417927,
"grad_norm": 7.750508785247803,
"learning_rate": 4.769843115970122e-06,
"loss": 2.7277,
"step": 175000
},
{
"epoch": 2.7215631542219123,
"grad_norm": 7.504671096801758,
"learning_rate": 4.640614096301465e-06,
"loss": 2.738,
"step": 175500
},
{
"epoch": 2.7293168954020315,
"grad_norm": 7.484751224517822,
"learning_rate": 4.511385076632809e-06,
"loss": 2.7148,
"step": 176000
},
{
"epoch": 2.7370706365821507,
"grad_norm": 7.809044361114502,
"learning_rate": 4.382156056964152e-06,
"loss": 2.7388,
"step": 176500
},
{
"epoch": 2.7448243777622703,
"grad_norm": 7.876001834869385,
"learning_rate": 4.252927037295495e-06,
"loss": 2.7214,
"step": 177000
},
{
"epoch": 2.75257811894239,
"grad_norm": 7.753846645355225,
"learning_rate": 4.123698017626838e-06,
"loss": 2.719,
"step": 177500
},
{
"epoch": 2.760331860122509,
"grad_norm": 7.285833358764648,
"learning_rate": 3.994468997958182e-06,
"loss": 2.7347,
"step": 178000
},
{
"epoch": 2.7680856013026283,
"grad_norm": 7.894680023193359,
"learning_rate": 3.865239978289525e-06,
"loss": 2.7376,
"step": 178500
},
{
"epoch": 2.775839342482748,
"grad_norm": 7.850667953491211,
"learning_rate": 3.7360109586208684e-06,
"loss": 2.7289,
"step": 179000
},
{
"epoch": 2.7835930836628675,
"grad_norm": 7.380823135375977,
"learning_rate": 3.606781938952211e-06,
"loss": 2.7289,
"step": 179500
},
{
"epoch": 2.7913468248429867,
"grad_norm": 7.752573490142822,
"learning_rate": 3.477552919283555e-06,
"loss": 2.724,
"step": 180000
},
{
"epoch": 2.7991005660231063,
"grad_norm": 7.117413520812988,
"learning_rate": 3.3483238996148974e-06,
"loss": 2.7332,
"step": 180500
},
{
"epoch": 2.8068543072032255,
"grad_norm": 7.8615522384643555,
"learning_rate": 3.2190948799462412e-06,
"loss": 2.7316,
"step": 181000
},
{
"epoch": 2.814608048383345,
"grad_norm": 7.938878059387207,
"learning_rate": 3.0898658602775842e-06,
"loss": 2.7223,
"step": 181500
},
{
"epoch": 2.8223617895634643,
"grad_norm": 7.760583877563477,
"learning_rate": 2.9606368406089272e-06,
"loss": 2.7222,
"step": 182000
},
{
"epoch": 2.830115530743584,
"grad_norm": 7.352213382720947,
"learning_rate": 2.8314078209402706e-06,
"loss": 2.7233,
"step": 182500
},
{
"epoch": 2.837869271923703,
"grad_norm": 7.541159629821777,
"learning_rate": 2.7021788012716136e-06,
"loss": 2.7225,
"step": 183000
},
{
"epoch": 2.8456230131038227,
"grad_norm": 7.890182018280029,
"learning_rate": 2.572949781602957e-06,
"loss": 2.7219,
"step": 183500
},
{
"epoch": 2.853376754283942,
"grad_norm": 7.695311546325684,
"learning_rate": 2.4437207619343e-06,
"loss": 2.7172,
"step": 184000
},
{
"epoch": 2.8611304954640615,
"grad_norm": 7.7702317237854,
"learning_rate": 2.3144917422656434e-06,
"loss": 2.728,
"step": 184500
},
{
"epoch": 2.8688842366441807,
"grad_norm": 7.646172046661377,
"learning_rate": 2.1852627225969864e-06,
"loss": 2.7312,
"step": 185000
},
{
"epoch": 2.8766379778243003,
"grad_norm": 7.06711483001709,
"learning_rate": 2.05603370292833e-06,
"loss": 2.7175,
"step": 185500
},
{
"epoch": 2.8843917190044195,
"grad_norm": 7.974971294403076,
"learning_rate": 1.926804683259673e-06,
"loss": 2.7244,
"step": 186000
},
{
"epoch": 2.892145460184539,
"grad_norm": 7.5829315185546875,
"learning_rate": 1.797575663591016e-06,
"loss": 2.7298,
"step": 186500
},
{
"epoch": 2.8998992013646583,
"grad_norm": 7.224939823150635,
"learning_rate": 1.6683466439223592e-06,
"loss": 2.7227,
"step": 187000
},
{
"epoch": 2.907652942544778,
"grad_norm": 8.057891845703125,
"learning_rate": 1.5391176242537025e-06,
"loss": 2.7393,
"step": 187500
},
{
"epoch": 2.915406683724897,
"grad_norm": 7.886134624481201,
"learning_rate": 1.4098886045850457e-06,
"loss": 2.7264,
"step": 188000
},
{
"epoch": 2.9231604249050167,
"grad_norm": 7.65654993057251,
"learning_rate": 1.2806595849163889e-06,
"loss": 2.7277,
"step": 188500
},
{
"epoch": 2.9309141660851363,
"grad_norm": 7.524332046508789,
"learning_rate": 1.1514305652477323e-06,
"loss": 2.7281,
"step": 189000
},
{
"epoch": 2.9386679072652555,
"grad_norm": 7.878385543823242,
"learning_rate": 1.0222015455790753e-06,
"loss": 2.7275,
"step": 189500
},
{
"epoch": 2.9464216484453747,
"grad_norm": 7.491950035095215,
"learning_rate": 8.929725259104185e-07,
"loss": 2.7225,
"step": 190000
},
{
"epoch": 2.9541753896254943,
"grad_norm": 8.570446968078613,
"learning_rate": 7.637435062417617e-07,
"loss": 2.722,
"step": 190500
},
{
"epoch": 2.961929130805614,
"grad_norm": 7.909883975982666,
"learning_rate": 6.345144865731049e-07,
"loss": 2.7175,
"step": 191000
},
{
"epoch": 2.969682871985733,
"grad_norm": 7.273110389709473,
"learning_rate": 5.052854669044481e-07,
"loss": 2.7167,
"step": 191500
},
{
"epoch": 2.9774366131658523,
"grad_norm": 7.78535270690918,
"learning_rate": 3.760564472357913e-07,
"loss": 2.7301,
"step": 192000
},
{
"epoch": 2.985190354345972,
"grad_norm": 7.7455973625183105,
"learning_rate": 2.468274275671345e-07,
"loss": 2.7192,
"step": 192500
},
{
"epoch": 2.9929440955260915,
"grad_norm": 7.988417148590088,
"learning_rate": 1.1759840789847768e-07,
"loss": 2.7195,
"step": 193000
}
],
"logging_steps": 500,
"max_steps": 193455,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6392157746049843e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}