{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 193455, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007753741180119408, "grad_norm": 2.8055944442749023, "learning_rate": 4.9870770980331344e-05, "loss": 8.6951, "step": 500 }, { "epoch": 0.015507482360238816, "grad_norm": 2.7864086627960205, "learning_rate": 4.974154196066269e-05, "loss": 6.4397, "step": 1000 }, { "epoch": 0.023261223540358224, "grad_norm": 2.835425853729248, "learning_rate": 4.961231294099403e-05, "loss": 5.8474, "step": 1500 }, { "epoch": 0.03101496472047763, "grad_norm": 3.0123701095581055, "learning_rate": 4.9483083921325374e-05, "loss": 5.5019, "step": 2000 }, { "epoch": 0.038768705900597035, "grad_norm": 2.9796297550201416, "learning_rate": 4.935385490165672e-05, "loss": 5.2444, "step": 2500 }, { "epoch": 0.04652244708071645, "grad_norm": 3.042416572570801, "learning_rate": 4.9224625881988064e-05, "loss": 5.0392, "step": 3000 }, { "epoch": 0.054276188260835856, "grad_norm": 2.9367804527282715, "learning_rate": 4.9095396862319405e-05, "loss": 4.8613, "step": 3500 }, { "epoch": 0.06202992944095526, "grad_norm": 2.9740755558013916, "learning_rate": 4.8966167842650746e-05, "loss": 4.7246, "step": 4000 }, { "epoch": 0.06978367062107467, "grad_norm": 3.364341974258423, "learning_rate": 4.8836938822982094e-05, "loss": 4.5969, "step": 4500 }, { "epoch": 0.07753741180119407, "grad_norm": 3.4089107513427734, "learning_rate": 4.8707709803313436e-05, "loss": 4.4929, "step": 5000 }, { "epoch": 0.08529115298131348, "grad_norm": 3.324126720428467, "learning_rate": 4.857848078364478e-05, "loss": 4.3966, "step": 5500 }, { "epoch": 0.0930448941614329, "grad_norm": 3.3835084438323975, "learning_rate": 4.8449251763976125e-05, "loss": 4.3176, "step": 6000 }, { "epoch": 0.1007986353415523, "grad_norm": 3.6219263076782227, "learning_rate": 4.8320022744307466e-05, "loss": 4.2318, "step": 6500 }, { "epoch": 0.10855237652167171, "grad_norm": 3.2887094020843506, "learning_rate": 4.819079372463881e-05, "loss": 4.161, "step": 7000 }, { "epoch": 0.11630611770179111, "grad_norm": 3.6443707942962646, "learning_rate": 4.806156470497015e-05, "loss": 4.0995, "step": 7500 }, { "epoch": 0.12405985888191053, "grad_norm": 3.623699903488159, "learning_rate": 4.79323356853015e-05, "loss": 4.0583, "step": 8000 }, { "epoch": 0.13181360006202994, "grad_norm": 3.6302268505096436, "learning_rate": 4.780310666563284e-05, "loss": 4.0, "step": 8500 }, { "epoch": 0.13956734124214934, "grad_norm": 3.6418278217315674, "learning_rate": 4.767387764596418e-05, "loss": 3.9486, "step": 9000 }, { "epoch": 0.14732108242226874, "grad_norm": 3.635089159011841, "learning_rate": 4.754464862629552e-05, "loss": 3.896, "step": 9500 }, { "epoch": 0.15507482360238814, "grad_norm": 3.8315062522888184, "learning_rate": 4.741541960662686e-05, "loss": 3.8629, "step": 10000 }, { "epoch": 0.16282856478250757, "grad_norm": 3.7439329624176025, "learning_rate": 4.728619058695821e-05, "loss": 3.8255, "step": 10500 }, { "epoch": 0.17058230596262697, "grad_norm": 3.732562303543091, "learning_rate": 4.715696156728955e-05, "loss": 3.7887, "step": 11000 }, { "epoch": 0.17833604714274637, "grad_norm": 3.7784509658813477, "learning_rate": 4.702773254762089e-05, "loss": 3.749, "step": 11500 }, { "epoch": 0.1860897883228658, "grad_norm": 3.9984617233276367, "learning_rate": 4.6898503527952234e-05, "loss": 3.7065, "step": 12000 }, { "epoch": 0.1938435295029852, "grad_norm": 3.954652786254883, "learning_rate": 4.676927450828358e-05, "loss": 3.6931, "step": 12500 }, { "epoch": 0.2015972706831046, "grad_norm": 3.9958720207214355, "learning_rate": 4.6640045488614924e-05, "loss": 3.6545, "step": 13000 }, { "epoch": 0.209351011863224, "grad_norm": 3.922849178314209, "learning_rate": 4.6510816468946265e-05, "loss": 3.6207, "step": 13500 }, { "epoch": 0.21710475304334342, "grad_norm": 4.092968463897705, "learning_rate": 4.638158744927761e-05, "loss": 3.5974, "step": 14000 }, { "epoch": 0.22485849422346282, "grad_norm": 3.8097622394561768, "learning_rate": 4.6252358429608954e-05, "loss": 3.5775, "step": 14500 }, { "epoch": 0.23261223540358222, "grad_norm": 4.094768047332764, "learning_rate": 4.6123129409940296e-05, "loss": 3.5543, "step": 15000 }, { "epoch": 0.24036597658370162, "grad_norm": 3.8095757961273193, "learning_rate": 4.5993900390271644e-05, "loss": 3.5392, "step": 15500 }, { "epoch": 0.24811971776382105, "grad_norm": 3.8520195484161377, "learning_rate": 4.5864671370602985e-05, "loss": 3.5047, "step": 16000 }, { "epoch": 0.2558734589439404, "grad_norm": 4.280955791473389, "learning_rate": 4.5735442350934326e-05, "loss": 3.495, "step": 16500 }, { "epoch": 0.2636272001240599, "grad_norm": 4.455172538757324, "learning_rate": 4.5606213331265674e-05, "loss": 3.4756, "step": 17000 }, { "epoch": 0.2713809413041793, "grad_norm": 4.1695733070373535, "learning_rate": 4.5476984311597016e-05, "loss": 3.4505, "step": 17500 }, { "epoch": 0.2791346824842987, "grad_norm": 4.223578453063965, "learning_rate": 4.534775529192836e-05, "loss": 3.4446, "step": 18000 }, { "epoch": 0.2868884236644181, "grad_norm": 4.616567134857178, "learning_rate": 4.52185262722597e-05, "loss": 3.4202, "step": 18500 }, { "epoch": 0.2946421648445375, "grad_norm": 4.264205455780029, "learning_rate": 4.5089297252591046e-05, "loss": 3.4024, "step": 19000 }, { "epoch": 0.3023959060246569, "grad_norm": 4.493732929229736, "learning_rate": 4.496006823292239e-05, "loss": 3.397, "step": 19500 }, { "epoch": 0.3101496472047763, "grad_norm": 4.321922779083252, "learning_rate": 4.483083921325373e-05, "loss": 3.3678, "step": 20000 }, { "epoch": 0.31790338838489574, "grad_norm": 4.248241424560547, "learning_rate": 4.470161019358508e-05, "loss": 3.3634, "step": 20500 }, { "epoch": 0.32565712956501514, "grad_norm": 4.490056037902832, "learning_rate": 4.457238117391642e-05, "loss": 3.3529, "step": 21000 }, { "epoch": 0.33341087074513454, "grad_norm": 4.652819633483887, "learning_rate": 4.444315215424776e-05, "loss": 3.3491, "step": 21500 }, { "epoch": 0.34116461192525394, "grad_norm": 4.65127420425415, "learning_rate": 4.431392313457911e-05, "loss": 3.3239, "step": 22000 }, { "epoch": 0.34891835310537334, "grad_norm": 4.279469013214111, "learning_rate": 4.418469411491045e-05, "loss": 3.3061, "step": 22500 }, { "epoch": 0.35667209428549274, "grad_norm": 4.562509059906006, "learning_rate": 4.405546509524179e-05, "loss": 3.3014, "step": 23000 }, { "epoch": 0.36442583546561214, "grad_norm": 4.664194583892822, "learning_rate": 4.392623607557314e-05, "loss": 3.2963, "step": 23500 }, { "epoch": 0.3721795766457316, "grad_norm": 4.531868934631348, "learning_rate": 4.379700705590448e-05, "loss": 3.2776, "step": 24000 }, { "epoch": 0.379933317825851, "grad_norm": 4.553177833557129, "learning_rate": 4.366777803623582e-05, "loss": 3.2559, "step": 24500 }, { "epoch": 0.3876870590059704, "grad_norm": 4.358414649963379, "learning_rate": 4.353854901656716e-05, "loss": 3.253, "step": 25000 }, { "epoch": 0.3954408001860898, "grad_norm": 4.921834468841553, "learning_rate": 4.3409319996898503e-05, "loss": 3.257, "step": 25500 }, { "epoch": 0.4031945413662092, "grad_norm": 4.523054599761963, "learning_rate": 4.3280090977229845e-05, "loss": 3.2368, "step": 26000 }, { "epoch": 0.4109482825463286, "grad_norm": 4.663913249969482, "learning_rate": 4.315086195756119e-05, "loss": 3.2314, "step": 26500 }, { "epoch": 0.418702023726448, "grad_norm": 4.529434680938721, "learning_rate": 4.3021632937892534e-05, "loss": 3.2216, "step": 27000 }, { "epoch": 0.4264557649065674, "grad_norm": 4.806913375854492, "learning_rate": 4.2892403918223875e-05, "loss": 3.2176, "step": 27500 }, { "epoch": 0.43420950608668685, "grad_norm": 4.8940019607543945, "learning_rate": 4.276317489855522e-05, "loss": 3.2061, "step": 28000 }, { "epoch": 0.44196324726680625, "grad_norm": 4.507465839385986, "learning_rate": 4.2633945878886565e-05, "loss": 3.1957, "step": 28500 }, { "epoch": 0.44971698844692565, "grad_norm": 5.141822338104248, "learning_rate": 4.2504716859217906e-05, "loss": 3.1849, "step": 29000 }, { "epoch": 0.45747072962704505, "grad_norm": 4.68912935256958, "learning_rate": 4.237548783954925e-05, "loss": 3.1707, "step": 29500 }, { "epoch": 0.46522447080716445, "grad_norm": 5.046304225921631, "learning_rate": 4.2246258819880595e-05, "loss": 3.1705, "step": 30000 }, { "epoch": 0.47297821198728385, "grad_norm": 4.986753940582275, "learning_rate": 4.211702980021194e-05, "loss": 3.1641, "step": 30500 }, { "epoch": 0.48073195316740325, "grad_norm": 5.041975498199463, "learning_rate": 4.198780078054328e-05, "loss": 3.1545, "step": 31000 }, { "epoch": 0.4884856943475227, "grad_norm": 5.255220413208008, "learning_rate": 4.1858571760874626e-05, "loss": 3.1566, "step": 31500 }, { "epoch": 0.4962394355276421, "grad_norm": 5.2884697914123535, "learning_rate": 4.172934274120597e-05, "loss": 3.1497, "step": 32000 }, { "epoch": 0.5039931767077614, "grad_norm": 4.990237712860107, "learning_rate": 4.160011372153731e-05, "loss": 3.1417, "step": 32500 }, { "epoch": 0.5117469178878808, "grad_norm": 5.165491104125977, "learning_rate": 4.147088470186866e-05, "loss": 3.1227, "step": 33000 }, { "epoch": 0.5195006590680004, "grad_norm": 5.079947471618652, "learning_rate": 4.13416556822e-05, "loss": 3.1276, "step": 33500 }, { "epoch": 0.5272544002481198, "grad_norm": 4.830618858337402, "learning_rate": 4.121242666253134e-05, "loss": 3.1195, "step": 34000 }, { "epoch": 0.5350081414282392, "grad_norm": 5.067671775817871, "learning_rate": 4.108319764286268e-05, "loss": 3.1168, "step": 34500 }, { "epoch": 0.5427618826083586, "grad_norm": 5.153751373291016, "learning_rate": 4.095396862319403e-05, "loss": 3.1087, "step": 35000 }, { "epoch": 0.550515623788478, "grad_norm": 5.0933027267456055, "learning_rate": 4.082473960352537e-05, "loss": 3.0945, "step": 35500 }, { "epoch": 0.5582693649685974, "grad_norm": 5.372694492340088, "learning_rate": 4.069551058385671e-05, "loss": 3.0906, "step": 36000 }, { "epoch": 0.5660231061487168, "grad_norm": 5.414623737335205, "learning_rate": 4.056628156418806e-05, "loss": 3.0894, "step": 36500 }, { "epoch": 0.5737768473288362, "grad_norm": 5.126060962677002, "learning_rate": 4.04370525445194e-05, "loss": 3.0848, "step": 37000 }, { "epoch": 0.5815305885089556, "grad_norm": 4.877682209014893, "learning_rate": 4.030782352485074e-05, "loss": 3.0752, "step": 37500 }, { "epoch": 0.589284329689075, "grad_norm": 5.10503625869751, "learning_rate": 4.017859450518209e-05, "loss": 3.0678, "step": 38000 }, { "epoch": 0.5970380708691944, "grad_norm": 5.342639923095703, "learning_rate": 4.004936548551343e-05, "loss": 3.074, "step": 38500 }, { "epoch": 0.6047918120493138, "grad_norm": 5.377997875213623, "learning_rate": 3.992013646584477e-05, "loss": 3.0633, "step": 39000 }, { "epoch": 0.6125455532294332, "grad_norm": 5.535395622253418, "learning_rate": 3.979090744617612e-05, "loss": 3.0667, "step": 39500 }, { "epoch": 0.6202992944095526, "grad_norm": 5.380632400512695, "learning_rate": 3.966167842650746e-05, "loss": 3.042, "step": 40000 }, { "epoch": 0.6280530355896721, "grad_norm": 5.2183027267456055, "learning_rate": 3.95324494068388e-05, "loss": 3.0488, "step": 40500 }, { "epoch": 0.6358067767697915, "grad_norm": 5.565989971160889, "learning_rate": 3.9403220387170145e-05, "loss": 3.0477, "step": 41000 }, { "epoch": 0.6435605179499109, "grad_norm": 5.33280611038208, "learning_rate": 3.9273991367501486e-05, "loss": 3.0429, "step": 41500 }, { "epoch": 0.6513142591300303, "grad_norm": 5.605506896972656, "learning_rate": 3.9144762347832834e-05, "loss": 3.0423, "step": 42000 }, { "epoch": 0.6590680003101497, "grad_norm": 5.62191915512085, "learning_rate": 3.9015533328164175e-05, "loss": 3.0205, "step": 42500 }, { "epoch": 0.6668217414902691, "grad_norm": 5.318583965301514, "learning_rate": 3.888630430849552e-05, "loss": 3.0277, "step": 43000 }, { "epoch": 0.6745754826703885, "grad_norm": 5.763892650604248, "learning_rate": 3.875707528882686e-05, "loss": 3.0342, "step": 43500 }, { "epoch": 0.6823292238505079, "grad_norm": 5.43319034576416, "learning_rate": 3.86278462691582e-05, "loss": 3.0134, "step": 44000 }, { "epoch": 0.6900829650306273, "grad_norm": 5.857462406158447, "learning_rate": 3.849861724948955e-05, "loss": 3.0091, "step": 44500 }, { "epoch": 0.6978367062107467, "grad_norm": 5.6617021560668945, "learning_rate": 3.836938822982089e-05, "loss": 3.0117, "step": 45000 }, { "epoch": 0.7055904473908661, "grad_norm": 5.857166290283203, "learning_rate": 3.824015921015223e-05, "loss": 3.0189, "step": 45500 }, { "epoch": 0.7133441885709855, "grad_norm": 5.425033092498779, "learning_rate": 3.811093019048358e-05, "loss": 3.019, "step": 46000 }, { "epoch": 0.7210979297511049, "grad_norm": 5.529470920562744, "learning_rate": 3.798170117081492e-05, "loss": 3.002, "step": 46500 }, { "epoch": 0.7288516709312243, "grad_norm": 5.534538745880127, "learning_rate": 3.785247215114626e-05, "loss": 2.995, "step": 47000 }, { "epoch": 0.7366054121113437, "grad_norm": 5.707602024078369, "learning_rate": 3.772324313147761e-05, "loss": 2.9942, "step": 47500 }, { "epoch": 0.7443591532914632, "grad_norm": 5.565040588378906, "learning_rate": 3.759401411180895e-05, "loss": 2.9947, "step": 48000 }, { "epoch": 0.7521128944715826, "grad_norm": 5.714329242706299, "learning_rate": 3.746478509214029e-05, "loss": 2.9941, "step": 48500 }, { "epoch": 0.759866635651702, "grad_norm": 5.562424182891846, "learning_rate": 3.733555607247163e-05, "loss": 2.9827, "step": 49000 }, { "epoch": 0.7676203768318214, "grad_norm": 5.685086250305176, "learning_rate": 3.720632705280298e-05, "loss": 2.9812, "step": 49500 }, { "epoch": 0.7753741180119408, "grad_norm": 5.563987731933594, "learning_rate": 3.707709803313432e-05, "loss": 2.9781, "step": 50000 }, { "epoch": 0.7831278591920602, "grad_norm": 5.545105934143066, "learning_rate": 3.694786901346566e-05, "loss": 2.9586, "step": 50500 }, { "epoch": 0.7908816003721796, "grad_norm": 5.9238386154174805, "learning_rate": 3.681863999379701e-05, "loss": 2.9713, "step": 51000 }, { "epoch": 0.798635341552299, "grad_norm": 5.929417133331299, "learning_rate": 3.668941097412835e-05, "loss": 2.9623, "step": 51500 }, { "epoch": 0.8063890827324184, "grad_norm": 5.734675884246826, "learning_rate": 3.6560181954459694e-05, "loss": 2.9659, "step": 52000 }, { "epoch": 0.8141428239125378, "grad_norm": 5.726919651031494, "learning_rate": 3.643095293479104e-05, "loss": 2.9624, "step": 52500 }, { "epoch": 0.8218965650926572, "grad_norm": 5.474340438842773, "learning_rate": 3.630172391512238e-05, "loss": 2.9597, "step": 53000 }, { "epoch": 0.8296503062727766, "grad_norm": 5.879449367523193, "learning_rate": 3.6172494895453725e-05, "loss": 2.9601, "step": 53500 }, { "epoch": 0.837404047452896, "grad_norm": 6.051661491394043, "learning_rate": 3.604326587578507e-05, "loss": 2.9606, "step": 54000 }, { "epoch": 0.8451577886330154, "grad_norm": 6.062263488769531, "learning_rate": 3.5914036856116414e-05, "loss": 2.9573, "step": 54500 }, { "epoch": 0.8529115298131348, "grad_norm": 5.804770469665527, "learning_rate": 3.5784807836447755e-05, "loss": 2.9512, "step": 55000 }, { "epoch": 0.8606652709932543, "grad_norm": 5.899106025695801, "learning_rate": 3.5655578816779097e-05, "loss": 2.9563, "step": 55500 }, { "epoch": 0.8684190121733737, "grad_norm": 5.6610894203186035, "learning_rate": 3.5526349797110445e-05, "loss": 2.9468, "step": 56000 }, { "epoch": 0.8761727533534931, "grad_norm": 5.676564693450928, "learning_rate": 3.5397120777441786e-05, "loss": 2.9442, "step": 56500 }, { "epoch": 0.8839264945336125, "grad_norm": 6.026761531829834, "learning_rate": 3.526789175777313e-05, "loss": 2.945, "step": 57000 }, { "epoch": 0.8916802357137319, "grad_norm": 6.285312652587891, "learning_rate": 3.5138662738104475e-05, "loss": 2.935, "step": 57500 }, { "epoch": 0.8994339768938513, "grad_norm": 5.787561416625977, "learning_rate": 3.5009433718435817e-05, "loss": 2.9395, "step": 58000 }, { "epoch": 0.9071877180739707, "grad_norm": 5.658621311187744, "learning_rate": 3.488020469876716e-05, "loss": 2.9288, "step": 58500 }, { "epoch": 0.9149414592540901, "grad_norm": 5.896640300750732, "learning_rate": 3.47509756790985e-05, "loss": 2.9351, "step": 59000 }, { "epoch": 0.9226952004342095, "grad_norm": 6.219537734985352, "learning_rate": 3.462174665942984e-05, "loss": 2.9257, "step": 59500 }, { "epoch": 0.9304489416143289, "grad_norm": 6.03794527053833, "learning_rate": 3.449251763976118e-05, "loss": 2.924, "step": 60000 }, { "epoch": 0.9382026827944483, "grad_norm": 6.291288375854492, "learning_rate": 3.436328862009253e-05, "loss": 2.9141, "step": 60500 }, { "epoch": 0.9459564239745677, "grad_norm": 6.239747524261475, "learning_rate": 3.423405960042387e-05, "loss": 2.9212, "step": 61000 }, { "epoch": 0.9537101651546871, "grad_norm": 5.852220058441162, "learning_rate": 3.410483058075521e-05, "loss": 2.9187, "step": 61500 }, { "epoch": 0.9614639063348065, "grad_norm": 5.890344142913818, "learning_rate": 3.397560156108656e-05, "loss": 2.9167, "step": 62000 }, { "epoch": 0.969217647514926, "grad_norm": 6.417314529418945, "learning_rate": 3.38463725414179e-05, "loss": 2.9147, "step": 62500 }, { "epoch": 0.9769713886950454, "grad_norm": 6.077672481536865, "learning_rate": 3.371714352174924e-05, "loss": 2.9144, "step": 63000 }, { "epoch": 0.9847251298751648, "grad_norm": 6.114253520965576, "learning_rate": 3.3587914502080584e-05, "loss": 2.9025, "step": 63500 }, { "epoch": 0.9924788710552842, "grad_norm": 6.094882965087891, "learning_rate": 3.345868548241193e-05, "loss": 2.9058, "step": 64000 }, { "epoch": 1.0002326122354035, "grad_norm": 5.9865498542785645, "learning_rate": 3.3329456462743274e-05, "loss": 2.9067, "step": 64500 }, { "epoch": 1.007986353415523, "grad_norm": 5.904115200042725, "learning_rate": 3.3200227443074615e-05, "loss": 2.8973, "step": 65000 }, { "epoch": 1.0157400945956423, "grad_norm": 6.1497392654418945, "learning_rate": 3.307099842340596e-05, "loss": 2.897, "step": 65500 }, { "epoch": 1.0234938357757617, "grad_norm": 6.136323928833008, "learning_rate": 3.2941769403737304e-05, "loss": 2.9023, "step": 66000 }, { "epoch": 1.0312475769558813, "grad_norm": 6.581076145172119, "learning_rate": 3.2812540384068646e-05, "loss": 2.8962, "step": 66500 }, { "epoch": 1.0390013181360007, "grad_norm": 6.236713886260986, "learning_rate": 3.2683311364399994e-05, "loss": 2.8953, "step": 67000 }, { "epoch": 1.0467550593161201, "grad_norm": 6.375150680541992, "learning_rate": 3.2554082344731335e-05, "loss": 2.8951, "step": 67500 }, { "epoch": 1.0545088004962395, "grad_norm": 6.454219818115234, "learning_rate": 3.2424853325062676e-05, "loss": 2.8902, "step": 68000 }, { "epoch": 1.062262541676359, "grad_norm": 6.4552226066589355, "learning_rate": 3.2295624305394024e-05, "loss": 2.9028, "step": 68500 }, { "epoch": 1.0700162828564783, "grad_norm": 6.323915481567383, "learning_rate": 3.2166395285725366e-05, "loss": 2.8793, "step": 69000 }, { "epoch": 1.0777700240365977, "grad_norm": 6.413472652435303, "learning_rate": 3.203716626605671e-05, "loss": 2.8704, "step": 69500 }, { "epoch": 1.0855237652167171, "grad_norm": 6.230509281158447, "learning_rate": 3.190793724638805e-05, "loss": 2.8867, "step": 70000 }, { "epoch": 1.0932775063968365, "grad_norm": 6.334902286529541, "learning_rate": 3.1778708226719396e-05, "loss": 2.8851, "step": 70500 }, { "epoch": 1.101031247576956, "grad_norm": 6.225689888000488, "learning_rate": 3.164947920705074e-05, "loss": 2.8783, "step": 71000 }, { "epoch": 1.1087849887570753, "grad_norm": 6.088467597961426, "learning_rate": 3.152025018738208e-05, "loss": 2.8837, "step": 71500 }, { "epoch": 1.1165387299371947, "grad_norm": 6.089876174926758, "learning_rate": 3.139102116771343e-05, "loss": 2.8681, "step": 72000 }, { "epoch": 1.1242924711173141, "grad_norm": 6.693448066711426, "learning_rate": 3.126179214804477e-05, "loss": 2.8678, "step": 72500 }, { "epoch": 1.1320462122974335, "grad_norm": 6.45464563369751, "learning_rate": 3.113256312837611e-05, "loss": 2.8717, "step": 73000 }, { "epoch": 1.139799953477553, "grad_norm": 6.7853803634643555, "learning_rate": 3.100333410870746e-05, "loss": 2.8607, "step": 73500 }, { "epoch": 1.1475536946576723, "grad_norm": 6.709279537200928, "learning_rate": 3.08741050890388e-05, "loss": 2.862, "step": 74000 }, { "epoch": 1.1553074358377917, "grad_norm": 6.519172668457031, "learning_rate": 3.074487606937014e-05, "loss": 2.8704, "step": 74500 }, { "epoch": 1.1630611770179111, "grad_norm": 6.5100507736206055, "learning_rate": 3.061564704970148e-05, "loss": 2.8636, "step": 75000 }, { "epoch": 1.1708149181980305, "grad_norm": 6.233548164367676, "learning_rate": 3.0486418030032826e-05, "loss": 2.8773, "step": 75500 }, { "epoch": 1.17856865937815, "grad_norm": 6.704037666320801, "learning_rate": 3.0357189010364168e-05, "loss": 2.8704, "step": 76000 }, { "epoch": 1.1863224005582693, "grad_norm": 6.638470649719238, "learning_rate": 3.0227959990695516e-05, "loss": 2.8708, "step": 76500 }, { "epoch": 1.1940761417383887, "grad_norm": 6.931632995605469, "learning_rate": 3.0098730971026857e-05, "loss": 2.8619, "step": 77000 }, { "epoch": 1.2018298829185081, "grad_norm": 6.729213714599609, "learning_rate": 2.99695019513582e-05, "loss": 2.8558, "step": 77500 }, { "epoch": 1.2095836240986275, "grad_norm": 6.931024551391602, "learning_rate": 2.984027293168954e-05, "loss": 2.8559, "step": 78000 }, { "epoch": 1.217337365278747, "grad_norm": 6.658525466918945, "learning_rate": 2.9711043912020888e-05, "loss": 2.8573, "step": 78500 }, { "epoch": 1.2250911064588663, "grad_norm": 6.135016918182373, "learning_rate": 2.958181489235223e-05, "loss": 2.8604, "step": 79000 }, { "epoch": 1.2328448476389857, "grad_norm": 6.685146331787109, "learning_rate": 2.945258587268357e-05, "loss": 2.844, "step": 79500 }, { "epoch": 1.2405985888191053, "grad_norm": 6.7349982261657715, "learning_rate": 2.9323356853014915e-05, "loss": 2.8451, "step": 80000 }, { "epoch": 1.2483523299992245, "grad_norm": 6.538317680358887, "learning_rate": 2.9194127833346256e-05, "loss": 2.8531, "step": 80500 }, { "epoch": 1.2561060711793441, "grad_norm": 6.23037576675415, "learning_rate": 2.9064898813677598e-05, "loss": 2.8454, "step": 81000 }, { "epoch": 1.2638598123594633, "grad_norm": 6.324411392211914, "learning_rate": 2.8935669794008946e-05, "loss": 2.8462, "step": 81500 }, { "epoch": 1.271613553539583, "grad_norm": 6.693195343017578, "learning_rate": 2.8806440774340287e-05, "loss": 2.8382, "step": 82000 }, { "epoch": 1.2793672947197021, "grad_norm": 6.47921085357666, "learning_rate": 2.8677211754671628e-05, "loss": 2.8492, "step": 82500 }, { "epoch": 1.2871210358998217, "grad_norm": 6.768202304840088, "learning_rate": 2.8547982735002976e-05, "loss": 2.8344, "step": 83000 }, { "epoch": 1.2948747770799411, "grad_norm": 6.594978332519531, "learning_rate": 2.8418753715334318e-05, "loss": 2.8337, "step": 83500 }, { "epoch": 1.3026285182600605, "grad_norm": 6.703185081481934, "learning_rate": 2.828952469566566e-05, "loss": 2.834, "step": 84000 }, { "epoch": 1.31038225944018, "grad_norm": 6.614627838134766, "learning_rate": 2.8160295675997e-05, "loss": 2.835, "step": 84500 }, { "epoch": 1.3181360006202993, "grad_norm": 6.38785457611084, "learning_rate": 2.8031066656328348e-05, "loss": 2.8439, "step": 85000 }, { "epoch": 1.3258897418004187, "grad_norm": 6.479560852050781, "learning_rate": 2.790183763665969e-05, "loss": 2.8338, "step": 85500 }, { "epoch": 1.3336434829805381, "grad_norm": 6.241596698760986, "learning_rate": 2.777260861699103e-05, "loss": 2.8304, "step": 86000 }, { "epoch": 1.3413972241606575, "grad_norm": 7.731629371643066, "learning_rate": 2.764337959732238e-05, "loss": 2.828, "step": 86500 }, { "epoch": 1.349150965340777, "grad_norm": 6.706104278564453, "learning_rate": 2.751415057765372e-05, "loss": 2.8297, "step": 87000 }, { "epoch": 1.3569047065208963, "grad_norm": 6.772350311279297, "learning_rate": 2.738492155798506e-05, "loss": 2.8331, "step": 87500 }, { "epoch": 1.3646584477010157, "grad_norm": 6.890201091766357, "learning_rate": 2.7255692538316406e-05, "loss": 2.8286, "step": 88000 }, { "epoch": 1.3724121888811351, "grad_norm": 6.540558815002441, "learning_rate": 2.7126463518647748e-05, "loss": 2.8306, "step": 88500 }, { "epoch": 1.3801659300612545, "grad_norm": 6.890176773071289, "learning_rate": 2.699723449897909e-05, "loss": 2.827, "step": 89000 }, { "epoch": 1.387919671241374, "grad_norm": 6.634540557861328, "learning_rate": 2.6868005479310437e-05, "loss": 2.8263, "step": 89500 }, { "epoch": 1.3956734124214933, "grad_norm": 7.228022575378418, "learning_rate": 2.6738776459641778e-05, "loss": 2.8365, "step": 90000 }, { "epoch": 1.4034271536016127, "grad_norm": 6.7347869873046875, "learning_rate": 2.660954743997312e-05, "loss": 2.825, "step": 90500 }, { "epoch": 1.4111808947817321, "grad_norm": 6.936458110809326, "learning_rate": 2.6480318420304468e-05, "loss": 2.8266, "step": 91000 }, { "epoch": 1.4189346359618515, "grad_norm": 6.489315509796143, "learning_rate": 2.635108940063581e-05, "loss": 2.8173, "step": 91500 }, { "epoch": 1.426688377141971, "grad_norm": 7.1031012535095215, "learning_rate": 2.622186038096715e-05, "loss": 2.8114, "step": 92000 }, { "epoch": 1.4344421183220903, "grad_norm": 6.918934345245361, "learning_rate": 2.609263136129849e-05, "loss": 2.8252, "step": 92500 }, { "epoch": 1.4421958595022097, "grad_norm": 6.686205863952637, "learning_rate": 2.596340234162984e-05, "loss": 2.8138, "step": 93000 }, { "epoch": 1.4499496006823291, "grad_norm": 6.464860439300537, "learning_rate": 2.583417332196118e-05, "loss": 2.8284, "step": 93500 }, { "epoch": 1.4577033418624485, "grad_norm": 6.871826171875, "learning_rate": 2.5704944302292522e-05, "loss": 2.8215, "step": 94000 }, { "epoch": 1.4654570830425682, "grad_norm": 6.555510520935059, "learning_rate": 2.557571528262387e-05, "loss": 2.8166, "step": 94500 }, { "epoch": 1.4732108242226873, "grad_norm": 6.931303977966309, "learning_rate": 2.544648626295521e-05, "loss": 2.8218, "step": 95000 }, { "epoch": 1.480964565402807, "grad_norm": 6.583662509918213, "learning_rate": 2.5317257243286553e-05, "loss": 2.8075, "step": 95500 }, { "epoch": 1.4887183065829261, "grad_norm": 6.624995231628418, "learning_rate": 2.5188028223617897e-05, "loss": 2.8281, "step": 96000 }, { "epoch": 1.4964720477630458, "grad_norm": 6.899562835693359, "learning_rate": 2.505879920394924e-05, "loss": 2.8195, "step": 96500 }, { "epoch": 1.504225788943165, "grad_norm": 6.846054553985596, "learning_rate": 2.4929570184280583e-05, "loss": 2.8149, "step": 97000 }, { "epoch": 1.5119795301232846, "grad_norm": 6.590377330780029, "learning_rate": 2.4800341164611925e-05, "loss": 2.7984, "step": 97500 }, { "epoch": 1.5197332713034037, "grad_norm": 6.771044731140137, "learning_rate": 2.467111214494327e-05, "loss": 2.82, "step": 98000 }, { "epoch": 1.5274870124835234, "grad_norm": 6.996868133544922, "learning_rate": 2.454188312527461e-05, "loss": 2.803, "step": 98500 }, { "epoch": 1.5352407536636425, "grad_norm": 6.782078742980957, "learning_rate": 2.4412654105605955e-05, "loss": 2.8156, "step": 99000 }, { "epoch": 1.5429944948437622, "grad_norm": 7.141603469848633, "learning_rate": 2.42834250859373e-05, "loss": 2.8081, "step": 99500 }, { "epoch": 1.5507482360238816, "grad_norm": 7.204538822174072, "learning_rate": 2.415419606626864e-05, "loss": 2.7939, "step": 100000 }, { "epoch": 1.558501977204001, "grad_norm": 7.218080043792725, "learning_rate": 2.4024967046599986e-05, "loss": 2.8011, "step": 100500 }, { "epoch": 1.5662557183841204, "grad_norm": 6.774955749511719, "learning_rate": 2.3895738026931327e-05, "loss": 2.8086, "step": 101000 }, { "epoch": 1.5740094595642398, "grad_norm": 6.7942657470703125, "learning_rate": 2.3766509007262672e-05, "loss": 2.7981, "step": 101500 }, { "epoch": 1.5817632007443592, "grad_norm": 6.575582027435303, "learning_rate": 2.3637279987594017e-05, "loss": 2.8048, "step": 102000 }, { "epoch": 1.5895169419244786, "grad_norm": 6.921658515930176, "learning_rate": 2.3508050967925358e-05, "loss": 2.8001, "step": 102500 }, { "epoch": 1.597270683104598, "grad_norm": 7.207976341247559, "learning_rate": 2.3378821948256703e-05, "loss": 2.8112, "step": 103000 }, { "epoch": 1.6050244242847174, "grad_norm": 7.6573710441589355, "learning_rate": 2.3249592928588047e-05, "loss": 2.8044, "step": 103500 }, { "epoch": 1.6127781654648368, "grad_norm": 7.072439670562744, "learning_rate": 2.312036390891939e-05, "loss": 2.7871, "step": 104000 }, { "epoch": 1.6205319066449562, "grad_norm": 7.3211259841918945, "learning_rate": 2.299113488925073e-05, "loss": 2.7921, "step": 104500 }, { "epoch": 1.6282856478250756, "grad_norm": 7.107245445251465, "learning_rate": 2.2861905869582075e-05, "loss": 2.8031, "step": 105000 }, { "epoch": 1.636039389005195, "grad_norm": 6.947020530700684, "learning_rate": 2.2732676849913416e-05, "loss": 2.7869, "step": 105500 }, { "epoch": 1.6437931301853144, "grad_norm": 7.2329301834106445, "learning_rate": 2.260344783024476e-05, "loss": 2.7876, "step": 106000 }, { "epoch": 1.6515468713654338, "grad_norm": 6.861079692840576, "learning_rate": 2.2474218810576102e-05, "loss": 2.7875, "step": 106500 }, { "epoch": 1.6593006125455534, "grad_norm": 7.405232906341553, "learning_rate": 2.2344989790907447e-05, "loss": 2.7955, "step": 107000 }, { "epoch": 1.6670543537256726, "grad_norm": 7.370352745056152, "learning_rate": 2.221576077123879e-05, "loss": 2.7931, "step": 107500 }, { "epoch": 1.6748080949057922, "grad_norm": 7.008327960968018, "learning_rate": 2.2086531751570133e-05, "loss": 2.7936, "step": 108000 }, { "epoch": 1.6825618360859114, "grad_norm": 7.268221378326416, "learning_rate": 2.1957302731901477e-05, "loss": 2.7969, "step": 108500 }, { "epoch": 1.690315577266031, "grad_norm": 6.869812488555908, "learning_rate": 2.182807371223282e-05, "loss": 2.7879, "step": 109000 }, { "epoch": 1.6980693184461502, "grad_norm": 6.86647891998291, "learning_rate": 2.1698844692564163e-05, "loss": 2.7992, "step": 109500 }, { "epoch": 1.7058230596262698, "grad_norm": 7.082624435424805, "learning_rate": 2.1569615672895508e-05, "loss": 2.7829, "step": 110000 }, { "epoch": 1.713576800806389, "grad_norm": 6.880459308624268, "learning_rate": 2.144038665322685e-05, "loss": 2.7797, "step": 110500 }, { "epoch": 1.7213305419865086, "grad_norm": 7.15917444229126, "learning_rate": 2.1311157633558194e-05, "loss": 2.7952, "step": 111000 }, { "epoch": 1.7290842831666278, "grad_norm": 7.239593982696533, "learning_rate": 2.118192861388954e-05, "loss": 2.7836, "step": 111500 }, { "epoch": 1.7368380243467474, "grad_norm": 6.907558441162109, "learning_rate": 2.105269959422088e-05, "loss": 2.7974, "step": 112000 }, { "epoch": 1.7445917655268666, "grad_norm": 7.07895040512085, "learning_rate": 2.0923470574552225e-05, "loss": 2.7847, "step": 112500 }, { "epoch": 1.7523455067069862, "grad_norm": 6.944314956665039, "learning_rate": 2.0794241554883566e-05, "loss": 2.7875, "step": 113000 }, { "epoch": 1.7600992478871054, "grad_norm": 6.936674118041992, "learning_rate": 2.0665012535214907e-05, "loss": 2.7743, "step": 113500 }, { "epoch": 1.767852989067225, "grad_norm": 7.579113960266113, "learning_rate": 2.0535783515546252e-05, "loss": 2.7748, "step": 114000 }, { "epoch": 1.7756067302473444, "grad_norm": 6.939824104309082, "learning_rate": 2.0406554495877593e-05, "loss": 2.7813, "step": 114500 }, { "epoch": 1.7833604714274638, "grad_norm": 6.831909656524658, "learning_rate": 2.0277325476208938e-05, "loss": 2.7755, "step": 115000 }, { "epoch": 1.7911142126075832, "grad_norm": 6.841889381408691, "learning_rate": 2.0148096456540283e-05, "loss": 2.7833, "step": 115500 }, { "epoch": 1.7988679537877026, "grad_norm": 6.934596538543701, "learning_rate": 2.0018867436871624e-05, "loss": 2.7816, "step": 116000 }, { "epoch": 1.806621694967822, "grad_norm": 7.232493877410889, "learning_rate": 1.988963841720297e-05, "loss": 2.7718, "step": 116500 }, { "epoch": 1.8143754361479414, "grad_norm": 6.8913421630859375, "learning_rate": 1.976040939753431e-05, "loss": 2.7806, "step": 117000 }, { "epoch": 1.8221291773280608, "grad_norm": 7.189756393432617, "learning_rate": 1.9631180377865655e-05, "loss": 2.7696, "step": 117500 }, { "epoch": 1.8298829185081802, "grad_norm": 7.165264129638672, "learning_rate": 1.9501951358197e-05, "loss": 2.772, "step": 118000 }, { "epoch": 1.8376366596882996, "grad_norm": 7.418449878692627, "learning_rate": 1.937272233852834e-05, "loss": 2.7673, "step": 118500 }, { "epoch": 1.845390400868419, "grad_norm": 7.016151428222656, "learning_rate": 1.9243493318859685e-05, "loss": 2.7721, "step": 119000 }, { "epoch": 1.8531441420485384, "grad_norm": 7.69176721572876, "learning_rate": 1.9114264299191027e-05, "loss": 2.7712, "step": 119500 }, { "epoch": 1.8608978832286578, "grad_norm": 7.078608512878418, "learning_rate": 1.898503527952237e-05, "loss": 2.7626, "step": 120000 }, { "epoch": 1.8686516244087772, "grad_norm": 7.167757034301758, "learning_rate": 1.8855806259853716e-05, "loss": 2.7757, "step": 120500 }, { "epoch": 1.8764053655888966, "grad_norm": 7.261277675628662, "learning_rate": 1.8726577240185057e-05, "loss": 2.7638, "step": 121000 }, { "epoch": 1.8841591067690162, "grad_norm": 7.32155179977417, "learning_rate": 1.85973482205164e-05, "loss": 2.7743, "step": 121500 }, { "epoch": 1.8919128479491354, "grad_norm": 7.3756103515625, "learning_rate": 1.8468119200847743e-05, "loss": 2.7751, "step": 122000 }, { "epoch": 1.899666589129255, "grad_norm": 7.422321796417236, "learning_rate": 1.8338890181179085e-05, "loss": 2.7766, "step": 122500 }, { "epoch": 1.9074203303093742, "grad_norm": 7.091059684753418, "learning_rate": 1.820966116151043e-05, "loss": 2.7643, "step": 123000 }, { "epoch": 1.9151740714894938, "grad_norm": 6.582401275634766, "learning_rate": 1.808043214184177e-05, "loss": 2.7665, "step": 123500 }, { "epoch": 1.922927812669613, "grad_norm": 7.574552536010742, "learning_rate": 1.7951203122173115e-05, "loss": 2.7548, "step": 124000 }, { "epoch": 1.9306815538497326, "grad_norm": 7.082491874694824, "learning_rate": 1.782197410250446e-05, "loss": 2.7577, "step": 124500 }, { "epoch": 1.9384352950298518, "grad_norm": 7.546943187713623, "learning_rate": 1.76927450828358e-05, "loss": 2.7709, "step": 125000 }, { "epoch": 1.9461890362099714, "grad_norm": 7.498143196105957, "learning_rate": 1.7563516063167146e-05, "loss": 2.7674, "step": 125500 }, { "epoch": 1.9539427773900906, "grad_norm": 7.182895660400391, "learning_rate": 1.743428704349849e-05, "loss": 2.7595, "step": 126000 }, { "epoch": 1.9616965185702102, "grad_norm": 7.754599094390869, "learning_rate": 1.7305058023829832e-05, "loss": 2.7586, "step": 126500 }, { "epoch": 1.9694502597503294, "grad_norm": 7.348043918609619, "learning_rate": 1.7175829004161177e-05, "loss": 2.7671, "step": 127000 }, { "epoch": 1.977204000930449, "grad_norm": 7.5025835037231445, "learning_rate": 1.7046599984492518e-05, "loss": 2.7596, "step": 127500 }, { "epoch": 1.9849577421105682, "grad_norm": 7.277556896209717, "learning_rate": 1.6917370964823863e-05, "loss": 2.7554, "step": 128000 }, { "epoch": 1.9927114832906878, "grad_norm": 7.15894079208374, "learning_rate": 1.6788141945155207e-05, "loss": 2.758, "step": 128500 }, { "epoch": 2.000465224470807, "grad_norm": 7.221950054168701, "learning_rate": 1.665891292548655e-05, "loss": 2.7564, "step": 129000 }, { "epoch": 2.0082189656509266, "grad_norm": 7.185346603393555, "learning_rate": 1.6529683905817893e-05, "loss": 2.7671, "step": 129500 }, { "epoch": 2.015972706831046, "grad_norm": 7.411344528198242, "learning_rate": 1.6400454886149234e-05, "loss": 2.7698, "step": 130000 }, { "epoch": 2.0237264480111654, "grad_norm": 7.3418498039245605, "learning_rate": 1.6271225866480576e-05, "loss": 2.7508, "step": 130500 }, { "epoch": 2.0314801891912846, "grad_norm": 7.749533176422119, "learning_rate": 1.614199684681192e-05, "loss": 2.7643, "step": 131000 }, { "epoch": 2.039233930371404, "grad_norm": 7.400169372558594, "learning_rate": 1.6012767827143262e-05, "loss": 2.7523, "step": 131500 }, { "epoch": 2.0469876715515234, "grad_norm": 6.999739646911621, "learning_rate": 1.5883538807474606e-05, "loss": 2.7525, "step": 132000 }, { "epoch": 2.054741412731643, "grad_norm": 7.423766613006592, "learning_rate": 1.575430978780595e-05, "loss": 2.7497, "step": 132500 }, { "epoch": 2.0624951539117626, "grad_norm": 7.121034622192383, "learning_rate": 1.5625080768137292e-05, "loss": 2.7644, "step": 133000 }, { "epoch": 2.070248895091882, "grad_norm": 7.697927951812744, "learning_rate": 1.5495851748468637e-05, "loss": 2.7599, "step": 133500 }, { "epoch": 2.0780026362720014, "grad_norm": 7.259957313537598, "learning_rate": 1.5366622728799982e-05, "loss": 2.7569, "step": 134000 }, { "epoch": 2.0857563774521206, "grad_norm": 7.2549943923950195, "learning_rate": 1.5237393709131323e-05, "loss": 2.764, "step": 134500 }, { "epoch": 2.0935101186322402, "grad_norm": 7.1079535484313965, "learning_rate": 1.5108164689462668e-05, "loss": 2.7463, "step": 135000 }, { "epoch": 2.1012638598123594, "grad_norm": 7.20269250869751, "learning_rate": 1.4978935669794009e-05, "loss": 2.7464, "step": 135500 }, { "epoch": 2.109017600992479, "grad_norm": 7.686685085296631, "learning_rate": 1.4849706650125352e-05, "loss": 2.7546, "step": 136000 }, { "epoch": 2.116771342172598, "grad_norm": 6.865842342376709, "learning_rate": 1.4720477630456697e-05, "loss": 2.743, "step": 136500 }, { "epoch": 2.124525083352718, "grad_norm": 6.892743110656738, "learning_rate": 1.4591248610788038e-05, "loss": 2.7597, "step": 137000 }, { "epoch": 2.132278824532837, "grad_norm": 7.216090679168701, "learning_rate": 1.4462019591119383e-05, "loss": 2.7638, "step": 137500 }, { "epoch": 2.1400325657129566, "grad_norm": 7.859537601470947, "learning_rate": 1.4332790571450724e-05, "loss": 2.7522, "step": 138000 }, { "epoch": 2.147786306893076, "grad_norm": 7.197884559631348, "learning_rate": 1.4203561551782069e-05, "loss": 2.7524, "step": 138500 }, { "epoch": 2.1555400480731954, "grad_norm": 7.401318550109863, "learning_rate": 1.4074332532113413e-05, "loss": 2.7629, "step": 139000 }, { "epoch": 2.1632937892533146, "grad_norm": 7.095146656036377, "learning_rate": 1.3945103512444755e-05, "loss": 2.7571, "step": 139500 }, { "epoch": 2.1710475304334342, "grad_norm": 7.200826168060303, "learning_rate": 1.38158744927761e-05, "loss": 2.7411, "step": 140000 }, { "epoch": 2.1788012716135534, "grad_norm": 7.727132797241211, "learning_rate": 1.3686645473107442e-05, "loss": 2.7518, "step": 140500 }, { "epoch": 2.186555012793673, "grad_norm": 7.624775409698486, "learning_rate": 1.3557416453438784e-05, "loss": 2.7461, "step": 141000 }, { "epoch": 2.194308753973792, "grad_norm": 7.7125935554504395, "learning_rate": 1.3428187433770128e-05, "loss": 2.7506, "step": 141500 }, { "epoch": 2.202062495153912, "grad_norm": 7.944740295410156, "learning_rate": 1.329895841410147e-05, "loss": 2.7435, "step": 142000 }, { "epoch": 2.209816236334031, "grad_norm": 7.168126106262207, "learning_rate": 1.3169729394432814e-05, "loss": 2.7577, "step": 142500 }, { "epoch": 2.2175699775141506, "grad_norm": 7.608551979064941, "learning_rate": 1.3040500374764159e-05, "loss": 2.7514, "step": 143000 }, { "epoch": 2.22532371869427, "grad_norm": 7.155666351318359, "learning_rate": 1.29112713550955e-05, "loss": 2.756, "step": 143500 }, { "epoch": 2.2330774598743894, "grad_norm": 7.49126672744751, "learning_rate": 1.2782042335426845e-05, "loss": 2.7452, "step": 144000 }, { "epoch": 2.2408312010545086, "grad_norm": 7.515799522399902, "learning_rate": 1.2652813315758188e-05, "loss": 2.7485, "step": 144500 }, { "epoch": 2.2485849422346282, "grad_norm": 7.652871608734131, "learning_rate": 1.252358429608953e-05, "loss": 2.7519, "step": 145000 }, { "epoch": 2.2563386834147474, "grad_norm": 7.006313800811768, "learning_rate": 1.2394355276420874e-05, "loss": 2.7401, "step": 145500 }, { "epoch": 2.264092424594867, "grad_norm": 7.337978839874268, "learning_rate": 1.2265126256752217e-05, "loss": 2.7458, "step": 146000 }, { "epoch": 2.2718461657749867, "grad_norm": 7.185283184051514, "learning_rate": 1.213589723708356e-05, "loss": 2.7546, "step": 146500 }, { "epoch": 2.279599906955106, "grad_norm": 7.885451316833496, "learning_rate": 1.2006668217414903e-05, "loss": 2.738, "step": 147000 }, { "epoch": 2.287353648135225, "grad_norm": 7.163339138031006, "learning_rate": 1.1877439197746248e-05, "loss": 2.7403, "step": 147500 }, { "epoch": 2.2951073893153446, "grad_norm": 7.566407680511475, "learning_rate": 1.174821017807759e-05, "loss": 2.7441, "step": 148000 }, { "epoch": 2.3028611304954643, "grad_norm": 7.626791477203369, "learning_rate": 1.1618981158408934e-05, "loss": 2.7442, "step": 148500 }, { "epoch": 2.3106148716755834, "grad_norm": 7.609415054321289, "learning_rate": 1.1489752138740275e-05, "loss": 2.7442, "step": 149000 }, { "epoch": 2.318368612855703, "grad_norm": 7.549880504608154, "learning_rate": 1.136052311907162e-05, "loss": 2.7398, "step": 149500 }, { "epoch": 2.3261223540358222, "grad_norm": 7.753575325012207, "learning_rate": 1.1231294099402963e-05, "loss": 2.7376, "step": 150000 }, { "epoch": 2.333876095215942, "grad_norm": 7.579866886138916, "learning_rate": 1.1102065079734306e-05, "loss": 2.7449, "step": 150500 }, { "epoch": 2.341629836396061, "grad_norm": 7.787561893463135, "learning_rate": 1.0972836060065649e-05, "loss": 2.7418, "step": 151000 }, { "epoch": 2.3493835775761807, "grad_norm": 7.163692474365234, "learning_rate": 1.0843607040396992e-05, "loss": 2.7459, "step": 151500 }, { "epoch": 2.3571373187563, "grad_norm": 8.124524116516113, "learning_rate": 1.0714378020728336e-05, "loss": 2.7472, "step": 152000 }, { "epoch": 2.3648910599364195, "grad_norm": 7.68442964553833, "learning_rate": 1.058514900105968e-05, "loss": 2.7454, "step": 152500 }, { "epoch": 2.3726448011165386, "grad_norm": 7.561328887939453, "learning_rate": 1.045591998139102e-05, "loss": 2.7314, "step": 153000 }, { "epoch": 2.3803985422966583, "grad_norm": 7.480719566345215, "learning_rate": 1.0326690961722364e-05, "loss": 2.7385, "step": 153500 }, { "epoch": 2.3881522834767774, "grad_norm": 7.676718235015869, "learning_rate": 1.0197461942053708e-05, "loss": 2.7363, "step": 154000 }, { "epoch": 2.395906024656897, "grad_norm": 7.30204963684082, "learning_rate": 1.0068232922385051e-05, "loss": 2.7469, "step": 154500 }, { "epoch": 2.4036597658370162, "grad_norm": 7.684243202209473, "learning_rate": 9.939003902716394e-06, "loss": 2.7404, "step": 155000 }, { "epoch": 2.411413507017136, "grad_norm": 7.187122821807861, "learning_rate": 9.809774883047737e-06, "loss": 2.7364, "step": 155500 }, { "epoch": 2.419167248197255, "grad_norm": 7.586068153381348, "learning_rate": 9.680545863379082e-06, "loss": 2.7331, "step": 156000 }, { "epoch": 2.4269209893773747, "grad_norm": 7.374856948852539, "learning_rate": 9.551316843710425e-06, "loss": 2.739, "step": 156500 }, { "epoch": 2.434674730557494, "grad_norm": 7.3092474937438965, "learning_rate": 9.422087824041768e-06, "loss": 2.7412, "step": 157000 }, { "epoch": 2.4424284717376135, "grad_norm": 7.9172844886779785, "learning_rate": 9.29285880437311e-06, "loss": 2.7384, "step": 157500 }, { "epoch": 2.4501822129177326, "grad_norm": 7.155998706817627, "learning_rate": 9.163629784704454e-06, "loss": 2.7324, "step": 158000 }, { "epoch": 2.4579359540978523, "grad_norm": 7.371484756469727, "learning_rate": 9.034400765035797e-06, "loss": 2.7382, "step": 158500 }, { "epoch": 2.4656896952779714, "grad_norm": 7.271082401275635, "learning_rate": 8.90517174536714e-06, "loss": 2.7238, "step": 159000 }, { "epoch": 2.473443436458091, "grad_norm": 7.525820255279541, "learning_rate": 8.775942725698483e-06, "loss": 2.7353, "step": 159500 }, { "epoch": 2.4811971776382107, "grad_norm": 7.422860622406006, "learning_rate": 8.646713706029828e-06, "loss": 2.7312, "step": 160000 }, { "epoch": 2.48895091881833, "grad_norm": 7.786092758178711, "learning_rate": 8.51748468636117e-06, "loss": 2.729, "step": 160500 }, { "epoch": 2.496704659998449, "grad_norm": 7.733543872833252, "learning_rate": 8.388255666692514e-06, "loss": 2.7377, "step": 161000 }, { "epoch": 2.5044584011785687, "grad_norm": 7.477449417114258, "learning_rate": 8.259026647023855e-06, "loss": 2.7431, "step": 161500 }, { "epoch": 2.5122121423586883, "grad_norm": 7.466070652008057, "learning_rate": 8.1297976273552e-06, "loss": 2.743, "step": 162000 }, { "epoch": 2.5199658835388075, "grad_norm": 7.578529357910156, "learning_rate": 8.000568607686542e-06, "loss": 2.738, "step": 162500 }, { "epoch": 2.5277196247189266, "grad_norm": 7.481320381164551, "learning_rate": 7.871339588017885e-06, "loss": 2.7332, "step": 163000 }, { "epoch": 2.5354733658990463, "grad_norm": 8.073503494262695, "learning_rate": 7.742110568349228e-06, "loss": 2.7517, "step": 163500 }, { "epoch": 2.543227107079166, "grad_norm": 7.4196457862854, "learning_rate": 7.612881548680572e-06, "loss": 2.7231, "step": 164000 }, { "epoch": 2.550980848259285, "grad_norm": 7.558558940887451, "learning_rate": 7.483652529011915e-06, "loss": 2.7384, "step": 164500 }, { "epoch": 2.5587345894394042, "grad_norm": 7.38846492767334, "learning_rate": 7.354423509343258e-06, "loss": 2.7348, "step": 165000 }, { "epoch": 2.566488330619524, "grad_norm": 7.8365864753723145, "learning_rate": 7.225194489674601e-06, "loss": 2.7254, "step": 165500 }, { "epoch": 2.5742420717996435, "grad_norm": 7.362669944763184, "learning_rate": 7.095965470005945e-06, "loss": 2.729, "step": 166000 }, { "epoch": 2.5819958129797627, "grad_norm": 7.646996974945068, "learning_rate": 6.966736450337288e-06, "loss": 2.7333, "step": 166500 }, { "epoch": 2.5897495541598823, "grad_norm": 7.944218158721924, "learning_rate": 6.837507430668631e-06, "loss": 2.7423, "step": 167000 }, { "epoch": 2.5975032953400015, "grad_norm": 7.502200603485107, "learning_rate": 6.708278410999974e-06, "loss": 2.7225, "step": 167500 }, { "epoch": 2.605257036520121, "grad_norm": 7.175666809082031, "learning_rate": 6.579049391331319e-06, "loss": 2.7366, "step": 168000 }, { "epoch": 2.6130107777002403, "grad_norm": 7.814846992492676, "learning_rate": 6.449820371662661e-06, "loss": 2.7421, "step": 168500 }, { "epoch": 2.62076451888036, "grad_norm": 7.270232677459717, "learning_rate": 6.320591351994004e-06, "loss": 2.7335, "step": 169000 }, { "epoch": 2.628518260060479, "grad_norm": 7.920383930206299, "learning_rate": 6.191362332325347e-06, "loss": 2.7352, "step": 169500 }, { "epoch": 2.6362720012405987, "grad_norm": 7.142765998840332, "learning_rate": 6.062133312656691e-06, "loss": 2.7318, "step": 170000 }, { "epoch": 2.644025742420718, "grad_norm": 8.12151050567627, "learning_rate": 5.932904292988034e-06, "loss": 2.7168, "step": 170500 }, { "epoch": 2.6517794836008375, "grad_norm": 7.717370510101318, "learning_rate": 5.803675273319377e-06, "loss": 2.7145, "step": 171000 }, { "epoch": 2.6595332247809567, "grad_norm": 7.359320640563965, "learning_rate": 5.67444625365072e-06, "loss": 2.7382, "step": 171500 }, { "epoch": 2.6672869659610763, "grad_norm": 7.525691509246826, "learning_rate": 5.5452172339820636e-06, "loss": 2.7228, "step": 172000 }, { "epoch": 2.6750407071411955, "grad_norm": 7.967082500457764, "learning_rate": 5.4159882143134066e-06, "loss": 2.7258, "step": 172500 }, { "epoch": 2.682794448321315, "grad_norm": 7.760034561157227, "learning_rate": 5.2867591946447495e-06, "loss": 2.7385, "step": 173000 }, { "epoch": 2.6905481895014347, "grad_norm": 7.141742706298828, "learning_rate": 5.1575301749760925e-06, "loss": 2.7282, "step": 173500 }, { "epoch": 2.698301930681554, "grad_norm": 7.685527801513672, "learning_rate": 5.028301155307436e-06, "loss": 2.7216, "step": 174000 }, { "epoch": 2.706055671861673, "grad_norm": 7.3134236335754395, "learning_rate": 4.899072135638779e-06, "loss": 2.7188, "step": 174500 }, { "epoch": 2.7138094130417927, "grad_norm": 7.750508785247803, "learning_rate": 4.769843115970122e-06, "loss": 2.7277, "step": 175000 }, { "epoch": 2.7215631542219123, "grad_norm": 7.504671096801758, "learning_rate": 4.640614096301465e-06, "loss": 2.738, "step": 175500 }, { "epoch": 2.7293168954020315, "grad_norm": 7.484751224517822, "learning_rate": 4.511385076632809e-06, "loss": 2.7148, "step": 176000 }, { "epoch": 2.7370706365821507, "grad_norm": 7.809044361114502, "learning_rate": 4.382156056964152e-06, "loss": 2.7388, "step": 176500 }, { "epoch": 2.7448243777622703, "grad_norm": 7.876001834869385, "learning_rate": 4.252927037295495e-06, "loss": 2.7214, "step": 177000 }, { "epoch": 2.75257811894239, "grad_norm": 7.753846645355225, "learning_rate": 4.123698017626838e-06, "loss": 2.719, "step": 177500 }, { "epoch": 2.760331860122509, "grad_norm": 7.285833358764648, "learning_rate": 3.994468997958182e-06, "loss": 2.7347, "step": 178000 }, { "epoch": 2.7680856013026283, "grad_norm": 7.894680023193359, "learning_rate": 3.865239978289525e-06, "loss": 2.7376, "step": 178500 }, { "epoch": 2.775839342482748, "grad_norm": 7.850667953491211, "learning_rate": 3.7360109586208684e-06, "loss": 2.7289, "step": 179000 }, { "epoch": 2.7835930836628675, "grad_norm": 7.380823135375977, "learning_rate": 3.606781938952211e-06, "loss": 2.7289, "step": 179500 }, { "epoch": 2.7913468248429867, "grad_norm": 7.752573490142822, "learning_rate": 3.477552919283555e-06, "loss": 2.724, "step": 180000 }, { "epoch": 2.7991005660231063, "grad_norm": 7.117413520812988, "learning_rate": 3.3483238996148974e-06, "loss": 2.7332, "step": 180500 }, { "epoch": 2.8068543072032255, "grad_norm": 7.8615522384643555, "learning_rate": 3.2190948799462412e-06, "loss": 2.7316, "step": 181000 }, { "epoch": 2.814608048383345, "grad_norm": 7.938878059387207, "learning_rate": 3.0898658602775842e-06, "loss": 2.7223, "step": 181500 }, { "epoch": 2.8223617895634643, "grad_norm": 7.760583877563477, "learning_rate": 2.9606368406089272e-06, "loss": 2.7222, "step": 182000 }, { "epoch": 2.830115530743584, "grad_norm": 7.352213382720947, "learning_rate": 2.8314078209402706e-06, "loss": 2.7233, "step": 182500 }, { "epoch": 2.837869271923703, "grad_norm": 7.541159629821777, "learning_rate": 2.7021788012716136e-06, "loss": 2.7225, "step": 183000 }, { "epoch": 2.8456230131038227, "grad_norm": 7.890182018280029, "learning_rate": 2.572949781602957e-06, "loss": 2.7219, "step": 183500 }, { "epoch": 2.853376754283942, "grad_norm": 7.695311546325684, "learning_rate": 2.4437207619343e-06, "loss": 2.7172, "step": 184000 }, { "epoch": 2.8611304954640615, "grad_norm": 7.7702317237854, "learning_rate": 2.3144917422656434e-06, "loss": 2.728, "step": 184500 }, { "epoch": 2.8688842366441807, "grad_norm": 7.646172046661377, "learning_rate": 2.1852627225969864e-06, "loss": 2.7312, "step": 185000 }, { "epoch": 2.8766379778243003, "grad_norm": 7.06711483001709, "learning_rate": 2.05603370292833e-06, "loss": 2.7175, "step": 185500 }, { "epoch": 2.8843917190044195, "grad_norm": 7.974971294403076, "learning_rate": 1.926804683259673e-06, "loss": 2.7244, "step": 186000 }, { "epoch": 2.892145460184539, "grad_norm": 7.5829315185546875, "learning_rate": 1.797575663591016e-06, "loss": 2.7298, "step": 186500 }, { "epoch": 2.8998992013646583, "grad_norm": 7.224939823150635, "learning_rate": 1.6683466439223592e-06, "loss": 2.7227, "step": 187000 }, { "epoch": 2.907652942544778, "grad_norm": 8.057891845703125, "learning_rate": 1.5391176242537025e-06, "loss": 2.7393, "step": 187500 }, { "epoch": 2.915406683724897, "grad_norm": 7.886134624481201, "learning_rate": 1.4098886045850457e-06, "loss": 2.7264, "step": 188000 }, { "epoch": 2.9231604249050167, "grad_norm": 7.65654993057251, "learning_rate": 1.2806595849163889e-06, "loss": 2.7277, "step": 188500 }, { "epoch": 2.9309141660851363, "grad_norm": 7.524332046508789, "learning_rate": 1.1514305652477323e-06, "loss": 2.7281, "step": 189000 }, { "epoch": 2.9386679072652555, "grad_norm": 7.878385543823242, "learning_rate": 1.0222015455790753e-06, "loss": 2.7275, "step": 189500 }, { "epoch": 2.9464216484453747, "grad_norm": 7.491950035095215, "learning_rate": 8.929725259104185e-07, "loss": 2.7225, "step": 190000 }, { "epoch": 2.9541753896254943, "grad_norm": 8.570446968078613, "learning_rate": 7.637435062417617e-07, "loss": 2.722, "step": 190500 }, { "epoch": 2.961929130805614, "grad_norm": 7.909883975982666, "learning_rate": 6.345144865731049e-07, "loss": 2.7175, "step": 191000 }, { "epoch": 2.969682871985733, "grad_norm": 7.273110389709473, "learning_rate": 5.052854669044481e-07, "loss": 2.7167, "step": 191500 }, { "epoch": 2.9774366131658523, "grad_norm": 7.78535270690918, "learning_rate": 3.760564472357913e-07, "loss": 2.7301, "step": 192000 }, { "epoch": 2.985190354345972, "grad_norm": 7.7455973625183105, "learning_rate": 2.468274275671345e-07, "loss": 2.7192, "step": 192500 }, { "epoch": 2.9929440955260915, "grad_norm": 7.988417148590088, "learning_rate": 1.1759840789847768e-07, "loss": 2.7195, "step": 193000 } ], "logging_steps": 500, "max_steps": 193455, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6392157746049843e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }