|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 193455, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007753741180119408, |
|
"grad_norm": 2.8055944442749023, |
|
"learning_rate": 4.9870770980331344e-05, |
|
"loss": 8.6951, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.015507482360238816, |
|
"grad_norm": 2.7864086627960205, |
|
"learning_rate": 4.974154196066269e-05, |
|
"loss": 6.4397, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.023261223540358224, |
|
"grad_norm": 2.835425853729248, |
|
"learning_rate": 4.961231294099403e-05, |
|
"loss": 5.8474, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.03101496472047763, |
|
"grad_norm": 3.0123701095581055, |
|
"learning_rate": 4.9483083921325374e-05, |
|
"loss": 5.5019, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.038768705900597035, |
|
"grad_norm": 2.9796297550201416, |
|
"learning_rate": 4.935385490165672e-05, |
|
"loss": 5.2444, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.04652244708071645, |
|
"grad_norm": 3.042416572570801, |
|
"learning_rate": 4.9224625881988064e-05, |
|
"loss": 5.0392, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.054276188260835856, |
|
"grad_norm": 2.9367804527282715, |
|
"learning_rate": 4.9095396862319405e-05, |
|
"loss": 4.8613, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.06202992944095526, |
|
"grad_norm": 2.9740755558013916, |
|
"learning_rate": 4.8966167842650746e-05, |
|
"loss": 4.7246, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.06978367062107467, |
|
"grad_norm": 3.364341974258423, |
|
"learning_rate": 4.8836938822982094e-05, |
|
"loss": 4.5969, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.07753741180119407, |
|
"grad_norm": 3.4089107513427734, |
|
"learning_rate": 4.8707709803313436e-05, |
|
"loss": 4.4929, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.08529115298131348, |
|
"grad_norm": 3.324126720428467, |
|
"learning_rate": 4.857848078364478e-05, |
|
"loss": 4.3966, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.0930448941614329, |
|
"grad_norm": 3.3835084438323975, |
|
"learning_rate": 4.8449251763976125e-05, |
|
"loss": 4.3176, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.1007986353415523, |
|
"grad_norm": 3.6219263076782227, |
|
"learning_rate": 4.8320022744307466e-05, |
|
"loss": 4.2318, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.10855237652167171, |
|
"grad_norm": 3.2887094020843506, |
|
"learning_rate": 4.819079372463881e-05, |
|
"loss": 4.161, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.11630611770179111, |
|
"grad_norm": 3.6443707942962646, |
|
"learning_rate": 4.806156470497015e-05, |
|
"loss": 4.0995, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.12405985888191053, |
|
"grad_norm": 3.623699903488159, |
|
"learning_rate": 4.79323356853015e-05, |
|
"loss": 4.0583, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.13181360006202994, |
|
"grad_norm": 3.6302268505096436, |
|
"learning_rate": 4.780310666563284e-05, |
|
"loss": 4.0, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.13956734124214934, |
|
"grad_norm": 3.6418278217315674, |
|
"learning_rate": 4.767387764596418e-05, |
|
"loss": 3.9486, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.14732108242226874, |
|
"grad_norm": 3.635089159011841, |
|
"learning_rate": 4.754464862629552e-05, |
|
"loss": 3.896, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.15507482360238814, |
|
"grad_norm": 3.8315062522888184, |
|
"learning_rate": 4.741541960662686e-05, |
|
"loss": 3.8629, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.16282856478250757, |
|
"grad_norm": 3.7439329624176025, |
|
"learning_rate": 4.728619058695821e-05, |
|
"loss": 3.8255, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.17058230596262697, |
|
"grad_norm": 3.732562303543091, |
|
"learning_rate": 4.715696156728955e-05, |
|
"loss": 3.7887, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.17833604714274637, |
|
"grad_norm": 3.7784509658813477, |
|
"learning_rate": 4.702773254762089e-05, |
|
"loss": 3.749, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.1860897883228658, |
|
"grad_norm": 3.9984617233276367, |
|
"learning_rate": 4.6898503527952234e-05, |
|
"loss": 3.7065, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.1938435295029852, |
|
"grad_norm": 3.954652786254883, |
|
"learning_rate": 4.676927450828358e-05, |
|
"loss": 3.6931, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.2015972706831046, |
|
"grad_norm": 3.9958720207214355, |
|
"learning_rate": 4.6640045488614924e-05, |
|
"loss": 3.6545, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.209351011863224, |
|
"grad_norm": 3.922849178314209, |
|
"learning_rate": 4.6510816468946265e-05, |
|
"loss": 3.6207, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.21710475304334342, |
|
"grad_norm": 4.092968463897705, |
|
"learning_rate": 4.638158744927761e-05, |
|
"loss": 3.5974, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.22485849422346282, |
|
"grad_norm": 3.8097622394561768, |
|
"learning_rate": 4.6252358429608954e-05, |
|
"loss": 3.5775, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.23261223540358222, |
|
"grad_norm": 4.094768047332764, |
|
"learning_rate": 4.6123129409940296e-05, |
|
"loss": 3.5543, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.24036597658370162, |
|
"grad_norm": 3.8095757961273193, |
|
"learning_rate": 4.5993900390271644e-05, |
|
"loss": 3.5392, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.24811971776382105, |
|
"grad_norm": 3.8520195484161377, |
|
"learning_rate": 4.5864671370602985e-05, |
|
"loss": 3.5047, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.2558734589439404, |
|
"grad_norm": 4.280955791473389, |
|
"learning_rate": 4.5735442350934326e-05, |
|
"loss": 3.495, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.2636272001240599, |
|
"grad_norm": 4.455172538757324, |
|
"learning_rate": 4.5606213331265674e-05, |
|
"loss": 3.4756, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.2713809413041793, |
|
"grad_norm": 4.1695733070373535, |
|
"learning_rate": 4.5476984311597016e-05, |
|
"loss": 3.4505, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.2791346824842987, |
|
"grad_norm": 4.223578453063965, |
|
"learning_rate": 4.534775529192836e-05, |
|
"loss": 3.4446, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.2868884236644181, |
|
"grad_norm": 4.616567134857178, |
|
"learning_rate": 4.52185262722597e-05, |
|
"loss": 3.4202, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.2946421648445375, |
|
"grad_norm": 4.264205455780029, |
|
"learning_rate": 4.5089297252591046e-05, |
|
"loss": 3.4024, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.3023959060246569, |
|
"grad_norm": 4.493732929229736, |
|
"learning_rate": 4.496006823292239e-05, |
|
"loss": 3.397, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.3101496472047763, |
|
"grad_norm": 4.321922779083252, |
|
"learning_rate": 4.483083921325373e-05, |
|
"loss": 3.3678, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.31790338838489574, |
|
"grad_norm": 4.248241424560547, |
|
"learning_rate": 4.470161019358508e-05, |
|
"loss": 3.3634, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.32565712956501514, |
|
"grad_norm": 4.490056037902832, |
|
"learning_rate": 4.457238117391642e-05, |
|
"loss": 3.3529, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.33341087074513454, |
|
"grad_norm": 4.652819633483887, |
|
"learning_rate": 4.444315215424776e-05, |
|
"loss": 3.3491, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.34116461192525394, |
|
"grad_norm": 4.65127420425415, |
|
"learning_rate": 4.431392313457911e-05, |
|
"loss": 3.3239, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.34891835310537334, |
|
"grad_norm": 4.279469013214111, |
|
"learning_rate": 4.418469411491045e-05, |
|
"loss": 3.3061, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.35667209428549274, |
|
"grad_norm": 4.562509059906006, |
|
"learning_rate": 4.405546509524179e-05, |
|
"loss": 3.3014, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.36442583546561214, |
|
"grad_norm": 4.664194583892822, |
|
"learning_rate": 4.392623607557314e-05, |
|
"loss": 3.2963, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.3721795766457316, |
|
"grad_norm": 4.531868934631348, |
|
"learning_rate": 4.379700705590448e-05, |
|
"loss": 3.2776, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.379933317825851, |
|
"grad_norm": 4.553177833557129, |
|
"learning_rate": 4.366777803623582e-05, |
|
"loss": 3.2559, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.3876870590059704, |
|
"grad_norm": 4.358414649963379, |
|
"learning_rate": 4.353854901656716e-05, |
|
"loss": 3.253, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.3954408001860898, |
|
"grad_norm": 4.921834468841553, |
|
"learning_rate": 4.3409319996898503e-05, |
|
"loss": 3.257, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.4031945413662092, |
|
"grad_norm": 4.523054599761963, |
|
"learning_rate": 4.3280090977229845e-05, |
|
"loss": 3.2368, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.4109482825463286, |
|
"grad_norm": 4.663913249969482, |
|
"learning_rate": 4.315086195756119e-05, |
|
"loss": 3.2314, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.418702023726448, |
|
"grad_norm": 4.529434680938721, |
|
"learning_rate": 4.3021632937892534e-05, |
|
"loss": 3.2216, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.4264557649065674, |
|
"grad_norm": 4.806913375854492, |
|
"learning_rate": 4.2892403918223875e-05, |
|
"loss": 3.2176, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.43420950608668685, |
|
"grad_norm": 4.8940019607543945, |
|
"learning_rate": 4.276317489855522e-05, |
|
"loss": 3.2061, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.44196324726680625, |
|
"grad_norm": 4.507465839385986, |
|
"learning_rate": 4.2633945878886565e-05, |
|
"loss": 3.1957, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.44971698844692565, |
|
"grad_norm": 5.141822338104248, |
|
"learning_rate": 4.2504716859217906e-05, |
|
"loss": 3.1849, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.45747072962704505, |
|
"grad_norm": 4.68912935256958, |
|
"learning_rate": 4.237548783954925e-05, |
|
"loss": 3.1707, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.46522447080716445, |
|
"grad_norm": 5.046304225921631, |
|
"learning_rate": 4.2246258819880595e-05, |
|
"loss": 3.1705, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.47297821198728385, |
|
"grad_norm": 4.986753940582275, |
|
"learning_rate": 4.211702980021194e-05, |
|
"loss": 3.1641, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.48073195316740325, |
|
"grad_norm": 5.041975498199463, |
|
"learning_rate": 4.198780078054328e-05, |
|
"loss": 3.1545, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.4884856943475227, |
|
"grad_norm": 5.255220413208008, |
|
"learning_rate": 4.1858571760874626e-05, |
|
"loss": 3.1566, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.4962394355276421, |
|
"grad_norm": 5.2884697914123535, |
|
"learning_rate": 4.172934274120597e-05, |
|
"loss": 3.1497, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.5039931767077614, |
|
"grad_norm": 4.990237712860107, |
|
"learning_rate": 4.160011372153731e-05, |
|
"loss": 3.1417, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.5117469178878808, |
|
"grad_norm": 5.165491104125977, |
|
"learning_rate": 4.147088470186866e-05, |
|
"loss": 3.1227, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.5195006590680004, |
|
"grad_norm": 5.079947471618652, |
|
"learning_rate": 4.13416556822e-05, |
|
"loss": 3.1276, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.5272544002481198, |
|
"grad_norm": 4.830618858337402, |
|
"learning_rate": 4.121242666253134e-05, |
|
"loss": 3.1195, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.5350081414282392, |
|
"grad_norm": 5.067671775817871, |
|
"learning_rate": 4.108319764286268e-05, |
|
"loss": 3.1168, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.5427618826083586, |
|
"grad_norm": 5.153751373291016, |
|
"learning_rate": 4.095396862319403e-05, |
|
"loss": 3.1087, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.550515623788478, |
|
"grad_norm": 5.0933027267456055, |
|
"learning_rate": 4.082473960352537e-05, |
|
"loss": 3.0945, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.5582693649685974, |
|
"grad_norm": 5.372694492340088, |
|
"learning_rate": 4.069551058385671e-05, |
|
"loss": 3.0906, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.5660231061487168, |
|
"grad_norm": 5.414623737335205, |
|
"learning_rate": 4.056628156418806e-05, |
|
"loss": 3.0894, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.5737768473288362, |
|
"grad_norm": 5.126060962677002, |
|
"learning_rate": 4.04370525445194e-05, |
|
"loss": 3.0848, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.5815305885089556, |
|
"grad_norm": 4.877682209014893, |
|
"learning_rate": 4.030782352485074e-05, |
|
"loss": 3.0752, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.589284329689075, |
|
"grad_norm": 5.10503625869751, |
|
"learning_rate": 4.017859450518209e-05, |
|
"loss": 3.0678, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.5970380708691944, |
|
"grad_norm": 5.342639923095703, |
|
"learning_rate": 4.004936548551343e-05, |
|
"loss": 3.074, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.6047918120493138, |
|
"grad_norm": 5.377997875213623, |
|
"learning_rate": 3.992013646584477e-05, |
|
"loss": 3.0633, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.6125455532294332, |
|
"grad_norm": 5.535395622253418, |
|
"learning_rate": 3.979090744617612e-05, |
|
"loss": 3.0667, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.6202992944095526, |
|
"grad_norm": 5.380632400512695, |
|
"learning_rate": 3.966167842650746e-05, |
|
"loss": 3.042, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.6280530355896721, |
|
"grad_norm": 5.2183027267456055, |
|
"learning_rate": 3.95324494068388e-05, |
|
"loss": 3.0488, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.6358067767697915, |
|
"grad_norm": 5.565989971160889, |
|
"learning_rate": 3.9403220387170145e-05, |
|
"loss": 3.0477, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.6435605179499109, |
|
"grad_norm": 5.33280611038208, |
|
"learning_rate": 3.9273991367501486e-05, |
|
"loss": 3.0429, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.6513142591300303, |
|
"grad_norm": 5.605506896972656, |
|
"learning_rate": 3.9144762347832834e-05, |
|
"loss": 3.0423, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.6590680003101497, |
|
"grad_norm": 5.62191915512085, |
|
"learning_rate": 3.9015533328164175e-05, |
|
"loss": 3.0205, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.6668217414902691, |
|
"grad_norm": 5.318583965301514, |
|
"learning_rate": 3.888630430849552e-05, |
|
"loss": 3.0277, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.6745754826703885, |
|
"grad_norm": 5.763892650604248, |
|
"learning_rate": 3.875707528882686e-05, |
|
"loss": 3.0342, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.6823292238505079, |
|
"grad_norm": 5.43319034576416, |
|
"learning_rate": 3.86278462691582e-05, |
|
"loss": 3.0134, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.6900829650306273, |
|
"grad_norm": 5.857462406158447, |
|
"learning_rate": 3.849861724948955e-05, |
|
"loss": 3.0091, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.6978367062107467, |
|
"grad_norm": 5.6617021560668945, |
|
"learning_rate": 3.836938822982089e-05, |
|
"loss": 3.0117, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.7055904473908661, |
|
"grad_norm": 5.857166290283203, |
|
"learning_rate": 3.824015921015223e-05, |
|
"loss": 3.0189, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.7133441885709855, |
|
"grad_norm": 5.425033092498779, |
|
"learning_rate": 3.811093019048358e-05, |
|
"loss": 3.019, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.7210979297511049, |
|
"grad_norm": 5.529470920562744, |
|
"learning_rate": 3.798170117081492e-05, |
|
"loss": 3.002, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.7288516709312243, |
|
"grad_norm": 5.534538745880127, |
|
"learning_rate": 3.785247215114626e-05, |
|
"loss": 2.995, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.7366054121113437, |
|
"grad_norm": 5.707602024078369, |
|
"learning_rate": 3.772324313147761e-05, |
|
"loss": 2.9942, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.7443591532914632, |
|
"grad_norm": 5.565040588378906, |
|
"learning_rate": 3.759401411180895e-05, |
|
"loss": 2.9947, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.7521128944715826, |
|
"grad_norm": 5.714329242706299, |
|
"learning_rate": 3.746478509214029e-05, |
|
"loss": 2.9941, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.759866635651702, |
|
"grad_norm": 5.562424182891846, |
|
"learning_rate": 3.733555607247163e-05, |
|
"loss": 2.9827, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.7676203768318214, |
|
"grad_norm": 5.685086250305176, |
|
"learning_rate": 3.720632705280298e-05, |
|
"loss": 2.9812, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.7753741180119408, |
|
"grad_norm": 5.563987731933594, |
|
"learning_rate": 3.707709803313432e-05, |
|
"loss": 2.9781, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.7831278591920602, |
|
"grad_norm": 5.545105934143066, |
|
"learning_rate": 3.694786901346566e-05, |
|
"loss": 2.9586, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.7908816003721796, |
|
"grad_norm": 5.9238386154174805, |
|
"learning_rate": 3.681863999379701e-05, |
|
"loss": 2.9713, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.798635341552299, |
|
"grad_norm": 5.929417133331299, |
|
"learning_rate": 3.668941097412835e-05, |
|
"loss": 2.9623, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.8063890827324184, |
|
"grad_norm": 5.734675884246826, |
|
"learning_rate": 3.6560181954459694e-05, |
|
"loss": 2.9659, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.8141428239125378, |
|
"grad_norm": 5.726919651031494, |
|
"learning_rate": 3.643095293479104e-05, |
|
"loss": 2.9624, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.8218965650926572, |
|
"grad_norm": 5.474340438842773, |
|
"learning_rate": 3.630172391512238e-05, |
|
"loss": 2.9597, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.8296503062727766, |
|
"grad_norm": 5.879449367523193, |
|
"learning_rate": 3.6172494895453725e-05, |
|
"loss": 2.9601, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.837404047452896, |
|
"grad_norm": 6.051661491394043, |
|
"learning_rate": 3.604326587578507e-05, |
|
"loss": 2.9606, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.8451577886330154, |
|
"grad_norm": 6.062263488769531, |
|
"learning_rate": 3.5914036856116414e-05, |
|
"loss": 2.9573, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.8529115298131348, |
|
"grad_norm": 5.804770469665527, |
|
"learning_rate": 3.5784807836447755e-05, |
|
"loss": 2.9512, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.8606652709932543, |
|
"grad_norm": 5.899106025695801, |
|
"learning_rate": 3.5655578816779097e-05, |
|
"loss": 2.9563, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.8684190121733737, |
|
"grad_norm": 5.6610894203186035, |
|
"learning_rate": 3.5526349797110445e-05, |
|
"loss": 2.9468, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.8761727533534931, |
|
"grad_norm": 5.676564693450928, |
|
"learning_rate": 3.5397120777441786e-05, |
|
"loss": 2.9442, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.8839264945336125, |
|
"grad_norm": 6.026761531829834, |
|
"learning_rate": 3.526789175777313e-05, |
|
"loss": 2.945, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.8916802357137319, |
|
"grad_norm": 6.285312652587891, |
|
"learning_rate": 3.5138662738104475e-05, |
|
"loss": 2.935, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.8994339768938513, |
|
"grad_norm": 5.787561416625977, |
|
"learning_rate": 3.5009433718435817e-05, |
|
"loss": 2.9395, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.9071877180739707, |
|
"grad_norm": 5.658621311187744, |
|
"learning_rate": 3.488020469876716e-05, |
|
"loss": 2.9288, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.9149414592540901, |
|
"grad_norm": 5.896640300750732, |
|
"learning_rate": 3.47509756790985e-05, |
|
"loss": 2.9351, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.9226952004342095, |
|
"grad_norm": 6.219537734985352, |
|
"learning_rate": 3.462174665942984e-05, |
|
"loss": 2.9257, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.9304489416143289, |
|
"grad_norm": 6.03794527053833, |
|
"learning_rate": 3.449251763976118e-05, |
|
"loss": 2.924, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.9382026827944483, |
|
"grad_norm": 6.291288375854492, |
|
"learning_rate": 3.436328862009253e-05, |
|
"loss": 2.9141, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.9459564239745677, |
|
"grad_norm": 6.239747524261475, |
|
"learning_rate": 3.423405960042387e-05, |
|
"loss": 2.9212, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.9537101651546871, |
|
"grad_norm": 5.852220058441162, |
|
"learning_rate": 3.410483058075521e-05, |
|
"loss": 2.9187, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.9614639063348065, |
|
"grad_norm": 5.890344142913818, |
|
"learning_rate": 3.397560156108656e-05, |
|
"loss": 2.9167, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.969217647514926, |
|
"grad_norm": 6.417314529418945, |
|
"learning_rate": 3.38463725414179e-05, |
|
"loss": 2.9147, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.9769713886950454, |
|
"grad_norm": 6.077672481536865, |
|
"learning_rate": 3.371714352174924e-05, |
|
"loss": 2.9144, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.9847251298751648, |
|
"grad_norm": 6.114253520965576, |
|
"learning_rate": 3.3587914502080584e-05, |
|
"loss": 2.9025, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.9924788710552842, |
|
"grad_norm": 6.094882965087891, |
|
"learning_rate": 3.345868548241193e-05, |
|
"loss": 2.9058, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.0002326122354035, |
|
"grad_norm": 5.9865498542785645, |
|
"learning_rate": 3.3329456462743274e-05, |
|
"loss": 2.9067, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.007986353415523, |
|
"grad_norm": 5.904115200042725, |
|
"learning_rate": 3.3200227443074615e-05, |
|
"loss": 2.8973, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.0157400945956423, |
|
"grad_norm": 6.1497392654418945, |
|
"learning_rate": 3.307099842340596e-05, |
|
"loss": 2.897, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.0234938357757617, |
|
"grad_norm": 6.136323928833008, |
|
"learning_rate": 3.2941769403737304e-05, |
|
"loss": 2.9023, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.0312475769558813, |
|
"grad_norm": 6.581076145172119, |
|
"learning_rate": 3.2812540384068646e-05, |
|
"loss": 2.8962, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.0390013181360007, |
|
"grad_norm": 6.236713886260986, |
|
"learning_rate": 3.2683311364399994e-05, |
|
"loss": 2.8953, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.0467550593161201, |
|
"grad_norm": 6.375150680541992, |
|
"learning_rate": 3.2554082344731335e-05, |
|
"loss": 2.8951, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.0545088004962395, |
|
"grad_norm": 6.454219818115234, |
|
"learning_rate": 3.2424853325062676e-05, |
|
"loss": 2.8902, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.062262541676359, |
|
"grad_norm": 6.4552226066589355, |
|
"learning_rate": 3.2295624305394024e-05, |
|
"loss": 2.9028, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.0700162828564783, |
|
"grad_norm": 6.323915481567383, |
|
"learning_rate": 3.2166395285725366e-05, |
|
"loss": 2.8793, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.0777700240365977, |
|
"grad_norm": 6.413472652435303, |
|
"learning_rate": 3.203716626605671e-05, |
|
"loss": 2.8704, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.0855237652167171, |
|
"grad_norm": 6.230509281158447, |
|
"learning_rate": 3.190793724638805e-05, |
|
"loss": 2.8867, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.0932775063968365, |
|
"grad_norm": 6.334902286529541, |
|
"learning_rate": 3.1778708226719396e-05, |
|
"loss": 2.8851, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.101031247576956, |
|
"grad_norm": 6.225689888000488, |
|
"learning_rate": 3.164947920705074e-05, |
|
"loss": 2.8783, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.1087849887570753, |
|
"grad_norm": 6.088467597961426, |
|
"learning_rate": 3.152025018738208e-05, |
|
"loss": 2.8837, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.1165387299371947, |
|
"grad_norm": 6.089876174926758, |
|
"learning_rate": 3.139102116771343e-05, |
|
"loss": 2.8681, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.1242924711173141, |
|
"grad_norm": 6.693448066711426, |
|
"learning_rate": 3.126179214804477e-05, |
|
"loss": 2.8678, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.1320462122974335, |
|
"grad_norm": 6.45464563369751, |
|
"learning_rate": 3.113256312837611e-05, |
|
"loss": 2.8717, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.139799953477553, |
|
"grad_norm": 6.7853803634643555, |
|
"learning_rate": 3.100333410870746e-05, |
|
"loss": 2.8607, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.1475536946576723, |
|
"grad_norm": 6.709279537200928, |
|
"learning_rate": 3.08741050890388e-05, |
|
"loss": 2.862, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.1553074358377917, |
|
"grad_norm": 6.519172668457031, |
|
"learning_rate": 3.074487606937014e-05, |
|
"loss": 2.8704, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.1630611770179111, |
|
"grad_norm": 6.5100507736206055, |
|
"learning_rate": 3.061564704970148e-05, |
|
"loss": 2.8636, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.1708149181980305, |
|
"grad_norm": 6.233548164367676, |
|
"learning_rate": 3.0486418030032826e-05, |
|
"loss": 2.8773, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.17856865937815, |
|
"grad_norm": 6.704037666320801, |
|
"learning_rate": 3.0357189010364168e-05, |
|
"loss": 2.8704, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.1863224005582693, |
|
"grad_norm": 6.638470649719238, |
|
"learning_rate": 3.0227959990695516e-05, |
|
"loss": 2.8708, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.1940761417383887, |
|
"grad_norm": 6.931632995605469, |
|
"learning_rate": 3.0098730971026857e-05, |
|
"loss": 2.8619, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.2018298829185081, |
|
"grad_norm": 6.729213714599609, |
|
"learning_rate": 2.99695019513582e-05, |
|
"loss": 2.8558, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.2095836240986275, |
|
"grad_norm": 6.931024551391602, |
|
"learning_rate": 2.984027293168954e-05, |
|
"loss": 2.8559, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.217337365278747, |
|
"grad_norm": 6.658525466918945, |
|
"learning_rate": 2.9711043912020888e-05, |
|
"loss": 2.8573, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.2250911064588663, |
|
"grad_norm": 6.135016918182373, |
|
"learning_rate": 2.958181489235223e-05, |
|
"loss": 2.8604, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.2328448476389857, |
|
"grad_norm": 6.685146331787109, |
|
"learning_rate": 2.945258587268357e-05, |
|
"loss": 2.844, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.2405985888191053, |
|
"grad_norm": 6.7349982261657715, |
|
"learning_rate": 2.9323356853014915e-05, |
|
"loss": 2.8451, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.2483523299992245, |
|
"grad_norm": 6.538317680358887, |
|
"learning_rate": 2.9194127833346256e-05, |
|
"loss": 2.8531, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 1.2561060711793441, |
|
"grad_norm": 6.23037576675415, |
|
"learning_rate": 2.9064898813677598e-05, |
|
"loss": 2.8454, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.2638598123594633, |
|
"grad_norm": 6.324411392211914, |
|
"learning_rate": 2.8935669794008946e-05, |
|
"loss": 2.8462, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 1.271613553539583, |
|
"grad_norm": 6.693195343017578, |
|
"learning_rate": 2.8806440774340287e-05, |
|
"loss": 2.8382, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.2793672947197021, |
|
"grad_norm": 6.47921085357666, |
|
"learning_rate": 2.8677211754671628e-05, |
|
"loss": 2.8492, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 1.2871210358998217, |
|
"grad_norm": 6.768202304840088, |
|
"learning_rate": 2.8547982735002976e-05, |
|
"loss": 2.8344, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.2948747770799411, |
|
"grad_norm": 6.594978332519531, |
|
"learning_rate": 2.8418753715334318e-05, |
|
"loss": 2.8337, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 1.3026285182600605, |
|
"grad_norm": 6.703185081481934, |
|
"learning_rate": 2.828952469566566e-05, |
|
"loss": 2.834, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.31038225944018, |
|
"grad_norm": 6.614627838134766, |
|
"learning_rate": 2.8160295675997e-05, |
|
"loss": 2.835, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 1.3181360006202993, |
|
"grad_norm": 6.38785457611084, |
|
"learning_rate": 2.8031066656328348e-05, |
|
"loss": 2.8439, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.3258897418004187, |
|
"grad_norm": 6.479560852050781, |
|
"learning_rate": 2.790183763665969e-05, |
|
"loss": 2.8338, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 1.3336434829805381, |
|
"grad_norm": 6.241596698760986, |
|
"learning_rate": 2.777260861699103e-05, |
|
"loss": 2.8304, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.3413972241606575, |
|
"grad_norm": 7.731629371643066, |
|
"learning_rate": 2.764337959732238e-05, |
|
"loss": 2.828, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 1.349150965340777, |
|
"grad_norm": 6.706104278564453, |
|
"learning_rate": 2.751415057765372e-05, |
|
"loss": 2.8297, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.3569047065208963, |
|
"grad_norm": 6.772350311279297, |
|
"learning_rate": 2.738492155798506e-05, |
|
"loss": 2.8331, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 1.3646584477010157, |
|
"grad_norm": 6.890201091766357, |
|
"learning_rate": 2.7255692538316406e-05, |
|
"loss": 2.8286, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.3724121888811351, |
|
"grad_norm": 6.540558815002441, |
|
"learning_rate": 2.7126463518647748e-05, |
|
"loss": 2.8306, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 1.3801659300612545, |
|
"grad_norm": 6.890176773071289, |
|
"learning_rate": 2.699723449897909e-05, |
|
"loss": 2.827, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 1.387919671241374, |
|
"grad_norm": 6.634540557861328, |
|
"learning_rate": 2.6868005479310437e-05, |
|
"loss": 2.8263, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 1.3956734124214933, |
|
"grad_norm": 7.228022575378418, |
|
"learning_rate": 2.6738776459641778e-05, |
|
"loss": 2.8365, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 1.4034271536016127, |
|
"grad_norm": 6.7347869873046875, |
|
"learning_rate": 2.660954743997312e-05, |
|
"loss": 2.825, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 1.4111808947817321, |
|
"grad_norm": 6.936458110809326, |
|
"learning_rate": 2.6480318420304468e-05, |
|
"loss": 2.8266, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 1.4189346359618515, |
|
"grad_norm": 6.489315509796143, |
|
"learning_rate": 2.635108940063581e-05, |
|
"loss": 2.8173, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 1.426688377141971, |
|
"grad_norm": 7.1031012535095215, |
|
"learning_rate": 2.622186038096715e-05, |
|
"loss": 2.8114, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 1.4344421183220903, |
|
"grad_norm": 6.918934345245361, |
|
"learning_rate": 2.609263136129849e-05, |
|
"loss": 2.8252, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 1.4421958595022097, |
|
"grad_norm": 6.686205863952637, |
|
"learning_rate": 2.596340234162984e-05, |
|
"loss": 2.8138, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 1.4499496006823291, |
|
"grad_norm": 6.464860439300537, |
|
"learning_rate": 2.583417332196118e-05, |
|
"loss": 2.8284, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 1.4577033418624485, |
|
"grad_norm": 6.871826171875, |
|
"learning_rate": 2.5704944302292522e-05, |
|
"loss": 2.8215, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 1.4654570830425682, |
|
"grad_norm": 6.555510520935059, |
|
"learning_rate": 2.557571528262387e-05, |
|
"loss": 2.8166, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 1.4732108242226873, |
|
"grad_norm": 6.931303977966309, |
|
"learning_rate": 2.544648626295521e-05, |
|
"loss": 2.8218, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 1.480964565402807, |
|
"grad_norm": 6.583662509918213, |
|
"learning_rate": 2.5317257243286553e-05, |
|
"loss": 2.8075, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 1.4887183065829261, |
|
"grad_norm": 6.624995231628418, |
|
"learning_rate": 2.5188028223617897e-05, |
|
"loss": 2.8281, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 1.4964720477630458, |
|
"grad_norm": 6.899562835693359, |
|
"learning_rate": 2.505879920394924e-05, |
|
"loss": 2.8195, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 1.504225788943165, |
|
"grad_norm": 6.846054553985596, |
|
"learning_rate": 2.4929570184280583e-05, |
|
"loss": 2.8149, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 1.5119795301232846, |
|
"grad_norm": 6.590377330780029, |
|
"learning_rate": 2.4800341164611925e-05, |
|
"loss": 2.7984, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 1.5197332713034037, |
|
"grad_norm": 6.771044731140137, |
|
"learning_rate": 2.467111214494327e-05, |
|
"loss": 2.82, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 1.5274870124835234, |
|
"grad_norm": 6.996868133544922, |
|
"learning_rate": 2.454188312527461e-05, |
|
"loss": 2.803, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 1.5352407536636425, |
|
"grad_norm": 6.782078742980957, |
|
"learning_rate": 2.4412654105605955e-05, |
|
"loss": 2.8156, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 1.5429944948437622, |
|
"grad_norm": 7.141603469848633, |
|
"learning_rate": 2.42834250859373e-05, |
|
"loss": 2.8081, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 1.5507482360238816, |
|
"grad_norm": 7.204538822174072, |
|
"learning_rate": 2.415419606626864e-05, |
|
"loss": 2.7939, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.558501977204001, |
|
"grad_norm": 7.218080043792725, |
|
"learning_rate": 2.4024967046599986e-05, |
|
"loss": 2.8011, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 1.5662557183841204, |
|
"grad_norm": 6.774955749511719, |
|
"learning_rate": 2.3895738026931327e-05, |
|
"loss": 2.8086, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 1.5740094595642398, |
|
"grad_norm": 6.7942657470703125, |
|
"learning_rate": 2.3766509007262672e-05, |
|
"loss": 2.7981, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 1.5817632007443592, |
|
"grad_norm": 6.575582027435303, |
|
"learning_rate": 2.3637279987594017e-05, |
|
"loss": 2.8048, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 1.5895169419244786, |
|
"grad_norm": 6.921658515930176, |
|
"learning_rate": 2.3508050967925358e-05, |
|
"loss": 2.8001, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 1.597270683104598, |
|
"grad_norm": 7.207976341247559, |
|
"learning_rate": 2.3378821948256703e-05, |
|
"loss": 2.8112, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 1.6050244242847174, |
|
"grad_norm": 7.6573710441589355, |
|
"learning_rate": 2.3249592928588047e-05, |
|
"loss": 2.8044, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 1.6127781654648368, |
|
"grad_norm": 7.072439670562744, |
|
"learning_rate": 2.312036390891939e-05, |
|
"loss": 2.7871, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 1.6205319066449562, |
|
"grad_norm": 7.3211259841918945, |
|
"learning_rate": 2.299113488925073e-05, |
|
"loss": 2.7921, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 1.6282856478250756, |
|
"grad_norm": 7.107245445251465, |
|
"learning_rate": 2.2861905869582075e-05, |
|
"loss": 2.8031, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 1.636039389005195, |
|
"grad_norm": 6.947020530700684, |
|
"learning_rate": 2.2732676849913416e-05, |
|
"loss": 2.7869, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 1.6437931301853144, |
|
"grad_norm": 7.2329301834106445, |
|
"learning_rate": 2.260344783024476e-05, |
|
"loss": 2.7876, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 1.6515468713654338, |
|
"grad_norm": 6.861079692840576, |
|
"learning_rate": 2.2474218810576102e-05, |
|
"loss": 2.7875, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 1.6593006125455534, |
|
"grad_norm": 7.405232906341553, |
|
"learning_rate": 2.2344989790907447e-05, |
|
"loss": 2.7955, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 1.6670543537256726, |
|
"grad_norm": 7.370352745056152, |
|
"learning_rate": 2.221576077123879e-05, |
|
"loss": 2.7931, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 1.6748080949057922, |
|
"grad_norm": 7.008327960968018, |
|
"learning_rate": 2.2086531751570133e-05, |
|
"loss": 2.7936, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 1.6825618360859114, |
|
"grad_norm": 7.268221378326416, |
|
"learning_rate": 2.1957302731901477e-05, |
|
"loss": 2.7969, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 1.690315577266031, |
|
"grad_norm": 6.869812488555908, |
|
"learning_rate": 2.182807371223282e-05, |
|
"loss": 2.7879, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 1.6980693184461502, |
|
"grad_norm": 6.86647891998291, |
|
"learning_rate": 2.1698844692564163e-05, |
|
"loss": 2.7992, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 1.7058230596262698, |
|
"grad_norm": 7.082624435424805, |
|
"learning_rate": 2.1569615672895508e-05, |
|
"loss": 2.7829, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 1.713576800806389, |
|
"grad_norm": 6.880459308624268, |
|
"learning_rate": 2.144038665322685e-05, |
|
"loss": 2.7797, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 1.7213305419865086, |
|
"grad_norm": 7.15917444229126, |
|
"learning_rate": 2.1311157633558194e-05, |
|
"loss": 2.7952, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 1.7290842831666278, |
|
"grad_norm": 7.239593982696533, |
|
"learning_rate": 2.118192861388954e-05, |
|
"loss": 2.7836, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 1.7368380243467474, |
|
"grad_norm": 6.907558441162109, |
|
"learning_rate": 2.105269959422088e-05, |
|
"loss": 2.7974, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 1.7445917655268666, |
|
"grad_norm": 7.07895040512085, |
|
"learning_rate": 2.0923470574552225e-05, |
|
"loss": 2.7847, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 1.7523455067069862, |
|
"grad_norm": 6.944314956665039, |
|
"learning_rate": 2.0794241554883566e-05, |
|
"loss": 2.7875, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 1.7600992478871054, |
|
"grad_norm": 6.936674118041992, |
|
"learning_rate": 2.0665012535214907e-05, |
|
"loss": 2.7743, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 1.767852989067225, |
|
"grad_norm": 7.579113960266113, |
|
"learning_rate": 2.0535783515546252e-05, |
|
"loss": 2.7748, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 1.7756067302473444, |
|
"grad_norm": 6.939824104309082, |
|
"learning_rate": 2.0406554495877593e-05, |
|
"loss": 2.7813, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 1.7833604714274638, |
|
"grad_norm": 6.831909656524658, |
|
"learning_rate": 2.0277325476208938e-05, |
|
"loss": 2.7755, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 1.7911142126075832, |
|
"grad_norm": 6.841889381408691, |
|
"learning_rate": 2.0148096456540283e-05, |
|
"loss": 2.7833, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 1.7988679537877026, |
|
"grad_norm": 6.934596538543701, |
|
"learning_rate": 2.0018867436871624e-05, |
|
"loss": 2.7816, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 1.806621694967822, |
|
"grad_norm": 7.232493877410889, |
|
"learning_rate": 1.988963841720297e-05, |
|
"loss": 2.7718, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 1.8143754361479414, |
|
"grad_norm": 6.8913421630859375, |
|
"learning_rate": 1.976040939753431e-05, |
|
"loss": 2.7806, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 1.8221291773280608, |
|
"grad_norm": 7.189756393432617, |
|
"learning_rate": 1.9631180377865655e-05, |
|
"loss": 2.7696, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 1.8298829185081802, |
|
"grad_norm": 7.165264129638672, |
|
"learning_rate": 1.9501951358197e-05, |
|
"loss": 2.772, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 1.8376366596882996, |
|
"grad_norm": 7.418449878692627, |
|
"learning_rate": 1.937272233852834e-05, |
|
"loss": 2.7673, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 1.845390400868419, |
|
"grad_norm": 7.016151428222656, |
|
"learning_rate": 1.9243493318859685e-05, |
|
"loss": 2.7721, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 1.8531441420485384, |
|
"grad_norm": 7.69176721572876, |
|
"learning_rate": 1.9114264299191027e-05, |
|
"loss": 2.7712, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 1.8608978832286578, |
|
"grad_norm": 7.078608512878418, |
|
"learning_rate": 1.898503527952237e-05, |
|
"loss": 2.7626, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 1.8686516244087772, |
|
"grad_norm": 7.167757034301758, |
|
"learning_rate": 1.8855806259853716e-05, |
|
"loss": 2.7757, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 1.8764053655888966, |
|
"grad_norm": 7.261277675628662, |
|
"learning_rate": 1.8726577240185057e-05, |
|
"loss": 2.7638, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 1.8841591067690162, |
|
"grad_norm": 7.32155179977417, |
|
"learning_rate": 1.85973482205164e-05, |
|
"loss": 2.7743, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 1.8919128479491354, |
|
"grad_norm": 7.3756103515625, |
|
"learning_rate": 1.8468119200847743e-05, |
|
"loss": 2.7751, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 1.899666589129255, |
|
"grad_norm": 7.422321796417236, |
|
"learning_rate": 1.8338890181179085e-05, |
|
"loss": 2.7766, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 1.9074203303093742, |
|
"grad_norm": 7.091059684753418, |
|
"learning_rate": 1.820966116151043e-05, |
|
"loss": 2.7643, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 1.9151740714894938, |
|
"grad_norm": 6.582401275634766, |
|
"learning_rate": 1.808043214184177e-05, |
|
"loss": 2.7665, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 1.922927812669613, |
|
"grad_norm": 7.574552536010742, |
|
"learning_rate": 1.7951203122173115e-05, |
|
"loss": 2.7548, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 1.9306815538497326, |
|
"grad_norm": 7.082491874694824, |
|
"learning_rate": 1.782197410250446e-05, |
|
"loss": 2.7577, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 1.9384352950298518, |
|
"grad_norm": 7.546943187713623, |
|
"learning_rate": 1.76927450828358e-05, |
|
"loss": 2.7709, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 1.9461890362099714, |
|
"grad_norm": 7.498143196105957, |
|
"learning_rate": 1.7563516063167146e-05, |
|
"loss": 2.7674, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 1.9539427773900906, |
|
"grad_norm": 7.182895660400391, |
|
"learning_rate": 1.743428704349849e-05, |
|
"loss": 2.7595, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 1.9616965185702102, |
|
"grad_norm": 7.754599094390869, |
|
"learning_rate": 1.7305058023829832e-05, |
|
"loss": 2.7586, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 1.9694502597503294, |
|
"grad_norm": 7.348043918609619, |
|
"learning_rate": 1.7175829004161177e-05, |
|
"loss": 2.7671, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 1.977204000930449, |
|
"grad_norm": 7.5025835037231445, |
|
"learning_rate": 1.7046599984492518e-05, |
|
"loss": 2.7596, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 1.9849577421105682, |
|
"grad_norm": 7.277556896209717, |
|
"learning_rate": 1.6917370964823863e-05, |
|
"loss": 2.7554, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 1.9927114832906878, |
|
"grad_norm": 7.15894079208374, |
|
"learning_rate": 1.6788141945155207e-05, |
|
"loss": 2.758, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 2.000465224470807, |
|
"grad_norm": 7.221950054168701, |
|
"learning_rate": 1.665891292548655e-05, |
|
"loss": 2.7564, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 2.0082189656509266, |
|
"grad_norm": 7.185346603393555, |
|
"learning_rate": 1.6529683905817893e-05, |
|
"loss": 2.7671, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 2.015972706831046, |
|
"grad_norm": 7.411344528198242, |
|
"learning_rate": 1.6400454886149234e-05, |
|
"loss": 2.7698, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 2.0237264480111654, |
|
"grad_norm": 7.3418498039245605, |
|
"learning_rate": 1.6271225866480576e-05, |
|
"loss": 2.7508, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 2.0314801891912846, |
|
"grad_norm": 7.749533176422119, |
|
"learning_rate": 1.614199684681192e-05, |
|
"loss": 2.7643, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 2.039233930371404, |
|
"grad_norm": 7.400169372558594, |
|
"learning_rate": 1.6012767827143262e-05, |
|
"loss": 2.7523, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 2.0469876715515234, |
|
"grad_norm": 6.999739646911621, |
|
"learning_rate": 1.5883538807474606e-05, |
|
"loss": 2.7525, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 2.054741412731643, |
|
"grad_norm": 7.423766613006592, |
|
"learning_rate": 1.575430978780595e-05, |
|
"loss": 2.7497, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 2.0624951539117626, |
|
"grad_norm": 7.121034622192383, |
|
"learning_rate": 1.5625080768137292e-05, |
|
"loss": 2.7644, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 2.070248895091882, |
|
"grad_norm": 7.697927951812744, |
|
"learning_rate": 1.5495851748468637e-05, |
|
"loss": 2.7599, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 2.0780026362720014, |
|
"grad_norm": 7.259957313537598, |
|
"learning_rate": 1.5366622728799982e-05, |
|
"loss": 2.7569, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 2.0857563774521206, |
|
"grad_norm": 7.2549943923950195, |
|
"learning_rate": 1.5237393709131323e-05, |
|
"loss": 2.764, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 2.0935101186322402, |
|
"grad_norm": 7.1079535484313965, |
|
"learning_rate": 1.5108164689462668e-05, |
|
"loss": 2.7463, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 2.1012638598123594, |
|
"grad_norm": 7.20269250869751, |
|
"learning_rate": 1.4978935669794009e-05, |
|
"loss": 2.7464, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 2.109017600992479, |
|
"grad_norm": 7.686685085296631, |
|
"learning_rate": 1.4849706650125352e-05, |
|
"loss": 2.7546, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 2.116771342172598, |
|
"grad_norm": 6.865842342376709, |
|
"learning_rate": 1.4720477630456697e-05, |
|
"loss": 2.743, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 2.124525083352718, |
|
"grad_norm": 6.892743110656738, |
|
"learning_rate": 1.4591248610788038e-05, |
|
"loss": 2.7597, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 2.132278824532837, |
|
"grad_norm": 7.216090679168701, |
|
"learning_rate": 1.4462019591119383e-05, |
|
"loss": 2.7638, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 2.1400325657129566, |
|
"grad_norm": 7.859537601470947, |
|
"learning_rate": 1.4332790571450724e-05, |
|
"loss": 2.7522, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 2.147786306893076, |
|
"grad_norm": 7.197884559631348, |
|
"learning_rate": 1.4203561551782069e-05, |
|
"loss": 2.7524, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 2.1555400480731954, |
|
"grad_norm": 7.401318550109863, |
|
"learning_rate": 1.4074332532113413e-05, |
|
"loss": 2.7629, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 2.1632937892533146, |
|
"grad_norm": 7.095146656036377, |
|
"learning_rate": 1.3945103512444755e-05, |
|
"loss": 2.7571, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 2.1710475304334342, |
|
"grad_norm": 7.200826168060303, |
|
"learning_rate": 1.38158744927761e-05, |
|
"loss": 2.7411, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 2.1788012716135534, |
|
"grad_norm": 7.727132797241211, |
|
"learning_rate": 1.3686645473107442e-05, |
|
"loss": 2.7518, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 2.186555012793673, |
|
"grad_norm": 7.624775409698486, |
|
"learning_rate": 1.3557416453438784e-05, |
|
"loss": 2.7461, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 2.194308753973792, |
|
"grad_norm": 7.7125935554504395, |
|
"learning_rate": 1.3428187433770128e-05, |
|
"loss": 2.7506, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 2.202062495153912, |
|
"grad_norm": 7.944740295410156, |
|
"learning_rate": 1.329895841410147e-05, |
|
"loss": 2.7435, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 2.209816236334031, |
|
"grad_norm": 7.168126106262207, |
|
"learning_rate": 1.3169729394432814e-05, |
|
"loss": 2.7577, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 2.2175699775141506, |
|
"grad_norm": 7.608551979064941, |
|
"learning_rate": 1.3040500374764159e-05, |
|
"loss": 2.7514, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 2.22532371869427, |
|
"grad_norm": 7.155666351318359, |
|
"learning_rate": 1.29112713550955e-05, |
|
"loss": 2.756, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 2.2330774598743894, |
|
"grad_norm": 7.49126672744751, |
|
"learning_rate": 1.2782042335426845e-05, |
|
"loss": 2.7452, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 2.2408312010545086, |
|
"grad_norm": 7.515799522399902, |
|
"learning_rate": 1.2652813315758188e-05, |
|
"loss": 2.7485, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 2.2485849422346282, |
|
"grad_norm": 7.652871608734131, |
|
"learning_rate": 1.252358429608953e-05, |
|
"loss": 2.7519, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 2.2563386834147474, |
|
"grad_norm": 7.006313800811768, |
|
"learning_rate": 1.2394355276420874e-05, |
|
"loss": 2.7401, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 2.264092424594867, |
|
"grad_norm": 7.337978839874268, |
|
"learning_rate": 1.2265126256752217e-05, |
|
"loss": 2.7458, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 2.2718461657749867, |
|
"grad_norm": 7.185283184051514, |
|
"learning_rate": 1.213589723708356e-05, |
|
"loss": 2.7546, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 2.279599906955106, |
|
"grad_norm": 7.885451316833496, |
|
"learning_rate": 1.2006668217414903e-05, |
|
"loss": 2.738, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 2.287353648135225, |
|
"grad_norm": 7.163339138031006, |
|
"learning_rate": 1.1877439197746248e-05, |
|
"loss": 2.7403, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 2.2951073893153446, |
|
"grad_norm": 7.566407680511475, |
|
"learning_rate": 1.174821017807759e-05, |
|
"loss": 2.7441, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 2.3028611304954643, |
|
"grad_norm": 7.626791477203369, |
|
"learning_rate": 1.1618981158408934e-05, |
|
"loss": 2.7442, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 2.3106148716755834, |
|
"grad_norm": 7.609415054321289, |
|
"learning_rate": 1.1489752138740275e-05, |
|
"loss": 2.7442, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 2.318368612855703, |
|
"grad_norm": 7.549880504608154, |
|
"learning_rate": 1.136052311907162e-05, |
|
"loss": 2.7398, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 2.3261223540358222, |
|
"grad_norm": 7.753575325012207, |
|
"learning_rate": 1.1231294099402963e-05, |
|
"loss": 2.7376, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 2.333876095215942, |
|
"grad_norm": 7.579866886138916, |
|
"learning_rate": 1.1102065079734306e-05, |
|
"loss": 2.7449, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 2.341629836396061, |
|
"grad_norm": 7.787561893463135, |
|
"learning_rate": 1.0972836060065649e-05, |
|
"loss": 2.7418, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 2.3493835775761807, |
|
"grad_norm": 7.163692474365234, |
|
"learning_rate": 1.0843607040396992e-05, |
|
"loss": 2.7459, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 2.3571373187563, |
|
"grad_norm": 8.124524116516113, |
|
"learning_rate": 1.0714378020728336e-05, |
|
"loss": 2.7472, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 2.3648910599364195, |
|
"grad_norm": 7.68442964553833, |
|
"learning_rate": 1.058514900105968e-05, |
|
"loss": 2.7454, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 2.3726448011165386, |
|
"grad_norm": 7.561328887939453, |
|
"learning_rate": 1.045591998139102e-05, |
|
"loss": 2.7314, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 2.3803985422966583, |
|
"grad_norm": 7.480719566345215, |
|
"learning_rate": 1.0326690961722364e-05, |
|
"loss": 2.7385, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 2.3881522834767774, |
|
"grad_norm": 7.676718235015869, |
|
"learning_rate": 1.0197461942053708e-05, |
|
"loss": 2.7363, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 2.395906024656897, |
|
"grad_norm": 7.30204963684082, |
|
"learning_rate": 1.0068232922385051e-05, |
|
"loss": 2.7469, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 2.4036597658370162, |
|
"grad_norm": 7.684243202209473, |
|
"learning_rate": 9.939003902716394e-06, |
|
"loss": 2.7404, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 2.411413507017136, |
|
"grad_norm": 7.187122821807861, |
|
"learning_rate": 9.809774883047737e-06, |
|
"loss": 2.7364, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 2.419167248197255, |
|
"grad_norm": 7.586068153381348, |
|
"learning_rate": 9.680545863379082e-06, |
|
"loss": 2.7331, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 2.4269209893773747, |
|
"grad_norm": 7.374856948852539, |
|
"learning_rate": 9.551316843710425e-06, |
|
"loss": 2.739, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 2.434674730557494, |
|
"grad_norm": 7.3092474937438965, |
|
"learning_rate": 9.422087824041768e-06, |
|
"loss": 2.7412, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 2.4424284717376135, |
|
"grad_norm": 7.9172844886779785, |
|
"learning_rate": 9.29285880437311e-06, |
|
"loss": 2.7384, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 2.4501822129177326, |
|
"grad_norm": 7.155998706817627, |
|
"learning_rate": 9.163629784704454e-06, |
|
"loss": 2.7324, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 2.4579359540978523, |
|
"grad_norm": 7.371484756469727, |
|
"learning_rate": 9.034400765035797e-06, |
|
"loss": 2.7382, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 2.4656896952779714, |
|
"grad_norm": 7.271082401275635, |
|
"learning_rate": 8.90517174536714e-06, |
|
"loss": 2.7238, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 2.473443436458091, |
|
"grad_norm": 7.525820255279541, |
|
"learning_rate": 8.775942725698483e-06, |
|
"loss": 2.7353, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 2.4811971776382107, |
|
"grad_norm": 7.422860622406006, |
|
"learning_rate": 8.646713706029828e-06, |
|
"loss": 2.7312, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 2.48895091881833, |
|
"grad_norm": 7.786092758178711, |
|
"learning_rate": 8.51748468636117e-06, |
|
"loss": 2.729, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 2.496704659998449, |
|
"grad_norm": 7.733543872833252, |
|
"learning_rate": 8.388255666692514e-06, |
|
"loss": 2.7377, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 2.5044584011785687, |
|
"grad_norm": 7.477449417114258, |
|
"learning_rate": 8.259026647023855e-06, |
|
"loss": 2.7431, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 2.5122121423586883, |
|
"grad_norm": 7.466070652008057, |
|
"learning_rate": 8.1297976273552e-06, |
|
"loss": 2.743, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 2.5199658835388075, |
|
"grad_norm": 7.578529357910156, |
|
"learning_rate": 8.000568607686542e-06, |
|
"loss": 2.738, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 2.5277196247189266, |
|
"grad_norm": 7.481320381164551, |
|
"learning_rate": 7.871339588017885e-06, |
|
"loss": 2.7332, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 2.5354733658990463, |
|
"grad_norm": 8.073503494262695, |
|
"learning_rate": 7.742110568349228e-06, |
|
"loss": 2.7517, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 2.543227107079166, |
|
"grad_norm": 7.4196457862854, |
|
"learning_rate": 7.612881548680572e-06, |
|
"loss": 2.7231, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 2.550980848259285, |
|
"grad_norm": 7.558558940887451, |
|
"learning_rate": 7.483652529011915e-06, |
|
"loss": 2.7384, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 2.5587345894394042, |
|
"grad_norm": 7.38846492767334, |
|
"learning_rate": 7.354423509343258e-06, |
|
"loss": 2.7348, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 2.566488330619524, |
|
"grad_norm": 7.8365864753723145, |
|
"learning_rate": 7.225194489674601e-06, |
|
"loss": 2.7254, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 2.5742420717996435, |
|
"grad_norm": 7.362669944763184, |
|
"learning_rate": 7.095965470005945e-06, |
|
"loss": 2.729, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 2.5819958129797627, |
|
"grad_norm": 7.646996974945068, |
|
"learning_rate": 6.966736450337288e-06, |
|
"loss": 2.7333, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 2.5897495541598823, |
|
"grad_norm": 7.944218158721924, |
|
"learning_rate": 6.837507430668631e-06, |
|
"loss": 2.7423, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 2.5975032953400015, |
|
"grad_norm": 7.502200603485107, |
|
"learning_rate": 6.708278410999974e-06, |
|
"loss": 2.7225, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 2.605257036520121, |
|
"grad_norm": 7.175666809082031, |
|
"learning_rate": 6.579049391331319e-06, |
|
"loss": 2.7366, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 2.6130107777002403, |
|
"grad_norm": 7.814846992492676, |
|
"learning_rate": 6.449820371662661e-06, |
|
"loss": 2.7421, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 2.62076451888036, |
|
"grad_norm": 7.270232677459717, |
|
"learning_rate": 6.320591351994004e-06, |
|
"loss": 2.7335, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 2.628518260060479, |
|
"grad_norm": 7.920383930206299, |
|
"learning_rate": 6.191362332325347e-06, |
|
"loss": 2.7352, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 2.6362720012405987, |
|
"grad_norm": 7.142765998840332, |
|
"learning_rate": 6.062133312656691e-06, |
|
"loss": 2.7318, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 2.644025742420718, |
|
"grad_norm": 8.12151050567627, |
|
"learning_rate": 5.932904292988034e-06, |
|
"loss": 2.7168, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 2.6517794836008375, |
|
"grad_norm": 7.717370510101318, |
|
"learning_rate": 5.803675273319377e-06, |
|
"loss": 2.7145, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 2.6595332247809567, |
|
"grad_norm": 7.359320640563965, |
|
"learning_rate": 5.67444625365072e-06, |
|
"loss": 2.7382, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 2.6672869659610763, |
|
"grad_norm": 7.525691509246826, |
|
"learning_rate": 5.5452172339820636e-06, |
|
"loss": 2.7228, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 2.6750407071411955, |
|
"grad_norm": 7.967082500457764, |
|
"learning_rate": 5.4159882143134066e-06, |
|
"loss": 2.7258, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 2.682794448321315, |
|
"grad_norm": 7.760034561157227, |
|
"learning_rate": 5.2867591946447495e-06, |
|
"loss": 2.7385, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 2.6905481895014347, |
|
"grad_norm": 7.141742706298828, |
|
"learning_rate": 5.1575301749760925e-06, |
|
"loss": 2.7282, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 2.698301930681554, |
|
"grad_norm": 7.685527801513672, |
|
"learning_rate": 5.028301155307436e-06, |
|
"loss": 2.7216, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 2.706055671861673, |
|
"grad_norm": 7.3134236335754395, |
|
"learning_rate": 4.899072135638779e-06, |
|
"loss": 2.7188, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 2.7138094130417927, |
|
"grad_norm": 7.750508785247803, |
|
"learning_rate": 4.769843115970122e-06, |
|
"loss": 2.7277, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 2.7215631542219123, |
|
"grad_norm": 7.504671096801758, |
|
"learning_rate": 4.640614096301465e-06, |
|
"loss": 2.738, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 2.7293168954020315, |
|
"grad_norm": 7.484751224517822, |
|
"learning_rate": 4.511385076632809e-06, |
|
"loss": 2.7148, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 2.7370706365821507, |
|
"grad_norm": 7.809044361114502, |
|
"learning_rate": 4.382156056964152e-06, |
|
"loss": 2.7388, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 2.7448243777622703, |
|
"grad_norm": 7.876001834869385, |
|
"learning_rate": 4.252927037295495e-06, |
|
"loss": 2.7214, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 2.75257811894239, |
|
"grad_norm": 7.753846645355225, |
|
"learning_rate": 4.123698017626838e-06, |
|
"loss": 2.719, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 2.760331860122509, |
|
"grad_norm": 7.285833358764648, |
|
"learning_rate": 3.994468997958182e-06, |
|
"loss": 2.7347, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 2.7680856013026283, |
|
"grad_norm": 7.894680023193359, |
|
"learning_rate": 3.865239978289525e-06, |
|
"loss": 2.7376, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 2.775839342482748, |
|
"grad_norm": 7.850667953491211, |
|
"learning_rate": 3.7360109586208684e-06, |
|
"loss": 2.7289, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 2.7835930836628675, |
|
"grad_norm": 7.380823135375977, |
|
"learning_rate": 3.606781938952211e-06, |
|
"loss": 2.7289, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 2.7913468248429867, |
|
"grad_norm": 7.752573490142822, |
|
"learning_rate": 3.477552919283555e-06, |
|
"loss": 2.724, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 2.7991005660231063, |
|
"grad_norm": 7.117413520812988, |
|
"learning_rate": 3.3483238996148974e-06, |
|
"loss": 2.7332, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 2.8068543072032255, |
|
"grad_norm": 7.8615522384643555, |
|
"learning_rate": 3.2190948799462412e-06, |
|
"loss": 2.7316, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 2.814608048383345, |
|
"grad_norm": 7.938878059387207, |
|
"learning_rate": 3.0898658602775842e-06, |
|
"loss": 2.7223, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 2.8223617895634643, |
|
"grad_norm": 7.760583877563477, |
|
"learning_rate": 2.9606368406089272e-06, |
|
"loss": 2.7222, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 2.830115530743584, |
|
"grad_norm": 7.352213382720947, |
|
"learning_rate": 2.8314078209402706e-06, |
|
"loss": 2.7233, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 2.837869271923703, |
|
"grad_norm": 7.541159629821777, |
|
"learning_rate": 2.7021788012716136e-06, |
|
"loss": 2.7225, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 2.8456230131038227, |
|
"grad_norm": 7.890182018280029, |
|
"learning_rate": 2.572949781602957e-06, |
|
"loss": 2.7219, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 2.853376754283942, |
|
"grad_norm": 7.695311546325684, |
|
"learning_rate": 2.4437207619343e-06, |
|
"loss": 2.7172, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 2.8611304954640615, |
|
"grad_norm": 7.7702317237854, |
|
"learning_rate": 2.3144917422656434e-06, |
|
"loss": 2.728, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 2.8688842366441807, |
|
"grad_norm": 7.646172046661377, |
|
"learning_rate": 2.1852627225969864e-06, |
|
"loss": 2.7312, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 2.8766379778243003, |
|
"grad_norm": 7.06711483001709, |
|
"learning_rate": 2.05603370292833e-06, |
|
"loss": 2.7175, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 2.8843917190044195, |
|
"grad_norm": 7.974971294403076, |
|
"learning_rate": 1.926804683259673e-06, |
|
"loss": 2.7244, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 2.892145460184539, |
|
"grad_norm": 7.5829315185546875, |
|
"learning_rate": 1.797575663591016e-06, |
|
"loss": 2.7298, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 2.8998992013646583, |
|
"grad_norm": 7.224939823150635, |
|
"learning_rate": 1.6683466439223592e-06, |
|
"loss": 2.7227, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 2.907652942544778, |
|
"grad_norm": 8.057891845703125, |
|
"learning_rate": 1.5391176242537025e-06, |
|
"loss": 2.7393, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 2.915406683724897, |
|
"grad_norm": 7.886134624481201, |
|
"learning_rate": 1.4098886045850457e-06, |
|
"loss": 2.7264, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 2.9231604249050167, |
|
"grad_norm": 7.65654993057251, |
|
"learning_rate": 1.2806595849163889e-06, |
|
"loss": 2.7277, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 2.9309141660851363, |
|
"grad_norm": 7.524332046508789, |
|
"learning_rate": 1.1514305652477323e-06, |
|
"loss": 2.7281, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 2.9386679072652555, |
|
"grad_norm": 7.878385543823242, |
|
"learning_rate": 1.0222015455790753e-06, |
|
"loss": 2.7275, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 2.9464216484453747, |
|
"grad_norm": 7.491950035095215, |
|
"learning_rate": 8.929725259104185e-07, |
|
"loss": 2.7225, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 2.9541753896254943, |
|
"grad_norm": 8.570446968078613, |
|
"learning_rate": 7.637435062417617e-07, |
|
"loss": 2.722, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 2.961929130805614, |
|
"grad_norm": 7.909883975982666, |
|
"learning_rate": 6.345144865731049e-07, |
|
"loss": 2.7175, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 2.969682871985733, |
|
"grad_norm": 7.273110389709473, |
|
"learning_rate": 5.052854669044481e-07, |
|
"loss": 2.7167, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 2.9774366131658523, |
|
"grad_norm": 7.78535270690918, |
|
"learning_rate": 3.760564472357913e-07, |
|
"loss": 2.7301, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 2.985190354345972, |
|
"grad_norm": 7.7455973625183105, |
|
"learning_rate": 2.468274275671345e-07, |
|
"loss": 2.7192, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 2.9929440955260915, |
|
"grad_norm": 7.988417148590088, |
|
"learning_rate": 1.1759840789847768e-07, |
|
"loss": 2.7195, |
|
"step": 193000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 193455, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6392157746049843e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|