|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 200, |
|
"global_step": 17057, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 5.8626956674679014e-05, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 9.999413730433253e-06, |
|
"loss": 1.0997, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0014656739168669754, |
|
"grad_norm": 0.25, |
|
"learning_rate": 9.985343260831331e-06, |
|
"loss": 0.8715, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0029313478337339507, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.970686521662662e-06, |
|
"loss": 0.8613, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004397021750600926, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 9.95602978249399e-06, |
|
"loss": 0.9177, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0058626956674679015, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 9.941373043325322e-06, |
|
"loss": 0.8869, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.007328369584334877, |
|
"grad_norm": 0.25, |
|
"learning_rate": 9.926716304156651e-06, |
|
"loss": 0.852, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.008794043501201852, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 9.912059564987983e-06, |
|
"loss": 0.8612, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.010259717418068828, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 9.897402825819312e-06, |
|
"loss": 0.9075, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.011725391334935803, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.882746086650642e-06, |
|
"loss": 0.9488, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.013191065251802778, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 9.868089347481973e-06, |
|
"loss": 0.8637, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.014656739168669754, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.853432608313303e-06, |
|
"loss": 0.9587, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.01612241308553673, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 9.838775869144634e-06, |
|
"loss": 0.8171, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.017588087002403704, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.824119129975964e-06, |
|
"loss": 0.8413, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01905376091927068, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.809462390807294e-06, |
|
"loss": 0.796, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.020519434836137655, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 9.794805651638625e-06, |
|
"loss": 0.8025, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.02198510875300463, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 9.780148912469955e-06, |
|
"loss": 0.8279, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.023450782669871606, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 9.765492173301284e-06, |
|
"loss": 0.807, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.02491645658673858, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.750835434132614e-06, |
|
"loss": 0.9146, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.026382130503605557, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 9.736178694963945e-06, |
|
"loss": 0.8667, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.027847804420472532, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 9.721521955795275e-06, |
|
"loss": 0.8888, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.029313478337339507, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 9.706865216626606e-06, |
|
"loss": 0.8417, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.030779152254206483, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 9.692208477457936e-06, |
|
"loss": 0.8205, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.03224482617107346, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 9.677551738289266e-06, |
|
"loss": 0.8273, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.03371050008794044, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.662894999120597e-06, |
|
"loss": 0.8107, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.03517617400480741, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 9.648238259951927e-06, |
|
"loss": 0.7997, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.03664184792167439, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 9.633581520783258e-06, |
|
"loss": 0.8802, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.03810752183854136, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 9.618924781614586e-06, |
|
"loss": 0.8859, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.03957319575540834, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 9.604268042445918e-06, |
|
"loss": 0.8317, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.04103886967227531, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 9.589611303277247e-06, |
|
"loss": 0.8483, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.04250454358914229, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 9.57495456410858e-06, |
|
"loss": 0.8267, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.04397021750600926, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 9.560297824939908e-06, |
|
"loss": 0.9042, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.04543589142287624, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 9.545641085771238e-06, |
|
"loss": 0.8058, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.04690156533974321, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 9.530984346602569e-06, |
|
"loss": 0.7874, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.04836723925661019, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 9.516327607433899e-06, |
|
"loss": 0.8562, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.04983291317347716, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.50167086826523e-06, |
|
"loss": 0.8571, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.05129858709034414, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 9.487014129096558e-06, |
|
"loss": 0.8252, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.05276426100721111, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.47235738992789e-06, |
|
"loss": 0.9093, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.05422993492407809, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 9.457700650759219e-06, |
|
"loss": 0.7995, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.055695608840945064, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 9.443043911590551e-06, |
|
"loss": 0.9485, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.05716128275781204, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 9.42838717242188e-06, |
|
"loss": 0.8695, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.058626956674679015, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 9.41373043325321e-06, |
|
"loss": 0.8645, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.060092630591545994, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 9.39907369408454e-06, |
|
"loss": 0.9043, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.061558304508412966, |
|
"grad_norm": 2.25, |
|
"learning_rate": 9.384416954915871e-06, |
|
"loss": 0.8794, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.06302397842527994, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 9.369760215747201e-06, |
|
"loss": 0.8327, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.06448965234214692, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.355103476578532e-06, |
|
"loss": 0.9179, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.06595532625901389, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 9.340446737409862e-06, |
|
"loss": 0.8321, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.06742100017588087, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 9.325789998241193e-06, |
|
"loss": 0.8061, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.06888667409274785, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.311133259072523e-06, |
|
"loss": 0.8817, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.07035234800961482, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.296476519903852e-06, |
|
"loss": 0.9002, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.07181802192648179, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 9.281819780735182e-06, |
|
"loss": 0.9576, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.07328369584334878, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 9.267163041566513e-06, |
|
"loss": 0.8482, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.07474936976021575, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 9.252506302397843e-06, |
|
"loss": 0.8323, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.07621504367708272, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.237849563229173e-06, |
|
"loss": 0.8371, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.07768071759394969, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 9.223192824060504e-06, |
|
"loss": 0.8462, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.07914639151081668, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 9.208536084891834e-06, |
|
"loss": 0.9729, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.08061206542768365, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 9.193879345723165e-06, |
|
"loss": 1.0097, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.08207773934455062, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 9.179222606554495e-06, |
|
"loss": 0.8492, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.0835434132614176, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 9.164565867385825e-06, |
|
"loss": 0.8352, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.08500908717828458, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 9.149909128217154e-06, |
|
"loss": 0.82, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.08647476109515155, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 9.135252389048486e-06, |
|
"loss": 0.8827, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.08794043501201852, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 9.120595649879815e-06, |
|
"loss": 0.8238, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.08940610892888551, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 9.105938910711145e-06, |
|
"loss": 0.9153, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.09087178284575248, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.091282171542476e-06, |
|
"loss": 0.8383, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.09233745676261945, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.076625432373806e-06, |
|
"loss": 0.8326, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.09380313067948642, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 9.061968693205137e-06, |
|
"loss": 0.9013, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.09526880459635341, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 9.047311954036467e-06, |
|
"loss": 0.882, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.09673447851322038, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 9.032655214867797e-06, |
|
"loss": 0.8533, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.09820015243008735, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 9.017998475699126e-06, |
|
"loss": 0.8696, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.09966582634695433, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 9.003341736530458e-06, |
|
"loss": 1.0214, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.10113150026382131, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 8.988684997361787e-06, |
|
"loss": 0.8732, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.10259717418068828, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 8.974028258193117e-06, |
|
"loss": 0.9454, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.10406284809755526, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 8.959371519024448e-06, |
|
"loss": 0.8994, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.10552852201442223, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 8.944714779855778e-06, |
|
"loss": 0.9015, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.10699419593128921, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 8.930058040687109e-06, |
|
"loss": 0.884, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.10845986984815618, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 8.915401301518439e-06, |
|
"loss": 0.7944, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.10992554376502316, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 8.90074456234977e-06, |
|
"loss": 0.8855, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.11139121768189013, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 8.8860878231811e-06, |
|
"loss": 1.0339, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.11285689159875711, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 8.87143108401243e-06, |
|
"loss": 1.0966, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.11432256551562409, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 8.85677434484376e-06, |
|
"loss": 0.8355, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.11578823943249106, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 8.84211760567509e-06, |
|
"loss": 0.8109, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.11725391334935803, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 8.82746086650642e-06, |
|
"loss": 0.9596, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.11871958726622502, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.81280412733775e-06, |
|
"loss": 0.8931, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.12018526118309199, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 8.79814738816908e-06, |
|
"loss": 0.818, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.12165093509995896, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 8.783490649000411e-06, |
|
"loss": 0.8465, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.12311660901682593, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 8.768833909831741e-06, |
|
"loss": 0.8754, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.12458228293369292, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 8.754177170663072e-06, |
|
"loss": 0.8368, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.1260479568505599, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 8.739520431494402e-06, |
|
"loss": 0.8626, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.12751363076742686, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 8.724863692325733e-06, |
|
"loss": 0.9038, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.12897930468429383, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 8.710206953157061e-06, |
|
"loss": 0.8421, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.1304449786011608, |
|
"grad_norm": 0.5, |
|
"learning_rate": 8.695550213988393e-06, |
|
"loss": 0.9368, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.13191065251802778, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 8.680893474819722e-06, |
|
"loss": 0.8676, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.13337632643489478, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 8.666236735651054e-06, |
|
"loss": 0.7942, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.13484200035176175, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 8.651579996482383e-06, |
|
"loss": 0.895, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.13630767426862872, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 8.636923257313713e-06, |
|
"loss": 0.8239, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.1377733481854957, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 8.622266518145044e-06, |
|
"loss": 0.8594, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.13923902210236266, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 8.607609778976374e-06, |
|
"loss": 0.9231, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.14070469601922964, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 8.592953039807705e-06, |
|
"loss": 0.8337, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.1421703699360966, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 8.578296300639033e-06, |
|
"loss": 0.8142, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.14363604385296358, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8.563639561470365e-06, |
|
"loss": 0.8561, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.14510171776983058, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 8.548982822301694e-06, |
|
"loss": 0.9175, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.14656739168669755, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 8.534326083133026e-06, |
|
"loss": 0.9498, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.14803306560356452, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 8.519669343964355e-06, |
|
"loss": 0.9313, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.1494987395204315, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 8.505012604795685e-06, |
|
"loss": 0.9784, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.15096441343729847, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 8.490355865627016e-06, |
|
"loss": 0.9337, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.15243008735416544, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 8.475699126458346e-06, |
|
"loss": 0.975, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.1538957612710324, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.461042387289676e-06, |
|
"loss": 0.8933, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.15536143518789938, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 8.446385648121007e-06, |
|
"loss": 0.7984, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.15682710910476638, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 8.431728908952337e-06, |
|
"loss": 0.8407, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.15829278302163335, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 8.417072169783668e-06, |
|
"loss": 0.8591, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.15975845693850033, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 8.402415430614998e-06, |
|
"loss": 0.906, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.1612241308553673, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 8.387758691446328e-06, |
|
"loss": 0.8644, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.16268980477223427, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 8.373101952277657e-06, |
|
"loss": 0.8917, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.16415547868910124, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 8.358445213108988e-06, |
|
"loss": 0.852, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.1656211526059682, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 8.343788473940318e-06, |
|
"loss": 0.9059, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.1670868265228352, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 8.329131734771648e-06, |
|
"loss": 0.9189, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.16855250043970219, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 8.314474995602979e-06, |
|
"loss": 0.9356, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.17001817435656916, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 8.29981825643431e-06, |
|
"loss": 0.8653, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.17148384827343613, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 8.28516151726564e-06, |
|
"loss": 0.9481, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.1729495221903031, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.27050477809697e-06, |
|
"loss": 0.8533, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.17441519610717007, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 8.2558480389283e-06, |
|
"loss": 0.9067, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.17588087002403704, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 8.24119129975963e-06, |
|
"loss": 0.8769, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.17734654394090402, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 8.226534560590961e-06, |
|
"loss": 1.0016, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.17881221785777102, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 8.21187782142229e-06, |
|
"loss": 0.8201, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.180277891774638, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 8.197221082253622e-06, |
|
"loss": 0.9004, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.18174356569150496, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 8.18256434308495e-06, |
|
"loss": 0.9973, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.18320923960837193, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 8.167907603916281e-06, |
|
"loss": 0.8702, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.1846749135252389, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 8.153250864747612e-06, |
|
"loss": 1.1181, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.18614058744210588, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 8.138594125578942e-06, |
|
"loss": 0.9449, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.18760626135897285, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 8.123937386410272e-06, |
|
"loss": 1.0326, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.18907193527583982, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 8.109280647241601e-06, |
|
"loss": 0.836, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.19053760919270682, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 8.094623908072933e-06, |
|
"loss": 0.8702, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.1920032831095738, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 8.079967168904262e-06, |
|
"loss": 0.9676, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.19346895702644076, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 8.065310429735594e-06, |
|
"loss": 0.9803, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.19493463094330774, |
|
"grad_norm": 0.125, |
|
"learning_rate": 8.050653690566923e-06, |
|
"loss": 0.8786, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.1964003048601747, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 8.035996951398253e-06, |
|
"loss": 0.8793, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.19786597877704168, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 8.021340212229584e-06, |
|
"loss": 0.9167, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.19933165269390865, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 8.006683473060914e-06, |
|
"loss": 0.8833, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.20079732661077562, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 7.992026733892244e-06, |
|
"loss": 0.9054, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.20226300052764262, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 7.977369994723575e-06, |
|
"loss": 0.9006, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.2037286744445096, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 7.962713255554905e-06, |
|
"loss": 0.8351, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.20519434836137657, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 7.948056516386236e-06, |
|
"loss": 0.8815, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.20666002227824354, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 7.933399777217566e-06, |
|
"loss": 0.8492, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.2081256961951105, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 7.918743038048895e-06, |
|
"loss": 0.7399, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.20959137011197748, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 7.904086298880225e-06, |
|
"loss": 0.8195, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.21105704402884445, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 7.889429559711556e-06, |
|
"loss": 0.8085, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.21252271794571143, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 7.874772820542886e-06, |
|
"loss": 0.9241, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.21398839186257843, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 7.860116081374216e-06, |
|
"loss": 0.8614, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.2154540657794454, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 7.845459342205547e-06, |
|
"loss": 0.8509, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.21691973969631237, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 7.830802603036877e-06, |
|
"loss": 0.9153, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.21838541361317934, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 7.816145863868208e-06, |
|
"loss": 0.857, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.2198510875300463, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 7.801489124699538e-06, |
|
"loss": 0.8623, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.22131676144691328, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 7.786832385530868e-06, |
|
"loss": 0.9246, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.22278243536378026, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 7.772175646362197e-06, |
|
"loss": 0.9, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.22424810928064723, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 7.75751890719353e-06, |
|
"loss": 1.0074, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.22571378319751423, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 7.742862168024858e-06, |
|
"loss": 0.8933, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.2271794571143812, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 7.72820542885619e-06, |
|
"loss": 0.8541, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.22864513103124817, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 7.713548689687519e-06, |
|
"loss": 0.8043, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.23011080494811514, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 7.698891950518849e-06, |
|
"loss": 0.9046, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.23157647886498212, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 7.68423521135018e-06, |
|
"loss": 1.0241, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.2330421527818491, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 7.66957847218151e-06, |
|
"loss": 0.8257, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.23450782669871606, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 7.65492173301284e-06, |
|
"loss": 0.9551, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.23597350061558303, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 7.640264993844169e-06, |
|
"loss": 0.9757, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.23743917453245003, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 7.6256082546755e-06, |
|
"loss": 0.9187, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.238904848449317, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 7.610951515506831e-06, |
|
"loss": 0.9166, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.24037052236618398, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 7.596294776338161e-06, |
|
"loss": 0.9277, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.24183619628305095, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 7.581638037169491e-06, |
|
"loss": 0.8612, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.24330187019991792, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 7.566981298000822e-06, |
|
"loss": 0.8533, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.2447675441167849, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 7.5523245588321515e-06, |
|
"loss": 0.8259, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.24623321803365186, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 7.537667819663482e-06, |
|
"loss": 0.882, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.24769889195051886, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 7.523011080494812e-06, |
|
"loss": 0.9179, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.24916456586738583, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 7.508354341326143e-06, |
|
"loss": 0.7686, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.2506302397842528, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 7.493697602157472e-06, |
|
"loss": 0.854, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.2520959137011198, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 7.4790408629888035e-06, |
|
"loss": 0.9073, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.25356158761798675, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 7.464384123820133e-06, |
|
"loss": 0.8411, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.2550272615348537, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 7.449727384651463e-06, |
|
"loss": 0.911, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.2564929354517207, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 7.435070645482794e-06, |
|
"loss": 0.8707, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.25795860936858767, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 7.420413906314123e-06, |
|
"loss": 0.8607, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.25942428328545464, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 7.405757167145454e-06, |
|
"loss": 0.8238, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.2608899572023216, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 7.391100427976784e-06, |
|
"loss": 0.9302, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.2623556311191886, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 7.376443688808115e-06, |
|
"loss": 0.8033, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.26382130503605555, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 7.361786949639444e-06, |
|
"loss": 0.8793, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2652869789529226, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 7.3471302104707754e-06, |
|
"loss": 0.8292, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.26675265286978955, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 7.332473471302105e-06, |
|
"loss": 0.8131, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.2682183267866565, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 7.317816732133436e-06, |
|
"loss": 0.8427, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.2696840007035235, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 7.303159992964766e-06, |
|
"loss": 0.9036, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.27114967462039047, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 7.288503253796096e-06, |
|
"loss": 0.9235, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.27261534853725744, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 7.273846514627426e-06, |
|
"loss": 0.9909, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.2740810224541244, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 7.259189775458757e-06, |
|
"loss": 0.9281, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.2755466963709914, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 7.244533036290087e-06, |
|
"loss": 0.8731, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.27701237028785836, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 7.229876297121416e-06, |
|
"loss": 0.8967, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.27847804420472533, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 7.215219557952747e-06, |
|
"loss": 0.8448, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.2799437181215923, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 7.200562818784077e-06, |
|
"loss": 0.846, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.28140939203845927, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 7.185906079615408e-06, |
|
"loss": 0.785, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.28287506595532624, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 7.171249340446738e-06, |
|
"loss": 0.995, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.2843407398721932, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 7.156592601278068e-06, |
|
"loss": 0.879, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.2858064137890602, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 7.141935862109399e-06, |
|
"loss": 0.9504, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.28727208770592716, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 7.127279122940729e-06, |
|
"loss": 1.0018, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.2887377616227942, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 7.1126223837720585e-06, |
|
"loss": 0.9797, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.29020343553966116, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 7.09796564460339e-06, |
|
"loss": 0.8494, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.29166910945652813, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 7.083308905434719e-06, |
|
"loss": 0.9362, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.2931347833733951, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 7.06865216626605e-06, |
|
"loss": 0.7694, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2946004572902621, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 7.05399542709738e-06, |
|
"loss": 0.9893, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.29606613120712905, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 7.0393386879287106e-06, |
|
"loss": 0.8555, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.297531805123996, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 7.02468194876004e-06, |
|
"loss": 0.8452, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.298997479040863, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 7.010025209591371e-06, |
|
"loss": 0.8275, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.30046315295772996, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 6.995368470422701e-06, |
|
"loss": 0.7464, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 0.30192882687459693, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 6.9807117312540305e-06, |
|
"loss": 0.8753, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.3033945007914639, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 6.966054992085362e-06, |
|
"loss": 1.016, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.3048601747083309, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 6.951398252916691e-06, |
|
"loss": 0.8956, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.30632584862519785, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 6.936741513748022e-06, |
|
"loss": 0.8358, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 0.3077915225420648, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 6.922084774579352e-06, |
|
"loss": 0.9687, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.3092571964589318, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 6.9074280354106825e-06, |
|
"loss": 0.8358, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 0.31072287037579877, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 6.892771296242012e-06, |
|
"loss": 0.8345, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.3121885442926658, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 6.878114557073343e-06, |
|
"loss": 0.9074, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 0.31365421820953276, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 6.863457817904673e-06, |
|
"loss": 0.8969, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.31511989212639974, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 6.848801078736004e-06, |
|
"loss": 0.7103, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 0.3165855660432667, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 6.834144339567334e-06, |
|
"loss": 0.8087, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.3180512399601337, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 6.819487600398664e-06, |
|
"loss": 0.9289, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 0.31951691387700065, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 6.804830861229994e-06, |
|
"loss": 0.8306, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.3209825877938676, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 6.790174122061325e-06, |
|
"loss": 0.793, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 0.3224482617107346, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.7755173828926545e-06, |
|
"loss": 0.768, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.32391393562760157, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 6.760860643723984e-06, |
|
"loss": 0.8649, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 0.32537960954446854, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 6.746203904555315e-06, |
|
"loss": 0.9173, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.3268452834613355, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 6.731547165386645e-06, |
|
"loss": 0.8569, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 0.3283109573782025, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 6.716890426217976e-06, |
|
"loss": 0.8013, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.32977663129506946, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 6.702233687049306e-06, |
|
"loss": 0.9285, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 0.3312423052119364, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.687576947880636e-06, |
|
"loss": 0.9202, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.3327079791288034, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 6.672920208711966e-06, |
|
"loss": 0.8727, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 0.3341736530456704, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 6.658263469543297e-06, |
|
"loss": 0.8877, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.3356393269625374, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 6.6436067303746264e-06, |
|
"loss": 0.7748, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 0.33710500087940437, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 6.628949991205958e-06, |
|
"loss": 0.805, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.33857067479627134, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 6.614293252037287e-06, |
|
"loss": 0.8786, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 0.3400363487131383, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 6.599636512868618e-06, |
|
"loss": 0.8441, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.3415020226300053, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 6.584979773699948e-06, |
|
"loss": 0.8751, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 0.34296769654687226, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 6.5703230345312785e-06, |
|
"loss": 0.9569, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.34443337046373923, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 6.555666295362608e-06, |
|
"loss": 0.9255, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 0.3458990443806062, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 6.541009556193938e-06, |
|
"loss": 0.8501, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.3473647182974732, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 6.526352817025269e-06, |
|
"loss": 0.8933, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 0.34883039221434015, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 6.511696077856598e-06, |
|
"loss": 0.8941, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.3502960661312071, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 6.49703933868793e-06, |
|
"loss": 1.0143, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 0.3517617400480741, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 6.482382599519259e-06, |
|
"loss": 0.8929, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.35322741396494106, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 6.46772586035059e-06, |
|
"loss": 0.8641, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 0.35469308788180803, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 6.45306912118192e-06, |
|
"loss": 0.8743, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.356158761798675, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 6.4384123820132504e-06, |
|
"loss": 0.8072, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 0.35762443571554203, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 6.42375564284458e-06, |
|
"loss": 0.8964, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.359090109632409, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 6.409098903675911e-06, |
|
"loss": 0.9049, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 0.360555783549276, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 6.394442164507241e-06, |
|
"loss": 0.9438, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.36202145746614295, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 6.379785425338571e-06, |
|
"loss": 0.8891, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 0.3634871313830099, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 6.365128686169902e-06, |
|
"loss": 1.1601, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.3649528052998769, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 6.350471947001232e-06, |
|
"loss": 0.968, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 0.36641847921674386, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.3358152078325616e-06, |
|
"loss": 0.962, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.36788415313361084, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 6.321158468663893e-06, |
|
"loss": 0.8971, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 0.3693498270504778, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 6.306501729495222e-06, |
|
"loss": 0.8835, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.3708155009673448, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 6.291844990326552e-06, |
|
"loss": 0.8783, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 0.37228117488421175, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 6.277188251157883e-06, |
|
"loss": 0.8959, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.3737468488010787, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 6.262531511989213e-06, |
|
"loss": 0.8945, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 0.3752125227179457, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 6.247874772820543e-06, |
|
"loss": 0.9492, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.37667819663481267, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 6.2332180336518736e-06, |
|
"loss": 0.8728, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 0.37814387055167964, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 6.218561294483204e-06, |
|
"loss": 0.8508, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.3796095444685466, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 6.2039045553145335e-06, |
|
"loss": 0.9759, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 0.38107521838541364, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 6.189247816145865e-06, |
|
"loss": 1.0576, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.3825408923022806, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 6.174591076977194e-06, |
|
"loss": 0.7754, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 0.3840065662191476, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 6.159934337808526e-06, |
|
"loss": 0.9326, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.38547224013601455, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 6.145277598639855e-06, |
|
"loss": 0.8764, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 0.3869379140528815, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 6.1306208594711856e-06, |
|
"loss": 0.8589, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.3884035879697485, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 6.115964120302515e-06, |
|
"loss": 1.0026, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 0.38986926188661547, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 6.101307381133846e-06, |
|
"loss": 0.8483, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.39133493580348244, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 6.086650641965176e-06, |
|
"loss": 0.9063, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 0.3928006097203494, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.0719939027965055e-06, |
|
"loss": 0.8658, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.3942662836372164, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 6.057337163627837e-06, |
|
"loss": 0.8389, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 0.39573195755408336, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 6.042680424459166e-06, |
|
"loss": 0.8769, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.39719763147095033, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 6.0280236852904975e-06, |
|
"loss": 0.9461, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 0.3986633053878173, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 6.013366946121827e-06, |
|
"loss": 0.9144, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.4001289793046843, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 5.9987102069531575e-06, |
|
"loss": 0.8746, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 0.40159465322155125, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 5.984053467784487e-06, |
|
"loss": 0.8977, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.4030603271384182, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 5.969396728615818e-06, |
|
"loss": 0.8404, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 0.40452600105528524, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 5.954739989447148e-06, |
|
"loss": 0.8321, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.4059916749721522, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 5.940083250278479e-06, |
|
"loss": 0.8963, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 0.4074573488890192, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 5.925426511109809e-06, |
|
"loss": 0.9668, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.40892302280588616, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 5.910769771941139e-06, |
|
"loss": 0.8921, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 0.41038869672275313, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 5.8961130327724695e-06, |
|
"loss": 0.9034, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.4118543706396201, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 5.8814562936038e-06, |
|
"loss": 0.873, |
|
"step": 7025 |
|
}, |
|
{ |
|
"epoch": 0.4133200445564871, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 5.8667995544351295e-06, |
|
"loss": 0.9082, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.41478571847335405, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 5.852142815266459e-06, |
|
"loss": 0.9186, |
|
"step": 7075 |
|
}, |
|
{ |
|
"epoch": 0.416251392390221, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 5.83748607609779e-06, |
|
"loss": 0.9711, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.417717066307088, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.82282933692912e-06, |
|
"loss": 0.8168, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 0.41918274022395496, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.808172597760451e-06, |
|
"loss": 0.926, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.42064841414082194, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 5.793515858591781e-06, |
|
"loss": 0.7931, |
|
"step": 7175 |
|
}, |
|
{ |
|
"epoch": 0.4221140880576889, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 5.778859119423111e-06, |
|
"loss": 0.9143, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.4235797619745559, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.7642023802544415e-06, |
|
"loss": 0.846, |
|
"step": 7225 |
|
}, |
|
{ |
|
"epoch": 0.42504543589142285, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 5.749545641085772e-06, |
|
"loss": 0.8936, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.4265111098082899, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 5.7348889019171014e-06, |
|
"loss": 0.9142, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 0.42797678372515685, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 5.720232162748433e-06, |
|
"loss": 0.9059, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.4294424576420238, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 5.705575423579762e-06, |
|
"loss": 0.7794, |
|
"step": 7325 |
|
}, |
|
{ |
|
"epoch": 0.4309081315588908, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 5.6909186844110935e-06, |
|
"loss": 0.7724, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.43237380547575777, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 5.676261945242423e-06, |
|
"loss": 0.9143, |
|
"step": 7375 |
|
}, |
|
{ |
|
"epoch": 0.43383947939262474, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 5.6616052060737535e-06, |
|
"loss": 0.8556, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.4353051533094917, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 5.646948466905083e-06, |
|
"loss": 0.8668, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 0.4367708272263587, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 5.632291727736414e-06, |
|
"loss": 0.87, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.43823650114322565, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 5.617634988567744e-06, |
|
"loss": 0.8162, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 0.4397021750600926, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 5.602978249399073e-06, |
|
"loss": 0.8705, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4411678489769596, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 5.588321510230405e-06, |
|
"loss": 0.8221, |
|
"step": 7525 |
|
}, |
|
{ |
|
"epoch": 0.44263352289382657, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 5.573664771061734e-06, |
|
"loss": 0.9005, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.44409919681069354, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 5.5590080318930654e-06, |
|
"loss": 0.8577, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 0.4455648707275605, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 5.544351292724395e-06, |
|
"loss": 0.9489, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.4470305446444275, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 5.529694553555725e-06, |
|
"loss": 0.78, |
|
"step": 7625 |
|
}, |
|
{ |
|
"epoch": 0.44849621856129446, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 5.515037814387055e-06, |
|
"loss": 0.7977, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.4499618924781615, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 5.500381075218386e-06, |
|
"loss": 0.8348, |
|
"step": 7675 |
|
}, |
|
{ |
|
"epoch": 0.45142756639502846, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 5.485724336049716e-06, |
|
"loss": 0.9294, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.45289324031189543, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 5.471067596881047e-06, |
|
"loss": 0.905, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 0.4543589142287624, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 5.456410857712377e-06, |
|
"loss": 0.865, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.4558245881456294, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 5.441754118543707e-06, |
|
"loss": 0.8706, |
|
"step": 7775 |
|
}, |
|
{ |
|
"epoch": 0.45729026206249634, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 5.427097379375037e-06, |
|
"loss": 1.0095, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.4587559359793633, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 5.412440640206368e-06, |
|
"loss": 0.801, |
|
"step": 7825 |
|
}, |
|
{ |
|
"epoch": 0.4602216098962303, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 5.397783901037697e-06, |
|
"loss": 0.8175, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.46168728381309726, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.383127161869027e-06, |
|
"loss": 0.9689, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 0.46315295772996423, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 5.368470422700358e-06, |
|
"loss": 0.8495, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.4646186316468312, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 5.353813683531688e-06, |
|
"loss": 0.9133, |
|
"step": 7925 |
|
}, |
|
{ |
|
"epoch": 0.4660843055636982, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 5.339156944363019e-06, |
|
"loss": 0.8828, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.46754997948056515, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 5.3245002051943485e-06, |
|
"loss": 1.0062, |
|
"step": 7975 |
|
}, |
|
{ |
|
"epoch": 0.4690156533974321, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 5.309843466025679e-06, |
|
"loss": 0.8171, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4704813273142991, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 5.295186726857009e-06, |
|
"loss": 0.8704, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 0.47194700123116606, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 5.28052998768834e-06, |
|
"loss": 0.866, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.4734126751480331, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 5.265873248519669e-06, |
|
"loss": 0.8298, |
|
"step": 8075 |
|
}, |
|
{ |
|
"epoch": 0.47487834906490006, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 5.251216509351001e-06, |
|
"loss": 0.8654, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.47634402298176703, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.23655977018233e-06, |
|
"loss": 0.9916, |
|
"step": 8125 |
|
}, |
|
{ |
|
"epoch": 0.477809696898634, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 5.2219030310136605e-06, |
|
"loss": 0.9645, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.479275370815501, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.207246291844991e-06, |
|
"loss": 0.8867, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 0.48074104473236795, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 5.192589552676321e-06, |
|
"loss": 0.9363, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.4822067186492349, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 5.177932813507651e-06, |
|
"loss": 0.7848, |
|
"step": 8225 |
|
}, |
|
{ |
|
"epoch": 0.4836723925661019, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 5.163276074338982e-06, |
|
"loss": 0.8899, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.48513806648296887, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 5.148619335170312e-06, |
|
"loss": 0.8092, |
|
"step": 8275 |
|
}, |
|
{ |
|
"epoch": 0.48660374039983584, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 5.133962596001641e-06, |
|
"loss": 0.899, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.4880694143167028, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 5.1193058568329725e-06, |
|
"loss": 0.9528, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 0.4895350882335698, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 5.104649117664302e-06, |
|
"loss": 0.8668, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.49100076215043675, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 5.0899923784956325e-06, |
|
"loss": 0.9176, |
|
"step": 8375 |
|
}, |
|
{ |
|
"epoch": 0.4924664360673037, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 5.075335639326963e-06, |
|
"loss": 0.9135, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.4939321099841707, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 5.060678900158293e-06, |
|
"loss": 0.8201, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 0.4953977839010377, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 5.046022160989623e-06, |
|
"loss": 0.8947, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.4968634578179047, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 5.031365421820954e-06, |
|
"loss": 0.8838, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 0.49832913173477167, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 5.016708682652284e-06, |
|
"loss": 0.8648, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.49979480565163864, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 5.002051943483615e-06, |
|
"loss": 0.9247, |
|
"step": 8525 |
|
}, |
|
{ |
|
"epoch": 0.5012604795685056, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 4.9873952043149445e-06, |
|
"loss": 0.8839, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.5027261534853725, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 4.972738465146275e-06, |
|
"loss": 0.9194, |
|
"step": 8575 |
|
}, |
|
{ |
|
"epoch": 0.5041918274022396, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 4.9580817259776045e-06, |
|
"loss": 0.8232, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.5056575013191065, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 4.943424986808935e-06, |
|
"loss": 1.0866, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 0.5071231752359735, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 4.928768247640265e-06, |
|
"loss": 0.9213, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.5085888491528405, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 4.914111508471596e-06, |
|
"loss": 0.993, |
|
"step": 8675 |
|
}, |
|
{ |
|
"epoch": 0.5100545230697074, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 4.899454769302926e-06, |
|
"loss": 0.9335, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.5115201969865745, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 4.8847980301342565e-06, |
|
"loss": 0.9327, |
|
"step": 8725 |
|
}, |
|
{ |
|
"epoch": 0.5129858709034414, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 4.870141290965587e-06, |
|
"loss": 0.8948, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.5144515448203084, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 4.8554845517969164e-06, |
|
"loss": 0.9082, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 0.5159172187371753, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 4.840827812628247e-06, |
|
"loss": 0.894, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.5173828926540424, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 4.826171073459577e-06, |
|
"loss": 0.9065, |
|
"step": 8825 |
|
}, |
|
{ |
|
"epoch": 0.5188485665709093, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 4.811514334290907e-06, |
|
"loss": 0.8453, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.5203142404877763, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 4.796857595122237e-06, |
|
"loss": 1.0002, |
|
"step": 8875 |
|
}, |
|
{ |
|
"epoch": 0.5217799144046432, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.782200855953568e-06, |
|
"loss": 0.9558, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.5232455883215102, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 4.767544116784898e-06, |
|
"loss": 0.7789, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 0.5247112622383772, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 4.7528873776162284e-06, |
|
"loss": 0.8602, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.5261769361552442, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 4.738230638447559e-06, |
|
"loss": 0.849, |
|
"step": 8975 |
|
}, |
|
{ |
|
"epoch": 0.5276426100721111, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 4.723573899278888e-06, |
|
"loss": 0.8461, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5291082839889781, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 4.708917160110219e-06, |
|
"loss": 0.8729, |
|
"step": 9025 |
|
}, |
|
{ |
|
"epoch": 0.5305739579058452, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 4.694260420941549e-06, |
|
"loss": 1.0451, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.5320396318227121, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 4.67960368177288e-06, |
|
"loss": 0.817, |
|
"step": 9075 |
|
}, |
|
{ |
|
"epoch": 0.5335053057395791, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 4.66494694260421e-06, |
|
"loss": 0.9171, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.534970979656446, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 4.6502902034355404e-06, |
|
"loss": 0.7783, |
|
"step": 9125 |
|
}, |
|
{ |
|
"epoch": 0.536436653573313, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 4.635633464266871e-06, |
|
"loss": 0.8943, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.53790232749018, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 4.6209767250982e-06, |
|
"loss": 0.9554, |
|
"step": 9175 |
|
}, |
|
{ |
|
"epoch": 0.539368001407047, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 4.606319985929531e-06, |
|
"loss": 0.7875, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.5408336753239139, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.591663246760861e-06, |
|
"loss": 0.9086, |
|
"step": 9225 |
|
}, |
|
{ |
|
"epoch": 0.5422993492407809, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 4.577006507592191e-06, |
|
"loss": 0.9511, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.5437650231576479, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 4.562349768423521e-06, |
|
"loss": 0.8182, |
|
"step": 9275 |
|
}, |
|
{ |
|
"epoch": 0.5452306970745149, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 4.547693029254852e-06, |
|
"loss": 0.8487, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.5466963709913818, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 4.533036290086182e-06, |
|
"loss": 0.8708, |
|
"step": 9325 |
|
}, |
|
{ |
|
"epoch": 0.5481620449082488, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 4.518379550917512e-06, |
|
"loss": 0.8577, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.5496277188251157, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 4.503722811748843e-06, |
|
"loss": 0.8896, |
|
"step": 9375 |
|
}, |
|
{ |
|
"epoch": 0.5510933927419828, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 4.489066072580172e-06, |
|
"loss": 0.8778, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.5525590666588497, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 4.474409333411503e-06, |
|
"loss": 0.847, |
|
"step": 9425 |
|
}, |
|
{ |
|
"epoch": 0.5540247405757167, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 4.459752594242833e-06, |
|
"loss": 0.8678, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.5554904144925837, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 4.4450958550741636e-06, |
|
"loss": 0.7334, |
|
"step": 9475 |
|
}, |
|
{ |
|
"epoch": 0.5569560884094507, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 4.430439115905494e-06, |
|
"loss": 1.1603, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5584217623263177, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 4.415782376736824e-06, |
|
"loss": 0.8804, |
|
"step": 9525 |
|
}, |
|
{ |
|
"epoch": 0.5598874362431846, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 4.401125637568155e-06, |
|
"loss": 0.9023, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.5613531101600516, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 4.386468898399484e-06, |
|
"loss": 0.8169, |
|
"step": 9575 |
|
}, |
|
{ |
|
"epoch": 0.5628187840769185, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 4.371812159230815e-06, |
|
"loss": 0.919, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.5642844579937856, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 4.357155420062145e-06, |
|
"loss": 0.8916, |
|
"step": 9625 |
|
}, |
|
{ |
|
"epoch": 0.5657501319106525, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 4.342498680893475e-06, |
|
"loss": 0.832, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.5672158058275195, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 4.327841941724805e-06, |
|
"loss": 0.824, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 0.5686814797443864, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 4.3131852025561355e-06, |
|
"loss": 0.8998, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.5701471536612535, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 4.298528463387466e-06, |
|
"loss": 0.8192, |
|
"step": 9725 |
|
}, |
|
{ |
|
"epoch": 0.5716128275781204, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 4.283871724218796e-06, |
|
"loss": 0.825, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.5730785014949874, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 4.269214985050127e-06, |
|
"loss": 0.9014, |
|
"step": 9775 |
|
}, |
|
{ |
|
"epoch": 0.5745441754118543, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 4.254558245881456e-06, |
|
"loss": 1.1582, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.5760098493287213, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 4.239901506712787e-06, |
|
"loss": 0.9225, |
|
"step": 9825 |
|
}, |
|
{ |
|
"epoch": 0.5774755232455884, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 4.225244767544117e-06, |
|
"loss": 0.8293, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.5789411971624553, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 4.2105880283754475e-06, |
|
"loss": 0.9205, |
|
"step": 9875 |
|
}, |
|
{ |
|
"epoch": 0.5804068710793223, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 4.195931289206778e-06, |
|
"loss": 0.9015, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.5818725449961892, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 4.181274550038108e-06, |
|
"loss": 0.7924, |
|
"step": 9925 |
|
}, |
|
{ |
|
"epoch": 0.5833382189130563, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.166617810869438e-06, |
|
"loss": 1.0072, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.5848038928299232, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 4.151961071700768e-06, |
|
"loss": 0.9099, |
|
"step": 9975 |
|
}, |
|
{ |
|
"epoch": 0.5862695667467902, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.137304332532099e-06, |
|
"loss": 1.1021, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5877352406636571, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.122647593363428e-06, |
|
"loss": 0.8166, |
|
"step": 10025 |
|
}, |
|
{ |
|
"epoch": 0.5892009145805241, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.107990854194759e-06, |
|
"loss": 0.9566, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.5906665884973911, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 4.093334115026089e-06, |
|
"loss": 0.8494, |
|
"step": 10075 |
|
}, |
|
{ |
|
"epoch": 0.5921322624142581, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 4.0786773758574195e-06, |
|
"loss": 0.9279, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.593597936331125, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 4.06402063668875e-06, |
|
"loss": 0.8628, |
|
"step": 10125 |
|
}, |
|
{ |
|
"epoch": 0.595063610247992, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 4.04936389752008e-06, |
|
"loss": 0.9406, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.596529284164859, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 4.03470715835141e-06, |
|
"loss": 0.896, |
|
"step": 10175 |
|
}, |
|
{ |
|
"epoch": 0.597994958081726, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 4.02005041918274e-06, |
|
"loss": 0.8608, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.599460631998593, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 4.005393680014071e-06, |
|
"loss": 0.9103, |
|
"step": 10225 |
|
}, |
|
{ |
|
"epoch": 0.6009263059154599, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 3.990736940845401e-06, |
|
"loss": 0.8136, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.602391979832327, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 3.9760802016767315e-06, |
|
"loss": 0.8374, |
|
"step": 10275 |
|
}, |
|
{ |
|
"epoch": 0.6038576537491939, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 3.961423462508062e-06, |
|
"loss": 0.8049, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.6053233276660609, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 3.946766723339392e-06, |
|
"loss": 0.7943, |
|
"step": 10325 |
|
}, |
|
{ |
|
"epoch": 0.6067890015829278, |
|
"grad_norm": 1.0, |
|
"learning_rate": 3.932109984170722e-06, |
|
"loss": 0.8691, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.6082546754997948, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 3.917453245002052e-06, |
|
"loss": 0.8565, |
|
"step": 10375 |
|
}, |
|
{ |
|
"epoch": 0.6097203494166618, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.902796505833383e-06, |
|
"loss": 0.9866, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.6111860233335288, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 3.888139766664712e-06, |
|
"loss": 0.9026, |
|
"step": 10425 |
|
}, |
|
{ |
|
"epoch": 0.6126516972503957, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 3.873483027496043e-06, |
|
"loss": 0.9329, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.6141173711672627, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 3.858826288327373e-06, |
|
"loss": 0.8534, |
|
"step": 10475 |
|
}, |
|
{ |
|
"epoch": 0.6155830450841296, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 3.8441695491587034e-06, |
|
"loss": 0.824, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6170487190009967, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.829512809990034e-06, |
|
"loss": 0.8444, |
|
"step": 10525 |
|
}, |
|
{ |
|
"epoch": 0.6185143929178636, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 3.814856070821364e-06, |
|
"loss": 0.8843, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.6199800668347306, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 3.8001993316526942e-06, |
|
"loss": 0.8791, |
|
"step": 10575 |
|
}, |
|
{ |
|
"epoch": 0.6214457407515975, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.7855425924840246e-06, |
|
"loss": 0.8695, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.6229114146684646, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 3.7708858533153546e-06, |
|
"loss": 0.8917, |
|
"step": 10625 |
|
}, |
|
{ |
|
"epoch": 0.6243770885853316, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 3.756229114146685e-06, |
|
"loss": 0.7642, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.6258427625021985, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 3.7415723749780154e-06, |
|
"loss": 0.8826, |
|
"step": 10675 |
|
}, |
|
{ |
|
"epoch": 0.6273084364190655, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 3.7269156358093454e-06, |
|
"loss": 0.8513, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.6287741103359324, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 3.712258896640676e-06, |
|
"loss": 1.0076, |
|
"step": 10725 |
|
}, |
|
{ |
|
"epoch": 0.6302397842527995, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 3.697602157472006e-06, |
|
"loss": 0.8629, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.6317054581696664, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 3.6829454183033366e-06, |
|
"loss": 0.7775, |
|
"step": 10775 |
|
}, |
|
{ |
|
"epoch": 0.6331711320865334, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 3.6682886791346666e-06, |
|
"loss": 0.9906, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.6346368060034003, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 3.6536319399659966e-06, |
|
"loss": 0.8601, |
|
"step": 10825 |
|
}, |
|
{ |
|
"epoch": 0.6361024799202674, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.6389752007973266e-06, |
|
"loss": 0.848, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.6375681538371343, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 3.624318461628657e-06, |
|
"loss": 0.9411, |
|
"step": 10875 |
|
}, |
|
{ |
|
"epoch": 0.6390338277540013, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 3.6096617224599874e-06, |
|
"loss": 0.9371, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.6404995016708682, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.5950049832913174e-06, |
|
"loss": 0.8688, |
|
"step": 10925 |
|
}, |
|
{ |
|
"epoch": 0.6419651755877352, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 3.5803482441226478e-06, |
|
"loss": 0.9022, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.6434308495046022, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 3.565691504953978e-06, |
|
"loss": 0.8537, |
|
"step": 10975 |
|
}, |
|
{ |
|
"epoch": 0.6448965234214692, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 3.5510347657853086e-06, |
|
"loss": 0.8228, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.6463621973383362, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 3.5363780266166386e-06, |
|
"loss": 0.8286, |
|
"step": 11025 |
|
}, |
|
{ |
|
"epoch": 0.6478278712552031, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 3.521721287447969e-06, |
|
"loss": 0.8312, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.6492935451720702, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 3.5070645482792994e-06, |
|
"loss": 0.8555, |
|
"step": 11075 |
|
}, |
|
{ |
|
"epoch": 0.6507592190889371, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 3.4924078091106293e-06, |
|
"loss": 0.8602, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.6522248930058041, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 3.4777510699419598e-06, |
|
"loss": 0.875, |
|
"step": 11125 |
|
}, |
|
{ |
|
"epoch": 0.653690566922671, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 3.46309433077329e-06, |
|
"loss": 1.0157, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.655156240839538, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 3.44843759160462e-06, |
|
"loss": 0.8448, |
|
"step": 11175 |
|
}, |
|
{ |
|
"epoch": 0.656621914756405, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 3.4337808524359505e-06, |
|
"loss": 0.8291, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.658087588673272, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 3.4191241132672805e-06, |
|
"loss": 0.9121, |
|
"step": 11225 |
|
}, |
|
{ |
|
"epoch": 0.6595532625901389, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 3.4044673740986105e-06, |
|
"loss": 0.9451, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.6610189365070059, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 3.389810634929941e-06, |
|
"loss": 0.8054, |
|
"step": 11275 |
|
}, |
|
{ |
|
"epoch": 0.6624846104238729, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 3.3751538957612713e-06, |
|
"loss": 0.7991, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.6639502843407399, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 3.3604971565926013e-06, |
|
"loss": 0.9013, |
|
"step": 11325 |
|
}, |
|
{ |
|
"epoch": 0.6654159582576068, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 3.3458404174239317e-06, |
|
"loss": 1.0514, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.6668816321744738, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 3.331183678255262e-06, |
|
"loss": 0.8199, |
|
"step": 11375 |
|
}, |
|
{ |
|
"epoch": 0.6683473060913409, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 3.316526939086592e-06, |
|
"loss": 0.8791, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.6698129800082078, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 3.3018701999179225e-06, |
|
"loss": 0.9326, |
|
"step": 11425 |
|
}, |
|
{ |
|
"epoch": 0.6712786539250748, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 3.287213460749253e-06, |
|
"loss": 0.8829, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.6727443278419417, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 3.2725567215805833e-06, |
|
"loss": 0.8332, |
|
"step": 11475 |
|
}, |
|
{ |
|
"epoch": 0.6742100017588087, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 3.2578999824119133e-06, |
|
"loss": 0.9214, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 3.2432432432432437e-06, |
|
"loss": 0.814, |
|
"step": 11525 |
|
}, |
|
{ |
|
"epoch": 0.6771413495925427, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.228586504074574e-06, |
|
"loss": 0.8175, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.6786070235094096, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 3.213929764905904e-06, |
|
"loss": 0.8749, |
|
"step": 11575 |
|
}, |
|
{ |
|
"epoch": 0.6800726974262766, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 3.199273025737234e-06, |
|
"loss": 0.7743, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.6815383713431435, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 3.184616286568564e-06, |
|
"loss": 0.8811, |
|
"step": 11625 |
|
}, |
|
{ |
|
"epoch": 0.6830040452600106, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 3.1699595473998945e-06, |
|
"loss": 0.8967, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 0.6844697191768775, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 3.155302808231225e-06, |
|
"loss": 0.9144, |
|
"step": 11675 |
|
}, |
|
{ |
|
"epoch": 0.6859353930937445, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 3.1406460690625553e-06, |
|
"loss": 0.8852, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.6874010670106114, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 3.1259893298938853e-06, |
|
"loss": 0.875, |
|
"step": 11725 |
|
}, |
|
{ |
|
"epoch": 0.6888667409274785, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 3.1113325907252157e-06, |
|
"loss": 0.9148, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.6903324148443454, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 3.096675851556546e-06, |
|
"loss": 0.8413, |
|
"step": 11775 |
|
}, |
|
{ |
|
"epoch": 0.6917980887612124, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 3.082019112387876e-06, |
|
"loss": 0.9005, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.6932637626780794, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.0673623732192065e-06, |
|
"loss": 0.8366, |
|
"step": 11825 |
|
}, |
|
{ |
|
"epoch": 0.6947294365949463, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.052705634050537e-06, |
|
"loss": 0.9477, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.6961951105118134, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.038048894881867e-06, |
|
"loss": 0.9308, |
|
"step": 11875 |
|
}, |
|
{ |
|
"epoch": 0.6976607844286803, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 3.0233921557131972e-06, |
|
"loss": 0.7818, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.6991264583455473, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 3.0087354165445277e-06, |
|
"loss": 0.9406, |
|
"step": 11925 |
|
}, |
|
{ |
|
"epoch": 0.7005921322624142, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 2.994078677375858e-06, |
|
"loss": 1.0378, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 0.7020578061792813, |
|
"grad_norm": 5.25, |
|
"learning_rate": 2.979421938207188e-06, |
|
"loss": 0.8277, |
|
"step": 11975 |
|
}, |
|
{ |
|
"epoch": 0.7035234800961482, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 2.964765199038518e-06, |
|
"loss": 0.8431, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7049891540130152, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 2.950108459869848e-06, |
|
"loss": 0.8696, |
|
"step": 12025 |
|
}, |
|
{ |
|
"epoch": 0.7064548279298821, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 2.9354517207011784e-06, |
|
"loss": 0.8615, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 0.7079205018467492, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 2.920794981532509e-06, |
|
"loss": 0.9367, |
|
"step": 12075 |
|
}, |
|
{ |
|
"epoch": 0.7093861757636161, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 2.9061382423638392e-06, |
|
"loss": 0.9749, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.7108518496804831, |
|
"grad_norm": 0.375, |
|
"learning_rate": 2.891481503195169e-06, |
|
"loss": 0.8747, |
|
"step": 12125 |
|
}, |
|
{ |
|
"epoch": 0.71231752359735, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 2.8768247640264996e-06, |
|
"loss": 0.8151, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.713783197514217, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 2.86216802485783e-06, |
|
"loss": 0.8211, |
|
"step": 12175 |
|
}, |
|
{ |
|
"epoch": 0.7152488714310841, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 2.84751128568916e-06, |
|
"loss": 0.8662, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.716714545347951, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.8328545465204904e-06, |
|
"loss": 0.909, |
|
"step": 12225 |
|
}, |
|
{ |
|
"epoch": 0.718180219264818, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.818197807351821e-06, |
|
"loss": 0.9167, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.7196458931816849, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.803541068183151e-06, |
|
"loss": 0.8634, |
|
"step": 12275 |
|
}, |
|
{ |
|
"epoch": 0.721111567098552, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 2.788884329014481e-06, |
|
"loss": 1.0852, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.7225772410154189, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 2.7742275898458116e-06, |
|
"loss": 0.9097, |
|
"step": 12325 |
|
}, |
|
{ |
|
"epoch": 0.7240429149322859, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 2.759570850677142e-06, |
|
"loss": 0.8853, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 0.7255085888491528, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.744914111508472e-06, |
|
"loss": 0.9239, |
|
"step": 12375 |
|
}, |
|
{ |
|
"epoch": 0.7269742627660198, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 2.730257372339802e-06, |
|
"loss": 1.0119, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.7284399366828868, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 2.715600633171132e-06, |
|
"loss": 0.9003, |
|
"step": 12425 |
|
}, |
|
{ |
|
"epoch": 0.7299056105997538, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 2.7009438940024624e-06, |
|
"loss": 0.8912, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.7313712845166207, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.6862871548337928e-06, |
|
"loss": 0.9916, |
|
"step": 12475 |
|
}, |
|
{ |
|
"epoch": 0.7328369584334877, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 2.6716304156651227e-06, |
|
"loss": 1.1092, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.7343026323503546, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 2.656973676496453e-06, |
|
"loss": 1.1346, |
|
"step": 12525 |
|
}, |
|
{ |
|
"epoch": 0.7357683062672217, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 2.6423169373277836e-06, |
|
"loss": 0.9109, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 0.7372339801840886, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 2.627660198159114e-06, |
|
"loss": 0.8605, |
|
"step": 12575 |
|
}, |
|
{ |
|
"epoch": 0.7386996541009556, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 2.613003458990444e-06, |
|
"loss": 0.8093, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.7401653280178226, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 2.5983467198217744e-06, |
|
"loss": 0.9045, |
|
"step": 12625 |
|
}, |
|
{ |
|
"epoch": 0.7416310019346896, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 2.5836899806531048e-06, |
|
"loss": 0.9586, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 0.7430966758515566, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 2.5690332414844347e-06, |
|
"loss": 0.8817, |
|
"step": 12675 |
|
}, |
|
{ |
|
"epoch": 0.7445623497684235, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 2.554376502315765e-06, |
|
"loss": 1.0672, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.7460280236852905, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 2.5397197631470956e-06, |
|
"loss": 0.7547, |
|
"step": 12725 |
|
}, |
|
{ |
|
"epoch": 0.7474936976021574, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 2.5250630239784255e-06, |
|
"loss": 0.86, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.7489593715190245, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 2.5104062848097555e-06, |
|
"loss": 0.837, |
|
"step": 12775 |
|
}, |
|
{ |
|
"epoch": 0.7504250454358914, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 2.495749545641086e-06, |
|
"loss": 0.969, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.7518907193527584, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 2.4810928064724163e-06, |
|
"loss": 0.858, |
|
"step": 12825 |
|
}, |
|
{ |
|
"epoch": 0.7533563932696253, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 2.4664360673037467e-06, |
|
"loss": 0.8171, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 0.7548220671864924, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 2.4517793281350767e-06, |
|
"loss": 0.9102, |
|
"step": 12875 |
|
}, |
|
{ |
|
"epoch": 0.7562877411033593, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 2.4371225889664067e-06, |
|
"loss": 0.991, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.7577534150202263, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 2.422465849797737e-06, |
|
"loss": 0.8257, |
|
"step": 12925 |
|
}, |
|
{ |
|
"epoch": 0.7592190889370932, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 2.4078091106290675e-06, |
|
"loss": 0.9095, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 0.7606847628539603, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 2.3931523714603975e-06, |
|
"loss": 0.9126, |
|
"step": 12975 |
|
}, |
|
{ |
|
"epoch": 0.7621504367708273, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 2.378495632291728e-06, |
|
"loss": 0.8467, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.7636161106876942, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 2.3638388931230583e-06, |
|
"loss": 0.8176, |
|
"step": 13025 |
|
}, |
|
{ |
|
"epoch": 0.7650817846045612, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 2.3491821539543887e-06, |
|
"loss": 0.8675, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 0.7665474585214281, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.3345254147857187e-06, |
|
"loss": 0.8793, |
|
"step": 13075 |
|
}, |
|
{ |
|
"epoch": 0.7680131324382952, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 2.3198686756170487e-06, |
|
"loss": 0.9777, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.7694788063551621, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.305211936448379e-06, |
|
"loss": 0.8507, |
|
"step": 13125 |
|
}, |
|
{ |
|
"epoch": 0.7709444802720291, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.2905551972797095e-06, |
|
"loss": 0.8181, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 0.772410154188896, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 2.2758984581110395e-06, |
|
"loss": 0.8968, |
|
"step": 13175 |
|
}, |
|
{ |
|
"epoch": 0.773875828105763, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.26124171894237e-06, |
|
"loss": 0.8177, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.77534150202263, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 2.2465849797737003e-06, |
|
"loss": 0.8724, |
|
"step": 13225 |
|
}, |
|
{ |
|
"epoch": 0.776807175939497, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 2.2319282406050307e-06, |
|
"loss": 0.965, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 0.7782728498563639, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 2.2172715014363607e-06, |
|
"loss": 0.8441, |
|
"step": 13275 |
|
}, |
|
{ |
|
"epoch": 0.7797385237732309, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.2026147622676906e-06, |
|
"loss": 0.8694, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.7812041976900979, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 2.187958023099021e-06, |
|
"loss": 0.8556, |
|
"step": 13325 |
|
}, |
|
{ |
|
"epoch": 0.7826698716069649, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 2.1733012839303515e-06, |
|
"loss": 0.8436, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 0.7841355455238319, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 2.1586445447616814e-06, |
|
"loss": 0.8208, |
|
"step": 13375 |
|
}, |
|
{ |
|
"epoch": 0.7856012194406988, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 2.143987805593012e-06, |
|
"loss": 0.8849, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.7870668933575659, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 2.1293310664243422e-06, |
|
"loss": 0.785, |
|
"step": 13425 |
|
}, |
|
{ |
|
"epoch": 0.7885325672744328, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 2.1146743272556727e-06, |
|
"loss": 0.8334, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 0.7899982411912998, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 2.1000175880870026e-06, |
|
"loss": 0.8887, |
|
"step": 13475 |
|
}, |
|
{ |
|
"epoch": 0.7914639151081667, |
|
"grad_norm": 4.5, |
|
"learning_rate": 2.0853608489183326e-06, |
|
"loss": 0.9221, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7929295890250337, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 2.070704109749663e-06, |
|
"loss": 0.9046, |
|
"step": 13525 |
|
}, |
|
{ |
|
"epoch": 0.7943952629419007, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 2.0560473705809934e-06, |
|
"loss": 0.9419, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 0.7958609368587677, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 2.0413906314123234e-06, |
|
"loss": 0.9732, |
|
"step": 13575 |
|
}, |
|
{ |
|
"epoch": 0.7973266107756346, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 2.026733892243654e-06, |
|
"loss": 0.8457, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.7987922846925016, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 2.0120771530749842e-06, |
|
"loss": 0.9258, |
|
"step": 13625 |
|
}, |
|
{ |
|
"epoch": 0.8002579586093685, |
|
"grad_norm": 0.25, |
|
"learning_rate": 1.997420413906314e-06, |
|
"loss": 0.8751, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 0.8017236325262356, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 1.9827636747376446e-06, |
|
"loss": 0.892, |
|
"step": 13675 |
|
}, |
|
{ |
|
"epoch": 0.8031893064431025, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 1.9681069355689746e-06, |
|
"loss": 0.8987, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.8046549803599695, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 1.953450196400305e-06, |
|
"loss": 0.8482, |
|
"step": 13725 |
|
}, |
|
{ |
|
"epoch": 0.8061206542768364, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 1.9387934572316354e-06, |
|
"loss": 0.965, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 0.8075863281937035, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 1.9241367180629654e-06, |
|
"loss": 0.8086, |
|
"step": 13775 |
|
}, |
|
{ |
|
"epoch": 0.8090520021105705, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.909479978894296e-06, |
|
"loss": 0.8894, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.8105176760274374, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.8948232397256262e-06, |
|
"loss": 0.8153, |
|
"step": 13825 |
|
}, |
|
{ |
|
"epoch": 0.8119833499443044, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.8801665005569564e-06, |
|
"loss": 0.8529, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 0.8134490238611713, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 1.8655097613882864e-06, |
|
"loss": 0.9254, |
|
"step": 13875 |
|
}, |
|
{ |
|
"epoch": 0.8149146977780384, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 1.8508530222196166e-06, |
|
"loss": 0.932, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.8163803716949053, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 1.836196283050947e-06, |
|
"loss": 0.9038, |
|
"step": 13925 |
|
}, |
|
{ |
|
"epoch": 0.8178460456117723, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.8215395438822772e-06, |
|
"loss": 0.8293, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 0.8193117195286392, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.8068828047136076e-06, |
|
"loss": 0.8581, |
|
"step": 13975 |
|
}, |
|
{ |
|
"epoch": 0.8207773934455063, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 1.7922260655449378e-06, |
|
"loss": 0.8774, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.8222430673623732, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 1.777569326376268e-06, |
|
"loss": 0.9594, |
|
"step": 14025 |
|
}, |
|
{ |
|
"epoch": 0.8237087412792402, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 1.7629125872075984e-06, |
|
"loss": 0.9275, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 0.8251744151961071, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 1.7482558480389283e-06, |
|
"loss": 0.8318, |
|
"step": 14075 |
|
}, |
|
{ |
|
"epoch": 0.8266400891129742, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 1.7335991088702585e-06, |
|
"loss": 0.9462, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.8281057630298411, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.718942369701589e-06, |
|
"loss": 0.879, |
|
"step": 14125 |
|
}, |
|
{ |
|
"epoch": 0.8295714369467081, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 1.7042856305329191e-06, |
|
"loss": 0.8837, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 0.8310371108635751, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.6896288913642495e-06, |
|
"loss": 0.8956, |
|
"step": 14175 |
|
}, |
|
{ |
|
"epoch": 0.832502784780442, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.6749721521955797e-06, |
|
"loss": 0.9283, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.8339684586973091, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 1.66031541302691e-06, |
|
"loss": 0.9279, |
|
"step": 14225 |
|
}, |
|
{ |
|
"epoch": 0.835434132614176, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 1.64565867385824e-06, |
|
"loss": 0.8446, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 0.836899806531043, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 1.6310019346895703e-06, |
|
"loss": 0.8585, |
|
"step": 14275 |
|
}, |
|
{ |
|
"epoch": 0.8383654804479099, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.6163451955209005e-06, |
|
"loss": 0.9229, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.839831154364777, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 1.601688456352231e-06, |
|
"loss": 0.7812, |
|
"step": 14325 |
|
}, |
|
{ |
|
"epoch": 0.8412968282816439, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 1.5870317171835611e-06, |
|
"loss": 0.925, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 0.8427625021985109, |
|
"grad_norm": 0.25, |
|
"learning_rate": 1.5723749780148915e-06, |
|
"loss": 0.9276, |
|
"step": 14375 |
|
}, |
|
{ |
|
"epoch": 0.8442281761153778, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.5577182388462217e-06, |
|
"loss": 0.8881, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.8456938500322448, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 1.543061499677552e-06, |
|
"loss": 0.8279, |
|
"step": 14425 |
|
}, |
|
{ |
|
"epoch": 0.8471595239491118, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 1.5284047605088819e-06, |
|
"loss": 0.8415, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 0.8486251978659788, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 1.5137480213402123e-06, |
|
"loss": 0.9089, |
|
"step": 14475 |
|
}, |
|
{ |
|
"epoch": 0.8500908717828457, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 1.4990912821715425e-06, |
|
"loss": 0.8786, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.8515565456997127, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.484434543002873e-06, |
|
"loss": 1.0085, |
|
"step": 14525 |
|
}, |
|
{ |
|
"epoch": 0.8530222196165798, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 1.469777803834203e-06, |
|
"loss": 0.9566, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 0.8544878935334467, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.4551210646655333e-06, |
|
"loss": 0.7709, |
|
"step": 14575 |
|
}, |
|
{ |
|
"epoch": 0.8559535674503137, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.4404643254968637e-06, |
|
"loss": 0.8845, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.8574192413671806, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 1.4258075863281939e-06, |
|
"loss": 0.8734, |
|
"step": 14625 |
|
}, |
|
{ |
|
"epoch": 0.8588849152840476, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 1.4111508471595239e-06, |
|
"loss": 0.8424, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 0.8603505892009146, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 1.3964941079908543e-06, |
|
"loss": 0.8982, |
|
"step": 14675 |
|
}, |
|
{ |
|
"epoch": 0.8618162631177816, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 1.3818373688221845e-06, |
|
"loss": 0.8867, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.8632819370346485, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 1.3671806296535149e-06, |
|
"loss": 0.768, |
|
"step": 14725 |
|
}, |
|
{ |
|
"epoch": 0.8647476109515155, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 1.352523890484845e-06, |
|
"loss": 0.7438, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 0.8662132848683824, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 1.3378671513161753e-06, |
|
"loss": 0.8944, |
|
"step": 14775 |
|
}, |
|
{ |
|
"epoch": 0.8676789587852495, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.3232104121475057e-06, |
|
"loss": 0.8471, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.8691446327021164, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 1.3085536729788359e-06, |
|
"loss": 0.9804, |
|
"step": 14825 |
|
}, |
|
{ |
|
"epoch": 0.8706103066189834, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.2938969338101658e-06, |
|
"loss": 0.8258, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 0.8720759805358503, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.2792401946414962e-06, |
|
"loss": 1.0303, |
|
"step": 14875 |
|
}, |
|
{ |
|
"epoch": 0.8735416544527174, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 1.2645834554728264e-06, |
|
"loss": 0.9401, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.8750073283695843, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.2499267163041566e-06, |
|
"loss": 0.8907, |
|
"step": 14925 |
|
}, |
|
{ |
|
"epoch": 0.8764730022864513, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.235269977135487e-06, |
|
"loss": 1.0413, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 0.8779386762033183, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 1.2206132379668172e-06, |
|
"loss": 0.8915, |
|
"step": 14975 |
|
}, |
|
{ |
|
"epoch": 0.8794043501201853, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.2059564987981474e-06, |
|
"loss": 1.0251, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.8808700240370523, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 1.1912997596294776e-06, |
|
"loss": 0.9363, |
|
"step": 15025 |
|
}, |
|
{ |
|
"epoch": 0.8823356979539192, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.176643020460808e-06, |
|
"loss": 0.9041, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 0.8838013718707862, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.1619862812921382e-06, |
|
"loss": 0.9011, |
|
"step": 15075 |
|
}, |
|
{ |
|
"epoch": 0.8852670457876531, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.1473295421234684e-06, |
|
"loss": 0.8542, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.8867327197045202, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 1.1326728029547986e-06, |
|
"loss": 0.8591, |
|
"step": 15125 |
|
}, |
|
{ |
|
"epoch": 0.8881983936213871, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.118016063786129e-06, |
|
"loss": 0.8726, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 0.8896640675382541, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 1.1033593246174592e-06, |
|
"loss": 0.8511, |
|
"step": 15175 |
|
}, |
|
{ |
|
"epoch": 0.891129741455121, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 1.0887025854487894e-06, |
|
"loss": 0.8862, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.892595415371988, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.0740458462801196e-06, |
|
"loss": 0.8561, |
|
"step": 15225 |
|
}, |
|
{ |
|
"epoch": 0.894061089288855, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.05938910711145e-06, |
|
"loss": 0.9929, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 0.895526763205722, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 1.0447323679427802e-06, |
|
"loss": 0.8928, |
|
"step": 15275 |
|
}, |
|
{ |
|
"epoch": 0.8969924371225889, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 1.0300756287741104e-06, |
|
"loss": 0.8723, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.8984581110394559, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 1.0154188896054406e-06, |
|
"loss": 0.8111, |
|
"step": 15325 |
|
}, |
|
{ |
|
"epoch": 0.899923784956323, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.000762150436771e-06, |
|
"loss": 0.8699, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 0.9013894588731899, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 9.861054112681012e-07, |
|
"loss": 0.9412, |
|
"step": 15375 |
|
}, |
|
{ |
|
"epoch": 0.9028551327900569, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 9.714486720994314e-07, |
|
"loss": 0.9887, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.9043208067069238, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 9.567919329307616e-07, |
|
"loss": 0.9231, |
|
"step": 15425 |
|
}, |
|
{ |
|
"epoch": 0.9057864806237909, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 9.421351937620919e-07, |
|
"loss": 0.8929, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 0.9072521545406578, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 9.274784545934222e-07, |
|
"loss": 0.8695, |
|
"step": 15475 |
|
}, |
|
{ |
|
"epoch": 0.9087178284575248, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 9.128217154247524e-07, |
|
"loss": 0.8565, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.9101835023743917, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 8.981649762560827e-07, |
|
"loss": 0.8357, |
|
"step": 15525 |
|
}, |
|
{ |
|
"epoch": 0.9116491762912587, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 8.835082370874129e-07, |
|
"loss": 0.9423, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 0.9131148502081257, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.688514979187431e-07, |
|
"loss": 0.7956, |
|
"step": 15575 |
|
}, |
|
{ |
|
"epoch": 0.9145805241249927, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 8.541947587500734e-07, |
|
"loss": 0.8043, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.9160461980418596, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.395380195814037e-07, |
|
"loss": 1.0452, |
|
"step": 15625 |
|
}, |
|
{ |
|
"epoch": 0.9175118719587266, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 8.248812804127338e-07, |
|
"loss": 0.8549, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 0.9189775458755935, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 8.10224541244064e-07, |
|
"loss": 0.904, |
|
"step": 15675 |
|
}, |
|
{ |
|
"epoch": 0.9204432197924606, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.955678020753943e-07, |
|
"loss": 0.9144, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.9219088937093276, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 7.809110629067245e-07, |
|
"loss": 0.9322, |
|
"step": 15725 |
|
}, |
|
{ |
|
"epoch": 0.9233745676261945, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 7.662543237380548e-07, |
|
"loss": 0.8968, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 0.9248402415430615, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 7.51597584569385e-07, |
|
"loss": 0.834, |
|
"step": 15775 |
|
}, |
|
{ |
|
"epoch": 0.9263059154599285, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 7.369408454007153e-07, |
|
"loss": 0.8769, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.9277715893767955, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 7.222841062320455e-07, |
|
"loss": 0.8159, |
|
"step": 15825 |
|
}, |
|
{ |
|
"epoch": 0.9292372632936624, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 7.076273670633758e-07, |
|
"loss": 0.8529, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 0.9307029372105294, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 6.92970627894706e-07, |
|
"loss": 0.8882, |
|
"step": 15875 |
|
}, |
|
{ |
|
"epoch": 0.9321686111273964, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 6.783138887260363e-07, |
|
"loss": 0.9047, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.9336342850442634, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 6.636571495573665e-07, |
|
"loss": 0.9791, |
|
"step": 15925 |
|
}, |
|
{ |
|
"epoch": 0.9350999589611303, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.490004103886968e-07, |
|
"loss": 0.8344, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 0.9365656328779973, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 6.34343671220027e-07, |
|
"loss": 0.8912, |
|
"step": 15975 |
|
}, |
|
{ |
|
"epoch": 0.9380313067948642, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 6.196869320513572e-07, |
|
"loss": 0.8693, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.9394969807117313, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 6.050301928826875e-07, |
|
"loss": 0.897, |
|
"step": 16025 |
|
}, |
|
{ |
|
"epoch": 0.9409626546285982, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 5.903734537140177e-07, |
|
"loss": 0.8517, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 0.9424283285454652, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 5.75716714545348e-07, |
|
"loss": 0.9227, |
|
"step": 16075 |
|
}, |
|
{ |
|
"epoch": 0.9438940024623321, |
|
"grad_norm": 0.12060546875, |
|
"learning_rate": 5.610599753766782e-07, |
|
"loss": 0.8516, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.9453596763791992, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 5.464032362080085e-07, |
|
"loss": 0.9763, |
|
"step": 16125 |
|
}, |
|
{ |
|
"epoch": 0.9468253502960662, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 5.317464970393387e-07, |
|
"loss": 0.8946, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 0.9482910242129331, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 5.17089757870669e-07, |
|
"loss": 0.7753, |
|
"step": 16175 |
|
}, |
|
{ |
|
"epoch": 0.9497566981298001, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 5.024330187019992e-07, |
|
"loss": 0.7681, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.951222372046667, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 4.877762795333295e-07, |
|
"loss": 0.9284, |
|
"step": 16225 |
|
}, |
|
{ |
|
"epoch": 0.9526880459635341, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 4.731195403646597e-07, |
|
"loss": 0.8657, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 0.954153719880401, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 4.5846280119598996e-07, |
|
"loss": 0.8839, |
|
"step": 16275 |
|
}, |
|
{ |
|
"epoch": 0.955619393797268, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 4.4380606202732016e-07, |
|
"loss": 0.9618, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.9570850677141349, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 4.2914932285865046e-07, |
|
"loss": 0.9123, |
|
"step": 16325 |
|
}, |
|
{ |
|
"epoch": 0.958550741631002, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 4.1449258368998065e-07, |
|
"loss": 1.0314, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 0.9600164155478689, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 3.9983584452131095e-07, |
|
"loss": 0.8674, |
|
"step": 16375 |
|
}, |
|
{ |
|
"epoch": 0.9614820894647359, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.8517910535264115e-07, |
|
"loss": 0.8991, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.9629477633816028, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 3.7052236618397145e-07, |
|
"loss": 0.8511, |
|
"step": 16425 |
|
}, |
|
{ |
|
"epoch": 0.9644134372984698, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.5586562701530164e-07, |
|
"loss": 0.8599, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 0.9658791112153368, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 3.4120888784663194e-07, |
|
"loss": 0.8352, |
|
"step": 16475 |
|
}, |
|
{ |
|
"epoch": 0.9673447851322038, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 3.2655214867796213e-07, |
|
"loss": 0.7832, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.9688104590490708, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 3.118954095092924e-07, |
|
"loss": 0.8793, |
|
"step": 16525 |
|
}, |
|
{ |
|
"epoch": 0.9702761329659377, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 2.972386703406226e-07, |
|
"loss": 0.8602, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 0.9717418068828048, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 2.8258193117195287e-07, |
|
"loss": 0.8972, |
|
"step": 16575 |
|
}, |
|
{ |
|
"epoch": 0.9732074807996717, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 2.679251920032831e-07, |
|
"loss": 0.8231, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.9746731547165387, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 2.5326845283461337e-07, |
|
"loss": 0.8406, |
|
"step": 16625 |
|
}, |
|
{ |
|
"epoch": 0.9761388286334056, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.386117136659436e-07, |
|
"loss": 0.9921, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 0.9776045025502726, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 2.2395497449727386e-07, |
|
"loss": 0.8458, |
|
"step": 16675 |
|
}, |
|
{ |
|
"epoch": 0.9790701764671396, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 2.092982353286041e-07, |
|
"loss": 0.9056, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.9805358503840066, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.9464149615993435e-07, |
|
"loss": 0.8358, |
|
"step": 16725 |
|
}, |
|
{ |
|
"epoch": 0.9820015243008735, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 1.799847569912646e-07, |
|
"loss": 0.9285, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 0.9834671982177405, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 1.6532801782259485e-07, |
|
"loss": 0.8284, |
|
"step": 16775 |
|
}, |
|
{ |
|
"epoch": 0.9849328721346075, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 1.506712786539251e-07, |
|
"loss": 0.8364, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.9863985460514745, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 1.3601453948525534e-07, |
|
"loss": 0.9212, |
|
"step": 16825 |
|
}, |
|
{ |
|
"epoch": 0.9878642199683414, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 1.213578003165856e-07, |
|
"loss": 0.8225, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 0.9893298938852084, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 1.0670106114791582e-07, |
|
"loss": 0.9779, |
|
"step": 16875 |
|
}, |
|
{ |
|
"epoch": 0.9907955678020754, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 9.204432197924607e-08, |
|
"loss": 0.816, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.9922612417189424, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 7.738758281057632e-08, |
|
"loss": 0.8379, |
|
"step": 16925 |
|
}, |
|
{ |
|
"epoch": 0.9937269156358094, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 6.273084364190655e-08, |
|
"loss": 0.8998, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 0.9951925895526763, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 4.80741044732368e-08, |
|
"loss": 0.9126, |
|
"step": 16975 |
|
}, |
|
{ |
|
"epoch": 0.9966582634695433, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 3.341736530456704e-08, |
|
"loss": 0.8822, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.9981239373864103, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 1.8760626135897286e-08, |
|
"loss": 0.8701, |
|
"step": 17025 |
|
}, |
|
{ |
|
"epoch": 0.9995896113032773, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.103886967227532e-09, |
|
"loss": 0.9079, |
|
"step": 17050 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 17057, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4293390811789451e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|