bborisv's picture
Add files using upload-large-folder tool
33a9377 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 17057,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.8626956674679014e-05,
"grad_norm": 0.232421875,
"learning_rate": 9.999413730433253e-06,
"loss": 1.0997,
"step": 1
},
{
"epoch": 0.0014656739168669754,
"grad_norm": 0.25,
"learning_rate": 9.985343260831331e-06,
"loss": 0.8715,
"step": 25
},
{
"epoch": 0.0029313478337339507,
"grad_norm": 0.59765625,
"learning_rate": 9.970686521662662e-06,
"loss": 0.8613,
"step": 50
},
{
"epoch": 0.004397021750600926,
"grad_norm": 0.3671875,
"learning_rate": 9.95602978249399e-06,
"loss": 0.9177,
"step": 75
},
{
"epoch": 0.0058626956674679015,
"grad_norm": 0.412109375,
"learning_rate": 9.941373043325322e-06,
"loss": 0.8869,
"step": 100
},
{
"epoch": 0.007328369584334877,
"grad_norm": 0.25,
"learning_rate": 9.926716304156651e-06,
"loss": 0.852,
"step": 125
},
{
"epoch": 0.008794043501201852,
"grad_norm": 0.4921875,
"learning_rate": 9.912059564987983e-06,
"loss": 0.8612,
"step": 150
},
{
"epoch": 0.010259717418068828,
"grad_norm": 0.484375,
"learning_rate": 9.897402825819312e-06,
"loss": 0.9075,
"step": 175
},
{
"epoch": 0.011725391334935803,
"grad_norm": 0.578125,
"learning_rate": 9.882746086650642e-06,
"loss": 0.9488,
"step": 200
},
{
"epoch": 0.013191065251802778,
"grad_norm": 0.486328125,
"learning_rate": 9.868089347481973e-06,
"loss": 0.8637,
"step": 225
},
{
"epoch": 0.014656739168669754,
"grad_norm": 0.578125,
"learning_rate": 9.853432608313303e-06,
"loss": 0.9587,
"step": 250
},
{
"epoch": 0.01612241308553673,
"grad_norm": 0.2451171875,
"learning_rate": 9.838775869144634e-06,
"loss": 0.8171,
"step": 275
},
{
"epoch": 0.017588087002403704,
"grad_norm": 0.251953125,
"learning_rate": 9.824119129975964e-06,
"loss": 0.8413,
"step": 300
},
{
"epoch": 0.01905376091927068,
"grad_norm": 0.39453125,
"learning_rate": 9.809462390807294e-06,
"loss": 0.796,
"step": 325
},
{
"epoch": 0.020519434836137655,
"grad_norm": 0.326171875,
"learning_rate": 9.794805651638625e-06,
"loss": 0.8025,
"step": 350
},
{
"epoch": 0.02198510875300463,
"grad_norm": 0.1630859375,
"learning_rate": 9.780148912469955e-06,
"loss": 0.8279,
"step": 375
},
{
"epoch": 0.023450782669871606,
"grad_norm": 0.5078125,
"learning_rate": 9.765492173301284e-06,
"loss": 0.807,
"step": 400
},
{
"epoch": 0.02491645658673858,
"grad_norm": 0.29296875,
"learning_rate": 9.750835434132614e-06,
"loss": 0.9146,
"step": 425
},
{
"epoch": 0.026382130503605557,
"grad_norm": 0.3125,
"learning_rate": 9.736178694963945e-06,
"loss": 0.8667,
"step": 450
},
{
"epoch": 0.027847804420472532,
"grad_norm": 0.349609375,
"learning_rate": 9.721521955795275e-06,
"loss": 0.8888,
"step": 475
},
{
"epoch": 0.029313478337339507,
"grad_norm": 0.1982421875,
"learning_rate": 9.706865216626606e-06,
"loss": 0.8417,
"step": 500
},
{
"epoch": 0.030779152254206483,
"grad_norm": 0.20703125,
"learning_rate": 9.692208477457936e-06,
"loss": 0.8205,
"step": 525
},
{
"epoch": 0.03224482617107346,
"grad_norm": 0.2216796875,
"learning_rate": 9.677551738289266e-06,
"loss": 0.8273,
"step": 550
},
{
"epoch": 0.03371050008794044,
"grad_norm": 0.39453125,
"learning_rate": 9.662894999120597e-06,
"loss": 0.8107,
"step": 575
},
{
"epoch": 0.03517617400480741,
"grad_norm": 0.1728515625,
"learning_rate": 9.648238259951927e-06,
"loss": 0.7997,
"step": 600
},
{
"epoch": 0.03664184792167439,
"grad_norm": 0.458984375,
"learning_rate": 9.633581520783258e-06,
"loss": 0.8802,
"step": 625
},
{
"epoch": 0.03810752183854136,
"grad_norm": 0.2333984375,
"learning_rate": 9.618924781614586e-06,
"loss": 0.8859,
"step": 650
},
{
"epoch": 0.03957319575540834,
"grad_norm": 0.294921875,
"learning_rate": 9.604268042445918e-06,
"loss": 0.8317,
"step": 675
},
{
"epoch": 0.04103886967227531,
"grad_norm": 0.208984375,
"learning_rate": 9.589611303277247e-06,
"loss": 0.8483,
"step": 700
},
{
"epoch": 0.04250454358914229,
"grad_norm": 0.2490234375,
"learning_rate": 9.57495456410858e-06,
"loss": 0.8267,
"step": 725
},
{
"epoch": 0.04397021750600926,
"grad_norm": 0.3984375,
"learning_rate": 9.560297824939908e-06,
"loss": 0.9042,
"step": 750
},
{
"epoch": 0.04543589142287624,
"grad_norm": 0.126953125,
"learning_rate": 9.545641085771238e-06,
"loss": 0.8058,
"step": 775
},
{
"epoch": 0.04690156533974321,
"grad_norm": 0.34375,
"learning_rate": 9.530984346602569e-06,
"loss": 0.7874,
"step": 800
},
{
"epoch": 0.04836723925661019,
"grad_norm": 0.216796875,
"learning_rate": 9.516327607433899e-06,
"loss": 0.8562,
"step": 825
},
{
"epoch": 0.04983291317347716,
"grad_norm": 0.2890625,
"learning_rate": 9.50167086826523e-06,
"loss": 0.8571,
"step": 850
},
{
"epoch": 0.05129858709034414,
"grad_norm": 0.2138671875,
"learning_rate": 9.487014129096558e-06,
"loss": 0.8252,
"step": 875
},
{
"epoch": 0.05276426100721111,
"grad_norm": 0.2890625,
"learning_rate": 9.47235738992789e-06,
"loss": 0.9093,
"step": 900
},
{
"epoch": 0.05422993492407809,
"grad_norm": 0.369140625,
"learning_rate": 9.457700650759219e-06,
"loss": 0.7995,
"step": 925
},
{
"epoch": 0.055695608840945064,
"grad_norm": 0.330078125,
"learning_rate": 9.443043911590551e-06,
"loss": 0.9485,
"step": 950
},
{
"epoch": 0.05716128275781204,
"grad_norm": 0.248046875,
"learning_rate": 9.42838717242188e-06,
"loss": 0.8695,
"step": 975
},
{
"epoch": 0.058626956674679015,
"grad_norm": 0.33203125,
"learning_rate": 9.41373043325321e-06,
"loss": 0.8645,
"step": 1000
},
{
"epoch": 0.060092630591545994,
"grad_norm": 1.5546875,
"learning_rate": 9.39907369408454e-06,
"loss": 0.9043,
"step": 1025
},
{
"epoch": 0.061558304508412966,
"grad_norm": 2.25,
"learning_rate": 9.384416954915871e-06,
"loss": 0.8794,
"step": 1050
},
{
"epoch": 0.06302397842527994,
"grad_norm": 0.248046875,
"learning_rate": 9.369760215747201e-06,
"loss": 0.8327,
"step": 1075
},
{
"epoch": 0.06448965234214692,
"grad_norm": 0.353515625,
"learning_rate": 9.355103476578532e-06,
"loss": 0.9179,
"step": 1100
},
{
"epoch": 0.06595532625901389,
"grad_norm": 1.140625,
"learning_rate": 9.340446737409862e-06,
"loss": 0.8321,
"step": 1125
},
{
"epoch": 0.06742100017588087,
"grad_norm": 0.2412109375,
"learning_rate": 9.325789998241193e-06,
"loss": 0.8061,
"step": 1150
},
{
"epoch": 0.06888667409274785,
"grad_norm": 0.25390625,
"learning_rate": 9.311133259072523e-06,
"loss": 0.8817,
"step": 1175
},
{
"epoch": 0.07035234800961482,
"grad_norm": 0.283203125,
"learning_rate": 9.296476519903852e-06,
"loss": 0.9002,
"step": 1200
},
{
"epoch": 0.07181802192648179,
"grad_norm": 0.38671875,
"learning_rate": 9.281819780735182e-06,
"loss": 0.9576,
"step": 1225
},
{
"epoch": 0.07328369584334878,
"grad_norm": 0.439453125,
"learning_rate": 9.267163041566513e-06,
"loss": 0.8482,
"step": 1250
},
{
"epoch": 0.07474936976021575,
"grad_norm": 0.1474609375,
"learning_rate": 9.252506302397843e-06,
"loss": 0.8323,
"step": 1275
},
{
"epoch": 0.07621504367708272,
"grad_norm": 0.29296875,
"learning_rate": 9.237849563229173e-06,
"loss": 0.8371,
"step": 1300
},
{
"epoch": 0.07768071759394969,
"grad_norm": 0.232421875,
"learning_rate": 9.223192824060504e-06,
"loss": 0.8462,
"step": 1325
},
{
"epoch": 0.07914639151081668,
"grad_norm": 0.333984375,
"learning_rate": 9.208536084891834e-06,
"loss": 0.9729,
"step": 1350
},
{
"epoch": 0.08061206542768365,
"grad_norm": 0.2177734375,
"learning_rate": 9.193879345723165e-06,
"loss": 1.0097,
"step": 1375
},
{
"epoch": 0.08207773934455062,
"grad_norm": 0.205078125,
"learning_rate": 9.179222606554495e-06,
"loss": 0.8492,
"step": 1400
},
{
"epoch": 0.0835434132614176,
"grad_norm": 0.2197265625,
"learning_rate": 9.164565867385825e-06,
"loss": 0.8352,
"step": 1425
},
{
"epoch": 0.08500908717828458,
"grad_norm": 0.40625,
"learning_rate": 9.149909128217154e-06,
"loss": 0.82,
"step": 1450
},
{
"epoch": 0.08647476109515155,
"grad_norm": 0.23828125,
"learning_rate": 9.135252389048486e-06,
"loss": 0.8827,
"step": 1475
},
{
"epoch": 0.08794043501201852,
"grad_norm": 0.267578125,
"learning_rate": 9.120595649879815e-06,
"loss": 0.8238,
"step": 1500
},
{
"epoch": 0.08940610892888551,
"grad_norm": 0.32421875,
"learning_rate": 9.105938910711145e-06,
"loss": 0.9153,
"step": 1525
},
{
"epoch": 0.09087178284575248,
"grad_norm": 0.265625,
"learning_rate": 9.091282171542476e-06,
"loss": 0.8383,
"step": 1550
},
{
"epoch": 0.09233745676261945,
"grad_norm": 0.5546875,
"learning_rate": 9.076625432373806e-06,
"loss": 0.8326,
"step": 1575
},
{
"epoch": 0.09380313067948642,
"grad_norm": 0.197265625,
"learning_rate": 9.061968693205137e-06,
"loss": 0.9013,
"step": 1600
},
{
"epoch": 0.09526880459635341,
"grad_norm": 0.23828125,
"learning_rate": 9.047311954036467e-06,
"loss": 0.882,
"step": 1625
},
{
"epoch": 0.09673447851322038,
"grad_norm": 0.427734375,
"learning_rate": 9.032655214867797e-06,
"loss": 0.8533,
"step": 1650
},
{
"epoch": 0.09820015243008735,
"grad_norm": 0.44140625,
"learning_rate": 9.017998475699126e-06,
"loss": 0.8696,
"step": 1675
},
{
"epoch": 0.09966582634695433,
"grad_norm": 5.71875,
"learning_rate": 9.003341736530458e-06,
"loss": 1.0214,
"step": 1700
},
{
"epoch": 0.10113150026382131,
"grad_norm": 0.404296875,
"learning_rate": 8.988684997361787e-06,
"loss": 0.8732,
"step": 1725
},
{
"epoch": 0.10259717418068828,
"grad_norm": 0.1787109375,
"learning_rate": 8.974028258193117e-06,
"loss": 0.9454,
"step": 1750
},
{
"epoch": 0.10406284809755526,
"grad_norm": 0.2294921875,
"learning_rate": 8.959371519024448e-06,
"loss": 0.8994,
"step": 1775
},
{
"epoch": 0.10552852201442223,
"grad_norm": 0.25390625,
"learning_rate": 8.944714779855778e-06,
"loss": 0.9015,
"step": 1800
},
{
"epoch": 0.10699419593128921,
"grad_norm": 0.138671875,
"learning_rate": 8.930058040687109e-06,
"loss": 0.884,
"step": 1825
},
{
"epoch": 0.10845986984815618,
"grad_norm": 0.404296875,
"learning_rate": 8.915401301518439e-06,
"loss": 0.7944,
"step": 1850
},
{
"epoch": 0.10992554376502316,
"grad_norm": 0.1953125,
"learning_rate": 8.90074456234977e-06,
"loss": 0.8855,
"step": 1875
},
{
"epoch": 0.11139121768189013,
"grad_norm": 0.55859375,
"learning_rate": 8.8860878231811e-06,
"loss": 1.0339,
"step": 1900
},
{
"epoch": 0.11285689159875711,
"grad_norm": 0.48046875,
"learning_rate": 8.87143108401243e-06,
"loss": 1.0966,
"step": 1925
},
{
"epoch": 0.11432256551562409,
"grad_norm": 0.2421875,
"learning_rate": 8.85677434484376e-06,
"loss": 0.8355,
"step": 1950
},
{
"epoch": 0.11578823943249106,
"grad_norm": 0.314453125,
"learning_rate": 8.84211760567509e-06,
"loss": 0.8109,
"step": 1975
},
{
"epoch": 0.11725391334935803,
"grad_norm": 0.26171875,
"learning_rate": 8.82746086650642e-06,
"loss": 0.9596,
"step": 2000
},
{
"epoch": 0.11871958726622502,
"grad_norm": 0.375,
"learning_rate": 8.81280412733775e-06,
"loss": 0.8931,
"step": 2025
},
{
"epoch": 0.12018526118309199,
"grad_norm": 0.20703125,
"learning_rate": 8.79814738816908e-06,
"loss": 0.818,
"step": 2050
},
{
"epoch": 0.12165093509995896,
"grad_norm": 0.2265625,
"learning_rate": 8.783490649000411e-06,
"loss": 0.8465,
"step": 2075
},
{
"epoch": 0.12311660901682593,
"grad_norm": 0.1689453125,
"learning_rate": 8.768833909831741e-06,
"loss": 0.8754,
"step": 2100
},
{
"epoch": 0.12458228293369292,
"grad_norm": 0.158203125,
"learning_rate": 8.754177170663072e-06,
"loss": 0.8368,
"step": 2125
},
{
"epoch": 0.1260479568505599,
"grad_norm": 0.2734375,
"learning_rate": 8.739520431494402e-06,
"loss": 0.8626,
"step": 2150
},
{
"epoch": 0.12751363076742686,
"grad_norm": 0.22265625,
"learning_rate": 8.724863692325733e-06,
"loss": 0.9038,
"step": 2175
},
{
"epoch": 0.12897930468429383,
"grad_norm": 0.1796875,
"learning_rate": 8.710206953157061e-06,
"loss": 0.8421,
"step": 2200
},
{
"epoch": 0.1304449786011608,
"grad_norm": 0.5,
"learning_rate": 8.695550213988393e-06,
"loss": 0.9368,
"step": 2225
},
{
"epoch": 0.13191065251802778,
"grad_norm": 0.3671875,
"learning_rate": 8.680893474819722e-06,
"loss": 0.8676,
"step": 2250
},
{
"epoch": 0.13337632643489478,
"grad_norm": 0.271484375,
"learning_rate": 8.666236735651054e-06,
"loss": 0.7942,
"step": 2275
},
{
"epoch": 0.13484200035176175,
"grad_norm": 0.2216796875,
"learning_rate": 8.651579996482383e-06,
"loss": 0.895,
"step": 2300
},
{
"epoch": 0.13630767426862872,
"grad_norm": 0.2119140625,
"learning_rate": 8.636923257313713e-06,
"loss": 0.8239,
"step": 2325
},
{
"epoch": 0.1377733481854957,
"grad_norm": 0.1650390625,
"learning_rate": 8.622266518145044e-06,
"loss": 0.8594,
"step": 2350
},
{
"epoch": 0.13923902210236266,
"grad_norm": 0.1923828125,
"learning_rate": 8.607609778976374e-06,
"loss": 0.9231,
"step": 2375
},
{
"epoch": 0.14070469601922964,
"grad_norm": 0.400390625,
"learning_rate": 8.592953039807705e-06,
"loss": 0.8337,
"step": 2400
},
{
"epoch": 0.1421703699360966,
"grad_norm": 0.10498046875,
"learning_rate": 8.578296300639033e-06,
"loss": 0.8142,
"step": 2425
},
{
"epoch": 0.14363604385296358,
"grad_norm": 0.7421875,
"learning_rate": 8.563639561470365e-06,
"loss": 0.8561,
"step": 2450
},
{
"epoch": 0.14510171776983058,
"grad_norm": 0.263671875,
"learning_rate": 8.548982822301694e-06,
"loss": 0.9175,
"step": 2475
},
{
"epoch": 0.14656739168669755,
"grad_norm": 0.146484375,
"learning_rate": 8.534326083133026e-06,
"loss": 0.9498,
"step": 2500
},
{
"epoch": 0.14803306560356452,
"grad_norm": 0.36328125,
"learning_rate": 8.519669343964355e-06,
"loss": 0.9313,
"step": 2525
},
{
"epoch": 0.1494987395204315,
"grad_norm": 0.392578125,
"learning_rate": 8.505012604795685e-06,
"loss": 0.9784,
"step": 2550
},
{
"epoch": 0.15096441343729847,
"grad_norm": 0.35546875,
"learning_rate": 8.490355865627016e-06,
"loss": 0.9337,
"step": 2575
},
{
"epoch": 0.15243008735416544,
"grad_norm": 0.1025390625,
"learning_rate": 8.475699126458346e-06,
"loss": 0.975,
"step": 2600
},
{
"epoch": 0.1538957612710324,
"grad_norm": 0.375,
"learning_rate": 8.461042387289676e-06,
"loss": 0.8933,
"step": 2625
},
{
"epoch": 0.15536143518789938,
"grad_norm": 0.234375,
"learning_rate": 8.446385648121007e-06,
"loss": 0.7984,
"step": 2650
},
{
"epoch": 0.15682710910476638,
"grad_norm": 0.2578125,
"learning_rate": 8.431728908952337e-06,
"loss": 0.8407,
"step": 2675
},
{
"epoch": 0.15829278302163335,
"grad_norm": 0.23828125,
"learning_rate": 8.417072169783668e-06,
"loss": 0.8591,
"step": 2700
},
{
"epoch": 0.15975845693850033,
"grad_norm": 0.439453125,
"learning_rate": 8.402415430614998e-06,
"loss": 0.906,
"step": 2725
},
{
"epoch": 0.1612241308553673,
"grad_norm": 0.455078125,
"learning_rate": 8.387758691446328e-06,
"loss": 0.8644,
"step": 2750
},
{
"epoch": 0.16268980477223427,
"grad_norm": 0.162109375,
"learning_rate": 8.373101952277657e-06,
"loss": 0.8917,
"step": 2775
},
{
"epoch": 0.16415547868910124,
"grad_norm": 0.2197265625,
"learning_rate": 8.358445213108988e-06,
"loss": 0.852,
"step": 2800
},
{
"epoch": 0.1656211526059682,
"grad_norm": 0.25390625,
"learning_rate": 8.343788473940318e-06,
"loss": 0.9059,
"step": 2825
},
{
"epoch": 0.1670868265228352,
"grad_norm": 0.392578125,
"learning_rate": 8.329131734771648e-06,
"loss": 0.9189,
"step": 2850
},
{
"epoch": 0.16855250043970219,
"grad_norm": 0.19140625,
"learning_rate": 8.314474995602979e-06,
"loss": 0.9356,
"step": 2875
},
{
"epoch": 0.17001817435656916,
"grad_norm": 0.232421875,
"learning_rate": 8.29981825643431e-06,
"loss": 0.8653,
"step": 2900
},
{
"epoch": 0.17148384827343613,
"grad_norm": 0.3125,
"learning_rate": 8.28516151726564e-06,
"loss": 0.9481,
"step": 2925
},
{
"epoch": 0.1729495221903031,
"grad_norm": 0.267578125,
"learning_rate": 8.27050477809697e-06,
"loss": 0.8533,
"step": 2950
},
{
"epoch": 0.17441519610717007,
"grad_norm": 0.353515625,
"learning_rate": 8.2558480389283e-06,
"loss": 0.9067,
"step": 2975
},
{
"epoch": 0.17588087002403704,
"grad_norm": 0.7890625,
"learning_rate": 8.24119129975963e-06,
"loss": 0.8769,
"step": 3000
},
{
"epoch": 0.17734654394090402,
"grad_norm": 0.19921875,
"learning_rate": 8.226534560590961e-06,
"loss": 1.0016,
"step": 3025
},
{
"epoch": 0.17881221785777102,
"grad_norm": 0.2060546875,
"learning_rate": 8.21187782142229e-06,
"loss": 0.8201,
"step": 3050
},
{
"epoch": 0.180277891774638,
"grad_norm": 0.287109375,
"learning_rate": 8.197221082253622e-06,
"loss": 0.9004,
"step": 3075
},
{
"epoch": 0.18174356569150496,
"grad_norm": 0.26953125,
"learning_rate": 8.18256434308495e-06,
"loss": 0.9973,
"step": 3100
},
{
"epoch": 0.18320923960837193,
"grad_norm": 0.2353515625,
"learning_rate": 8.167907603916281e-06,
"loss": 0.8702,
"step": 3125
},
{
"epoch": 0.1846749135252389,
"grad_norm": 0.21875,
"learning_rate": 8.153250864747612e-06,
"loss": 1.1181,
"step": 3150
},
{
"epoch": 0.18614058744210588,
"grad_norm": 0.671875,
"learning_rate": 8.138594125578942e-06,
"loss": 0.9449,
"step": 3175
},
{
"epoch": 0.18760626135897285,
"grad_norm": 0.392578125,
"learning_rate": 8.123937386410272e-06,
"loss": 1.0326,
"step": 3200
},
{
"epoch": 0.18907193527583982,
"grad_norm": 0.2099609375,
"learning_rate": 8.109280647241601e-06,
"loss": 0.836,
"step": 3225
},
{
"epoch": 0.19053760919270682,
"grad_norm": 0.201171875,
"learning_rate": 8.094623908072933e-06,
"loss": 0.8702,
"step": 3250
},
{
"epoch": 0.1920032831095738,
"grad_norm": 0.18359375,
"learning_rate": 8.079967168904262e-06,
"loss": 0.9676,
"step": 3275
},
{
"epoch": 0.19346895702644076,
"grad_norm": 0.31640625,
"learning_rate": 8.065310429735594e-06,
"loss": 0.9803,
"step": 3300
},
{
"epoch": 0.19493463094330774,
"grad_norm": 0.125,
"learning_rate": 8.050653690566923e-06,
"loss": 0.8786,
"step": 3325
},
{
"epoch": 0.1964003048601747,
"grad_norm": 0.216796875,
"learning_rate": 8.035996951398253e-06,
"loss": 0.8793,
"step": 3350
},
{
"epoch": 0.19786597877704168,
"grad_norm": 0.263671875,
"learning_rate": 8.021340212229584e-06,
"loss": 0.9167,
"step": 3375
},
{
"epoch": 0.19933165269390865,
"grad_norm": 0.1484375,
"learning_rate": 8.006683473060914e-06,
"loss": 0.8833,
"step": 3400
},
{
"epoch": 0.20079732661077562,
"grad_norm": 0.431640625,
"learning_rate": 7.992026733892244e-06,
"loss": 0.9054,
"step": 3425
},
{
"epoch": 0.20226300052764262,
"grad_norm": 0.287109375,
"learning_rate": 7.977369994723575e-06,
"loss": 0.9006,
"step": 3450
},
{
"epoch": 0.2037286744445096,
"grad_norm": 0.271484375,
"learning_rate": 7.962713255554905e-06,
"loss": 0.8351,
"step": 3475
},
{
"epoch": 0.20519434836137657,
"grad_norm": 0.2099609375,
"learning_rate": 7.948056516386236e-06,
"loss": 0.8815,
"step": 3500
},
{
"epoch": 0.20666002227824354,
"grad_norm": 0.396484375,
"learning_rate": 7.933399777217566e-06,
"loss": 0.8492,
"step": 3525
},
{
"epoch": 0.2081256961951105,
"grad_norm": 0.23046875,
"learning_rate": 7.918743038048895e-06,
"loss": 0.7399,
"step": 3550
},
{
"epoch": 0.20959137011197748,
"grad_norm": 0.2490234375,
"learning_rate": 7.904086298880225e-06,
"loss": 0.8195,
"step": 3575
},
{
"epoch": 0.21105704402884445,
"grad_norm": 0.2099609375,
"learning_rate": 7.889429559711556e-06,
"loss": 0.8085,
"step": 3600
},
{
"epoch": 0.21252271794571143,
"grad_norm": 0.2578125,
"learning_rate": 7.874772820542886e-06,
"loss": 0.9241,
"step": 3625
},
{
"epoch": 0.21398839186257843,
"grad_norm": 0.189453125,
"learning_rate": 7.860116081374216e-06,
"loss": 0.8614,
"step": 3650
},
{
"epoch": 0.2154540657794454,
"grad_norm": 0.279296875,
"learning_rate": 7.845459342205547e-06,
"loss": 0.8509,
"step": 3675
},
{
"epoch": 0.21691973969631237,
"grad_norm": 0.26171875,
"learning_rate": 7.830802603036877e-06,
"loss": 0.9153,
"step": 3700
},
{
"epoch": 0.21838541361317934,
"grad_norm": 0.435546875,
"learning_rate": 7.816145863868208e-06,
"loss": 0.857,
"step": 3725
},
{
"epoch": 0.2198510875300463,
"grad_norm": 0.1904296875,
"learning_rate": 7.801489124699538e-06,
"loss": 0.8623,
"step": 3750
},
{
"epoch": 0.22131676144691328,
"grad_norm": 0.2216796875,
"learning_rate": 7.786832385530868e-06,
"loss": 0.9246,
"step": 3775
},
{
"epoch": 0.22278243536378026,
"grad_norm": 0.2890625,
"learning_rate": 7.772175646362197e-06,
"loss": 0.9,
"step": 3800
},
{
"epoch": 0.22424810928064723,
"grad_norm": 0.294921875,
"learning_rate": 7.75751890719353e-06,
"loss": 1.0074,
"step": 3825
},
{
"epoch": 0.22571378319751423,
"grad_norm": 0.98046875,
"learning_rate": 7.742862168024858e-06,
"loss": 0.8933,
"step": 3850
},
{
"epoch": 0.2271794571143812,
"grad_norm": 0.462890625,
"learning_rate": 7.72820542885619e-06,
"loss": 0.8541,
"step": 3875
},
{
"epoch": 0.22864513103124817,
"grad_norm": 0.48828125,
"learning_rate": 7.713548689687519e-06,
"loss": 0.8043,
"step": 3900
},
{
"epoch": 0.23011080494811514,
"grad_norm": 0.365234375,
"learning_rate": 7.698891950518849e-06,
"loss": 0.9046,
"step": 3925
},
{
"epoch": 0.23157647886498212,
"grad_norm": 0.3671875,
"learning_rate": 7.68423521135018e-06,
"loss": 1.0241,
"step": 3950
},
{
"epoch": 0.2330421527818491,
"grad_norm": 0.412109375,
"learning_rate": 7.66957847218151e-06,
"loss": 0.8257,
"step": 3975
},
{
"epoch": 0.23450782669871606,
"grad_norm": 0.248046875,
"learning_rate": 7.65492173301284e-06,
"loss": 0.9551,
"step": 4000
},
{
"epoch": 0.23597350061558303,
"grad_norm": 0.2314453125,
"learning_rate": 7.640264993844169e-06,
"loss": 0.9757,
"step": 4025
},
{
"epoch": 0.23743917453245003,
"grad_norm": 0.287109375,
"learning_rate": 7.6256082546755e-06,
"loss": 0.9187,
"step": 4050
},
{
"epoch": 0.238904848449317,
"grad_norm": 0.5546875,
"learning_rate": 7.610951515506831e-06,
"loss": 0.9166,
"step": 4075
},
{
"epoch": 0.24037052236618398,
"grad_norm": 0.275390625,
"learning_rate": 7.596294776338161e-06,
"loss": 0.9277,
"step": 4100
},
{
"epoch": 0.24183619628305095,
"grad_norm": 0.16796875,
"learning_rate": 7.581638037169491e-06,
"loss": 0.8612,
"step": 4125
},
{
"epoch": 0.24330187019991792,
"grad_norm": 0.357421875,
"learning_rate": 7.566981298000822e-06,
"loss": 0.8533,
"step": 4150
},
{
"epoch": 0.2447675441167849,
"grad_norm": 0.2470703125,
"learning_rate": 7.5523245588321515e-06,
"loss": 0.8259,
"step": 4175
},
{
"epoch": 0.24623321803365186,
"grad_norm": 0.126953125,
"learning_rate": 7.537667819663482e-06,
"loss": 0.882,
"step": 4200
},
{
"epoch": 0.24769889195051886,
"grad_norm": 0.193359375,
"learning_rate": 7.523011080494812e-06,
"loss": 0.9179,
"step": 4225
},
{
"epoch": 0.24916456586738583,
"grad_norm": 0.2138671875,
"learning_rate": 7.508354341326143e-06,
"loss": 0.7686,
"step": 4250
},
{
"epoch": 0.2506302397842528,
"grad_norm": 0.1982421875,
"learning_rate": 7.493697602157472e-06,
"loss": 0.854,
"step": 4275
},
{
"epoch": 0.2520959137011198,
"grad_norm": 0.2275390625,
"learning_rate": 7.4790408629888035e-06,
"loss": 0.9073,
"step": 4300
},
{
"epoch": 0.25356158761798675,
"grad_norm": 0.232421875,
"learning_rate": 7.464384123820133e-06,
"loss": 0.8411,
"step": 4325
},
{
"epoch": 0.2550272615348537,
"grad_norm": 0.2578125,
"learning_rate": 7.449727384651463e-06,
"loss": 0.911,
"step": 4350
},
{
"epoch": 0.2564929354517207,
"grad_norm": 0.2158203125,
"learning_rate": 7.435070645482794e-06,
"loss": 0.8707,
"step": 4375
},
{
"epoch": 0.25795860936858767,
"grad_norm": 0.1962890625,
"learning_rate": 7.420413906314123e-06,
"loss": 0.8607,
"step": 4400
},
{
"epoch": 0.25942428328545464,
"grad_norm": 0.45703125,
"learning_rate": 7.405757167145454e-06,
"loss": 0.8238,
"step": 4425
},
{
"epoch": 0.2608899572023216,
"grad_norm": 0.255859375,
"learning_rate": 7.391100427976784e-06,
"loss": 0.9302,
"step": 4450
},
{
"epoch": 0.2623556311191886,
"grad_norm": 0.59375,
"learning_rate": 7.376443688808115e-06,
"loss": 0.8033,
"step": 4475
},
{
"epoch": 0.26382130503605555,
"grad_norm": 0.2060546875,
"learning_rate": 7.361786949639444e-06,
"loss": 0.8793,
"step": 4500
},
{
"epoch": 0.2652869789529226,
"grad_norm": 0.27734375,
"learning_rate": 7.3471302104707754e-06,
"loss": 0.8292,
"step": 4525
},
{
"epoch": 0.26675265286978955,
"grad_norm": 0.232421875,
"learning_rate": 7.332473471302105e-06,
"loss": 0.8131,
"step": 4550
},
{
"epoch": 0.2682183267866565,
"grad_norm": 0.51171875,
"learning_rate": 7.317816732133436e-06,
"loss": 0.8427,
"step": 4575
},
{
"epoch": 0.2696840007035235,
"grad_norm": 0.333984375,
"learning_rate": 7.303159992964766e-06,
"loss": 0.9036,
"step": 4600
},
{
"epoch": 0.27114967462039047,
"grad_norm": 0.484375,
"learning_rate": 7.288503253796096e-06,
"loss": 0.9235,
"step": 4625
},
{
"epoch": 0.27261534853725744,
"grad_norm": 0.201171875,
"learning_rate": 7.273846514627426e-06,
"loss": 0.9909,
"step": 4650
},
{
"epoch": 0.2740810224541244,
"grad_norm": 0.373046875,
"learning_rate": 7.259189775458757e-06,
"loss": 0.9281,
"step": 4675
},
{
"epoch": 0.2755466963709914,
"grad_norm": 0.189453125,
"learning_rate": 7.244533036290087e-06,
"loss": 0.8731,
"step": 4700
},
{
"epoch": 0.27701237028785836,
"grad_norm": 0.15234375,
"learning_rate": 7.229876297121416e-06,
"loss": 0.8967,
"step": 4725
},
{
"epoch": 0.27847804420472533,
"grad_norm": 0.376953125,
"learning_rate": 7.215219557952747e-06,
"loss": 0.8448,
"step": 4750
},
{
"epoch": 0.2799437181215923,
"grad_norm": 0.376953125,
"learning_rate": 7.200562818784077e-06,
"loss": 0.846,
"step": 4775
},
{
"epoch": 0.28140939203845927,
"grad_norm": 0.2412109375,
"learning_rate": 7.185906079615408e-06,
"loss": 0.785,
"step": 4800
},
{
"epoch": 0.28287506595532624,
"grad_norm": 0.396484375,
"learning_rate": 7.171249340446738e-06,
"loss": 0.995,
"step": 4825
},
{
"epoch": 0.2843407398721932,
"grad_norm": 0.162109375,
"learning_rate": 7.156592601278068e-06,
"loss": 0.879,
"step": 4850
},
{
"epoch": 0.2858064137890602,
"grad_norm": 0.1748046875,
"learning_rate": 7.141935862109399e-06,
"loss": 0.9504,
"step": 4875
},
{
"epoch": 0.28727208770592716,
"grad_norm": 0.2080078125,
"learning_rate": 7.127279122940729e-06,
"loss": 1.0018,
"step": 4900
},
{
"epoch": 0.2887377616227942,
"grad_norm": 0.26171875,
"learning_rate": 7.1126223837720585e-06,
"loss": 0.9797,
"step": 4925
},
{
"epoch": 0.29020343553966116,
"grad_norm": 0.271484375,
"learning_rate": 7.09796564460339e-06,
"loss": 0.8494,
"step": 4950
},
{
"epoch": 0.29166910945652813,
"grad_norm": 0.341796875,
"learning_rate": 7.083308905434719e-06,
"loss": 0.9362,
"step": 4975
},
{
"epoch": 0.2931347833733951,
"grad_norm": 0.146484375,
"learning_rate": 7.06865216626605e-06,
"loss": 0.7694,
"step": 5000
},
{
"epoch": 0.2946004572902621,
"grad_norm": 0.166015625,
"learning_rate": 7.05399542709738e-06,
"loss": 0.9893,
"step": 5025
},
{
"epoch": 0.29606613120712905,
"grad_norm": 0.333984375,
"learning_rate": 7.0393386879287106e-06,
"loss": 0.8555,
"step": 5050
},
{
"epoch": 0.297531805123996,
"grad_norm": 0.3046875,
"learning_rate": 7.02468194876004e-06,
"loss": 0.8452,
"step": 5075
},
{
"epoch": 0.298997479040863,
"grad_norm": 0.21484375,
"learning_rate": 7.010025209591371e-06,
"loss": 0.8275,
"step": 5100
},
{
"epoch": 0.30046315295772996,
"grad_norm": 0.185546875,
"learning_rate": 6.995368470422701e-06,
"loss": 0.7464,
"step": 5125
},
{
"epoch": 0.30192882687459693,
"grad_norm": 0.140625,
"learning_rate": 6.9807117312540305e-06,
"loss": 0.8753,
"step": 5150
},
{
"epoch": 0.3033945007914639,
"grad_norm": 0.2734375,
"learning_rate": 6.966054992085362e-06,
"loss": 1.016,
"step": 5175
},
{
"epoch": 0.3048601747083309,
"grad_norm": 0.1796875,
"learning_rate": 6.951398252916691e-06,
"loss": 0.8956,
"step": 5200
},
{
"epoch": 0.30632584862519785,
"grad_norm": 0.185546875,
"learning_rate": 6.936741513748022e-06,
"loss": 0.8358,
"step": 5225
},
{
"epoch": 0.3077915225420648,
"grad_norm": 0.23046875,
"learning_rate": 6.922084774579352e-06,
"loss": 0.9687,
"step": 5250
},
{
"epoch": 0.3092571964589318,
"grad_norm": 0.216796875,
"learning_rate": 6.9074280354106825e-06,
"loss": 0.8358,
"step": 5275
},
{
"epoch": 0.31072287037579877,
"grad_norm": 0.451171875,
"learning_rate": 6.892771296242012e-06,
"loss": 0.8345,
"step": 5300
},
{
"epoch": 0.3121885442926658,
"grad_norm": 0.40234375,
"learning_rate": 6.878114557073343e-06,
"loss": 0.9074,
"step": 5325
},
{
"epoch": 0.31365421820953276,
"grad_norm": 4.96875,
"learning_rate": 6.863457817904673e-06,
"loss": 0.8969,
"step": 5350
},
{
"epoch": 0.31511989212639974,
"grad_norm": 0.2373046875,
"learning_rate": 6.848801078736004e-06,
"loss": 0.7103,
"step": 5375
},
{
"epoch": 0.3165855660432667,
"grad_norm": 0.1552734375,
"learning_rate": 6.834144339567334e-06,
"loss": 0.8087,
"step": 5400
},
{
"epoch": 0.3180512399601337,
"grad_norm": 0.1982421875,
"learning_rate": 6.819487600398664e-06,
"loss": 0.9289,
"step": 5425
},
{
"epoch": 0.31951691387700065,
"grad_norm": 0.24609375,
"learning_rate": 6.804830861229994e-06,
"loss": 0.8306,
"step": 5450
},
{
"epoch": 0.3209825877938676,
"grad_norm": 0.19921875,
"learning_rate": 6.790174122061325e-06,
"loss": 0.793,
"step": 5475
},
{
"epoch": 0.3224482617107346,
"grad_norm": 0.7109375,
"learning_rate": 6.7755173828926545e-06,
"loss": 0.768,
"step": 5500
},
{
"epoch": 0.32391393562760157,
"grad_norm": 0.2041015625,
"learning_rate": 6.760860643723984e-06,
"loss": 0.8649,
"step": 5525
},
{
"epoch": 0.32537960954446854,
"grad_norm": 0.2373046875,
"learning_rate": 6.746203904555315e-06,
"loss": 0.9173,
"step": 5550
},
{
"epoch": 0.3268452834613355,
"grad_norm": 0.341796875,
"learning_rate": 6.731547165386645e-06,
"loss": 0.8569,
"step": 5575
},
{
"epoch": 0.3283109573782025,
"grad_norm": 0.28515625,
"learning_rate": 6.716890426217976e-06,
"loss": 0.8013,
"step": 5600
},
{
"epoch": 0.32977663129506946,
"grad_norm": 0.1767578125,
"learning_rate": 6.702233687049306e-06,
"loss": 0.9285,
"step": 5625
},
{
"epoch": 0.3312423052119364,
"grad_norm": 0.357421875,
"learning_rate": 6.687576947880636e-06,
"loss": 0.9202,
"step": 5650
},
{
"epoch": 0.3327079791288034,
"grad_norm": 0.61328125,
"learning_rate": 6.672920208711966e-06,
"loss": 0.8727,
"step": 5675
},
{
"epoch": 0.3341736530456704,
"grad_norm": 0.205078125,
"learning_rate": 6.658263469543297e-06,
"loss": 0.8877,
"step": 5700
},
{
"epoch": 0.3356393269625374,
"grad_norm": 0.28515625,
"learning_rate": 6.6436067303746264e-06,
"loss": 0.7748,
"step": 5725
},
{
"epoch": 0.33710500087940437,
"grad_norm": 0.2578125,
"learning_rate": 6.628949991205958e-06,
"loss": 0.805,
"step": 5750
},
{
"epoch": 0.33857067479627134,
"grad_norm": 0.341796875,
"learning_rate": 6.614293252037287e-06,
"loss": 0.8786,
"step": 5775
},
{
"epoch": 0.3400363487131383,
"grad_norm": 0.64453125,
"learning_rate": 6.599636512868618e-06,
"loss": 0.8441,
"step": 5800
},
{
"epoch": 0.3415020226300053,
"grad_norm": 0.228515625,
"learning_rate": 6.584979773699948e-06,
"loss": 0.8751,
"step": 5825
},
{
"epoch": 0.34296769654687226,
"grad_norm": 0.37109375,
"learning_rate": 6.5703230345312785e-06,
"loss": 0.9569,
"step": 5850
},
{
"epoch": 0.34443337046373923,
"grad_norm": 0.294921875,
"learning_rate": 6.555666295362608e-06,
"loss": 0.9255,
"step": 5875
},
{
"epoch": 0.3458990443806062,
"grad_norm": 0.3125,
"learning_rate": 6.541009556193938e-06,
"loss": 0.8501,
"step": 5900
},
{
"epoch": 0.3473647182974732,
"grad_norm": 0.2001953125,
"learning_rate": 6.526352817025269e-06,
"loss": 0.8933,
"step": 5925
},
{
"epoch": 0.34883039221434015,
"grad_norm": 0.19921875,
"learning_rate": 6.511696077856598e-06,
"loss": 0.8941,
"step": 5950
},
{
"epoch": 0.3502960661312071,
"grad_norm": 0.25390625,
"learning_rate": 6.49703933868793e-06,
"loss": 1.0143,
"step": 5975
},
{
"epoch": 0.3517617400480741,
"grad_norm": 0.279296875,
"learning_rate": 6.482382599519259e-06,
"loss": 0.8929,
"step": 6000
},
{
"epoch": 0.35322741396494106,
"grad_norm": 0.333984375,
"learning_rate": 6.46772586035059e-06,
"loss": 0.8641,
"step": 6025
},
{
"epoch": 0.35469308788180803,
"grad_norm": 0.263671875,
"learning_rate": 6.45306912118192e-06,
"loss": 0.8743,
"step": 6050
},
{
"epoch": 0.356158761798675,
"grad_norm": 0.333984375,
"learning_rate": 6.4384123820132504e-06,
"loss": 0.8072,
"step": 6075
},
{
"epoch": 0.35762443571554203,
"grad_norm": 0.345703125,
"learning_rate": 6.42375564284458e-06,
"loss": 0.8964,
"step": 6100
},
{
"epoch": 0.359090109632409,
"grad_norm": 0.27734375,
"learning_rate": 6.409098903675911e-06,
"loss": 0.9049,
"step": 6125
},
{
"epoch": 0.360555783549276,
"grad_norm": 0.17578125,
"learning_rate": 6.394442164507241e-06,
"loss": 0.9438,
"step": 6150
},
{
"epoch": 0.36202145746614295,
"grad_norm": 0.255859375,
"learning_rate": 6.379785425338571e-06,
"loss": 0.8891,
"step": 6175
},
{
"epoch": 0.3634871313830099,
"grad_norm": 0.130859375,
"learning_rate": 6.365128686169902e-06,
"loss": 1.1601,
"step": 6200
},
{
"epoch": 0.3649528052998769,
"grad_norm": 0.216796875,
"learning_rate": 6.350471947001232e-06,
"loss": 0.968,
"step": 6225
},
{
"epoch": 0.36641847921674386,
"grad_norm": 0.7109375,
"learning_rate": 6.3358152078325616e-06,
"loss": 0.962,
"step": 6250
},
{
"epoch": 0.36788415313361084,
"grad_norm": 0.4453125,
"learning_rate": 6.321158468663893e-06,
"loss": 0.8971,
"step": 6275
},
{
"epoch": 0.3693498270504778,
"grad_norm": 0.2255859375,
"learning_rate": 6.306501729495222e-06,
"loss": 0.8835,
"step": 6300
},
{
"epoch": 0.3708155009673448,
"grad_norm": 0.361328125,
"learning_rate": 6.291844990326552e-06,
"loss": 0.8783,
"step": 6325
},
{
"epoch": 0.37228117488421175,
"grad_norm": 0.490234375,
"learning_rate": 6.277188251157883e-06,
"loss": 0.8959,
"step": 6350
},
{
"epoch": 0.3737468488010787,
"grad_norm": 0.3125,
"learning_rate": 6.262531511989213e-06,
"loss": 0.8945,
"step": 6375
},
{
"epoch": 0.3752125227179457,
"grad_norm": 0.173828125,
"learning_rate": 6.247874772820543e-06,
"loss": 0.9492,
"step": 6400
},
{
"epoch": 0.37667819663481267,
"grad_norm": 0.37890625,
"learning_rate": 6.2332180336518736e-06,
"loss": 0.8728,
"step": 6425
},
{
"epoch": 0.37814387055167964,
"grad_norm": 0.255859375,
"learning_rate": 6.218561294483204e-06,
"loss": 0.8508,
"step": 6450
},
{
"epoch": 0.3796095444685466,
"grad_norm": 0.1884765625,
"learning_rate": 6.2039045553145335e-06,
"loss": 0.9759,
"step": 6475
},
{
"epoch": 0.38107521838541364,
"grad_norm": 0.216796875,
"learning_rate": 6.189247816145865e-06,
"loss": 1.0576,
"step": 6500
},
{
"epoch": 0.3825408923022806,
"grad_norm": 0.240234375,
"learning_rate": 6.174591076977194e-06,
"loss": 0.7754,
"step": 6525
},
{
"epoch": 0.3840065662191476,
"grad_norm": 0.189453125,
"learning_rate": 6.159934337808526e-06,
"loss": 0.9326,
"step": 6550
},
{
"epoch": 0.38547224013601455,
"grad_norm": 0.1796875,
"learning_rate": 6.145277598639855e-06,
"loss": 0.8764,
"step": 6575
},
{
"epoch": 0.3869379140528815,
"grad_norm": 0.2314453125,
"learning_rate": 6.1306208594711856e-06,
"loss": 0.8589,
"step": 6600
},
{
"epoch": 0.3884035879697485,
"grad_norm": 1.1953125,
"learning_rate": 6.115964120302515e-06,
"loss": 1.0026,
"step": 6625
},
{
"epoch": 0.38986926188661547,
"grad_norm": 0.19140625,
"learning_rate": 6.101307381133846e-06,
"loss": 0.8483,
"step": 6650
},
{
"epoch": 0.39133493580348244,
"grad_norm": 0.294921875,
"learning_rate": 6.086650641965176e-06,
"loss": 0.9063,
"step": 6675
},
{
"epoch": 0.3928006097203494,
"grad_norm": 0.353515625,
"learning_rate": 6.0719939027965055e-06,
"loss": 0.8658,
"step": 6700
},
{
"epoch": 0.3942662836372164,
"grad_norm": 0.2197265625,
"learning_rate": 6.057337163627837e-06,
"loss": 0.8389,
"step": 6725
},
{
"epoch": 0.39573195755408336,
"grad_norm": 0.94921875,
"learning_rate": 6.042680424459166e-06,
"loss": 0.8769,
"step": 6750
},
{
"epoch": 0.39719763147095033,
"grad_norm": 1.1796875,
"learning_rate": 6.0280236852904975e-06,
"loss": 0.9461,
"step": 6775
},
{
"epoch": 0.3986633053878173,
"grad_norm": 0.17578125,
"learning_rate": 6.013366946121827e-06,
"loss": 0.9144,
"step": 6800
},
{
"epoch": 0.4001289793046843,
"grad_norm": 0.2431640625,
"learning_rate": 5.9987102069531575e-06,
"loss": 0.8746,
"step": 6825
},
{
"epoch": 0.40159465322155125,
"grad_norm": 0.60546875,
"learning_rate": 5.984053467784487e-06,
"loss": 0.8977,
"step": 6850
},
{
"epoch": 0.4030603271384182,
"grad_norm": 0.478515625,
"learning_rate": 5.969396728615818e-06,
"loss": 0.8404,
"step": 6875
},
{
"epoch": 0.40452600105528524,
"grad_norm": 0.326171875,
"learning_rate": 5.954739989447148e-06,
"loss": 0.8321,
"step": 6900
},
{
"epoch": 0.4059916749721522,
"grad_norm": 0.169921875,
"learning_rate": 5.940083250278479e-06,
"loss": 0.8963,
"step": 6925
},
{
"epoch": 0.4074573488890192,
"grad_norm": 0.234375,
"learning_rate": 5.925426511109809e-06,
"loss": 0.9668,
"step": 6950
},
{
"epoch": 0.40892302280588616,
"grad_norm": 0.15625,
"learning_rate": 5.910769771941139e-06,
"loss": 0.8921,
"step": 6975
},
{
"epoch": 0.41038869672275313,
"grad_norm": 0.2734375,
"learning_rate": 5.8961130327724695e-06,
"loss": 0.9034,
"step": 7000
},
{
"epoch": 0.4118543706396201,
"grad_norm": 0.236328125,
"learning_rate": 5.8814562936038e-06,
"loss": 0.873,
"step": 7025
},
{
"epoch": 0.4133200445564871,
"grad_norm": 0.28515625,
"learning_rate": 5.8667995544351295e-06,
"loss": 0.9082,
"step": 7050
},
{
"epoch": 0.41478571847335405,
"grad_norm": 3.171875,
"learning_rate": 5.852142815266459e-06,
"loss": 0.9186,
"step": 7075
},
{
"epoch": 0.416251392390221,
"grad_norm": 0.1396484375,
"learning_rate": 5.83748607609779e-06,
"loss": 0.9711,
"step": 7100
},
{
"epoch": 0.417717066307088,
"grad_norm": 0.8046875,
"learning_rate": 5.82282933692912e-06,
"loss": 0.8168,
"step": 7125
},
{
"epoch": 0.41918274022395496,
"grad_norm": 0.205078125,
"learning_rate": 5.808172597760451e-06,
"loss": 0.926,
"step": 7150
},
{
"epoch": 0.42064841414082194,
"grad_norm": 0.134765625,
"learning_rate": 5.793515858591781e-06,
"loss": 0.7931,
"step": 7175
},
{
"epoch": 0.4221140880576889,
"grad_norm": 0.1904296875,
"learning_rate": 5.778859119423111e-06,
"loss": 0.9143,
"step": 7200
},
{
"epoch": 0.4235797619745559,
"grad_norm": 0.298828125,
"learning_rate": 5.7642023802544415e-06,
"loss": 0.846,
"step": 7225
},
{
"epoch": 0.42504543589142285,
"grad_norm": 0.328125,
"learning_rate": 5.749545641085772e-06,
"loss": 0.8936,
"step": 7250
},
{
"epoch": 0.4265111098082899,
"grad_norm": 0.1923828125,
"learning_rate": 5.7348889019171014e-06,
"loss": 0.9142,
"step": 7275
},
{
"epoch": 0.42797678372515685,
"grad_norm": 0.470703125,
"learning_rate": 5.720232162748433e-06,
"loss": 0.9059,
"step": 7300
},
{
"epoch": 0.4294424576420238,
"grad_norm": 1.1015625,
"learning_rate": 5.705575423579762e-06,
"loss": 0.7794,
"step": 7325
},
{
"epoch": 0.4309081315588908,
"grad_norm": 0.2060546875,
"learning_rate": 5.6909186844110935e-06,
"loss": 0.7724,
"step": 7350
},
{
"epoch": 0.43237380547575777,
"grad_norm": 0.169921875,
"learning_rate": 5.676261945242423e-06,
"loss": 0.9143,
"step": 7375
},
{
"epoch": 0.43383947939262474,
"grad_norm": 0.2275390625,
"learning_rate": 5.6616052060737535e-06,
"loss": 0.8556,
"step": 7400
},
{
"epoch": 0.4353051533094917,
"grad_norm": 0.259765625,
"learning_rate": 5.646948466905083e-06,
"loss": 0.8668,
"step": 7425
},
{
"epoch": 0.4367708272263587,
"grad_norm": 0.6328125,
"learning_rate": 5.632291727736414e-06,
"loss": 0.87,
"step": 7450
},
{
"epoch": 0.43823650114322565,
"grad_norm": 0.173828125,
"learning_rate": 5.617634988567744e-06,
"loss": 0.8162,
"step": 7475
},
{
"epoch": 0.4397021750600926,
"grad_norm": 0.279296875,
"learning_rate": 5.602978249399073e-06,
"loss": 0.8705,
"step": 7500
},
{
"epoch": 0.4411678489769596,
"grad_norm": 0.26953125,
"learning_rate": 5.588321510230405e-06,
"loss": 0.8221,
"step": 7525
},
{
"epoch": 0.44263352289382657,
"grad_norm": 0.236328125,
"learning_rate": 5.573664771061734e-06,
"loss": 0.9005,
"step": 7550
},
{
"epoch": 0.44409919681069354,
"grad_norm": 0.2890625,
"learning_rate": 5.5590080318930654e-06,
"loss": 0.8577,
"step": 7575
},
{
"epoch": 0.4455648707275605,
"grad_norm": 0.28125,
"learning_rate": 5.544351292724395e-06,
"loss": 0.9489,
"step": 7600
},
{
"epoch": 0.4470305446444275,
"grad_norm": 0.29296875,
"learning_rate": 5.529694553555725e-06,
"loss": 0.78,
"step": 7625
},
{
"epoch": 0.44849621856129446,
"grad_norm": 0.390625,
"learning_rate": 5.515037814387055e-06,
"loss": 0.7977,
"step": 7650
},
{
"epoch": 0.4499618924781615,
"grad_norm": 0.203125,
"learning_rate": 5.500381075218386e-06,
"loss": 0.8348,
"step": 7675
},
{
"epoch": 0.45142756639502846,
"grad_norm": 0.287109375,
"learning_rate": 5.485724336049716e-06,
"loss": 0.9294,
"step": 7700
},
{
"epoch": 0.45289324031189543,
"grad_norm": 0.333984375,
"learning_rate": 5.471067596881047e-06,
"loss": 0.905,
"step": 7725
},
{
"epoch": 0.4543589142287624,
"grad_norm": 0.2490234375,
"learning_rate": 5.456410857712377e-06,
"loss": 0.865,
"step": 7750
},
{
"epoch": 0.4558245881456294,
"grad_norm": 0.2578125,
"learning_rate": 5.441754118543707e-06,
"loss": 0.8706,
"step": 7775
},
{
"epoch": 0.45729026206249634,
"grad_norm": 1.2109375,
"learning_rate": 5.427097379375037e-06,
"loss": 1.0095,
"step": 7800
},
{
"epoch": 0.4587559359793633,
"grad_norm": 0.177734375,
"learning_rate": 5.412440640206368e-06,
"loss": 0.801,
"step": 7825
},
{
"epoch": 0.4602216098962303,
"grad_norm": 0.2578125,
"learning_rate": 5.397783901037697e-06,
"loss": 0.8175,
"step": 7850
},
{
"epoch": 0.46168728381309726,
"grad_norm": 0.298828125,
"learning_rate": 5.383127161869027e-06,
"loss": 0.9689,
"step": 7875
},
{
"epoch": 0.46315295772996423,
"grad_norm": 0.2373046875,
"learning_rate": 5.368470422700358e-06,
"loss": 0.8495,
"step": 7900
},
{
"epoch": 0.4646186316468312,
"grad_norm": 0.1796875,
"learning_rate": 5.353813683531688e-06,
"loss": 0.9133,
"step": 7925
},
{
"epoch": 0.4660843055636982,
"grad_norm": 0.2021484375,
"learning_rate": 5.339156944363019e-06,
"loss": 0.8828,
"step": 7950
},
{
"epoch": 0.46754997948056515,
"grad_norm": 0.2578125,
"learning_rate": 5.3245002051943485e-06,
"loss": 1.0062,
"step": 7975
},
{
"epoch": 0.4690156533974321,
"grad_norm": 0.240234375,
"learning_rate": 5.309843466025679e-06,
"loss": 0.8171,
"step": 8000
},
{
"epoch": 0.4704813273142991,
"grad_norm": 0.28515625,
"learning_rate": 5.295186726857009e-06,
"loss": 0.8704,
"step": 8025
},
{
"epoch": 0.47194700123116606,
"grad_norm": 0.4140625,
"learning_rate": 5.28052998768834e-06,
"loss": 0.866,
"step": 8050
},
{
"epoch": 0.4734126751480331,
"grad_norm": 0.15625,
"learning_rate": 5.265873248519669e-06,
"loss": 0.8298,
"step": 8075
},
{
"epoch": 0.47487834906490006,
"grad_norm": 0.330078125,
"learning_rate": 5.251216509351001e-06,
"loss": 0.8654,
"step": 8100
},
{
"epoch": 0.47634402298176703,
"grad_norm": 0.205078125,
"learning_rate": 5.23655977018233e-06,
"loss": 0.9916,
"step": 8125
},
{
"epoch": 0.477809696898634,
"grad_norm": 0.9609375,
"learning_rate": 5.2219030310136605e-06,
"loss": 0.9645,
"step": 8150
},
{
"epoch": 0.479275370815501,
"grad_norm": 0.314453125,
"learning_rate": 5.207246291844991e-06,
"loss": 0.8867,
"step": 8175
},
{
"epoch": 0.48074104473236795,
"grad_norm": 0.326171875,
"learning_rate": 5.192589552676321e-06,
"loss": 0.9363,
"step": 8200
},
{
"epoch": 0.4822067186492349,
"grad_norm": 0.283203125,
"learning_rate": 5.177932813507651e-06,
"loss": 0.7848,
"step": 8225
},
{
"epoch": 0.4836723925661019,
"grad_norm": 0.33203125,
"learning_rate": 5.163276074338982e-06,
"loss": 0.8899,
"step": 8250
},
{
"epoch": 0.48513806648296887,
"grad_norm": 0.1748046875,
"learning_rate": 5.148619335170312e-06,
"loss": 0.8092,
"step": 8275
},
{
"epoch": 0.48660374039983584,
"grad_norm": 0.41796875,
"learning_rate": 5.133962596001641e-06,
"loss": 0.899,
"step": 8300
},
{
"epoch": 0.4880694143167028,
"grad_norm": 0.169921875,
"learning_rate": 5.1193058568329725e-06,
"loss": 0.9528,
"step": 8325
},
{
"epoch": 0.4895350882335698,
"grad_norm": 0.1474609375,
"learning_rate": 5.104649117664302e-06,
"loss": 0.8668,
"step": 8350
},
{
"epoch": 0.49100076215043675,
"grad_norm": 0.3828125,
"learning_rate": 5.0899923784956325e-06,
"loss": 0.9176,
"step": 8375
},
{
"epoch": 0.4924664360673037,
"grad_norm": 0.44140625,
"learning_rate": 5.075335639326963e-06,
"loss": 0.9135,
"step": 8400
},
{
"epoch": 0.4939321099841707,
"grad_norm": 0.279296875,
"learning_rate": 5.060678900158293e-06,
"loss": 0.8201,
"step": 8425
},
{
"epoch": 0.4953977839010377,
"grad_norm": 0.62109375,
"learning_rate": 5.046022160989623e-06,
"loss": 0.8947,
"step": 8450
},
{
"epoch": 0.4968634578179047,
"grad_norm": 0.279296875,
"learning_rate": 5.031365421820954e-06,
"loss": 0.8838,
"step": 8475
},
{
"epoch": 0.49832913173477167,
"grad_norm": 0.1962890625,
"learning_rate": 5.016708682652284e-06,
"loss": 0.8648,
"step": 8500
},
{
"epoch": 0.49979480565163864,
"grad_norm": 2.390625,
"learning_rate": 5.002051943483615e-06,
"loss": 0.9247,
"step": 8525
},
{
"epoch": 0.5012604795685056,
"grad_norm": 0.2333984375,
"learning_rate": 4.9873952043149445e-06,
"loss": 0.8839,
"step": 8550
},
{
"epoch": 0.5027261534853725,
"grad_norm": 0.51953125,
"learning_rate": 4.972738465146275e-06,
"loss": 0.9194,
"step": 8575
},
{
"epoch": 0.5041918274022396,
"grad_norm": 0.26953125,
"learning_rate": 4.9580817259776045e-06,
"loss": 0.8232,
"step": 8600
},
{
"epoch": 0.5056575013191065,
"grad_norm": 0.37890625,
"learning_rate": 4.943424986808935e-06,
"loss": 1.0866,
"step": 8625
},
{
"epoch": 0.5071231752359735,
"grad_norm": 0.23046875,
"learning_rate": 4.928768247640265e-06,
"loss": 0.9213,
"step": 8650
},
{
"epoch": 0.5085888491528405,
"grad_norm": 0.2236328125,
"learning_rate": 4.914111508471596e-06,
"loss": 0.993,
"step": 8675
},
{
"epoch": 0.5100545230697074,
"grad_norm": 0.5234375,
"learning_rate": 4.899454769302926e-06,
"loss": 0.9335,
"step": 8700
},
{
"epoch": 0.5115201969865745,
"grad_norm": 0.3125,
"learning_rate": 4.8847980301342565e-06,
"loss": 0.9327,
"step": 8725
},
{
"epoch": 0.5129858709034414,
"grad_norm": 0.515625,
"learning_rate": 4.870141290965587e-06,
"loss": 0.8948,
"step": 8750
},
{
"epoch": 0.5144515448203084,
"grad_norm": 0.1591796875,
"learning_rate": 4.8554845517969164e-06,
"loss": 0.9082,
"step": 8775
},
{
"epoch": 0.5159172187371753,
"grad_norm": 0.2138671875,
"learning_rate": 4.840827812628247e-06,
"loss": 0.894,
"step": 8800
},
{
"epoch": 0.5173828926540424,
"grad_norm": 0.21484375,
"learning_rate": 4.826171073459577e-06,
"loss": 0.9065,
"step": 8825
},
{
"epoch": 0.5188485665709093,
"grad_norm": 0.1669921875,
"learning_rate": 4.811514334290907e-06,
"loss": 0.8453,
"step": 8850
},
{
"epoch": 0.5203142404877763,
"grad_norm": 0.21484375,
"learning_rate": 4.796857595122237e-06,
"loss": 1.0002,
"step": 8875
},
{
"epoch": 0.5217799144046432,
"grad_norm": 0.9921875,
"learning_rate": 4.782200855953568e-06,
"loss": 0.9558,
"step": 8900
},
{
"epoch": 0.5232455883215102,
"grad_norm": 0.1884765625,
"learning_rate": 4.767544116784898e-06,
"loss": 0.7789,
"step": 8925
},
{
"epoch": 0.5247112622383772,
"grad_norm": 0.365234375,
"learning_rate": 4.7528873776162284e-06,
"loss": 0.8602,
"step": 8950
},
{
"epoch": 0.5261769361552442,
"grad_norm": 0.451171875,
"learning_rate": 4.738230638447559e-06,
"loss": 0.849,
"step": 8975
},
{
"epoch": 0.5276426100721111,
"grad_norm": 0.1591796875,
"learning_rate": 4.723573899278888e-06,
"loss": 0.8461,
"step": 9000
},
{
"epoch": 0.5291082839889781,
"grad_norm": 0.47265625,
"learning_rate": 4.708917160110219e-06,
"loss": 0.8729,
"step": 9025
},
{
"epoch": 0.5305739579058452,
"grad_norm": 0.341796875,
"learning_rate": 4.694260420941549e-06,
"loss": 1.0451,
"step": 9050
},
{
"epoch": 0.5320396318227121,
"grad_norm": 0.1943359375,
"learning_rate": 4.67960368177288e-06,
"loss": 0.817,
"step": 9075
},
{
"epoch": 0.5335053057395791,
"grad_norm": 0.2119140625,
"learning_rate": 4.66494694260421e-06,
"loss": 0.9171,
"step": 9100
},
{
"epoch": 0.534970979656446,
"grad_norm": 0.353515625,
"learning_rate": 4.6502902034355404e-06,
"loss": 0.7783,
"step": 9125
},
{
"epoch": 0.536436653573313,
"grad_norm": 0.44921875,
"learning_rate": 4.635633464266871e-06,
"loss": 0.8943,
"step": 9150
},
{
"epoch": 0.53790232749018,
"grad_norm": 0.2490234375,
"learning_rate": 4.6209767250982e-06,
"loss": 0.9554,
"step": 9175
},
{
"epoch": 0.539368001407047,
"grad_norm": 0.162109375,
"learning_rate": 4.606319985929531e-06,
"loss": 0.7875,
"step": 9200
},
{
"epoch": 0.5408336753239139,
"grad_norm": 0.33984375,
"learning_rate": 4.591663246760861e-06,
"loss": 0.9086,
"step": 9225
},
{
"epoch": 0.5422993492407809,
"grad_norm": 0.279296875,
"learning_rate": 4.577006507592191e-06,
"loss": 0.9511,
"step": 9250
},
{
"epoch": 0.5437650231576479,
"grad_norm": 0.2353515625,
"learning_rate": 4.562349768423521e-06,
"loss": 0.8182,
"step": 9275
},
{
"epoch": 0.5452306970745149,
"grad_norm": 0.302734375,
"learning_rate": 4.547693029254852e-06,
"loss": 0.8487,
"step": 9300
},
{
"epoch": 0.5466963709913818,
"grad_norm": 0.1845703125,
"learning_rate": 4.533036290086182e-06,
"loss": 0.8708,
"step": 9325
},
{
"epoch": 0.5481620449082488,
"grad_norm": 0.265625,
"learning_rate": 4.518379550917512e-06,
"loss": 0.8577,
"step": 9350
},
{
"epoch": 0.5496277188251157,
"grad_norm": 0.36328125,
"learning_rate": 4.503722811748843e-06,
"loss": 0.8896,
"step": 9375
},
{
"epoch": 0.5510933927419828,
"grad_norm": 0.2314453125,
"learning_rate": 4.489066072580172e-06,
"loss": 0.8778,
"step": 9400
},
{
"epoch": 0.5525590666588497,
"grad_norm": 0.283203125,
"learning_rate": 4.474409333411503e-06,
"loss": 0.847,
"step": 9425
},
{
"epoch": 0.5540247405757167,
"grad_norm": 0.384765625,
"learning_rate": 4.459752594242833e-06,
"loss": 0.8678,
"step": 9450
},
{
"epoch": 0.5554904144925837,
"grad_norm": 0.228515625,
"learning_rate": 4.4450958550741636e-06,
"loss": 0.7334,
"step": 9475
},
{
"epoch": 0.5569560884094507,
"grad_norm": 0.2080078125,
"learning_rate": 4.430439115905494e-06,
"loss": 1.1603,
"step": 9500
},
{
"epoch": 0.5584217623263177,
"grad_norm": 0.19921875,
"learning_rate": 4.415782376736824e-06,
"loss": 0.8804,
"step": 9525
},
{
"epoch": 0.5598874362431846,
"grad_norm": 0.431640625,
"learning_rate": 4.401125637568155e-06,
"loss": 0.9023,
"step": 9550
},
{
"epoch": 0.5613531101600516,
"grad_norm": 0.2353515625,
"learning_rate": 4.386468898399484e-06,
"loss": 0.8169,
"step": 9575
},
{
"epoch": 0.5628187840769185,
"grad_norm": 0.119140625,
"learning_rate": 4.371812159230815e-06,
"loss": 0.919,
"step": 9600
},
{
"epoch": 0.5642844579937856,
"grad_norm": 0.546875,
"learning_rate": 4.357155420062145e-06,
"loss": 0.8916,
"step": 9625
},
{
"epoch": 0.5657501319106525,
"grad_norm": 0.359375,
"learning_rate": 4.342498680893475e-06,
"loss": 0.832,
"step": 9650
},
{
"epoch": 0.5672158058275195,
"grad_norm": 0.302734375,
"learning_rate": 4.327841941724805e-06,
"loss": 0.824,
"step": 9675
},
{
"epoch": 0.5686814797443864,
"grad_norm": 0.361328125,
"learning_rate": 4.3131852025561355e-06,
"loss": 0.8998,
"step": 9700
},
{
"epoch": 0.5701471536612535,
"grad_norm": 0.44140625,
"learning_rate": 4.298528463387466e-06,
"loss": 0.8192,
"step": 9725
},
{
"epoch": 0.5716128275781204,
"grad_norm": 0.169921875,
"learning_rate": 4.283871724218796e-06,
"loss": 0.825,
"step": 9750
},
{
"epoch": 0.5730785014949874,
"grad_norm": 0.2734375,
"learning_rate": 4.269214985050127e-06,
"loss": 0.9014,
"step": 9775
},
{
"epoch": 0.5745441754118543,
"grad_norm": 0.65234375,
"learning_rate": 4.254558245881456e-06,
"loss": 1.1582,
"step": 9800
},
{
"epoch": 0.5760098493287213,
"grad_norm": 0.2138671875,
"learning_rate": 4.239901506712787e-06,
"loss": 0.9225,
"step": 9825
},
{
"epoch": 0.5774755232455884,
"grad_norm": 0.328125,
"learning_rate": 4.225244767544117e-06,
"loss": 0.8293,
"step": 9850
},
{
"epoch": 0.5789411971624553,
"grad_norm": 0.26171875,
"learning_rate": 4.2105880283754475e-06,
"loss": 0.9205,
"step": 9875
},
{
"epoch": 0.5804068710793223,
"grad_norm": 0.28125,
"learning_rate": 4.195931289206778e-06,
"loss": 0.9015,
"step": 9900
},
{
"epoch": 0.5818725449961892,
"grad_norm": 0.205078125,
"learning_rate": 4.181274550038108e-06,
"loss": 0.7924,
"step": 9925
},
{
"epoch": 0.5833382189130563,
"grad_norm": 0.78515625,
"learning_rate": 4.166617810869438e-06,
"loss": 1.0072,
"step": 9950
},
{
"epoch": 0.5848038928299232,
"grad_norm": 0.2353515625,
"learning_rate": 4.151961071700768e-06,
"loss": 0.9099,
"step": 9975
},
{
"epoch": 0.5862695667467902,
"grad_norm": 1.2265625,
"learning_rate": 4.137304332532099e-06,
"loss": 1.1021,
"step": 10000
},
{
"epoch": 0.5877352406636571,
"grad_norm": 0.33984375,
"learning_rate": 4.122647593363428e-06,
"loss": 0.8166,
"step": 10025
},
{
"epoch": 0.5892009145805241,
"grad_norm": 0.296875,
"learning_rate": 4.107990854194759e-06,
"loss": 0.9566,
"step": 10050
},
{
"epoch": 0.5906665884973911,
"grad_norm": 0.2119140625,
"learning_rate": 4.093334115026089e-06,
"loss": 0.8494,
"step": 10075
},
{
"epoch": 0.5921322624142581,
"grad_norm": 0.2451171875,
"learning_rate": 4.0786773758574195e-06,
"loss": 0.9279,
"step": 10100
},
{
"epoch": 0.593597936331125,
"grad_norm": 0.2275390625,
"learning_rate": 4.06402063668875e-06,
"loss": 0.8628,
"step": 10125
},
{
"epoch": 0.595063610247992,
"grad_norm": 0.310546875,
"learning_rate": 4.04936389752008e-06,
"loss": 0.9406,
"step": 10150
},
{
"epoch": 0.596529284164859,
"grad_norm": 0.212890625,
"learning_rate": 4.03470715835141e-06,
"loss": 0.896,
"step": 10175
},
{
"epoch": 0.597994958081726,
"grad_norm": 0.421875,
"learning_rate": 4.02005041918274e-06,
"loss": 0.8608,
"step": 10200
},
{
"epoch": 0.599460631998593,
"grad_norm": 0.255859375,
"learning_rate": 4.005393680014071e-06,
"loss": 0.9103,
"step": 10225
},
{
"epoch": 0.6009263059154599,
"grad_norm": 0.29296875,
"learning_rate": 3.990736940845401e-06,
"loss": 0.8136,
"step": 10250
},
{
"epoch": 0.602391979832327,
"grad_norm": 0.19921875,
"learning_rate": 3.9760802016767315e-06,
"loss": 0.8374,
"step": 10275
},
{
"epoch": 0.6038576537491939,
"grad_norm": 0.1806640625,
"learning_rate": 3.961423462508062e-06,
"loss": 0.8049,
"step": 10300
},
{
"epoch": 0.6053233276660609,
"grad_norm": 0.392578125,
"learning_rate": 3.946766723339392e-06,
"loss": 0.7943,
"step": 10325
},
{
"epoch": 0.6067890015829278,
"grad_norm": 1.0,
"learning_rate": 3.932109984170722e-06,
"loss": 0.8691,
"step": 10350
},
{
"epoch": 0.6082546754997948,
"grad_norm": 0.146484375,
"learning_rate": 3.917453245002052e-06,
"loss": 0.8565,
"step": 10375
},
{
"epoch": 0.6097203494166618,
"grad_norm": 0.357421875,
"learning_rate": 3.902796505833383e-06,
"loss": 0.9866,
"step": 10400
},
{
"epoch": 0.6111860233335288,
"grad_norm": 0.236328125,
"learning_rate": 3.888139766664712e-06,
"loss": 0.9026,
"step": 10425
},
{
"epoch": 0.6126516972503957,
"grad_norm": 0.416015625,
"learning_rate": 3.873483027496043e-06,
"loss": 0.9329,
"step": 10450
},
{
"epoch": 0.6141173711672627,
"grad_norm": 0.458984375,
"learning_rate": 3.858826288327373e-06,
"loss": 0.8534,
"step": 10475
},
{
"epoch": 0.6155830450841296,
"grad_norm": 0.279296875,
"learning_rate": 3.8441695491587034e-06,
"loss": 0.824,
"step": 10500
},
{
"epoch": 0.6170487190009967,
"grad_norm": 0.255859375,
"learning_rate": 3.829512809990034e-06,
"loss": 0.8444,
"step": 10525
},
{
"epoch": 0.6185143929178636,
"grad_norm": 0.26953125,
"learning_rate": 3.814856070821364e-06,
"loss": 0.8843,
"step": 10550
},
{
"epoch": 0.6199800668347306,
"grad_norm": 0.1748046875,
"learning_rate": 3.8001993316526942e-06,
"loss": 0.8791,
"step": 10575
},
{
"epoch": 0.6214457407515975,
"grad_norm": 0.287109375,
"learning_rate": 3.7855425924840246e-06,
"loss": 0.8695,
"step": 10600
},
{
"epoch": 0.6229114146684646,
"grad_norm": 0.1748046875,
"learning_rate": 3.7708858533153546e-06,
"loss": 0.8917,
"step": 10625
},
{
"epoch": 0.6243770885853316,
"grad_norm": 0.4921875,
"learning_rate": 3.756229114146685e-06,
"loss": 0.7642,
"step": 10650
},
{
"epoch": 0.6258427625021985,
"grad_norm": 0.2421875,
"learning_rate": 3.7415723749780154e-06,
"loss": 0.8826,
"step": 10675
},
{
"epoch": 0.6273084364190655,
"grad_norm": 0.197265625,
"learning_rate": 3.7269156358093454e-06,
"loss": 0.8513,
"step": 10700
},
{
"epoch": 0.6287741103359324,
"grad_norm": 0.50390625,
"learning_rate": 3.712258896640676e-06,
"loss": 1.0076,
"step": 10725
},
{
"epoch": 0.6302397842527995,
"grad_norm": 0.220703125,
"learning_rate": 3.697602157472006e-06,
"loss": 0.8629,
"step": 10750
},
{
"epoch": 0.6317054581696664,
"grad_norm": 0.251953125,
"learning_rate": 3.6829454183033366e-06,
"loss": 0.7775,
"step": 10775
},
{
"epoch": 0.6331711320865334,
"grad_norm": 0.2265625,
"learning_rate": 3.6682886791346666e-06,
"loss": 0.9906,
"step": 10800
},
{
"epoch": 0.6346368060034003,
"grad_norm": 0.298828125,
"learning_rate": 3.6536319399659966e-06,
"loss": 0.8601,
"step": 10825
},
{
"epoch": 0.6361024799202674,
"grad_norm": 1.6875,
"learning_rate": 3.6389752007973266e-06,
"loss": 0.848,
"step": 10850
},
{
"epoch": 0.6375681538371343,
"grad_norm": 0.2490234375,
"learning_rate": 3.624318461628657e-06,
"loss": 0.9411,
"step": 10875
},
{
"epoch": 0.6390338277540013,
"grad_norm": 0.2431640625,
"learning_rate": 3.6096617224599874e-06,
"loss": 0.9371,
"step": 10900
},
{
"epoch": 0.6404995016708682,
"grad_norm": 0.302734375,
"learning_rate": 3.5950049832913174e-06,
"loss": 0.8688,
"step": 10925
},
{
"epoch": 0.6419651755877352,
"grad_norm": 0.333984375,
"learning_rate": 3.5803482441226478e-06,
"loss": 0.9022,
"step": 10950
},
{
"epoch": 0.6434308495046022,
"grad_norm": 0.259765625,
"learning_rate": 3.565691504953978e-06,
"loss": 0.8537,
"step": 10975
},
{
"epoch": 0.6448965234214692,
"grad_norm": 0.5703125,
"learning_rate": 3.5510347657853086e-06,
"loss": 0.8228,
"step": 11000
},
{
"epoch": 0.6463621973383362,
"grad_norm": 0.259765625,
"learning_rate": 3.5363780266166386e-06,
"loss": 0.8286,
"step": 11025
},
{
"epoch": 0.6478278712552031,
"grad_norm": 0.26171875,
"learning_rate": 3.521721287447969e-06,
"loss": 0.8312,
"step": 11050
},
{
"epoch": 0.6492935451720702,
"grad_norm": 0.294921875,
"learning_rate": 3.5070645482792994e-06,
"loss": 0.8555,
"step": 11075
},
{
"epoch": 0.6507592190889371,
"grad_norm": 0.37890625,
"learning_rate": 3.4924078091106293e-06,
"loss": 0.8602,
"step": 11100
},
{
"epoch": 0.6522248930058041,
"grad_norm": 0.294921875,
"learning_rate": 3.4777510699419598e-06,
"loss": 0.875,
"step": 11125
},
{
"epoch": 0.653690566922671,
"grad_norm": 3.171875,
"learning_rate": 3.46309433077329e-06,
"loss": 1.0157,
"step": 11150
},
{
"epoch": 0.655156240839538,
"grad_norm": 0.2197265625,
"learning_rate": 3.44843759160462e-06,
"loss": 0.8448,
"step": 11175
},
{
"epoch": 0.656621914756405,
"grad_norm": 0.53515625,
"learning_rate": 3.4337808524359505e-06,
"loss": 0.8291,
"step": 11200
},
{
"epoch": 0.658087588673272,
"grad_norm": 0.2578125,
"learning_rate": 3.4191241132672805e-06,
"loss": 0.9121,
"step": 11225
},
{
"epoch": 0.6595532625901389,
"grad_norm": 0.1962890625,
"learning_rate": 3.4044673740986105e-06,
"loss": 0.9451,
"step": 11250
},
{
"epoch": 0.6610189365070059,
"grad_norm": 0.458984375,
"learning_rate": 3.389810634929941e-06,
"loss": 0.8054,
"step": 11275
},
{
"epoch": 0.6624846104238729,
"grad_norm": 0.2216796875,
"learning_rate": 3.3751538957612713e-06,
"loss": 0.7991,
"step": 11300
},
{
"epoch": 0.6639502843407399,
"grad_norm": 0.57421875,
"learning_rate": 3.3604971565926013e-06,
"loss": 0.9013,
"step": 11325
},
{
"epoch": 0.6654159582576068,
"grad_norm": 0.373046875,
"learning_rate": 3.3458404174239317e-06,
"loss": 1.0514,
"step": 11350
},
{
"epoch": 0.6668816321744738,
"grad_norm": 0.3125,
"learning_rate": 3.331183678255262e-06,
"loss": 0.8199,
"step": 11375
},
{
"epoch": 0.6683473060913409,
"grad_norm": 0.416015625,
"learning_rate": 3.316526939086592e-06,
"loss": 0.8791,
"step": 11400
},
{
"epoch": 0.6698129800082078,
"grad_norm": 0.39453125,
"learning_rate": 3.3018701999179225e-06,
"loss": 0.9326,
"step": 11425
},
{
"epoch": 0.6712786539250748,
"grad_norm": 0.365234375,
"learning_rate": 3.287213460749253e-06,
"loss": 0.8829,
"step": 11450
},
{
"epoch": 0.6727443278419417,
"grad_norm": 0.30859375,
"learning_rate": 3.2725567215805833e-06,
"loss": 0.8332,
"step": 11475
},
{
"epoch": 0.6742100017588087,
"grad_norm": 0.65625,
"learning_rate": 3.2578999824119133e-06,
"loss": 0.9214,
"step": 11500
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.203125,
"learning_rate": 3.2432432432432437e-06,
"loss": 0.814,
"step": 11525
},
{
"epoch": 0.6771413495925427,
"grad_norm": 0.255859375,
"learning_rate": 3.228586504074574e-06,
"loss": 0.8175,
"step": 11550
},
{
"epoch": 0.6786070235094096,
"grad_norm": 0.369140625,
"learning_rate": 3.213929764905904e-06,
"loss": 0.8749,
"step": 11575
},
{
"epoch": 0.6800726974262766,
"grad_norm": 0.2177734375,
"learning_rate": 3.199273025737234e-06,
"loss": 0.7743,
"step": 11600
},
{
"epoch": 0.6815383713431435,
"grad_norm": 0.26171875,
"learning_rate": 3.184616286568564e-06,
"loss": 0.8811,
"step": 11625
},
{
"epoch": 0.6830040452600106,
"grad_norm": 0.173828125,
"learning_rate": 3.1699595473998945e-06,
"loss": 0.8967,
"step": 11650
},
{
"epoch": 0.6844697191768775,
"grad_norm": 0.18359375,
"learning_rate": 3.155302808231225e-06,
"loss": 0.9144,
"step": 11675
},
{
"epoch": 0.6859353930937445,
"grad_norm": 0.296875,
"learning_rate": 3.1406460690625553e-06,
"loss": 0.8852,
"step": 11700
},
{
"epoch": 0.6874010670106114,
"grad_norm": 0.248046875,
"learning_rate": 3.1259893298938853e-06,
"loss": 0.875,
"step": 11725
},
{
"epoch": 0.6888667409274785,
"grad_norm": 0.189453125,
"learning_rate": 3.1113325907252157e-06,
"loss": 0.9148,
"step": 11750
},
{
"epoch": 0.6903324148443454,
"grad_norm": 0.63671875,
"learning_rate": 3.096675851556546e-06,
"loss": 0.8413,
"step": 11775
},
{
"epoch": 0.6917980887612124,
"grad_norm": 0.1787109375,
"learning_rate": 3.082019112387876e-06,
"loss": 0.9005,
"step": 11800
},
{
"epoch": 0.6932637626780794,
"grad_norm": 0.302734375,
"learning_rate": 3.0673623732192065e-06,
"loss": 0.8366,
"step": 11825
},
{
"epoch": 0.6947294365949463,
"grad_norm": 0.318359375,
"learning_rate": 3.052705634050537e-06,
"loss": 0.9477,
"step": 11850
},
{
"epoch": 0.6961951105118134,
"grad_norm": 0.302734375,
"learning_rate": 3.038048894881867e-06,
"loss": 0.9308,
"step": 11875
},
{
"epoch": 0.6976607844286803,
"grad_norm": 0.119140625,
"learning_rate": 3.0233921557131972e-06,
"loss": 0.7818,
"step": 11900
},
{
"epoch": 0.6991264583455473,
"grad_norm": 0.296875,
"learning_rate": 3.0087354165445277e-06,
"loss": 0.9406,
"step": 11925
},
{
"epoch": 0.7005921322624142,
"grad_norm": 0.244140625,
"learning_rate": 2.994078677375858e-06,
"loss": 1.0378,
"step": 11950
},
{
"epoch": 0.7020578061792813,
"grad_norm": 5.25,
"learning_rate": 2.979421938207188e-06,
"loss": 0.8277,
"step": 11975
},
{
"epoch": 0.7035234800961482,
"grad_norm": 0.275390625,
"learning_rate": 2.964765199038518e-06,
"loss": 0.8431,
"step": 12000
},
{
"epoch": 0.7049891540130152,
"grad_norm": 0.1982421875,
"learning_rate": 2.950108459869848e-06,
"loss": 0.8696,
"step": 12025
},
{
"epoch": 0.7064548279298821,
"grad_norm": 0.287109375,
"learning_rate": 2.9354517207011784e-06,
"loss": 0.8615,
"step": 12050
},
{
"epoch": 0.7079205018467492,
"grad_norm": 0.416015625,
"learning_rate": 2.920794981532509e-06,
"loss": 0.9367,
"step": 12075
},
{
"epoch": 0.7093861757636161,
"grad_norm": 0.2294921875,
"learning_rate": 2.9061382423638392e-06,
"loss": 0.9749,
"step": 12100
},
{
"epoch": 0.7108518496804831,
"grad_norm": 0.375,
"learning_rate": 2.891481503195169e-06,
"loss": 0.8747,
"step": 12125
},
{
"epoch": 0.71231752359735,
"grad_norm": 0.306640625,
"learning_rate": 2.8768247640264996e-06,
"loss": 0.8151,
"step": 12150
},
{
"epoch": 0.713783197514217,
"grad_norm": 0.30078125,
"learning_rate": 2.86216802485783e-06,
"loss": 0.8211,
"step": 12175
},
{
"epoch": 0.7152488714310841,
"grad_norm": 0.318359375,
"learning_rate": 2.84751128568916e-06,
"loss": 0.8662,
"step": 12200
},
{
"epoch": 0.716714545347951,
"grad_norm": 0.6640625,
"learning_rate": 2.8328545465204904e-06,
"loss": 0.909,
"step": 12225
},
{
"epoch": 0.718180219264818,
"grad_norm": 1.0234375,
"learning_rate": 2.818197807351821e-06,
"loss": 0.9167,
"step": 12250
},
{
"epoch": 0.7196458931816849,
"grad_norm": 0.73828125,
"learning_rate": 2.803541068183151e-06,
"loss": 0.8634,
"step": 12275
},
{
"epoch": 0.721111567098552,
"grad_norm": 0.16015625,
"learning_rate": 2.788884329014481e-06,
"loss": 1.0852,
"step": 12300
},
{
"epoch": 0.7225772410154189,
"grad_norm": 0.2216796875,
"learning_rate": 2.7742275898458116e-06,
"loss": 0.9097,
"step": 12325
},
{
"epoch": 0.7240429149322859,
"grad_norm": 0.373046875,
"learning_rate": 2.759570850677142e-06,
"loss": 0.8853,
"step": 12350
},
{
"epoch": 0.7255085888491528,
"grad_norm": 0.1513671875,
"learning_rate": 2.744914111508472e-06,
"loss": 0.9239,
"step": 12375
},
{
"epoch": 0.7269742627660198,
"grad_norm": 0.1708984375,
"learning_rate": 2.730257372339802e-06,
"loss": 1.0119,
"step": 12400
},
{
"epoch": 0.7284399366828868,
"grad_norm": 0.279296875,
"learning_rate": 2.715600633171132e-06,
"loss": 0.9003,
"step": 12425
},
{
"epoch": 0.7299056105997538,
"grad_norm": 0.2333984375,
"learning_rate": 2.7009438940024624e-06,
"loss": 0.8912,
"step": 12450
},
{
"epoch": 0.7313712845166207,
"grad_norm": 0.3125,
"learning_rate": 2.6862871548337928e-06,
"loss": 0.9916,
"step": 12475
},
{
"epoch": 0.7328369584334877,
"grad_norm": 0.208984375,
"learning_rate": 2.6716304156651227e-06,
"loss": 1.1092,
"step": 12500
},
{
"epoch": 0.7343026323503546,
"grad_norm": 0.27734375,
"learning_rate": 2.656973676496453e-06,
"loss": 1.1346,
"step": 12525
},
{
"epoch": 0.7357683062672217,
"grad_norm": 0.546875,
"learning_rate": 2.6423169373277836e-06,
"loss": 0.9109,
"step": 12550
},
{
"epoch": 0.7372339801840886,
"grad_norm": 0.25390625,
"learning_rate": 2.627660198159114e-06,
"loss": 0.8605,
"step": 12575
},
{
"epoch": 0.7386996541009556,
"grad_norm": 4.1875,
"learning_rate": 2.613003458990444e-06,
"loss": 0.8093,
"step": 12600
},
{
"epoch": 0.7401653280178226,
"grad_norm": 0.1875,
"learning_rate": 2.5983467198217744e-06,
"loss": 0.9045,
"step": 12625
},
{
"epoch": 0.7416310019346896,
"grad_norm": 0.25390625,
"learning_rate": 2.5836899806531048e-06,
"loss": 0.9586,
"step": 12650
},
{
"epoch": 0.7430966758515566,
"grad_norm": 0.318359375,
"learning_rate": 2.5690332414844347e-06,
"loss": 0.8817,
"step": 12675
},
{
"epoch": 0.7445623497684235,
"grad_norm": 0.6484375,
"learning_rate": 2.554376502315765e-06,
"loss": 1.0672,
"step": 12700
},
{
"epoch": 0.7460280236852905,
"grad_norm": 0.37890625,
"learning_rate": 2.5397197631470956e-06,
"loss": 0.7547,
"step": 12725
},
{
"epoch": 0.7474936976021574,
"grad_norm": 0.271484375,
"learning_rate": 2.5250630239784255e-06,
"loss": 0.86,
"step": 12750
},
{
"epoch": 0.7489593715190245,
"grad_norm": 0.19921875,
"learning_rate": 2.5104062848097555e-06,
"loss": 0.837,
"step": 12775
},
{
"epoch": 0.7504250454358914,
"grad_norm": 0.271484375,
"learning_rate": 2.495749545641086e-06,
"loss": 0.969,
"step": 12800
},
{
"epoch": 0.7518907193527584,
"grad_norm": 0.2275390625,
"learning_rate": 2.4810928064724163e-06,
"loss": 0.858,
"step": 12825
},
{
"epoch": 0.7533563932696253,
"grad_norm": 0.318359375,
"learning_rate": 2.4664360673037467e-06,
"loss": 0.8171,
"step": 12850
},
{
"epoch": 0.7548220671864924,
"grad_norm": 0.23046875,
"learning_rate": 2.4517793281350767e-06,
"loss": 0.9102,
"step": 12875
},
{
"epoch": 0.7562877411033593,
"grad_norm": 0.59765625,
"learning_rate": 2.4371225889664067e-06,
"loss": 0.991,
"step": 12900
},
{
"epoch": 0.7577534150202263,
"grad_norm": 0.2333984375,
"learning_rate": 2.422465849797737e-06,
"loss": 0.8257,
"step": 12925
},
{
"epoch": 0.7592190889370932,
"grad_norm": 0.435546875,
"learning_rate": 2.4078091106290675e-06,
"loss": 0.9095,
"step": 12950
},
{
"epoch": 0.7606847628539603,
"grad_norm": 0.1337890625,
"learning_rate": 2.3931523714603975e-06,
"loss": 0.9126,
"step": 12975
},
{
"epoch": 0.7621504367708273,
"grad_norm": 0.2314453125,
"learning_rate": 2.378495632291728e-06,
"loss": 0.8467,
"step": 13000
},
{
"epoch": 0.7636161106876942,
"grad_norm": 0.291015625,
"learning_rate": 2.3638388931230583e-06,
"loss": 0.8176,
"step": 13025
},
{
"epoch": 0.7650817846045612,
"grad_norm": 0.2333984375,
"learning_rate": 2.3491821539543887e-06,
"loss": 0.8675,
"step": 13050
},
{
"epoch": 0.7665474585214281,
"grad_norm": 0.35546875,
"learning_rate": 2.3345254147857187e-06,
"loss": 0.8793,
"step": 13075
},
{
"epoch": 0.7680131324382952,
"grad_norm": 0.1865234375,
"learning_rate": 2.3198686756170487e-06,
"loss": 0.9777,
"step": 13100
},
{
"epoch": 0.7694788063551621,
"grad_norm": 0.31640625,
"learning_rate": 2.305211936448379e-06,
"loss": 0.8507,
"step": 13125
},
{
"epoch": 0.7709444802720291,
"grad_norm": 0.189453125,
"learning_rate": 2.2905551972797095e-06,
"loss": 0.8181,
"step": 13150
},
{
"epoch": 0.772410154188896,
"grad_norm": 0.2451171875,
"learning_rate": 2.2758984581110395e-06,
"loss": 0.8968,
"step": 13175
},
{
"epoch": 0.773875828105763,
"grad_norm": 0.35546875,
"learning_rate": 2.26124171894237e-06,
"loss": 0.8177,
"step": 13200
},
{
"epoch": 0.77534150202263,
"grad_norm": 0.2216796875,
"learning_rate": 2.2465849797737003e-06,
"loss": 0.8724,
"step": 13225
},
{
"epoch": 0.776807175939497,
"grad_norm": 0.404296875,
"learning_rate": 2.2319282406050307e-06,
"loss": 0.965,
"step": 13250
},
{
"epoch": 0.7782728498563639,
"grad_norm": 0.263671875,
"learning_rate": 2.2172715014363607e-06,
"loss": 0.8441,
"step": 13275
},
{
"epoch": 0.7797385237732309,
"grad_norm": 0.18359375,
"learning_rate": 2.2026147622676906e-06,
"loss": 0.8694,
"step": 13300
},
{
"epoch": 0.7812041976900979,
"grad_norm": 0.3671875,
"learning_rate": 2.187958023099021e-06,
"loss": 0.8556,
"step": 13325
},
{
"epoch": 0.7826698716069649,
"grad_norm": 0.2197265625,
"learning_rate": 2.1733012839303515e-06,
"loss": 0.8436,
"step": 13350
},
{
"epoch": 0.7841355455238319,
"grad_norm": 0.1748046875,
"learning_rate": 2.1586445447616814e-06,
"loss": 0.8208,
"step": 13375
},
{
"epoch": 0.7856012194406988,
"grad_norm": 0.146484375,
"learning_rate": 2.143987805593012e-06,
"loss": 0.8849,
"step": 13400
},
{
"epoch": 0.7870668933575659,
"grad_norm": 0.25390625,
"learning_rate": 2.1293310664243422e-06,
"loss": 0.785,
"step": 13425
},
{
"epoch": 0.7885325672744328,
"grad_norm": 0.1630859375,
"learning_rate": 2.1146743272556727e-06,
"loss": 0.8334,
"step": 13450
},
{
"epoch": 0.7899982411912998,
"grad_norm": 0.1455078125,
"learning_rate": 2.1000175880870026e-06,
"loss": 0.8887,
"step": 13475
},
{
"epoch": 0.7914639151081667,
"grad_norm": 4.5,
"learning_rate": 2.0853608489183326e-06,
"loss": 0.9221,
"step": 13500
},
{
"epoch": 0.7929295890250337,
"grad_norm": 0.166015625,
"learning_rate": 2.070704109749663e-06,
"loss": 0.9046,
"step": 13525
},
{
"epoch": 0.7943952629419007,
"grad_norm": 0.94921875,
"learning_rate": 2.0560473705809934e-06,
"loss": 0.9419,
"step": 13550
},
{
"epoch": 0.7958609368587677,
"grad_norm": 0.17578125,
"learning_rate": 2.0413906314123234e-06,
"loss": 0.9732,
"step": 13575
},
{
"epoch": 0.7973266107756346,
"grad_norm": 0.376953125,
"learning_rate": 2.026733892243654e-06,
"loss": 0.8457,
"step": 13600
},
{
"epoch": 0.7987922846925016,
"grad_norm": 0.23046875,
"learning_rate": 2.0120771530749842e-06,
"loss": 0.9258,
"step": 13625
},
{
"epoch": 0.8002579586093685,
"grad_norm": 0.25,
"learning_rate": 1.997420413906314e-06,
"loss": 0.8751,
"step": 13650
},
{
"epoch": 0.8017236325262356,
"grad_norm": 0.240234375,
"learning_rate": 1.9827636747376446e-06,
"loss": 0.892,
"step": 13675
},
{
"epoch": 0.8031893064431025,
"grad_norm": 0.234375,
"learning_rate": 1.9681069355689746e-06,
"loss": 0.8987,
"step": 13700
},
{
"epoch": 0.8046549803599695,
"grad_norm": 0.1474609375,
"learning_rate": 1.953450196400305e-06,
"loss": 0.8482,
"step": 13725
},
{
"epoch": 0.8061206542768364,
"grad_norm": 0.2314453125,
"learning_rate": 1.9387934572316354e-06,
"loss": 0.965,
"step": 13750
},
{
"epoch": 0.8075863281937035,
"grad_norm": 0.2275390625,
"learning_rate": 1.9241367180629654e-06,
"loss": 0.8086,
"step": 13775
},
{
"epoch": 0.8090520021105705,
"grad_norm": 0.359375,
"learning_rate": 1.909479978894296e-06,
"loss": 0.8894,
"step": 13800
},
{
"epoch": 0.8105176760274374,
"grad_norm": 1.953125,
"learning_rate": 1.8948232397256262e-06,
"loss": 0.8153,
"step": 13825
},
{
"epoch": 0.8119833499443044,
"grad_norm": 0.50390625,
"learning_rate": 1.8801665005569564e-06,
"loss": 0.8529,
"step": 13850
},
{
"epoch": 0.8134490238611713,
"grad_norm": 0.1884765625,
"learning_rate": 1.8655097613882864e-06,
"loss": 0.9254,
"step": 13875
},
{
"epoch": 0.8149146977780384,
"grad_norm": 0.6015625,
"learning_rate": 1.8508530222196166e-06,
"loss": 0.932,
"step": 13900
},
{
"epoch": 0.8163803716949053,
"grad_norm": 0.275390625,
"learning_rate": 1.836196283050947e-06,
"loss": 0.9038,
"step": 13925
},
{
"epoch": 0.8178460456117723,
"grad_norm": 0.193359375,
"learning_rate": 1.8215395438822772e-06,
"loss": 0.8293,
"step": 13950
},
{
"epoch": 0.8193117195286392,
"grad_norm": 1.078125,
"learning_rate": 1.8068828047136076e-06,
"loss": 0.8581,
"step": 13975
},
{
"epoch": 0.8207773934455063,
"grad_norm": 0.294921875,
"learning_rate": 1.7922260655449378e-06,
"loss": 0.8774,
"step": 14000
},
{
"epoch": 0.8222430673623732,
"grad_norm": 0.166015625,
"learning_rate": 1.777569326376268e-06,
"loss": 0.9594,
"step": 14025
},
{
"epoch": 0.8237087412792402,
"grad_norm": 0.251953125,
"learning_rate": 1.7629125872075984e-06,
"loss": 0.9275,
"step": 14050
},
{
"epoch": 0.8251744151961071,
"grad_norm": 0.212890625,
"learning_rate": 1.7482558480389283e-06,
"loss": 0.8318,
"step": 14075
},
{
"epoch": 0.8266400891129742,
"grad_norm": 0.64453125,
"learning_rate": 1.7335991088702585e-06,
"loss": 0.9462,
"step": 14100
},
{
"epoch": 0.8281057630298411,
"grad_norm": 0.33203125,
"learning_rate": 1.718942369701589e-06,
"loss": 0.879,
"step": 14125
},
{
"epoch": 0.8295714369467081,
"grad_norm": 0.486328125,
"learning_rate": 1.7042856305329191e-06,
"loss": 0.8837,
"step": 14150
},
{
"epoch": 0.8310371108635751,
"grad_norm": 0.390625,
"learning_rate": 1.6896288913642495e-06,
"loss": 0.8956,
"step": 14175
},
{
"epoch": 0.832502784780442,
"grad_norm": 0.765625,
"learning_rate": 1.6749721521955797e-06,
"loss": 0.9283,
"step": 14200
},
{
"epoch": 0.8339684586973091,
"grad_norm": 0.64453125,
"learning_rate": 1.66031541302691e-06,
"loss": 0.9279,
"step": 14225
},
{
"epoch": 0.835434132614176,
"grad_norm": 0.5859375,
"learning_rate": 1.64565867385824e-06,
"loss": 0.8446,
"step": 14250
},
{
"epoch": 0.836899806531043,
"grad_norm": 0.154296875,
"learning_rate": 1.6310019346895703e-06,
"loss": 0.8585,
"step": 14275
},
{
"epoch": 0.8383654804479099,
"grad_norm": 0.2412109375,
"learning_rate": 1.6163451955209005e-06,
"loss": 0.9229,
"step": 14300
},
{
"epoch": 0.839831154364777,
"grad_norm": 0.197265625,
"learning_rate": 1.601688456352231e-06,
"loss": 0.7812,
"step": 14325
},
{
"epoch": 0.8412968282816439,
"grad_norm": 0.228515625,
"learning_rate": 1.5870317171835611e-06,
"loss": 0.925,
"step": 14350
},
{
"epoch": 0.8427625021985109,
"grad_norm": 0.25,
"learning_rate": 1.5723749780148915e-06,
"loss": 0.9276,
"step": 14375
},
{
"epoch": 0.8442281761153778,
"grad_norm": 0.365234375,
"learning_rate": 1.5577182388462217e-06,
"loss": 0.8881,
"step": 14400
},
{
"epoch": 0.8456938500322448,
"grad_norm": 0.166015625,
"learning_rate": 1.543061499677552e-06,
"loss": 0.8279,
"step": 14425
},
{
"epoch": 0.8471595239491118,
"grad_norm": 0.466796875,
"learning_rate": 1.5284047605088819e-06,
"loss": 0.8415,
"step": 14450
},
{
"epoch": 0.8486251978659788,
"grad_norm": 0.220703125,
"learning_rate": 1.5137480213402123e-06,
"loss": 0.9089,
"step": 14475
},
{
"epoch": 0.8500908717828457,
"grad_norm": 0.1953125,
"learning_rate": 1.4990912821715425e-06,
"loss": 0.8786,
"step": 14500
},
{
"epoch": 0.8515565456997127,
"grad_norm": 0.29296875,
"learning_rate": 1.484434543002873e-06,
"loss": 1.0085,
"step": 14525
},
{
"epoch": 0.8530222196165798,
"grad_norm": 0.240234375,
"learning_rate": 1.469777803834203e-06,
"loss": 0.9566,
"step": 14550
},
{
"epoch": 0.8544878935334467,
"grad_norm": 0.3828125,
"learning_rate": 1.4551210646655333e-06,
"loss": 0.7709,
"step": 14575
},
{
"epoch": 0.8559535674503137,
"grad_norm": 0.18359375,
"learning_rate": 1.4404643254968637e-06,
"loss": 0.8845,
"step": 14600
},
{
"epoch": 0.8574192413671806,
"grad_norm": 0.2890625,
"learning_rate": 1.4258075863281939e-06,
"loss": 0.8734,
"step": 14625
},
{
"epoch": 0.8588849152840476,
"grad_norm": 0.2099609375,
"learning_rate": 1.4111508471595239e-06,
"loss": 0.8424,
"step": 14650
},
{
"epoch": 0.8603505892009146,
"grad_norm": 0.4375,
"learning_rate": 1.3964941079908543e-06,
"loss": 0.8982,
"step": 14675
},
{
"epoch": 0.8618162631177816,
"grad_norm": 0.18359375,
"learning_rate": 1.3818373688221845e-06,
"loss": 0.8867,
"step": 14700
},
{
"epoch": 0.8632819370346485,
"grad_norm": 0.2138671875,
"learning_rate": 1.3671806296535149e-06,
"loss": 0.768,
"step": 14725
},
{
"epoch": 0.8647476109515155,
"grad_norm": 0.265625,
"learning_rate": 1.352523890484845e-06,
"loss": 0.7438,
"step": 14750
},
{
"epoch": 0.8662132848683824,
"grad_norm": 0.61328125,
"learning_rate": 1.3378671513161753e-06,
"loss": 0.8944,
"step": 14775
},
{
"epoch": 0.8676789587852495,
"grad_norm": 0.419921875,
"learning_rate": 1.3232104121475057e-06,
"loss": 0.8471,
"step": 14800
},
{
"epoch": 0.8691446327021164,
"grad_norm": 0.2275390625,
"learning_rate": 1.3085536729788359e-06,
"loss": 0.9804,
"step": 14825
},
{
"epoch": 0.8706103066189834,
"grad_norm": 0.39453125,
"learning_rate": 1.2938969338101658e-06,
"loss": 0.8258,
"step": 14850
},
{
"epoch": 0.8720759805358503,
"grad_norm": 4.28125,
"learning_rate": 1.2792401946414962e-06,
"loss": 1.0303,
"step": 14875
},
{
"epoch": 0.8735416544527174,
"grad_norm": 0.2314453125,
"learning_rate": 1.2645834554728264e-06,
"loss": 0.9401,
"step": 14900
},
{
"epoch": 0.8750073283695843,
"grad_norm": 0.1845703125,
"learning_rate": 1.2499267163041566e-06,
"loss": 0.8907,
"step": 14925
},
{
"epoch": 0.8764730022864513,
"grad_norm": 0.1865234375,
"learning_rate": 1.235269977135487e-06,
"loss": 1.0413,
"step": 14950
},
{
"epoch": 0.8779386762033183,
"grad_norm": 0.263671875,
"learning_rate": 1.2206132379668172e-06,
"loss": 0.8915,
"step": 14975
},
{
"epoch": 0.8794043501201853,
"grad_norm": 0.62109375,
"learning_rate": 1.2059564987981474e-06,
"loss": 1.0251,
"step": 15000
},
{
"epoch": 0.8808700240370523,
"grad_norm": 0.291015625,
"learning_rate": 1.1912997596294776e-06,
"loss": 0.9363,
"step": 15025
},
{
"epoch": 0.8823356979539192,
"grad_norm": 0.28515625,
"learning_rate": 1.176643020460808e-06,
"loss": 0.9041,
"step": 15050
},
{
"epoch": 0.8838013718707862,
"grad_norm": 0.28515625,
"learning_rate": 1.1619862812921382e-06,
"loss": 0.9011,
"step": 15075
},
{
"epoch": 0.8852670457876531,
"grad_norm": 0.640625,
"learning_rate": 1.1473295421234684e-06,
"loss": 0.8542,
"step": 15100
},
{
"epoch": 0.8867327197045202,
"grad_norm": 0.26953125,
"learning_rate": 1.1326728029547986e-06,
"loss": 0.8591,
"step": 15125
},
{
"epoch": 0.8881983936213871,
"grad_norm": 0.337890625,
"learning_rate": 1.118016063786129e-06,
"loss": 0.8726,
"step": 15150
},
{
"epoch": 0.8896640675382541,
"grad_norm": 0.62890625,
"learning_rate": 1.1033593246174592e-06,
"loss": 0.8511,
"step": 15175
},
{
"epoch": 0.891129741455121,
"grad_norm": 0.1884765625,
"learning_rate": 1.0887025854487894e-06,
"loss": 0.8862,
"step": 15200
},
{
"epoch": 0.892595415371988,
"grad_norm": 0.400390625,
"learning_rate": 1.0740458462801196e-06,
"loss": 0.8561,
"step": 15225
},
{
"epoch": 0.894061089288855,
"grad_norm": 0.271484375,
"learning_rate": 1.05938910711145e-06,
"loss": 0.9929,
"step": 15250
},
{
"epoch": 0.895526763205722,
"grad_norm": 0.2470703125,
"learning_rate": 1.0447323679427802e-06,
"loss": 0.8928,
"step": 15275
},
{
"epoch": 0.8969924371225889,
"grad_norm": 0.1650390625,
"learning_rate": 1.0300756287741104e-06,
"loss": 0.8723,
"step": 15300
},
{
"epoch": 0.8984581110394559,
"grad_norm": 0.2109375,
"learning_rate": 1.0154188896054406e-06,
"loss": 0.8111,
"step": 15325
},
{
"epoch": 0.899923784956323,
"grad_norm": 0.29296875,
"learning_rate": 1.000762150436771e-06,
"loss": 0.8699,
"step": 15350
},
{
"epoch": 0.9013894588731899,
"grad_norm": 0.369140625,
"learning_rate": 9.861054112681012e-07,
"loss": 0.9412,
"step": 15375
},
{
"epoch": 0.9028551327900569,
"grad_norm": 0.171875,
"learning_rate": 9.714486720994314e-07,
"loss": 0.9887,
"step": 15400
},
{
"epoch": 0.9043208067069238,
"grad_norm": 0.1533203125,
"learning_rate": 9.567919329307616e-07,
"loss": 0.9231,
"step": 15425
},
{
"epoch": 0.9057864806237909,
"grad_norm": 0.228515625,
"learning_rate": 9.421351937620919e-07,
"loss": 0.8929,
"step": 15450
},
{
"epoch": 0.9072521545406578,
"grad_norm": 0.2197265625,
"learning_rate": 9.274784545934222e-07,
"loss": 0.8695,
"step": 15475
},
{
"epoch": 0.9087178284575248,
"grad_norm": 0.2392578125,
"learning_rate": 9.128217154247524e-07,
"loss": 0.8565,
"step": 15500
},
{
"epoch": 0.9101835023743917,
"grad_norm": 0.2109375,
"learning_rate": 8.981649762560827e-07,
"loss": 0.8357,
"step": 15525
},
{
"epoch": 0.9116491762912587,
"grad_norm": 0.28515625,
"learning_rate": 8.835082370874129e-07,
"loss": 0.9423,
"step": 15550
},
{
"epoch": 0.9131148502081257,
"grad_norm": 0.21484375,
"learning_rate": 8.688514979187431e-07,
"loss": 0.7956,
"step": 15575
},
{
"epoch": 0.9145805241249927,
"grad_norm": 0.193359375,
"learning_rate": 8.541947587500734e-07,
"loss": 0.8043,
"step": 15600
},
{
"epoch": 0.9160461980418596,
"grad_norm": 0.54296875,
"learning_rate": 8.395380195814037e-07,
"loss": 1.0452,
"step": 15625
},
{
"epoch": 0.9175118719587266,
"grad_norm": 0.357421875,
"learning_rate": 8.248812804127338e-07,
"loss": 0.8549,
"step": 15650
},
{
"epoch": 0.9189775458755935,
"grad_norm": 0.1689453125,
"learning_rate": 8.10224541244064e-07,
"loss": 0.904,
"step": 15675
},
{
"epoch": 0.9204432197924606,
"grad_norm": 2.1875,
"learning_rate": 7.955678020753943e-07,
"loss": 0.9144,
"step": 15700
},
{
"epoch": 0.9219088937093276,
"grad_norm": 0.2080078125,
"learning_rate": 7.809110629067245e-07,
"loss": 0.9322,
"step": 15725
},
{
"epoch": 0.9233745676261945,
"grad_norm": 0.1689453125,
"learning_rate": 7.662543237380548e-07,
"loss": 0.8968,
"step": 15750
},
{
"epoch": 0.9248402415430615,
"grad_norm": 0.45703125,
"learning_rate": 7.51597584569385e-07,
"loss": 0.834,
"step": 15775
},
{
"epoch": 0.9263059154599285,
"grad_norm": 0.2158203125,
"learning_rate": 7.369408454007153e-07,
"loss": 0.8769,
"step": 15800
},
{
"epoch": 0.9277715893767955,
"grad_norm": 0.359375,
"learning_rate": 7.222841062320455e-07,
"loss": 0.8159,
"step": 15825
},
{
"epoch": 0.9292372632936624,
"grad_norm": 0.24609375,
"learning_rate": 7.076273670633758e-07,
"loss": 0.8529,
"step": 15850
},
{
"epoch": 0.9307029372105294,
"grad_norm": 0.2236328125,
"learning_rate": 6.92970627894706e-07,
"loss": 0.8882,
"step": 15875
},
{
"epoch": 0.9321686111273964,
"grad_norm": 0.2041015625,
"learning_rate": 6.783138887260363e-07,
"loss": 0.9047,
"step": 15900
},
{
"epoch": 0.9336342850442634,
"grad_norm": 0.3671875,
"learning_rate": 6.636571495573665e-07,
"loss": 0.9791,
"step": 15925
},
{
"epoch": 0.9350999589611303,
"grad_norm": 0.66796875,
"learning_rate": 6.490004103886968e-07,
"loss": 0.8344,
"step": 15950
},
{
"epoch": 0.9365656328779973,
"grad_norm": 0.2158203125,
"learning_rate": 6.34343671220027e-07,
"loss": 0.8912,
"step": 15975
},
{
"epoch": 0.9380313067948642,
"grad_norm": 0.26171875,
"learning_rate": 6.196869320513572e-07,
"loss": 0.8693,
"step": 16000
},
{
"epoch": 0.9394969807117313,
"grad_norm": 0.296875,
"learning_rate": 6.050301928826875e-07,
"loss": 0.897,
"step": 16025
},
{
"epoch": 0.9409626546285982,
"grad_norm": 0.2890625,
"learning_rate": 5.903734537140177e-07,
"loss": 0.8517,
"step": 16050
},
{
"epoch": 0.9424283285454652,
"grad_norm": 0.2197265625,
"learning_rate": 5.75716714545348e-07,
"loss": 0.9227,
"step": 16075
},
{
"epoch": 0.9438940024623321,
"grad_norm": 0.12060546875,
"learning_rate": 5.610599753766782e-07,
"loss": 0.8516,
"step": 16100
},
{
"epoch": 0.9453596763791992,
"grad_norm": 0.451171875,
"learning_rate": 5.464032362080085e-07,
"loss": 0.9763,
"step": 16125
},
{
"epoch": 0.9468253502960662,
"grad_norm": 0.765625,
"learning_rate": 5.317464970393387e-07,
"loss": 0.8946,
"step": 16150
},
{
"epoch": 0.9482910242129331,
"grad_norm": 0.57421875,
"learning_rate": 5.17089757870669e-07,
"loss": 0.7753,
"step": 16175
},
{
"epoch": 0.9497566981298001,
"grad_norm": 0.43359375,
"learning_rate": 5.024330187019992e-07,
"loss": 0.7681,
"step": 16200
},
{
"epoch": 0.951222372046667,
"grad_norm": 0.267578125,
"learning_rate": 4.877762795333295e-07,
"loss": 0.9284,
"step": 16225
},
{
"epoch": 0.9526880459635341,
"grad_norm": 0.447265625,
"learning_rate": 4.731195403646597e-07,
"loss": 0.8657,
"step": 16250
},
{
"epoch": 0.954153719880401,
"grad_norm": 0.546875,
"learning_rate": 4.5846280119598996e-07,
"loss": 0.8839,
"step": 16275
},
{
"epoch": 0.955619393797268,
"grad_norm": 0.1845703125,
"learning_rate": 4.4380606202732016e-07,
"loss": 0.9618,
"step": 16300
},
{
"epoch": 0.9570850677141349,
"grad_norm": 0.2294921875,
"learning_rate": 4.2914932285865046e-07,
"loss": 0.9123,
"step": 16325
},
{
"epoch": 0.958550741631002,
"grad_norm": 0.291015625,
"learning_rate": 4.1449258368998065e-07,
"loss": 1.0314,
"step": 16350
},
{
"epoch": 0.9600164155478689,
"grad_norm": 0.267578125,
"learning_rate": 3.9983584452131095e-07,
"loss": 0.8674,
"step": 16375
},
{
"epoch": 0.9614820894647359,
"grad_norm": 0.318359375,
"learning_rate": 3.8517910535264115e-07,
"loss": 0.8991,
"step": 16400
},
{
"epoch": 0.9629477633816028,
"grad_norm": 0.2412109375,
"learning_rate": 3.7052236618397145e-07,
"loss": 0.8511,
"step": 16425
},
{
"epoch": 0.9644134372984698,
"grad_norm": 1.140625,
"learning_rate": 3.5586562701530164e-07,
"loss": 0.8599,
"step": 16450
},
{
"epoch": 0.9658791112153368,
"grad_norm": 0.279296875,
"learning_rate": 3.4120888784663194e-07,
"loss": 0.8352,
"step": 16475
},
{
"epoch": 0.9673447851322038,
"grad_norm": 0.470703125,
"learning_rate": 3.2655214867796213e-07,
"loss": 0.7832,
"step": 16500
},
{
"epoch": 0.9688104590490708,
"grad_norm": 0.1982421875,
"learning_rate": 3.118954095092924e-07,
"loss": 0.8793,
"step": 16525
},
{
"epoch": 0.9702761329659377,
"grad_norm": 0.306640625,
"learning_rate": 2.972386703406226e-07,
"loss": 0.8602,
"step": 16550
},
{
"epoch": 0.9717418068828048,
"grad_norm": 0.2353515625,
"learning_rate": 2.8258193117195287e-07,
"loss": 0.8972,
"step": 16575
},
{
"epoch": 0.9732074807996717,
"grad_norm": 0.111328125,
"learning_rate": 2.679251920032831e-07,
"loss": 0.8231,
"step": 16600
},
{
"epoch": 0.9746731547165387,
"grad_norm": 0.2734375,
"learning_rate": 2.5326845283461337e-07,
"loss": 0.8406,
"step": 16625
},
{
"epoch": 0.9761388286334056,
"grad_norm": 0.77734375,
"learning_rate": 2.386117136659436e-07,
"loss": 0.9921,
"step": 16650
},
{
"epoch": 0.9776045025502726,
"grad_norm": 0.357421875,
"learning_rate": 2.2395497449727386e-07,
"loss": 0.8458,
"step": 16675
},
{
"epoch": 0.9790701764671396,
"grad_norm": 0.287109375,
"learning_rate": 2.092982353286041e-07,
"loss": 0.9056,
"step": 16700
},
{
"epoch": 0.9805358503840066,
"grad_norm": 0.189453125,
"learning_rate": 1.9464149615993435e-07,
"loss": 0.8358,
"step": 16725
},
{
"epoch": 0.9820015243008735,
"grad_norm": 0.1552734375,
"learning_rate": 1.799847569912646e-07,
"loss": 0.9285,
"step": 16750
},
{
"epoch": 0.9834671982177405,
"grad_norm": 0.248046875,
"learning_rate": 1.6532801782259485e-07,
"loss": 0.8284,
"step": 16775
},
{
"epoch": 0.9849328721346075,
"grad_norm": 0.216796875,
"learning_rate": 1.506712786539251e-07,
"loss": 0.8364,
"step": 16800
},
{
"epoch": 0.9863985460514745,
"grad_norm": 0.326171875,
"learning_rate": 1.3601453948525534e-07,
"loss": 0.9212,
"step": 16825
},
{
"epoch": 0.9878642199683414,
"grad_norm": 0.234375,
"learning_rate": 1.213578003165856e-07,
"loss": 0.8225,
"step": 16850
},
{
"epoch": 0.9893298938852084,
"grad_norm": 0.16015625,
"learning_rate": 1.0670106114791582e-07,
"loss": 0.9779,
"step": 16875
},
{
"epoch": 0.9907955678020754,
"grad_norm": 0.1845703125,
"learning_rate": 9.204432197924607e-08,
"loss": 0.816,
"step": 16900
},
{
"epoch": 0.9922612417189424,
"grad_norm": 0.23046875,
"learning_rate": 7.738758281057632e-08,
"loss": 0.8379,
"step": 16925
},
{
"epoch": 0.9937269156358094,
"grad_norm": 0.275390625,
"learning_rate": 6.273084364190655e-08,
"loss": 0.8998,
"step": 16950
},
{
"epoch": 0.9951925895526763,
"grad_norm": 0.34375,
"learning_rate": 4.80741044732368e-08,
"loss": 0.9126,
"step": 16975
},
{
"epoch": 0.9966582634695433,
"grad_norm": 0.275390625,
"learning_rate": 3.341736530456704e-08,
"loss": 0.8822,
"step": 17000
},
{
"epoch": 0.9981239373864103,
"grad_norm": 0.466796875,
"learning_rate": 1.8760626135897286e-08,
"loss": 0.8701,
"step": 17025
},
{
"epoch": 0.9995896113032773,
"grad_norm": 4.75,
"learning_rate": 4.103886967227532e-09,
"loss": 0.9079,
"step": 17050
}
],
"logging_steps": 25,
"max_steps": 17057,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4293390811789451e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}