iamTangsang's picture
Complete Epoch 4
0a6e8dc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9998452192082965,
"eval_steps": 12921,
"global_step": 103368,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003869519792593739,
"grad_norm": 773.3179931640625,
"learning_rate": 1.9992260960414815e-05,
"loss": 17.8357,
"step": 100
},
{
"epoch": 0.007739039585187478,
"grad_norm": 327.2199401855469,
"learning_rate": 1.9984521920829628e-05,
"loss": 11.1504,
"step": 200
},
{
"epoch": 0.011608559377781217,
"grad_norm": 78.84376525878906,
"learning_rate": 1.9976782881244438e-05,
"loss": 8.0846,
"step": 300
},
{
"epoch": 0.015478079170374956,
"grad_norm": 78.74186706542969,
"learning_rate": 1.9969043841659254e-05,
"loss": 6.1805,
"step": 400
},
{
"epoch": 0.019347598962968695,
"grad_norm": 10.000066757202148,
"learning_rate": 1.9961304802074064e-05,
"loss": 4.9342,
"step": 500
},
{
"epoch": 0.023217118755562434,
"grad_norm": 11.822905540466309,
"learning_rate": 1.9953565762488877e-05,
"loss": 4.1502,
"step": 600
},
{
"epoch": 0.027086638548156173,
"grad_norm": 7.014156818389893,
"learning_rate": 1.994582672290369e-05,
"loss": 3.7055,
"step": 700
},
{
"epoch": 0.030956158340749912,
"grad_norm": 9.273381233215332,
"learning_rate": 1.99380876833185e-05,
"loss": 3.3966,
"step": 800
},
{
"epoch": 0.034825678133343654,
"grad_norm": 8.681159973144531,
"learning_rate": 1.9930348643733316e-05,
"loss": 3.1075,
"step": 900
},
{
"epoch": 0.03869519792593739,
"grad_norm": 5.212049961090088,
"learning_rate": 1.9922609604148126e-05,
"loss": 2.9555,
"step": 1000
},
{
"epoch": 0.04256471771853113,
"grad_norm": 5.840594291687012,
"learning_rate": 1.991487056456294e-05,
"loss": 2.8528,
"step": 1100
},
{
"epoch": 0.04643423751112487,
"grad_norm": 5.89320707321167,
"learning_rate": 1.9907131524977752e-05,
"loss": 2.6982,
"step": 1200
},
{
"epoch": 0.05030375730371861,
"grad_norm": 9.517736434936523,
"learning_rate": 1.9899392485392565e-05,
"loss": 2.5849,
"step": 1300
},
{
"epoch": 0.054173277096312346,
"grad_norm": 10.960714340209961,
"learning_rate": 1.989165344580738e-05,
"loss": 2.5084,
"step": 1400
},
{
"epoch": 0.05804279688890609,
"grad_norm": 5.337602615356445,
"learning_rate": 1.9883914406222188e-05,
"loss": 2.4456,
"step": 1500
},
{
"epoch": 0.061912316681499824,
"grad_norm": 7.735531806945801,
"learning_rate": 1.9876175366637e-05,
"loss": 2.33,
"step": 1600
},
{
"epoch": 0.06578183647409357,
"grad_norm": 7.02548885345459,
"learning_rate": 1.9868436327051814e-05,
"loss": 2.3016,
"step": 1700
},
{
"epoch": 0.06965135626668731,
"grad_norm": 4.705838680267334,
"learning_rate": 1.9860697287466627e-05,
"loss": 2.2375,
"step": 1800
},
{
"epoch": 0.07352087605928104,
"grad_norm": 4.2120256423950195,
"learning_rate": 1.985295824788144e-05,
"loss": 2.2007,
"step": 1900
},
{
"epoch": 0.07739039585187478,
"grad_norm": 6.315709114074707,
"learning_rate": 1.984521920829625e-05,
"loss": 2.161,
"step": 2000
},
{
"epoch": 0.08125991564446852,
"grad_norm": 5.227051734924316,
"learning_rate": 1.9837480168711067e-05,
"loss": 2.0787,
"step": 2100
},
{
"epoch": 0.08512943543706226,
"grad_norm": 7.011941432952881,
"learning_rate": 1.9829741129125876e-05,
"loss": 2.0563,
"step": 2200
},
{
"epoch": 0.088998955229656,
"grad_norm": 6.2574143409729,
"learning_rate": 1.982200208954069e-05,
"loss": 1.9629,
"step": 2300
},
{
"epoch": 0.09286847502224974,
"grad_norm": 6.2574381828308105,
"learning_rate": 1.9814263049955503e-05,
"loss": 1.9871,
"step": 2400
},
{
"epoch": 0.09673799481484348,
"grad_norm": 6.882974147796631,
"learning_rate": 1.9806524010370312e-05,
"loss": 1.9569,
"step": 2500
},
{
"epoch": 0.10060751460743722,
"grad_norm": 7.7587199211120605,
"learning_rate": 1.979878497078513e-05,
"loss": 1.8799,
"step": 2600
},
{
"epoch": 0.10447703440003096,
"grad_norm": 4.561262607574463,
"learning_rate": 1.979104593119994e-05,
"loss": 1.8605,
"step": 2700
},
{
"epoch": 0.10834655419262469,
"grad_norm": 7.243245601654053,
"learning_rate": 1.978330689161475e-05,
"loss": 1.8433,
"step": 2800
},
{
"epoch": 0.11221607398521843,
"grad_norm": 5.04033088684082,
"learning_rate": 1.9775567852029565e-05,
"loss": 1.8112,
"step": 2900
},
{
"epoch": 0.11608559377781218,
"grad_norm": 4.307803630828857,
"learning_rate": 1.9767828812444378e-05,
"loss": 1.7924,
"step": 3000
},
{
"epoch": 0.11995511357040592,
"grad_norm": 5.578695774078369,
"learning_rate": 1.976008977285919e-05,
"loss": 1.8002,
"step": 3100
},
{
"epoch": 0.12382463336299965,
"grad_norm": 5.035937309265137,
"learning_rate": 1.9752350733274e-05,
"loss": 1.7425,
"step": 3200
},
{
"epoch": 0.1276941531555934,
"grad_norm": 5.792506694793701,
"learning_rate": 1.9744611693688817e-05,
"loss": 1.6971,
"step": 3300
},
{
"epoch": 0.13156367294818713,
"grad_norm": 3.9911370277404785,
"learning_rate": 1.9736872654103627e-05,
"loss": 1.673,
"step": 3400
},
{
"epoch": 0.13543319274078086,
"grad_norm": 6.748035907745361,
"learning_rate": 1.972913361451844e-05,
"loss": 1.673,
"step": 3500
},
{
"epoch": 0.13930271253337462,
"grad_norm": 6.547277450561523,
"learning_rate": 1.9721394574933253e-05,
"loss": 1.6237,
"step": 3600
},
{
"epoch": 0.14317223232596835,
"grad_norm": 4.524691581726074,
"learning_rate": 1.9713655535348063e-05,
"loss": 1.6557,
"step": 3700
},
{
"epoch": 0.14704175211856207,
"grad_norm": 4.387489318847656,
"learning_rate": 1.970591649576288e-05,
"loss": 1.611,
"step": 3800
},
{
"epoch": 0.15091127191115583,
"grad_norm": 4.25224494934082,
"learning_rate": 1.969817745617769e-05,
"loss": 1.5658,
"step": 3900
},
{
"epoch": 0.15478079170374956,
"grad_norm": 3.9016988277435303,
"learning_rate": 1.9690438416592502e-05,
"loss": 1.5737,
"step": 4000
},
{
"epoch": 0.15865031149634332,
"grad_norm": 4.547232151031494,
"learning_rate": 1.9682699377007315e-05,
"loss": 1.5481,
"step": 4100
},
{
"epoch": 0.16251983128893704,
"grad_norm": 5.446467399597168,
"learning_rate": 1.9674960337422125e-05,
"loss": 1.5275,
"step": 4200
},
{
"epoch": 0.16638935108153077,
"grad_norm": 5.614628791809082,
"learning_rate": 1.966722129783694e-05,
"loss": 1.519,
"step": 4300
},
{
"epoch": 0.17025887087412453,
"grad_norm": 9.042234420776367,
"learning_rate": 1.965948225825175e-05,
"loss": 1.4762,
"step": 4400
},
{
"epoch": 0.17412839066671826,
"grad_norm": 3.2087180614471436,
"learning_rate": 1.9651743218666564e-05,
"loss": 1.5133,
"step": 4500
},
{
"epoch": 0.177997910459312,
"grad_norm": 4.113743305206299,
"learning_rate": 1.9644004179081377e-05,
"loss": 1.4572,
"step": 4600
},
{
"epoch": 0.18186743025190574,
"grad_norm": 3.658928155899048,
"learning_rate": 1.963626513949619e-05,
"loss": 1.4591,
"step": 4700
},
{
"epoch": 0.18573695004449947,
"grad_norm": 3.779625177383423,
"learning_rate": 1.9628526099911004e-05,
"loss": 1.4307,
"step": 4800
},
{
"epoch": 0.18960646983709323,
"grad_norm": 3.7647228240966797,
"learning_rate": 1.9620787060325813e-05,
"loss": 1.4438,
"step": 4900
},
{
"epoch": 0.19347598962968696,
"grad_norm": 3.0192203521728516,
"learning_rate": 1.961304802074063e-05,
"loss": 1.53,
"step": 5000
},
{
"epoch": 0.19734550942228068,
"grad_norm": 4.725248336791992,
"learning_rate": 1.960530898115544e-05,
"loss": 1.3875,
"step": 5100
},
{
"epoch": 0.20121502921487444,
"grad_norm": 4.87757682800293,
"learning_rate": 1.9597569941570253e-05,
"loss": 1.3919,
"step": 5200
},
{
"epoch": 0.20508454900746817,
"grad_norm": 6.212441921234131,
"learning_rate": 1.9589830901985066e-05,
"loss": 1.3626,
"step": 5300
},
{
"epoch": 0.20895406880006193,
"grad_norm": 6.060988903045654,
"learning_rate": 1.9582091862399875e-05,
"loss": 1.3816,
"step": 5400
},
{
"epoch": 0.21282358859265565,
"grad_norm": 3.0998196601867676,
"learning_rate": 1.9574352822814692e-05,
"loss": 1.2983,
"step": 5500
},
{
"epoch": 0.21669310838524938,
"grad_norm": 9.51986026763916,
"learning_rate": 1.95666137832295e-05,
"loss": 1.306,
"step": 5600
},
{
"epoch": 0.22056262817784314,
"grad_norm": 3.9112932682037354,
"learning_rate": 1.9558874743644315e-05,
"loss": 1.2809,
"step": 5700
},
{
"epoch": 0.22443214797043687,
"grad_norm": 3.84000563621521,
"learning_rate": 1.9551135704059128e-05,
"loss": 1.3083,
"step": 5800
},
{
"epoch": 0.2283016677630306,
"grad_norm": 4.876347541809082,
"learning_rate": 1.954339666447394e-05,
"loss": 1.2847,
"step": 5900
},
{
"epoch": 0.23217118755562435,
"grad_norm": 4.156209468841553,
"learning_rate": 1.9535657624888754e-05,
"loss": 1.2499,
"step": 6000
},
{
"epoch": 0.23604070734821808,
"grad_norm": 4.460844039916992,
"learning_rate": 1.9527918585303564e-05,
"loss": 1.2547,
"step": 6100
},
{
"epoch": 0.23991022714081184,
"grad_norm": 3.557577133178711,
"learning_rate": 1.9520179545718377e-05,
"loss": 1.2362,
"step": 6200
},
{
"epoch": 0.24377974693340557,
"grad_norm": 4.56212854385376,
"learning_rate": 1.951244050613319e-05,
"loss": 1.2362,
"step": 6300
},
{
"epoch": 0.2476492667259993,
"grad_norm": 2.713700294494629,
"learning_rate": 1.9504701466548003e-05,
"loss": 1.2355,
"step": 6400
},
{
"epoch": 0.25151878651859305,
"grad_norm": 3.288674831390381,
"learning_rate": 1.9496962426962816e-05,
"loss": 1.2082,
"step": 6500
},
{
"epoch": 0.2553883063111868,
"grad_norm": 3.3763837814331055,
"learning_rate": 1.9489223387377626e-05,
"loss": 1.2144,
"step": 6600
},
{
"epoch": 0.2592578261037805,
"grad_norm": 4.2473297119140625,
"learning_rate": 1.9481484347792442e-05,
"loss": 1.2034,
"step": 6700
},
{
"epoch": 0.26312734589637427,
"grad_norm": 3.2804677486419678,
"learning_rate": 1.9473745308207252e-05,
"loss": 1.1913,
"step": 6800
},
{
"epoch": 0.266996865688968,
"grad_norm": 4.3531060218811035,
"learning_rate": 1.9466006268622065e-05,
"loss": 1.1794,
"step": 6900
},
{
"epoch": 0.2708663854815617,
"grad_norm": 4.2846503257751465,
"learning_rate": 1.9458267229036878e-05,
"loss": 1.2,
"step": 7000
},
{
"epoch": 0.2747359052741555,
"grad_norm": 4.860626220703125,
"learning_rate": 1.945052818945169e-05,
"loss": 1.1758,
"step": 7100
},
{
"epoch": 0.27860542506674923,
"grad_norm": 5.262426376342773,
"learning_rate": 1.9442789149866504e-05,
"loss": 1.1542,
"step": 7200
},
{
"epoch": 0.28247494485934294,
"grad_norm": 2.705568552017212,
"learning_rate": 1.9435050110281314e-05,
"loss": 1.1487,
"step": 7300
},
{
"epoch": 0.2863444646519367,
"grad_norm": 2.9500765800476074,
"learning_rate": 1.9427311070696127e-05,
"loss": 1.15,
"step": 7400
},
{
"epoch": 0.29021398444453045,
"grad_norm": 3.6077990531921387,
"learning_rate": 1.941957203111094e-05,
"loss": 1.1282,
"step": 7500
},
{
"epoch": 0.29408350423712415,
"grad_norm": 4.17680549621582,
"learning_rate": 1.9411832991525753e-05,
"loss": 1.122,
"step": 7600
},
{
"epoch": 0.2979530240297179,
"grad_norm": 4.31356954574585,
"learning_rate": 1.9404093951940567e-05,
"loss": 1.1312,
"step": 7700
},
{
"epoch": 0.30182254382231166,
"grad_norm": 4.263576984405518,
"learning_rate": 1.9396354912355376e-05,
"loss": 1.0749,
"step": 7800
},
{
"epoch": 0.30569206361490536,
"grad_norm": 5.233335971832275,
"learning_rate": 1.9388615872770193e-05,
"loss": 1.1188,
"step": 7900
},
{
"epoch": 0.3095615834074991,
"grad_norm": 4.0638747215271,
"learning_rate": 1.9380876833185003e-05,
"loss": 1.0796,
"step": 8000
},
{
"epoch": 0.3134311032000929,
"grad_norm": 3.467556953430176,
"learning_rate": 1.9373137793599816e-05,
"loss": 1.0611,
"step": 8100
},
{
"epoch": 0.31730062299268663,
"grad_norm": 3.4867913722991943,
"learning_rate": 1.936539875401463e-05,
"loss": 1.1273,
"step": 8200
},
{
"epoch": 0.32117014278528033,
"grad_norm": 5.139125347137451,
"learning_rate": 1.935765971442944e-05,
"loss": 1.076,
"step": 8300
},
{
"epoch": 0.3250396625778741,
"grad_norm": 2.705803632736206,
"learning_rate": 1.9349920674844255e-05,
"loss": 1.0684,
"step": 8400
},
{
"epoch": 0.32890918237046785,
"grad_norm": 3.1156651973724365,
"learning_rate": 1.9342181635259065e-05,
"loss": 1.046,
"step": 8500
},
{
"epoch": 0.33277870216306155,
"grad_norm": 2.9857335090637207,
"learning_rate": 1.9334442595673878e-05,
"loss": 1.057,
"step": 8600
},
{
"epoch": 0.3366482219556553,
"grad_norm": 4.149692535400391,
"learning_rate": 1.932670355608869e-05,
"loss": 1.0911,
"step": 8700
},
{
"epoch": 0.34051774174824906,
"grad_norm": 3.3451409339904785,
"learning_rate": 1.9318964516503504e-05,
"loss": 1.0637,
"step": 8800
},
{
"epoch": 0.34438726154084276,
"grad_norm": 4.110273361206055,
"learning_rate": 1.9311225476918317e-05,
"loss": 1.0242,
"step": 8900
},
{
"epoch": 0.3482567813334365,
"grad_norm": 2.951718807220459,
"learning_rate": 1.9303486437333127e-05,
"loss": 1.0712,
"step": 9000
},
{
"epoch": 0.35212630112603027,
"grad_norm": 3.8159780502319336,
"learning_rate": 1.929574739774794e-05,
"loss": 1.0474,
"step": 9100
},
{
"epoch": 0.355995820918624,
"grad_norm": 2.9727439880371094,
"learning_rate": 1.9288008358162753e-05,
"loss": 1.0247,
"step": 9200
},
{
"epoch": 0.35986534071121773,
"grad_norm": 3.576489210128784,
"learning_rate": 1.9280269318577566e-05,
"loss": 0.995,
"step": 9300
},
{
"epoch": 0.3637348605038115,
"grad_norm": 2.537593364715576,
"learning_rate": 1.927253027899238e-05,
"loss": 1.0405,
"step": 9400
},
{
"epoch": 0.36760438029640524,
"grad_norm": 5.178198337554932,
"learning_rate": 1.926479123940719e-05,
"loss": 1.0071,
"step": 9500
},
{
"epoch": 0.37147390008899894,
"grad_norm": 3.5340261459350586,
"learning_rate": 1.9257052199822005e-05,
"loss": 1.0146,
"step": 9600
},
{
"epoch": 0.3753434198815927,
"grad_norm": 4.672996997833252,
"learning_rate": 1.9249313160236815e-05,
"loss": 1.0172,
"step": 9700
},
{
"epoch": 0.37921293967418646,
"grad_norm": 2.453150510787964,
"learning_rate": 1.9241574120651628e-05,
"loss": 0.9778,
"step": 9800
},
{
"epoch": 0.38308245946678016,
"grad_norm": 2.8062591552734375,
"learning_rate": 1.923383508106644e-05,
"loss": 1.0202,
"step": 9900
},
{
"epoch": 0.3869519792593739,
"grad_norm": 3.405714750289917,
"learning_rate": 1.9226096041481254e-05,
"loss": 0.967,
"step": 10000
},
{
"epoch": 0.39082149905196767,
"grad_norm": 2.818267583847046,
"learning_rate": 1.9218357001896068e-05,
"loss": 0.9691,
"step": 10100
},
{
"epoch": 0.39469101884456137,
"grad_norm": 3.632185935974121,
"learning_rate": 1.9210617962310877e-05,
"loss": 0.9607,
"step": 10200
},
{
"epoch": 0.3985605386371551,
"grad_norm": 4.022298812866211,
"learning_rate": 1.920287892272569e-05,
"loss": 0.9908,
"step": 10300
},
{
"epoch": 0.4024300584297489,
"grad_norm": 3.376326322555542,
"learning_rate": 1.9195139883140503e-05,
"loss": 0.9677,
"step": 10400
},
{
"epoch": 0.4062995782223426,
"grad_norm": 4.996184825897217,
"learning_rate": 1.9187400843555317e-05,
"loss": 0.9734,
"step": 10500
},
{
"epoch": 0.41016909801493634,
"grad_norm": 3.2259700298309326,
"learning_rate": 1.917966180397013e-05,
"loss": 0.9614,
"step": 10600
},
{
"epoch": 0.4140386178075301,
"grad_norm": 2.585890531539917,
"learning_rate": 1.917192276438494e-05,
"loss": 0.9764,
"step": 10700
},
{
"epoch": 0.41790813760012385,
"grad_norm": 4.138501167297363,
"learning_rate": 1.9164183724799756e-05,
"loss": 0.9556,
"step": 10800
},
{
"epoch": 0.42177765739271755,
"grad_norm": 3.0705041885375977,
"learning_rate": 1.9156444685214566e-05,
"loss": 0.9427,
"step": 10900
},
{
"epoch": 0.4256471771853113,
"grad_norm": 2.94376802444458,
"learning_rate": 1.914870564562938e-05,
"loss": 0.9299,
"step": 11000
},
{
"epoch": 0.42951669697790507,
"grad_norm": 3.2788572311401367,
"learning_rate": 1.9140966606044192e-05,
"loss": 0.9562,
"step": 11100
},
{
"epoch": 0.43338621677049877,
"grad_norm": 2.8515307903289795,
"learning_rate": 1.9133227566459005e-05,
"loss": 0.9731,
"step": 11200
},
{
"epoch": 0.4372557365630925,
"grad_norm": 4.263568878173828,
"learning_rate": 1.9125488526873818e-05,
"loss": 0.9388,
"step": 11300
},
{
"epoch": 0.4411252563556863,
"grad_norm": 2.5887269973754883,
"learning_rate": 1.9117749487288628e-05,
"loss": 0.943,
"step": 11400
},
{
"epoch": 0.44499477614828,
"grad_norm": 4.111998081207275,
"learning_rate": 1.911001044770344e-05,
"loss": 0.935,
"step": 11500
},
{
"epoch": 0.44886429594087374,
"grad_norm": 3.5757904052734375,
"learning_rate": 1.9102271408118254e-05,
"loss": 0.9166,
"step": 11600
},
{
"epoch": 0.4527338157334675,
"grad_norm": 3.2234575748443604,
"learning_rate": 1.9094532368533067e-05,
"loss": 0.9086,
"step": 11700
},
{
"epoch": 0.4566033355260612,
"grad_norm": 2.5244109630584717,
"learning_rate": 1.908679332894788e-05,
"loss": 0.8732,
"step": 11800
},
{
"epoch": 0.46047285531865495,
"grad_norm": 2.221653699874878,
"learning_rate": 1.907905428936269e-05,
"loss": 0.9143,
"step": 11900
},
{
"epoch": 0.4643423751112487,
"grad_norm": 3.898876428604126,
"learning_rate": 1.9071315249777503e-05,
"loss": 0.9297,
"step": 12000
},
{
"epoch": 0.4682118949038424,
"grad_norm": 3.2549328804016113,
"learning_rate": 1.9063576210192316e-05,
"loss": 0.8753,
"step": 12100
},
{
"epoch": 0.47208141469643616,
"grad_norm": 4.246420383453369,
"learning_rate": 1.905583717060713e-05,
"loss": 0.9222,
"step": 12200
},
{
"epoch": 0.4759509344890299,
"grad_norm": 2.8742761611938477,
"learning_rate": 1.9048098131021942e-05,
"loss": 0.9314,
"step": 12300
},
{
"epoch": 0.4798204542816237,
"grad_norm": 3.1842613220214844,
"learning_rate": 1.9040359091436755e-05,
"loss": 0.9167,
"step": 12400
},
{
"epoch": 0.4836899740742174,
"grad_norm": 2.5616085529327393,
"learning_rate": 1.903262005185157e-05,
"loss": 0.9089,
"step": 12500
},
{
"epoch": 0.48755949386681113,
"grad_norm": 2.4772250652313232,
"learning_rate": 1.9024881012266378e-05,
"loss": 0.8641,
"step": 12600
},
{
"epoch": 0.4914290136594049,
"grad_norm": 2.5282464027404785,
"learning_rate": 1.901714197268119e-05,
"loss": 0.8717,
"step": 12700
},
{
"epoch": 0.4952985334519986,
"grad_norm": 3.940554618835449,
"learning_rate": 1.9009402933096004e-05,
"loss": 0.9078,
"step": 12800
},
{
"epoch": 0.49916805324459235,
"grad_norm": 4.964881420135498,
"learning_rate": 1.9001663893510818e-05,
"loss": 0.8716,
"step": 12900
},
{
"epoch": 0.499980652401037,
"eval_loss": 0.560674250125885,
"eval_runtime": 73.0307,
"eval_samples_per_second": 28.604,
"eval_steps_per_second": 3.588,
"step": 12921
},
{
"epoch": 0.5030375730371861,
"grad_norm": 3.7385573387145996,
"learning_rate": 1.899392485392563e-05,
"loss": 0.8925,
"step": 13000
},
{
"epoch": 0.5069070928297799,
"grad_norm": 2.8294308185577393,
"learning_rate": 1.898618581434044e-05,
"loss": 0.8573,
"step": 13100
},
{
"epoch": 0.5107766126223736,
"grad_norm": 2.6944656372070312,
"learning_rate": 1.8978446774755253e-05,
"loss": 0.8607,
"step": 13200
},
{
"epoch": 0.5146461324149673,
"grad_norm": 3.5395185947418213,
"learning_rate": 1.8970707735170067e-05,
"loss": 0.8417,
"step": 13300
},
{
"epoch": 0.518515652207561,
"grad_norm": 3.0177690982818604,
"learning_rate": 1.896296869558488e-05,
"loss": 0.8779,
"step": 13400
},
{
"epoch": 0.5223851720001548,
"grad_norm": 2.879098653793335,
"learning_rate": 1.8955229655999693e-05,
"loss": 0.8702,
"step": 13500
},
{
"epoch": 0.5262546917927485,
"grad_norm": 3.593395709991455,
"learning_rate": 1.8947490616414506e-05,
"loss": 0.8575,
"step": 13600
},
{
"epoch": 0.5301242115853423,
"grad_norm": 4.0405120849609375,
"learning_rate": 1.8939751576829316e-05,
"loss": 0.8643,
"step": 13700
},
{
"epoch": 0.533993731377936,
"grad_norm": 3.6186561584472656,
"learning_rate": 1.893201253724413e-05,
"loss": 0.8688,
"step": 13800
},
{
"epoch": 0.5378632511705297,
"grad_norm": 3.501770496368408,
"learning_rate": 1.8924273497658942e-05,
"loss": 0.8513,
"step": 13900
},
{
"epoch": 0.5417327709631234,
"grad_norm": 3.8322582244873047,
"learning_rate": 1.8916534458073755e-05,
"loss": 0.8346,
"step": 14000
},
{
"epoch": 0.5456022907557172,
"grad_norm": 4.2960076332092285,
"learning_rate": 1.8908795418488568e-05,
"loss": 0.8712,
"step": 14100
},
{
"epoch": 0.549471810548311,
"grad_norm": 2.9287805557250977,
"learning_rate": 1.890105637890338e-05,
"loss": 0.8124,
"step": 14200
},
{
"epoch": 0.5533413303409047,
"grad_norm": 3.2405707836151123,
"learning_rate": 1.889331733931819e-05,
"loss": 0.8415,
"step": 14300
},
{
"epoch": 0.5572108501334985,
"grad_norm": 3.691862106323242,
"learning_rate": 1.8885578299733004e-05,
"loss": 0.8619,
"step": 14400
},
{
"epoch": 0.5610803699260922,
"grad_norm": 2.882659912109375,
"learning_rate": 1.8877839260147817e-05,
"loss": 0.8455,
"step": 14500
},
{
"epoch": 0.5649498897186859,
"grad_norm": 2.3164443969726562,
"learning_rate": 1.887010022056263e-05,
"loss": 0.8698,
"step": 14600
},
{
"epoch": 0.5688194095112796,
"grad_norm": 3.544872760772705,
"learning_rate": 1.8862361180977443e-05,
"loss": 0.8286,
"step": 14700
},
{
"epoch": 0.5726889293038734,
"grad_norm": 3.035102605819702,
"learning_rate": 1.8854622141392253e-05,
"loss": 0.8389,
"step": 14800
},
{
"epoch": 0.5765584490964671,
"grad_norm": 2.7800323963165283,
"learning_rate": 1.8846883101807066e-05,
"loss": 0.8363,
"step": 14900
},
{
"epoch": 0.5804279688890609,
"grad_norm": 3.638786554336548,
"learning_rate": 1.883914406222188e-05,
"loss": 0.8323,
"step": 15000
},
{
"epoch": 0.5842974886816547,
"grad_norm": 2.910550117492676,
"learning_rate": 1.8831405022636692e-05,
"loss": 0.8237,
"step": 15100
},
{
"epoch": 0.5881670084742483,
"grad_norm": 4.035897254943848,
"learning_rate": 1.8823665983051505e-05,
"loss": 0.8124,
"step": 15200
},
{
"epoch": 0.592036528266842,
"grad_norm": 3.7201926708221436,
"learning_rate": 1.881592694346632e-05,
"loss": 0.8191,
"step": 15300
},
{
"epoch": 0.5959060480594358,
"grad_norm": 2.1671581268310547,
"learning_rate": 1.880818790388113e-05,
"loss": 0.8263,
"step": 15400
},
{
"epoch": 0.5997755678520296,
"grad_norm": 4.754736423492432,
"learning_rate": 1.880044886429594e-05,
"loss": 0.8119,
"step": 15500
},
{
"epoch": 0.6036450876446233,
"grad_norm": 2.688183069229126,
"learning_rate": 1.8792709824710754e-05,
"loss": 0.7925,
"step": 15600
},
{
"epoch": 0.6075146074372171,
"grad_norm": 2.091325283050537,
"learning_rate": 1.8784970785125567e-05,
"loss": 0.8011,
"step": 15700
},
{
"epoch": 0.6113841272298107,
"grad_norm": 3.1910102367401123,
"learning_rate": 1.877723174554038e-05,
"loss": 0.7847,
"step": 15800
},
{
"epoch": 0.6152536470224045,
"grad_norm": 2.0929980278015137,
"learning_rate": 1.8769492705955194e-05,
"loss": 0.8159,
"step": 15900
},
{
"epoch": 0.6191231668149982,
"grad_norm": 3.227985382080078,
"learning_rate": 1.8761753666370003e-05,
"loss": 0.7704,
"step": 16000
},
{
"epoch": 0.622992686607592,
"grad_norm": 3.1534597873687744,
"learning_rate": 1.8754014626784817e-05,
"loss": 0.8024,
"step": 16100
},
{
"epoch": 0.6268622064001858,
"grad_norm": 3.8482093811035156,
"learning_rate": 1.874627558719963e-05,
"loss": 0.7787,
"step": 16200
},
{
"epoch": 0.6307317261927795,
"grad_norm": 3.0121939182281494,
"learning_rate": 1.8738536547614443e-05,
"loss": 0.79,
"step": 16300
},
{
"epoch": 0.6346012459853733,
"grad_norm": 2.6242785453796387,
"learning_rate": 1.8730797508029256e-05,
"loss": 0.7814,
"step": 16400
},
{
"epoch": 0.6384707657779669,
"grad_norm": 5.697305679321289,
"learning_rate": 1.872305846844407e-05,
"loss": 0.7978,
"step": 16500
},
{
"epoch": 0.6423402855705607,
"grad_norm": 2.5282928943634033,
"learning_rate": 1.871531942885888e-05,
"loss": 0.7724,
"step": 16600
},
{
"epoch": 0.6462098053631544,
"grad_norm": 3.4947257041931152,
"learning_rate": 1.8707580389273692e-05,
"loss": 0.7707,
"step": 16700
},
{
"epoch": 0.6500793251557482,
"grad_norm": 3.6135666370391846,
"learning_rate": 1.8699841349688505e-05,
"loss": 0.7615,
"step": 16800
},
{
"epoch": 0.6539488449483419,
"grad_norm": 3.657271385192871,
"learning_rate": 1.8692102310103318e-05,
"loss": 0.7916,
"step": 16900
},
{
"epoch": 0.6578183647409357,
"grad_norm": 2.918696403503418,
"learning_rate": 1.868436327051813e-05,
"loss": 0.7914,
"step": 17000
},
{
"epoch": 0.6616878845335293,
"grad_norm": 2.597545862197876,
"learning_rate": 1.8676624230932944e-05,
"loss": 0.7772,
"step": 17100
},
{
"epoch": 0.6655574043261231,
"grad_norm": 2.5721652507781982,
"learning_rate": 1.8668885191347754e-05,
"loss": 0.8055,
"step": 17200
},
{
"epoch": 0.6694269241187168,
"grad_norm": 2.5288245677948,
"learning_rate": 1.8661146151762567e-05,
"loss": 0.7746,
"step": 17300
},
{
"epoch": 0.6732964439113106,
"grad_norm": 3.3261797428131104,
"learning_rate": 1.865340711217738e-05,
"loss": 0.7615,
"step": 17400
},
{
"epoch": 0.6771659637039044,
"grad_norm": 3.3527183532714844,
"learning_rate": 1.8645668072592193e-05,
"loss": 0.7824,
"step": 17500
},
{
"epoch": 0.6810354834964981,
"grad_norm": 3.0245845317840576,
"learning_rate": 1.8637929033007006e-05,
"loss": 0.7901,
"step": 17600
},
{
"epoch": 0.6849050032890919,
"grad_norm": 4.284315586090088,
"learning_rate": 1.863018999342182e-05,
"loss": 0.7635,
"step": 17700
},
{
"epoch": 0.6887745230816855,
"grad_norm": 2.9628872871398926,
"learning_rate": 1.862245095383663e-05,
"loss": 0.7715,
"step": 17800
},
{
"epoch": 0.6926440428742793,
"grad_norm": 3.209158182144165,
"learning_rate": 1.8614711914251442e-05,
"loss": 0.7549,
"step": 17900
},
{
"epoch": 0.696513562666873,
"grad_norm": 2.8341355323791504,
"learning_rate": 1.8606972874666255e-05,
"loss": 0.745,
"step": 18000
},
{
"epoch": 0.7003830824594668,
"grad_norm": 3.9047954082489014,
"learning_rate": 1.859923383508107e-05,
"loss": 0.7398,
"step": 18100
},
{
"epoch": 0.7042526022520605,
"grad_norm": 2.646315097808838,
"learning_rate": 1.859149479549588e-05,
"loss": 0.7381,
"step": 18200
},
{
"epoch": 0.7081221220446543,
"grad_norm": 2.5596678256988525,
"learning_rate": 1.858375575591069e-05,
"loss": 0.746,
"step": 18300
},
{
"epoch": 0.711991641837248,
"grad_norm": 2.578354597091675,
"learning_rate": 1.8576016716325504e-05,
"loss": 0.7123,
"step": 18400
},
{
"epoch": 0.7158611616298417,
"grad_norm": 2.1996634006500244,
"learning_rate": 1.8568277676740317e-05,
"loss": 0.75,
"step": 18500
},
{
"epoch": 0.7197306814224355,
"grad_norm": 3.1088805198669434,
"learning_rate": 1.856053863715513e-05,
"loss": 0.7563,
"step": 18600
},
{
"epoch": 0.7236002012150292,
"grad_norm": 2.6386542320251465,
"learning_rate": 1.8552799597569944e-05,
"loss": 0.7485,
"step": 18700
},
{
"epoch": 0.727469721007623,
"grad_norm": 3.407057523727417,
"learning_rate": 1.8545060557984757e-05,
"loss": 0.7177,
"step": 18800
},
{
"epoch": 0.7313392408002167,
"grad_norm": 3.8691439628601074,
"learning_rate": 1.853732151839957e-05,
"loss": 0.7687,
"step": 18900
},
{
"epoch": 0.7352087605928105,
"grad_norm": 1.9367045164108276,
"learning_rate": 1.852958247881438e-05,
"loss": 0.7425,
"step": 19000
},
{
"epoch": 0.7390782803854041,
"grad_norm": 6.34794807434082,
"learning_rate": 1.8521843439229193e-05,
"loss": 0.7474,
"step": 19100
},
{
"epoch": 0.7429478001779979,
"grad_norm": 2.6807470321655273,
"learning_rate": 1.8514104399644006e-05,
"loss": 0.7401,
"step": 19200
},
{
"epoch": 0.7468173199705916,
"grad_norm": 1.9673856496810913,
"learning_rate": 1.850636536005882e-05,
"loss": 0.7242,
"step": 19300
},
{
"epoch": 0.7506868397631854,
"grad_norm": 3.6721489429473877,
"learning_rate": 1.8498626320473632e-05,
"loss": 0.7148,
"step": 19400
},
{
"epoch": 0.7545563595557792,
"grad_norm": 2.662858724594116,
"learning_rate": 1.8490887280888442e-05,
"loss": 0.7486,
"step": 19500
},
{
"epoch": 0.7584258793483729,
"grad_norm": 2.0240514278411865,
"learning_rate": 1.8483148241303255e-05,
"loss": 0.7493,
"step": 19600
},
{
"epoch": 0.7622953991409666,
"grad_norm": 2.8386764526367188,
"learning_rate": 1.8475409201718068e-05,
"loss": 0.7272,
"step": 19700
},
{
"epoch": 0.7661649189335603,
"grad_norm": 4.325782299041748,
"learning_rate": 1.846767016213288e-05,
"loss": 0.7411,
"step": 19800
},
{
"epoch": 0.7700344387261541,
"grad_norm": 2.5033533573150635,
"learning_rate": 1.8459931122547694e-05,
"loss": 0.7289,
"step": 19900
},
{
"epoch": 0.7739039585187478,
"grad_norm": 2.9453632831573486,
"learning_rate": 1.8452192082962507e-05,
"loss": 0.7278,
"step": 20000
},
{
"epoch": 0.7777734783113416,
"grad_norm": 2.145141124725342,
"learning_rate": 1.844445304337732e-05,
"loss": 0.7131,
"step": 20100
},
{
"epoch": 0.7816429981039353,
"grad_norm": 4.372563362121582,
"learning_rate": 1.843671400379213e-05,
"loss": 0.6891,
"step": 20200
},
{
"epoch": 0.7855125178965291,
"grad_norm": 2.579414129257202,
"learning_rate": 1.8428974964206943e-05,
"loss": 0.7312,
"step": 20300
},
{
"epoch": 0.7893820376891227,
"grad_norm": 2.5996174812316895,
"learning_rate": 1.8421235924621756e-05,
"loss": 0.7158,
"step": 20400
},
{
"epoch": 0.7932515574817165,
"grad_norm": 2.689061164855957,
"learning_rate": 1.841349688503657e-05,
"loss": 0.7067,
"step": 20500
},
{
"epoch": 0.7971210772743103,
"grad_norm": 3.0453779697418213,
"learning_rate": 1.8405757845451382e-05,
"loss": 0.7022,
"step": 20600
},
{
"epoch": 0.800990597066904,
"grad_norm": 2.452270746231079,
"learning_rate": 1.8398018805866192e-05,
"loss": 0.6891,
"step": 20700
},
{
"epoch": 0.8048601168594978,
"grad_norm": 2.2127304077148438,
"learning_rate": 1.8390279766281005e-05,
"loss": 0.7126,
"step": 20800
},
{
"epoch": 0.8087296366520915,
"grad_norm": 3.050323486328125,
"learning_rate": 1.838254072669582e-05,
"loss": 0.7361,
"step": 20900
},
{
"epoch": 0.8125991564446852,
"grad_norm": 3.5222108364105225,
"learning_rate": 1.837480168711063e-05,
"loss": 0.6958,
"step": 21000
},
{
"epoch": 0.8164686762372789,
"grad_norm": 2.7050528526306152,
"learning_rate": 1.8367062647525445e-05,
"loss": 0.6882,
"step": 21100
},
{
"epoch": 0.8203381960298727,
"grad_norm": 2.8139567375183105,
"learning_rate": 1.8359323607940254e-05,
"loss": 0.6871,
"step": 21200
},
{
"epoch": 0.8242077158224664,
"grad_norm": 2.5572404861450195,
"learning_rate": 1.835158456835507e-05,
"loss": 0.7388,
"step": 21300
},
{
"epoch": 0.8280772356150602,
"grad_norm": 3.583036184310913,
"learning_rate": 1.834384552876988e-05,
"loss": 0.687,
"step": 21400
},
{
"epoch": 0.831946755407654,
"grad_norm": 2.7720303535461426,
"learning_rate": 1.8336106489184694e-05,
"loss": 0.69,
"step": 21500
},
{
"epoch": 0.8358162752002477,
"grad_norm": 2.2616991996765137,
"learning_rate": 1.8328367449599507e-05,
"loss": 0.7008,
"step": 21600
},
{
"epoch": 0.8396857949928414,
"grad_norm": 2.4385766983032227,
"learning_rate": 1.832062841001432e-05,
"loss": 0.7132,
"step": 21700
},
{
"epoch": 0.8435553147854351,
"grad_norm": 3.018012046813965,
"learning_rate": 1.8312889370429133e-05,
"loss": 0.6931,
"step": 21800
},
{
"epoch": 0.8474248345780289,
"grad_norm": 5.071914196014404,
"learning_rate": 1.8305150330843943e-05,
"loss": 0.7222,
"step": 21900
},
{
"epoch": 0.8512943543706226,
"grad_norm": 2.837449312210083,
"learning_rate": 1.8297411291258756e-05,
"loss": 0.6806,
"step": 22000
},
{
"epoch": 0.8551638741632164,
"grad_norm": 2.692207098007202,
"learning_rate": 1.828967225167357e-05,
"loss": 0.6858,
"step": 22100
},
{
"epoch": 0.8590333939558101,
"grad_norm": 5.077603816986084,
"learning_rate": 1.8281933212088382e-05,
"loss": 0.6851,
"step": 22200
},
{
"epoch": 0.8629029137484038,
"grad_norm": 2.468109607696533,
"learning_rate": 1.8274194172503195e-05,
"loss": 0.7078,
"step": 22300
},
{
"epoch": 0.8667724335409975,
"grad_norm": 2.6579232215881348,
"learning_rate": 1.8266455132918005e-05,
"loss": 0.6966,
"step": 22400
},
{
"epoch": 0.8706419533335913,
"grad_norm": 2.528069496154785,
"learning_rate": 1.8258716093332818e-05,
"loss": 0.6702,
"step": 22500
},
{
"epoch": 0.874511473126185,
"grad_norm": 2.341118812561035,
"learning_rate": 1.825097705374763e-05,
"loss": 0.6622,
"step": 22600
},
{
"epoch": 0.8783809929187788,
"grad_norm": 2.7627336978912354,
"learning_rate": 1.8243238014162444e-05,
"loss": 0.6657,
"step": 22700
},
{
"epoch": 0.8822505127113726,
"grad_norm": 3.725227117538452,
"learning_rate": 1.8235498974577257e-05,
"loss": 0.6722,
"step": 22800
},
{
"epoch": 0.8861200325039662,
"grad_norm": 2.631901741027832,
"learning_rate": 1.822775993499207e-05,
"loss": 0.6866,
"step": 22900
},
{
"epoch": 0.88998955229656,
"grad_norm": 2.7313873767852783,
"learning_rate": 1.8220020895406883e-05,
"loss": 0.6865,
"step": 23000
},
{
"epoch": 0.8938590720891537,
"grad_norm": 2.1421585083007812,
"learning_rate": 1.8212281855821693e-05,
"loss": 0.6835,
"step": 23100
},
{
"epoch": 0.8977285918817475,
"grad_norm": 3.489210605621338,
"learning_rate": 1.8204542816236506e-05,
"loss": 0.6839,
"step": 23200
},
{
"epoch": 0.9015981116743412,
"grad_norm": 2.160252809524536,
"learning_rate": 1.819680377665132e-05,
"loss": 0.6474,
"step": 23300
},
{
"epoch": 0.905467631466935,
"grad_norm": 2.6864516735076904,
"learning_rate": 1.8189064737066132e-05,
"loss": 0.685,
"step": 23400
},
{
"epoch": 0.9093371512595287,
"grad_norm": 1.3721705675125122,
"learning_rate": 1.8181325697480946e-05,
"loss": 0.6555,
"step": 23500
},
{
"epoch": 0.9132066710521224,
"grad_norm": 2.154467821121216,
"learning_rate": 1.8173586657895755e-05,
"loss": 0.6683,
"step": 23600
},
{
"epoch": 0.9170761908447161,
"grad_norm": 2.175413131713867,
"learning_rate": 1.816584761831057e-05,
"loss": 0.6489,
"step": 23700
},
{
"epoch": 0.9209457106373099,
"grad_norm": 2.504239082336426,
"learning_rate": 1.815810857872538e-05,
"loss": 0.6364,
"step": 23800
},
{
"epoch": 0.9248152304299037,
"grad_norm": 2.6821277141571045,
"learning_rate": 1.8150369539140195e-05,
"loss": 0.6678,
"step": 23900
},
{
"epoch": 0.9286847502224974,
"grad_norm": 2.505697011947632,
"learning_rate": 1.8142630499555008e-05,
"loss": 0.6755,
"step": 24000
},
{
"epoch": 0.9325542700150912,
"grad_norm": 2.164247989654541,
"learning_rate": 1.8134891459969817e-05,
"loss": 0.6612,
"step": 24100
},
{
"epoch": 0.9364237898076848,
"grad_norm": 1.8794877529144287,
"learning_rate": 1.8127152420384634e-05,
"loss": 0.6422,
"step": 24200
},
{
"epoch": 0.9402933096002786,
"grad_norm": 2.1299638748168945,
"learning_rate": 1.8119413380799444e-05,
"loss": 0.655,
"step": 24300
},
{
"epoch": 0.9441628293928723,
"grad_norm": 2.3705379962921143,
"learning_rate": 1.8111674341214257e-05,
"loss": 0.6435,
"step": 24400
},
{
"epoch": 0.9480323491854661,
"grad_norm": 3.0839450359344482,
"learning_rate": 1.810393530162907e-05,
"loss": 0.7019,
"step": 24500
},
{
"epoch": 0.9519018689780598,
"grad_norm": 1.8064953088760376,
"learning_rate": 1.8096196262043883e-05,
"loss": 0.648,
"step": 24600
},
{
"epoch": 0.9557713887706536,
"grad_norm": 2.46791934967041,
"learning_rate": 1.8088457222458696e-05,
"loss": 0.6398,
"step": 24700
},
{
"epoch": 0.9596409085632474,
"grad_norm": 3.4168615341186523,
"learning_rate": 1.8080718182873506e-05,
"loss": 0.685,
"step": 24800
},
{
"epoch": 0.963510428355841,
"grad_norm": 2.9070920944213867,
"learning_rate": 1.807297914328832e-05,
"loss": 0.6559,
"step": 24900
},
{
"epoch": 0.9673799481484348,
"grad_norm": 2.403107166290283,
"learning_rate": 1.8065240103703132e-05,
"loss": 0.6845,
"step": 25000
},
{
"epoch": 0.9712494679410285,
"grad_norm": 2.1251277923583984,
"learning_rate": 1.8057501064117945e-05,
"loss": 0.6268,
"step": 25100
},
{
"epoch": 0.9751189877336223,
"grad_norm": 3.04496693611145,
"learning_rate": 1.8049762024532758e-05,
"loss": 0.6734,
"step": 25200
},
{
"epoch": 0.978988507526216,
"grad_norm": 4.796090602874756,
"learning_rate": 1.8042022984947568e-05,
"loss": 0.6522,
"step": 25300
},
{
"epoch": 0.9828580273188098,
"grad_norm": 2.2436540126800537,
"learning_rate": 1.8034283945362384e-05,
"loss": 0.6638,
"step": 25400
},
{
"epoch": 0.9867275471114034,
"grad_norm": 2.5630242824554443,
"learning_rate": 1.8026544905777194e-05,
"loss": 0.6387,
"step": 25500
},
{
"epoch": 0.9905970669039972,
"grad_norm": 2.3005053997039795,
"learning_rate": 1.8018805866192007e-05,
"loss": 0.6322,
"step": 25600
},
{
"epoch": 0.9944665866965909,
"grad_norm": 1.7634360790252686,
"learning_rate": 1.801106682660682e-05,
"loss": 0.6459,
"step": 25700
},
{
"epoch": 0.9983361064891847,
"grad_norm": 2.2708933353424072,
"learning_rate": 1.800332778702163e-05,
"loss": 0.6625,
"step": 25800
},
{
"epoch": 0.999961304802074,
"eval_loss": 0.418006956577301,
"eval_runtime": 72.9007,
"eval_samples_per_second": 28.655,
"eval_steps_per_second": 3.594,
"step": 25842
},
{
"epoch": 1.0022056262817784,
"grad_norm": 2.6418962478637695,
"learning_rate": 1.9995511357040595e-05,
"loss": 0.6466,
"step": 25900
},
{
"epoch": 1.0060751460743722,
"grad_norm": 3.1957106590270996,
"learning_rate": 1.9987772317455404e-05,
"loss": 0.6523,
"step": 26000
},
{
"epoch": 1.009944665866966,
"grad_norm": 3.396048069000244,
"learning_rate": 1.9980033277870218e-05,
"loss": 0.6683,
"step": 26100
},
{
"epoch": 1.0138141856595597,
"grad_norm": 2.8312299251556396,
"learning_rate": 1.997229423828503e-05,
"loss": 0.645,
"step": 26200
},
{
"epoch": 1.0176837054521535,
"grad_norm": 3.423259973526001,
"learning_rate": 1.996455519869984e-05,
"loss": 0.6086,
"step": 26300
},
{
"epoch": 1.0215532252447472,
"grad_norm": 2.7330431938171387,
"learning_rate": 1.9956816159114657e-05,
"loss": 0.6612,
"step": 26400
},
{
"epoch": 1.0254227450373408,
"grad_norm": 2.3465893268585205,
"learning_rate": 1.9949077119529467e-05,
"loss": 0.6338,
"step": 26500
},
{
"epoch": 1.0292922648299345,
"grad_norm": 2.6004831790924072,
"learning_rate": 1.994133807994428e-05,
"loss": 0.6199,
"step": 26600
},
{
"epoch": 1.0331617846225283,
"grad_norm": 2.838761806488037,
"learning_rate": 1.9933599040359093e-05,
"loss": 0.6242,
"step": 26700
},
{
"epoch": 1.037031304415122,
"grad_norm": 3.6770079135894775,
"learning_rate": 1.9925860000773906e-05,
"loss": 0.6146,
"step": 26800
},
{
"epoch": 1.0409008242077158,
"grad_norm": 2.4731369018554688,
"learning_rate": 1.991812096118872e-05,
"loss": 0.6218,
"step": 26900
},
{
"epoch": 1.0447703440003095,
"grad_norm": 1.8442339897155762,
"learning_rate": 1.991038192160353e-05,
"loss": 0.6436,
"step": 27000
},
{
"epoch": 1.0486398637929033,
"grad_norm": 3.833077907562256,
"learning_rate": 1.9902642882018345e-05,
"loss": 0.6166,
"step": 27100
},
{
"epoch": 1.052509383585497,
"grad_norm": 3.100231647491455,
"learning_rate": 1.9894903842433155e-05,
"loss": 0.6176,
"step": 27200
},
{
"epoch": 1.0563789033780908,
"grad_norm": 1.8669284582138062,
"learning_rate": 1.9887164802847968e-05,
"loss": 0.6091,
"step": 27300
},
{
"epoch": 1.0602484231706846,
"grad_norm": 1.897378921508789,
"learning_rate": 1.987942576326278e-05,
"loss": 0.652,
"step": 27400
},
{
"epoch": 1.0641179429632783,
"grad_norm": 1.5283644199371338,
"learning_rate": 1.987168672367759e-05,
"loss": 0.6318,
"step": 27500
},
{
"epoch": 1.067987462755872,
"grad_norm": 2.560490131378174,
"learning_rate": 1.9863947684092407e-05,
"loss": 0.6215,
"step": 27600
},
{
"epoch": 1.0718569825484656,
"grad_norm": 2.0089356899261475,
"learning_rate": 1.9856208644507217e-05,
"loss": 0.6024,
"step": 27700
},
{
"epoch": 1.0757265023410594,
"grad_norm": 3.221628427505493,
"learning_rate": 1.984846960492203e-05,
"loss": 0.6451,
"step": 27800
},
{
"epoch": 1.0795960221336531,
"grad_norm": 2.0573599338531494,
"learning_rate": 1.9840730565336843e-05,
"loss": 0.6564,
"step": 27900
},
{
"epoch": 1.0834655419262469,
"grad_norm": 2.207869052886963,
"learning_rate": 1.9832991525751656e-05,
"loss": 0.5918,
"step": 28000
},
{
"epoch": 1.0873350617188406,
"grad_norm": 2.5935580730438232,
"learning_rate": 1.982525248616647e-05,
"loss": 0.6011,
"step": 28100
},
{
"epoch": 1.0912045815114344,
"grad_norm": 3.1886303424835205,
"learning_rate": 1.981751344658128e-05,
"loss": 0.6018,
"step": 28200
},
{
"epoch": 1.0950741013040282,
"grad_norm": 3.0881826877593994,
"learning_rate": 1.9809774406996096e-05,
"loss": 0.6491,
"step": 28300
},
{
"epoch": 1.098943621096622,
"grad_norm": 2.027204751968384,
"learning_rate": 1.9802035367410905e-05,
"loss": 0.584,
"step": 28400
},
{
"epoch": 1.1028131408892157,
"grad_norm": 2.9437708854675293,
"learning_rate": 1.979429632782572e-05,
"loss": 0.599,
"step": 28500
},
{
"epoch": 1.1066826606818094,
"grad_norm": 2.136786460876465,
"learning_rate": 1.978655728824053e-05,
"loss": 0.5996,
"step": 28600
},
{
"epoch": 1.1105521804744032,
"grad_norm": 2.343502998352051,
"learning_rate": 1.977881824865534e-05,
"loss": 0.6006,
"step": 28700
},
{
"epoch": 1.114421700266997,
"grad_norm": 2.804229497909546,
"learning_rate": 1.9771079209070158e-05,
"loss": 0.6317,
"step": 28800
},
{
"epoch": 1.1182912200595907,
"grad_norm": 2.8518316745758057,
"learning_rate": 1.9763340169484968e-05,
"loss": 0.6092,
"step": 28900
},
{
"epoch": 1.1221607398521845,
"grad_norm": 2.5474793910980225,
"learning_rate": 1.975560112989978e-05,
"loss": 0.6016,
"step": 29000
},
{
"epoch": 1.126030259644778,
"grad_norm": 3.378596544265747,
"learning_rate": 1.9747862090314594e-05,
"loss": 0.5828,
"step": 29100
},
{
"epoch": 1.1298997794373717,
"grad_norm": 2.2915539741516113,
"learning_rate": 1.9740123050729403e-05,
"loss": 0.5981,
"step": 29200
},
{
"epoch": 1.1337692992299655,
"grad_norm": 4.275195598602295,
"learning_rate": 1.973238401114422e-05,
"loss": 0.5924,
"step": 29300
},
{
"epoch": 1.1376388190225593,
"grad_norm": 3.948652744293213,
"learning_rate": 1.972464497155903e-05,
"loss": 0.605,
"step": 29400
},
{
"epoch": 1.141508338815153,
"grad_norm": 1.9093629121780396,
"learning_rate": 1.9716905931973843e-05,
"loss": 0.6062,
"step": 29500
},
{
"epoch": 1.1453778586077468,
"grad_norm": 2.5479652881622314,
"learning_rate": 1.9709166892388656e-05,
"loss": 0.5875,
"step": 29600
},
{
"epoch": 1.1492473784003405,
"grad_norm": 1.8700230121612549,
"learning_rate": 1.970142785280347e-05,
"loss": 0.5877,
"step": 29700
},
{
"epoch": 1.1531168981929343,
"grad_norm": 1.7103559970855713,
"learning_rate": 1.9693688813218282e-05,
"loss": 0.5917,
"step": 29800
},
{
"epoch": 1.156986417985528,
"grad_norm": 2.3256266117095947,
"learning_rate": 1.9685949773633092e-05,
"loss": 0.5866,
"step": 29900
},
{
"epoch": 1.1608559377781218,
"grad_norm": 2.2812767028808594,
"learning_rate": 1.9678210734047908e-05,
"loss": 0.5746,
"step": 30000
},
{
"epoch": 1.1647254575707155,
"grad_norm": 1.945586919784546,
"learning_rate": 1.9670471694462718e-05,
"loss": 0.5772,
"step": 30100
},
{
"epoch": 1.1685949773633093,
"grad_norm": 2.1316816806793213,
"learning_rate": 1.966273265487753e-05,
"loss": 0.5692,
"step": 30200
},
{
"epoch": 1.1724644971559028,
"grad_norm": 1.6511510610580444,
"learning_rate": 1.9654993615292344e-05,
"loss": 0.5888,
"step": 30300
},
{
"epoch": 1.1763340169484966,
"grad_norm": 2.1600418090820312,
"learning_rate": 1.9647254575707154e-05,
"loss": 0.5957,
"step": 30400
},
{
"epoch": 1.1802035367410904,
"grad_norm": 2.488708257675171,
"learning_rate": 1.963951553612197e-05,
"loss": 0.5742,
"step": 30500
},
{
"epoch": 1.184073056533684,
"grad_norm": 4.75523042678833,
"learning_rate": 1.963177649653678e-05,
"loss": 0.5512,
"step": 30600
},
{
"epoch": 1.1879425763262779,
"grad_norm": 2.335571050643921,
"learning_rate": 1.9624037456951593e-05,
"loss": 0.5665,
"step": 30700
},
{
"epoch": 1.1918120961188716,
"grad_norm": 2.083780527114868,
"learning_rate": 1.9616298417366406e-05,
"loss": 0.5682,
"step": 30800
},
{
"epoch": 1.1956816159114654,
"grad_norm": 2.2006747722625732,
"learning_rate": 1.960855937778122e-05,
"loss": 0.5837,
"step": 30900
},
{
"epoch": 1.1995511357040591,
"grad_norm": 2.8674488067626953,
"learning_rate": 1.9600820338196033e-05,
"loss": 0.5921,
"step": 31000
},
{
"epoch": 1.203420655496653,
"grad_norm": 1.7168868780136108,
"learning_rate": 1.9593081298610842e-05,
"loss": 0.5833,
"step": 31100
},
{
"epoch": 1.2072901752892466,
"grad_norm": 1.9584147930145264,
"learning_rate": 1.9585342259025655e-05,
"loss": 0.6038,
"step": 31200
},
{
"epoch": 1.2111596950818404,
"grad_norm": 1.6358076333999634,
"learning_rate": 1.957760321944047e-05,
"loss": 0.5733,
"step": 31300
},
{
"epoch": 1.2150292148744342,
"grad_norm": 2.02459454536438,
"learning_rate": 1.956986417985528e-05,
"loss": 0.5727,
"step": 31400
},
{
"epoch": 1.218898734667028,
"grad_norm": 3.0293195247650146,
"learning_rate": 1.9562125140270095e-05,
"loss": 0.5738,
"step": 31500
},
{
"epoch": 1.2227682544596217,
"grad_norm": 1.8469349145889282,
"learning_rate": 1.9554386100684904e-05,
"loss": 0.5921,
"step": 31600
},
{
"epoch": 1.2266377742522152,
"grad_norm": 2.891789436340332,
"learning_rate": 1.954664706109972e-05,
"loss": 0.5722,
"step": 31700
},
{
"epoch": 1.230507294044809,
"grad_norm": 1.9393624067306519,
"learning_rate": 1.953890802151453e-05,
"loss": 0.5969,
"step": 31800
},
{
"epoch": 1.2343768138374027,
"grad_norm": 2.441889524459839,
"learning_rate": 1.9531168981929344e-05,
"loss": 0.5962,
"step": 31900
},
{
"epoch": 1.2382463336299965,
"grad_norm": 2.32602596282959,
"learning_rate": 1.9523429942344157e-05,
"loss": 0.5867,
"step": 32000
},
{
"epoch": 1.2421158534225902,
"grad_norm": 2.894043445587158,
"learning_rate": 1.951569090275897e-05,
"loss": 0.5841,
"step": 32100
},
{
"epoch": 1.245985373215184,
"grad_norm": 1.8625471591949463,
"learning_rate": 1.9507951863173783e-05,
"loss": 0.5541,
"step": 32200
},
{
"epoch": 1.2498548930077777,
"grad_norm": 2.3896780014038086,
"learning_rate": 1.9500212823588593e-05,
"loss": 0.5876,
"step": 32300
},
{
"epoch": 1.2537244128003715,
"grad_norm": 1.956554651260376,
"learning_rate": 1.9492473784003406e-05,
"loss": 0.569,
"step": 32400
},
{
"epoch": 1.2575939325929653,
"grad_norm": 2.6784517765045166,
"learning_rate": 1.948473474441822e-05,
"loss": 0.5661,
"step": 32500
},
{
"epoch": 1.261463452385559,
"grad_norm": 1.7353438138961792,
"learning_rate": 1.9476995704833032e-05,
"loss": 0.5699,
"step": 32600
},
{
"epoch": 1.2653329721781528,
"grad_norm": 2.6139369010925293,
"learning_rate": 1.9469256665247845e-05,
"loss": 0.5857,
"step": 32700
},
{
"epoch": 1.2692024919707463,
"grad_norm": 1.978928804397583,
"learning_rate": 1.9461517625662655e-05,
"loss": 0.568,
"step": 32800
},
{
"epoch": 1.27307201176334,
"grad_norm": 1.856897234916687,
"learning_rate": 1.945377858607747e-05,
"loss": 0.5629,
"step": 32900
},
{
"epoch": 1.2769415315559338,
"grad_norm": 2.1292757987976074,
"learning_rate": 1.944603954649228e-05,
"loss": 0.5509,
"step": 33000
},
{
"epoch": 1.2808110513485276,
"grad_norm": 4.36674690246582,
"learning_rate": 1.9438300506907094e-05,
"loss": 0.5796,
"step": 33100
},
{
"epoch": 1.2846805711411213,
"grad_norm": 2.971160888671875,
"learning_rate": 1.9430561467321907e-05,
"loss": 0.5735,
"step": 33200
},
{
"epoch": 1.288550090933715,
"grad_norm": 2.1826820373535156,
"learning_rate": 1.942282242773672e-05,
"loss": 0.5478,
"step": 33300
},
{
"epoch": 1.2924196107263088,
"grad_norm": 2.4867353439331055,
"learning_rate": 1.9415083388151533e-05,
"loss": 0.5311,
"step": 33400
},
{
"epoch": 1.2962891305189026,
"grad_norm": 3.6826677322387695,
"learning_rate": 1.9407344348566343e-05,
"loss": 0.5687,
"step": 33500
},
{
"epoch": 1.3001586503114964,
"grad_norm": 1.6112810373306274,
"learning_rate": 1.9399605308981156e-05,
"loss": 0.5808,
"step": 33600
},
{
"epoch": 1.3040281701040901,
"grad_norm": 1.9377580881118774,
"learning_rate": 1.939186626939597e-05,
"loss": 0.5936,
"step": 33700
},
{
"epoch": 1.3078976898966839,
"grad_norm": 2.1743500232696533,
"learning_rate": 1.9384127229810783e-05,
"loss": 0.5494,
"step": 33800
},
{
"epoch": 1.3117672096892776,
"grad_norm": 2.9613547325134277,
"learning_rate": 1.9376388190225596e-05,
"loss": 0.5454,
"step": 33900
},
{
"epoch": 1.3156367294818714,
"grad_norm": 1.6620279550552368,
"learning_rate": 1.9368649150640405e-05,
"loss": 0.5754,
"step": 34000
},
{
"epoch": 1.3195062492744651,
"grad_norm": 1.6309536695480347,
"learning_rate": 1.936091011105522e-05,
"loss": 0.5504,
"step": 34100
},
{
"epoch": 1.323375769067059,
"grad_norm": 2.2999184131622314,
"learning_rate": 1.935317107147003e-05,
"loss": 0.5596,
"step": 34200
},
{
"epoch": 1.3272452888596526,
"grad_norm": 1.74318528175354,
"learning_rate": 1.9345432031884845e-05,
"loss": 0.5507,
"step": 34300
},
{
"epoch": 1.3311148086522462,
"grad_norm": 2.797492265701294,
"learning_rate": 1.9337692992299658e-05,
"loss": 0.5815,
"step": 34400
},
{
"epoch": 1.33498432844484,
"grad_norm": 2.596480369567871,
"learning_rate": 1.9329953952714467e-05,
"loss": 0.5624,
"step": 34500
},
{
"epoch": 1.3388538482374337,
"grad_norm": 2.3296291828155518,
"learning_rate": 1.9322214913129284e-05,
"loss": 0.5508,
"step": 34600
},
{
"epoch": 1.3427233680300275,
"grad_norm": 2.2664413452148438,
"learning_rate": 1.9314475873544094e-05,
"loss": 0.5614,
"step": 34700
},
{
"epoch": 1.3465928878226212,
"grad_norm": 2.1926095485687256,
"learning_rate": 1.9306736833958907e-05,
"loss": 0.528,
"step": 34800
},
{
"epoch": 1.350462407615215,
"grad_norm": 3.2874293327331543,
"learning_rate": 1.929899779437372e-05,
"loss": 0.558,
"step": 34900
},
{
"epoch": 1.3543319274078087,
"grad_norm": 3.320584774017334,
"learning_rate": 1.9291258754788533e-05,
"loss": 0.557,
"step": 35000
},
{
"epoch": 1.3582014472004025,
"grad_norm": 1.9394633769989014,
"learning_rate": 1.9283519715203346e-05,
"loss": 0.544,
"step": 35100
},
{
"epoch": 1.3620709669929962,
"grad_norm": 2.928828001022339,
"learning_rate": 1.9275780675618156e-05,
"loss": 0.5797,
"step": 35200
},
{
"epoch": 1.36594048678559,
"grad_norm": 1.899062991142273,
"learning_rate": 1.926804163603297e-05,
"loss": 0.541,
"step": 35300
},
{
"epoch": 1.3698100065781835,
"grad_norm": 1.9945220947265625,
"learning_rate": 1.9260302596447782e-05,
"loss": 0.5584,
"step": 35400
},
{
"epoch": 1.3736795263707773,
"grad_norm": 2.7123208045959473,
"learning_rate": 1.9252563556862595e-05,
"loss": 0.5514,
"step": 35500
},
{
"epoch": 1.377549046163371,
"grad_norm": 1.9963657855987549,
"learning_rate": 1.9244824517277408e-05,
"loss": 0.5278,
"step": 35600
},
{
"epoch": 1.3814185659559648,
"grad_norm": 2.3080806732177734,
"learning_rate": 1.9237085477692218e-05,
"loss": 0.5283,
"step": 35700
},
{
"epoch": 1.3852880857485586,
"grad_norm": 1.7134689092636108,
"learning_rate": 1.922934643810703e-05,
"loss": 0.5618,
"step": 35800
},
{
"epoch": 1.3891576055411523,
"grad_norm": 2.130807876586914,
"learning_rate": 1.9221607398521844e-05,
"loss": 0.5177,
"step": 35900
},
{
"epoch": 1.393027125333746,
"grad_norm": 1.8442771434783936,
"learning_rate": 1.9213868358936657e-05,
"loss": 0.5406,
"step": 36000
},
{
"epoch": 1.3968966451263398,
"grad_norm": 2.7372751235961914,
"learning_rate": 1.920612931935147e-05,
"loss": 0.5312,
"step": 36100
},
{
"epoch": 1.4007661649189336,
"grad_norm": 1.7819504737854004,
"learning_rate": 1.9198390279766283e-05,
"loss": 0.5563,
"step": 36200
},
{
"epoch": 1.4046356847115273,
"grad_norm": 2.3207108974456787,
"learning_rate": 1.9190651240181097e-05,
"loss": 0.5494,
"step": 36300
},
{
"epoch": 1.408505204504121,
"grad_norm": 2.2460434436798096,
"learning_rate": 1.9182912200595906e-05,
"loss": 0.5537,
"step": 36400
},
{
"epoch": 1.4123747242967148,
"grad_norm": 2.508028984069824,
"learning_rate": 1.917517316101072e-05,
"loss": 0.5317,
"step": 36500
},
{
"epoch": 1.4162442440893086,
"grad_norm": 2.5418314933776855,
"learning_rate": 1.9167434121425532e-05,
"loss": 0.5132,
"step": 36600
},
{
"epoch": 1.4201137638819024,
"grad_norm": 2.6335184574127197,
"learning_rate": 1.9159695081840346e-05,
"loss": 0.5628,
"step": 36700
},
{
"epoch": 1.4239832836744961,
"grad_norm": 2.0406432151794434,
"learning_rate": 1.915195604225516e-05,
"loss": 0.5245,
"step": 36800
},
{
"epoch": 1.4278528034670899,
"grad_norm": 1.7628612518310547,
"learning_rate": 1.914421700266997e-05,
"loss": 0.5331,
"step": 36900
},
{
"epoch": 1.4317223232596834,
"grad_norm": 1.38942551612854,
"learning_rate": 1.913647796308478e-05,
"loss": 0.54,
"step": 37000
},
{
"epoch": 1.4355918430522772,
"grad_norm": 2.143207550048828,
"learning_rate": 1.9128738923499595e-05,
"loss": 0.5553,
"step": 37100
},
{
"epoch": 1.439461362844871,
"grad_norm": 3.6318371295928955,
"learning_rate": 1.9120999883914408e-05,
"loss": 0.4981,
"step": 37200
},
{
"epoch": 1.4433308826374647,
"grad_norm": 1.8823308944702148,
"learning_rate": 1.911326084432922e-05,
"loss": 0.5185,
"step": 37300
},
{
"epoch": 1.4472004024300584,
"grad_norm": 2.1262166500091553,
"learning_rate": 1.9105521804744034e-05,
"loss": 0.5126,
"step": 37400
},
{
"epoch": 1.4510699222226522,
"grad_norm": 2.445730686187744,
"learning_rate": 1.9097782765158847e-05,
"loss": 0.5327,
"step": 37500
},
{
"epoch": 1.454939442015246,
"grad_norm": 3.0408647060394287,
"learning_rate": 1.9090043725573657e-05,
"loss": 0.5152,
"step": 37600
},
{
"epoch": 1.4588089618078397,
"grad_norm": 2.2298266887664795,
"learning_rate": 1.908230468598847e-05,
"loss": 0.5477,
"step": 37700
},
{
"epoch": 1.4626784816004335,
"grad_norm": 1.3029440641403198,
"learning_rate": 1.9074565646403283e-05,
"loss": 0.5292,
"step": 37800
},
{
"epoch": 1.466548001393027,
"grad_norm": 2.163135051727295,
"learning_rate": 1.9066826606818096e-05,
"loss": 0.5339,
"step": 37900
},
{
"epoch": 1.4704175211856207,
"grad_norm": 2.660160541534424,
"learning_rate": 1.905908756723291e-05,
"loss": 0.5302,
"step": 38000
},
{
"epoch": 1.4742870409782145,
"grad_norm": 2.235914468765259,
"learning_rate": 1.905134852764772e-05,
"loss": 0.5436,
"step": 38100
},
{
"epoch": 1.4781565607708083,
"grad_norm": 2.123150110244751,
"learning_rate": 1.9043609488062532e-05,
"loss": 0.5442,
"step": 38200
},
{
"epoch": 1.482026080563402,
"grad_norm": 2.917724609375,
"learning_rate": 1.9035870448477345e-05,
"loss": 0.5253,
"step": 38300
},
{
"epoch": 1.4858956003559958,
"grad_norm": 1.8136122226715088,
"learning_rate": 1.9028131408892158e-05,
"loss": 0.5259,
"step": 38400
},
{
"epoch": 1.4897651201485895,
"grad_norm": 1.409419298171997,
"learning_rate": 1.902039236930697e-05,
"loss": 0.5412,
"step": 38500
},
{
"epoch": 1.4936346399411833,
"grad_norm": 1.6381382942199707,
"learning_rate": 1.9012653329721784e-05,
"loss": 0.5253,
"step": 38600
},
{
"epoch": 1.497504159733777,
"grad_norm": 1.4886516332626343,
"learning_rate": 1.9004914290136594e-05,
"loss": 0.505,
"step": 38700
},
{
"epoch": 1.4999419572031112,
"eval_loss": 0.3584839403629303,
"eval_runtime": 75.7902,
"eval_samples_per_second": 27.563,
"eval_steps_per_second": 3.457,
"step": 38763
},
{
"epoch": 1.5013736795263708,
"grad_norm": 4.64426851272583,
"learning_rate": 1.8997175250551407e-05,
"loss": 0.533,
"step": 38800
},
{
"epoch": 1.5052431993189646,
"grad_norm": 2.312692403793335,
"learning_rate": 1.898943621096622e-05,
"loss": 0.506,
"step": 38900
},
{
"epoch": 1.5091127191115583,
"grad_norm": 2.6338064670562744,
"learning_rate": 1.8981697171381033e-05,
"loss": 0.5318,
"step": 39000
},
{
"epoch": 1.512982238904152,
"grad_norm": 2.5822842121124268,
"learning_rate": 1.8973958131795847e-05,
"loss": 0.5192,
"step": 39100
},
{
"epoch": 1.5168517586967458,
"grad_norm": 2.04563570022583,
"learning_rate": 1.896621909221066e-05,
"loss": 0.5187,
"step": 39200
},
{
"epoch": 1.5207212784893396,
"grad_norm": 2.4310240745544434,
"learning_rate": 1.895848005262547e-05,
"loss": 0.5087,
"step": 39300
},
{
"epoch": 1.5245907982819333,
"grad_norm": 2.6476080417633057,
"learning_rate": 1.8950741013040282e-05,
"loss": 0.5202,
"step": 39400
},
{
"epoch": 1.528460318074527,
"grad_norm": 2.638237237930298,
"learning_rate": 1.8943001973455096e-05,
"loss": 0.526,
"step": 39500
},
{
"epoch": 1.5323298378671208,
"grad_norm": 1.1704139709472656,
"learning_rate": 1.893526293386991e-05,
"loss": 0.5215,
"step": 39600
},
{
"epoch": 1.5361993576597144,
"grad_norm": 2.730487108230591,
"learning_rate": 1.8927523894284722e-05,
"loss": 0.5539,
"step": 39700
},
{
"epoch": 1.5400688774523081,
"grad_norm": 2.1526124477386475,
"learning_rate": 1.8919784854699535e-05,
"loss": 0.5102,
"step": 39800
},
{
"epoch": 1.543938397244902,
"grad_norm": 1.5470048189163208,
"learning_rate": 1.8912045815114345e-05,
"loss": 0.5037,
"step": 39900
},
{
"epoch": 1.5478079170374957,
"grad_norm": 1.6453921794891357,
"learning_rate": 1.8904306775529158e-05,
"loss": 0.5125,
"step": 40000
},
{
"epoch": 1.5516774368300894,
"grad_norm": 1.8560413122177124,
"learning_rate": 1.889656773594397e-05,
"loss": 0.519,
"step": 40100
},
{
"epoch": 1.5555469566226832,
"grad_norm": 1.750705361366272,
"learning_rate": 1.8888828696358784e-05,
"loss": 0.5526,
"step": 40200
},
{
"epoch": 1.5594164764152767,
"grad_norm": 1.9431952238082886,
"learning_rate": 1.8881089656773597e-05,
"loss": 0.4997,
"step": 40300
},
{
"epoch": 1.5632859962078705,
"grad_norm": 3.076261043548584,
"learning_rate": 1.887335061718841e-05,
"loss": 0.517,
"step": 40400
},
{
"epoch": 1.5671555160004642,
"grad_norm": 3.281338930130005,
"learning_rate": 1.886561157760322e-05,
"loss": 0.5084,
"step": 40500
},
{
"epoch": 1.571025035793058,
"grad_norm": 2.3248159885406494,
"learning_rate": 1.8857872538018033e-05,
"loss": 0.5265,
"step": 40600
},
{
"epoch": 1.5748945555856517,
"grad_norm": 1.9274684190750122,
"learning_rate": 1.8850133498432846e-05,
"loss": 0.5286,
"step": 40700
},
{
"epoch": 1.5787640753782455,
"grad_norm": 2.3014674186706543,
"learning_rate": 1.884239445884766e-05,
"loss": 0.5212,
"step": 40800
},
{
"epoch": 1.5826335951708392,
"grad_norm": 1.7328417301177979,
"learning_rate": 1.8834655419262472e-05,
"loss": 0.4932,
"step": 40900
},
{
"epoch": 1.586503114963433,
"grad_norm": 1.4878865480422974,
"learning_rate": 1.8826916379677285e-05,
"loss": 0.5002,
"step": 41000
},
{
"epoch": 1.5903726347560267,
"grad_norm": 2.281660556793213,
"learning_rate": 1.8819177340092095e-05,
"loss": 0.5135,
"step": 41100
},
{
"epoch": 1.5942421545486205,
"grad_norm": 2.6435060501098633,
"learning_rate": 1.8811438300506908e-05,
"loss": 0.5195,
"step": 41200
},
{
"epoch": 1.5981116743412143,
"grad_norm": 2.7956855297088623,
"learning_rate": 1.880369926092172e-05,
"loss": 0.5206,
"step": 41300
},
{
"epoch": 1.601981194133808,
"grad_norm": 2.0349161624908447,
"learning_rate": 1.8795960221336534e-05,
"loss": 0.5372,
"step": 41400
},
{
"epoch": 1.6058507139264018,
"grad_norm": 2.1461424827575684,
"learning_rate": 1.8788221181751347e-05,
"loss": 0.4901,
"step": 41500
},
{
"epoch": 1.6097202337189955,
"grad_norm": 1.6793510913848877,
"learning_rate": 1.8780482142166157e-05,
"loss": 0.5153,
"step": 41600
},
{
"epoch": 1.6135897535115893,
"grad_norm": 1.605312466621399,
"learning_rate": 1.877274310258097e-05,
"loss": 0.5136,
"step": 41700
},
{
"epoch": 1.617459273304183,
"grad_norm": 1.3838826417922974,
"learning_rate": 1.8765004062995783e-05,
"loss": 0.5016,
"step": 41800
},
{
"epoch": 1.6213287930967768,
"grad_norm": 1.7461504936218262,
"learning_rate": 1.8757265023410597e-05,
"loss": 0.5174,
"step": 41900
},
{
"epoch": 1.6251983128893706,
"grad_norm": 1.5110478401184082,
"learning_rate": 1.874952598382541e-05,
"loss": 0.4928,
"step": 42000
},
{
"epoch": 1.6290678326819643,
"grad_norm": 4.541932106018066,
"learning_rate": 1.8741786944240223e-05,
"loss": 0.502,
"step": 42100
},
{
"epoch": 1.632937352474558,
"grad_norm": 2.1037495136260986,
"learning_rate": 1.8734047904655032e-05,
"loss": 0.5133,
"step": 42200
},
{
"epoch": 1.6368068722671516,
"grad_norm": 1.9825454950332642,
"learning_rate": 1.8726308865069846e-05,
"loss": 0.4841,
"step": 42300
},
{
"epoch": 1.6406763920597454,
"grad_norm": 1.887780785560608,
"learning_rate": 1.871856982548466e-05,
"loss": 0.5447,
"step": 42400
},
{
"epoch": 1.6445459118523391,
"grad_norm": 2.2040176391601562,
"learning_rate": 1.8710830785899472e-05,
"loss": 0.5181,
"step": 42500
},
{
"epoch": 1.6484154316449329,
"grad_norm": 2.889693260192871,
"learning_rate": 1.8703091746314285e-05,
"loss": 0.4811,
"step": 42600
},
{
"epoch": 1.6522849514375266,
"grad_norm": 1.1301201581954956,
"learning_rate": 1.8695352706729098e-05,
"loss": 0.5074,
"step": 42700
},
{
"epoch": 1.6561544712301204,
"grad_norm": 2.004110097885132,
"learning_rate": 1.8687613667143908e-05,
"loss": 0.4964,
"step": 42800
},
{
"epoch": 1.660023991022714,
"grad_norm": 2.496898889541626,
"learning_rate": 1.867987462755872e-05,
"loss": 0.5143,
"step": 42900
},
{
"epoch": 1.6638935108153077,
"grad_norm": 1.8562402725219727,
"learning_rate": 1.8672135587973534e-05,
"loss": 0.4961,
"step": 43000
},
{
"epoch": 1.6677630306079014,
"grad_norm": 2.016951084136963,
"learning_rate": 1.8664396548388347e-05,
"loss": 0.4953,
"step": 43100
},
{
"epoch": 1.6716325504004952,
"grad_norm": 1.9758374691009521,
"learning_rate": 1.865665750880316e-05,
"loss": 0.4871,
"step": 43200
},
{
"epoch": 1.675502070193089,
"grad_norm": 1.7189594507217407,
"learning_rate": 1.864891846921797e-05,
"loss": 0.5054,
"step": 43300
},
{
"epoch": 1.6793715899856827,
"grad_norm": 2.1537132263183594,
"learning_rate": 1.8641179429632783e-05,
"loss": 0.5048,
"step": 43400
},
{
"epoch": 1.6832411097782765,
"grad_norm": 1.0246325731277466,
"learning_rate": 1.8633440390047596e-05,
"loss": 0.5053,
"step": 43500
},
{
"epoch": 1.6871106295708702,
"grad_norm": 2.4667491912841797,
"learning_rate": 1.862570135046241e-05,
"loss": 0.4947,
"step": 43600
},
{
"epoch": 1.690980149363464,
"grad_norm": 2.4239518642425537,
"learning_rate": 1.8617962310877222e-05,
"loss": 0.4928,
"step": 43700
},
{
"epoch": 1.6948496691560577,
"grad_norm": 2.4990153312683105,
"learning_rate": 1.8610223271292035e-05,
"loss": 0.492,
"step": 43800
},
{
"epoch": 1.6987191889486515,
"grad_norm": 1.7751882076263428,
"learning_rate": 1.860248423170685e-05,
"loss": 0.5012,
"step": 43900
},
{
"epoch": 1.7025887087412452,
"grad_norm": 1.9940294027328491,
"learning_rate": 1.8594745192121658e-05,
"loss": 0.4956,
"step": 44000
},
{
"epoch": 1.706458228533839,
"grad_norm": 1.631549596786499,
"learning_rate": 1.858700615253647e-05,
"loss": 0.5031,
"step": 44100
},
{
"epoch": 1.7103277483264328,
"grad_norm": 2.021336078643799,
"learning_rate": 1.8579267112951284e-05,
"loss": 0.5237,
"step": 44200
},
{
"epoch": 1.7141972681190265,
"grad_norm": 1.925374150276184,
"learning_rate": 1.8571528073366097e-05,
"loss": 0.5043,
"step": 44300
},
{
"epoch": 1.7180667879116203,
"grad_norm": 2.0028488636016846,
"learning_rate": 1.856378903378091e-05,
"loss": 0.4828,
"step": 44400
},
{
"epoch": 1.721936307704214,
"grad_norm": 2.185070037841797,
"learning_rate": 1.855604999419572e-05,
"loss": 0.4947,
"step": 44500
},
{
"epoch": 1.7258058274968078,
"grad_norm": 1.4172900915145874,
"learning_rate": 1.8548310954610533e-05,
"loss": 0.4766,
"step": 44600
},
{
"epoch": 1.7296753472894015,
"grad_norm": 3.3415722846984863,
"learning_rate": 1.8540571915025346e-05,
"loss": 0.4867,
"step": 44700
},
{
"epoch": 1.7335448670819953,
"grad_norm": 3.0978989601135254,
"learning_rate": 1.853283287544016e-05,
"loss": 0.4803,
"step": 44800
},
{
"epoch": 1.7374143868745888,
"grad_norm": 1.9211913347244263,
"learning_rate": 1.8525093835854973e-05,
"loss": 0.4932,
"step": 44900
},
{
"epoch": 1.7412839066671826,
"grad_norm": 2.969252347946167,
"learning_rate": 1.8517354796269786e-05,
"loss": 0.4797,
"step": 45000
},
{
"epoch": 1.7451534264597763,
"grad_norm": 2.3203649520874023,
"learning_rate": 1.85096157566846e-05,
"loss": 0.4946,
"step": 45100
},
{
"epoch": 1.74902294625237,
"grad_norm": 3.472598075866699,
"learning_rate": 1.850187671709941e-05,
"loss": 0.4943,
"step": 45200
},
{
"epoch": 1.7528924660449638,
"grad_norm": 1.84355890750885,
"learning_rate": 1.8494137677514222e-05,
"loss": 0.4862,
"step": 45300
},
{
"epoch": 1.7567619858375576,
"grad_norm": 1.6597728729248047,
"learning_rate": 1.8486398637929035e-05,
"loss": 0.4693,
"step": 45400
},
{
"epoch": 1.7606315056301511,
"grad_norm": 2.034503221511841,
"learning_rate": 1.8478659598343848e-05,
"loss": 0.4865,
"step": 45500
},
{
"epoch": 1.764501025422745,
"grad_norm": 3.27135968208313,
"learning_rate": 1.847092055875866e-05,
"loss": 0.4848,
"step": 45600
},
{
"epoch": 1.7683705452153387,
"grad_norm": 1.4343475103378296,
"learning_rate": 1.846318151917347e-05,
"loss": 0.486,
"step": 45700
},
{
"epoch": 1.7722400650079324,
"grad_norm": 1.9744470119476318,
"learning_rate": 1.8455442479588284e-05,
"loss": 0.4903,
"step": 45800
},
{
"epoch": 1.7761095848005262,
"grad_norm": 1.4793157577514648,
"learning_rate": 1.8447703440003097e-05,
"loss": 0.5093,
"step": 45900
},
{
"epoch": 1.77997910459312,
"grad_norm": 2.11100697517395,
"learning_rate": 1.843996440041791e-05,
"loss": 0.4802,
"step": 46000
},
{
"epoch": 1.7838486243857137,
"grad_norm": 3.300123929977417,
"learning_rate": 1.8432225360832723e-05,
"loss": 0.4801,
"step": 46100
},
{
"epoch": 1.7877181441783074,
"grad_norm": 1.9946368932724,
"learning_rate": 1.8424486321247533e-05,
"loss": 0.4729,
"step": 46200
},
{
"epoch": 1.7915876639709012,
"grad_norm": 1.8352553844451904,
"learning_rate": 1.841674728166235e-05,
"loss": 0.4678,
"step": 46300
},
{
"epoch": 1.795457183763495,
"grad_norm": 1.6375458240509033,
"learning_rate": 1.840900824207716e-05,
"loss": 0.4885,
"step": 46400
},
{
"epoch": 1.7993267035560887,
"grad_norm": 1.59699285030365,
"learning_rate": 1.8401269202491972e-05,
"loss": 0.4748,
"step": 46500
},
{
"epoch": 1.8031962233486825,
"grad_norm": 2.2244873046875,
"learning_rate": 1.8393530162906785e-05,
"loss": 0.4846,
"step": 46600
},
{
"epoch": 1.8070657431412762,
"grad_norm": 0.9236389994621277,
"learning_rate": 1.83857911233216e-05,
"loss": 0.4758,
"step": 46700
},
{
"epoch": 1.81093526293387,
"grad_norm": 1.8354111909866333,
"learning_rate": 1.837805208373641e-05,
"loss": 0.4941,
"step": 46800
},
{
"epoch": 1.8148047827264637,
"grad_norm": 3.5753133296966553,
"learning_rate": 1.837031304415122e-05,
"loss": 0.504,
"step": 46900
},
{
"epoch": 1.8186743025190575,
"grad_norm": 3.344614028930664,
"learning_rate": 1.8362574004566034e-05,
"loss": 0.492,
"step": 47000
},
{
"epoch": 1.8225438223116512,
"grad_norm": 3.517702102661133,
"learning_rate": 1.8354834964980847e-05,
"loss": 0.4926,
"step": 47100
},
{
"epoch": 1.826413342104245,
"grad_norm": 1.7364025115966797,
"learning_rate": 1.834709592539566e-05,
"loss": 0.4879,
"step": 47200
},
{
"epoch": 1.8302828618968388,
"grad_norm": 3.0614891052246094,
"learning_rate": 1.8339356885810474e-05,
"loss": 0.4827,
"step": 47300
},
{
"epoch": 1.8341523816894323,
"grad_norm": 2.4689478874206543,
"learning_rate": 1.8331617846225283e-05,
"loss": 0.4704,
"step": 47400
},
{
"epoch": 1.838021901482026,
"grad_norm": 1.7155258655548096,
"learning_rate": 1.83238788066401e-05,
"loss": 0.4671,
"step": 47500
},
{
"epoch": 1.8418914212746198,
"grad_norm": 6.756774425506592,
"learning_rate": 1.831613976705491e-05,
"loss": 0.4776,
"step": 47600
},
{
"epoch": 1.8457609410672136,
"grad_norm": 2.346327543258667,
"learning_rate": 1.8308400727469723e-05,
"loss": 0.4973,
"step": 47700
},
{
"epoch": 1.8496304608598073,
"grad_norm": 2.0925145149230957,
"learning_rate": 1.8300661687884536e-05,
"loss": 0.4738,
"step": 47800
},
{
"epoch": 1.853499980652401,
"grad_norm": 2.5207793712615967,
"learning_rate": 1.8292922648299345e-05,
"loss": 0.5101,
"step": 47900
},
{
"epoch": 1.8573695004449948,
"grad_norm": 1.3195054531097412,
"learning_rate": 1.8285183608714162e-05,
"loss": 0.4742,
"step": 48000
},
{
"epoch": 1.8612390202375884,
"grad_norm": 3.062666654586792,
"learning_rate": 1.827744456912897e-05,
"loss": 0.4849,
"step": 48100
},
{
"epoch": 1.8651085400301821,
"grad_norm": 2.738952398300171,
"learning_rate": 1.8269705529543785e-05,
"loss": 0.4841,
"step": 48200
},
{
"epoch": 1.8689780598227759,
"grad_norm": 2.114544630050659,
"learning_rate": 1.8261966489958598e-05,
"loss": 0.4789,
"step": 48300
},
{
"epoch": 1.8728475796153696,
"grad_norm": 3.1391451358795166,
"learning_rate": 1.825422745037341e-05,
"loss": 0.4871,
"step": 48400
},
{
"epoch": 1.8767170994079634,
"grad_norm": 1.8246567249298096,
"learning_rate": 1.8246488410788224e-05,
"loss": 0.4748,
"step": 48500
},
{
"epoch": 1.8805866192005571,
"grad_norm": 1.226166009902954,
"learning_rate": 1.8238749371203034e-05,
"loss": 0.4734,
"step": 48600
},
{
"epoch": 1.884456138993151,
"grad_norm": 1.876904010772705,
"learning_rate": 1.8231010331617847e-05,
"loss": 0.4901,
"step": 48700
},
{
"epoch": 1.8883256587857447,
"grad_norm": 1.992308259010315,
"learning_rate": 1.822327129203266e-05,
"loss": 0.4525,
"step": 48800
},
{
"epoch": 1.8921951785783384,
"grad_norm": 1.9081039428710938,
"learning_rate": 1.8215532252447473e-05,
"loss": 0.4649,
"step": 48900
},
{
"epoch": 1.8960646983709322,
"grad_norm": 1.3478918075561523,
"learning_rate": 1.8207793212862286e-05,
"loss": 0.4687,
"step": 49000
},
{
"epoch": 1.899934218163526,
"grad_norm": 1.5510190725326538,
"learning_rate": 1.8200054173277096e-05,
"loss": 0.4593,
"step": 49100
},
{
"epoch": 1.9038037379561197,
"grad_norm": 2.338334798812866,
"learning_rate": 1.8192315133691912e-05,
"loss": 0.4713,
"step": 49200
},
{
"epoch": 1.9076732577487134,
"grad_norm": 1.5012677907943726,
"learning_rate": 1.8184576094106722e-05,
"loss": 0.4682,
"step": 49300
},
{
"epoch": 1.9115427775413072,
"grad_norm": 1.439362645149231,
"learning_rate": 1.8176837054521535e-05,
"loss": 0.4751,
"step": 49400
},
{
"epoch": 1.915412297333901,
"grad_norm": 1.9710556268692017,
"learning_rate": 1.816909801493635e-05,
"loss": 0.4823,
"step": 49500
},
{
"epoch": 1.9192818171264947,
"grad_norm": 1.8966295719146729,
"learning_rate": 1.816135897535116e-05,
"loss": 0.4524,
"step": 49600
},
{
"epoch": 1.9231513369190885,
"grad_norm": 1.6913188695907593,
"learning_rate": 1.8153619935765975e-05,
"loss": 0.458,
"step": 49700
},
{
"epoch": 1.9270208567116822,
"grad_norm": 2.3188533782958984,
"learning_rate": 1.8145880896180784e-05,
"loss": 0.4672,
"step": 49800
},
{
"epoch": 1.930890376504276,
"grad_norm": 1.5947829484939575,
"learning_rate": 1.8138141856595597e-05,
"loss": 0.4703,
"step": 49900
},
{
"epoch": 1.9347598962968695,
"grad_norm": 1.9424033164978027,
"learning_rate": 1.813040281701041e-05,
"loss": 0.4808,
"step": 50000
},
{
"epoch": 1.9386294160894633,
"grad_norm": 1.3641841411590576,
"learning_rate": 1.8122663777425224e-05,
"loss": 0.4624,
"step": 50100
},
{
"epoch": 1.942498935882057,
"grad_norm": 1.4691535234451294,
"learning_rate": 1.8114924737840037e-05,
"loss": 0.4741,
"step": 50200
},
{
"epoch": 1.9463684556746508,
"grad_norm": 0.9599714279174805,
"learning_rate": 1.8107185698254846e-05,
"loss": 0.4379,
"step": 50300
},
{
"epoch": 1.9502379754672445,
"grad_norm": 2.234408140182495,
"learning_rate": 1.8099446658669663e-05,
"loss": 0.4594,
"step": 50400
},
{
"epoch": 1.9541074952598383,
"grad_norm": 1.6762405633926392,
"learning_rate": 1.8091707619084473e-05,
"loss": 0.4767,
"step": 50500
},
{
"epoch": 1.957977015052432,
"grad_norm": 2.0577871799468994,
"learning_rate": 1.8083968579499286e-05,
"loss": 0.4282,
"step": 50600
},
{
"epoch": 1.9618465348450256,
"grad_norm": 1.48188054561615,
"learning_rate": 1.80762295399141e-05,
"loss": 0.4656,
"step": 50700
},
{
"epoch": 1.9657160546376193,
"grad_norm": 2.1647377014160156,
"learning_rate": 1.806849050032891e-05,
"loss": 0.4531,
"step": 50800
},
{
"epoch": 1.969585574430213,
"grad_norm": 1.387678861618042,
"learning_rate": 1.8060751460743725e-05,
"loss": 0.4693,
"step": 50900
},
{
"epoch": 1.9734550942228068,
"grad_norm": 0.9403946995735168,
"learning_rate": 1.8053012421158535e-05,
"loss": 0.4632,
"step": 51000
},
{
"epoch": 1.9773246140154006,
"grad_norm": 2.03277325630188,
"learning_rate": 1.8045273381573348e-05,
"loss": 0.4539,
"step": 51100
},
{
"epoch": 1.9811941338079944,
"grad_norm": 2.1548171043395996,
"learning_rate": 1.803753434198816e-05,
"loss": 0.4661,
"step": 51200
},
{
"epoch": 1.9850636536005881,
"grad_norm": 1.6806986331939697,
"learning_rate": 1.8029795302402974e-05,
"loss": 0.4735,
"step": 51300
},
{
"epoch": 1.9889331733931819,
"grad_norm": 2.099877119064331,
"learning_rate": 1.8022056262817787e-05,
"loss": 0.4638,
"step": 51400
},
{
"epoch": 1.9928026931857756,
"grad_norm": 1.8525362014770508,
"learning_rate": 1.8014317223232597e-05,
"loss": 0.4678,
"step": 51500
},
{
"epoch": 1.9966722129783694,
"grad_norm": 2.842991590499878,
"learning_rate": 1.8006578183647413e-05,
"loss": 0.4516,
"step": 51600
},
{
"epoch": 1.9999226096041482,
"eval_loss": 0.31903380155563354,
"eval_runtime": 75.5346,
"eval_samples_per_second": 27.656,
"eval_steps_per_second": 3.469,
"step": 51684
},
{
"epoch": 2.000541732770963,
"grad_norm": 1.955477237701416,
"learning_rate": 1.999876175366637e-05,
"loss": 0.4118,
"step": 51700
},
{
"epoch": 2.004411252563557,
"grad_norm": 1.9310007095336914,
"learning_rate": 1.9991022714081184e-05,
"loss": 0.4665,
"step": 51800
},
{
"epoch": 2.0082807723561507,
"grad_norm": 1.7999227046966553,
"learning_rate": 1.9983283674495998e-05,
"loss": 0.4763,
"step": 51900
},
{
"epoch": 2.0121502921487444,
"grad_norm": 1.753142237663269,
"learning_rate": 1.9975544634910807e-05,
"loss": 0.4447,
"step": 52000
},
{
"epoch": 2.016019811941338,
"grad_norm": 2.3983778953552246,
"learning_rate": 1.9967805595325624e-05,
"loss": 0.4397,
"step": 52100
},
{
"epoch": 2.019889331733932,
"grad_norm": 2.5045669078826904,
"learning_rate": 1.9960066555740433e-05,
"loss": 0.4474,
"step": 52200
},
{
"epoch": 2.0237588515265257,
"grad_norm": 1.4538400173187256,
"learning_rate": 1.9952327516155247e-05,
"loss": 0.4422,
"step": 52300
},
{
"epoch": 2.0276283713191194,
"grad_norm": 1.072337031364441,
"learning_rate": 1.994458847657006e-05,
"loss": 0.4726,
"step": 52400
},
{
"epoch": 2.031497891111713,
"grad_norm": 1.7496392726898193,
"learning_rate": 1.993684943698487e-05,
"loss": 0.4579,
"step": 52500
},
{
"epoch": 2.035367410904307,
"grad_norm": 3.286799192428589,
"learning_rate": 1.9929110397399686e-05,
"loss": 0.4537,
"step": 52600
},
{
"epoch": 2.0392369306969007,
"grad_norm": 2.7570724487304688,
"learning_rate": 1.9921371357814496e-05,
"loss": 0.4816,
"step": 52700
},
{
"epoch": 2.0431064504894945,
"grad_norm": 2.1981394290924072,
"learning_rate": 1.991363231822931e-05,
"loss": 0.4685,
"step": 52800
},
{
"epoch": 2.0469759702820878,
"grad_norm": 1.7801116704940796,
"learning_rate": 1.9905893278644122e-05,
"loss": 0.4573,
"step": 52900
},
{
"epoch": 2.0508454900746815,
"grad_norm": 4.362318992614746,
"learning_rate": 1.9898154239058935e-05,
"loss": 0.4489,
"step": 53000
},
{
"epoch": 2.0547150098672753,
"grad_norm": 3.178088665008545,
"learning_rate": 1.9890415199473748e-05,
"loss": 0.4531,
"step": 53100
},
{
"epoch": 2.058584529659869,
"grad_norm": 2.0428311824798584,
"learning_rate": 1.9882676159888558e-05,
"loss": 0.4446,
"step": 53200
},
{
"epoch": 2.062454049452463,
"grad_norm": 2.213200569152832,
"learning_rate": 1.9874937120303374e-05,
"loss": 0.4666,
"step": 53300
},
{
"epoch": 2.0663235692450566,
"grad_norm": 2.4708657264709473,
"learning_rate": 1.9867198080718184e-05,
"loss": 0.4441,
"step": 53400
},
{
"epoch": 2.0701930890376503,
"grad_norm": 1.6680482625961304,
"learning_rate": 1.9859459041132997e-05,
"loss": 0.451,
"step": 53500
},
{
"epoch": 2.074062608830244,
"grad_norm": 1.9106837511062622,
"learning_rate": 1.985172000154781e-05,
"loss": 0.4398,
"step": 53600
},
{
"epoch": 2.077932128622838,
"grad_norm": 1.2974762916564941,
"learning_rate": 1.984398096196262e-05,
"loss": 0.4365,
"step": 53700
},
{
"epoch": 2.0818016484154316,
"grad_norm": 2.707754135131836,
"learning_rate": 1.9836241922377436e-05,
"loss": 0.4387,
"step": 53800
},
{
"epoch": 2.0856711682080253,
"grad_norm": 1.648790955543518,
"learning_rate": 1.9828502882792246e-05,
"loss": 0.4734,
"step": 53900
},
{
"epoch": 2.089540688000619,
"grad_norm": 1.6504909992218018,
"learning_rate": 1.982076384320706e-05,
"loss": 0.4239,
"step": 54000
},
{
"epoch": 2.093410207793213,
"grad_norm": 1.8103519678115845,
"learning_rate": 1.9813024803621872e-05,
"loss": 0.4551,
"step": 54100
},
{
"epoch": 2.0972797275858066,
"grad_norm": 1.9078305959701538,
"learning_rate": 1.9805285764036682e-05,
"loss": 0.45,
"step": 54200
},
{
"epoch": 2.1011492473784004,
"grad_norm": 2.079587459564209,
"learning_rate": 1.97975467244515e-05,
"loss": 0.4478,
"step": 54300
},
{
"epoch": 2.105018767170994,
"grad_norm": 1.3618863821029663,
"learning_rate": 1.9789807684866308e-05,
"loss": 0.4285,
"step": 54400
},
{
"epoch": 2.108888286963588,
"grad_norm": 3.3195624351501465,
"learning_rate": 1.978206864528112e-05,
"loss": 0.4545,
"step": 54500
},
{
"epoch": 2.1127578067561816,
"grad_norm": 1.7043200731277466,
"learning_rate": 1.9774329605695934e-05,
"loss": 0.4574,
"step": 54600
},
{
"epoch": 2.1166273265487754,
"grad_norm": 1.7037144899368286,
"learning_rate": 1.9766590566110748e-05,
"loss": 0.436,
"step": 54700
},
{
"epoch": 2.120496846341369,
"grad_norm": 1.7071598768234253,
"learning_rate": 1.975885152652556e-05,
"loss": 0.4453,
"step": 54800
},
{
"epoch": 2.124366366133963,
"grad_norm": 1.7206696271896362,
"learning_rate": 1.975111248694037e-05,
"loss": 0.4679,
"step": 54900
},
{
"epoch": 2.1282358859265567,
"grad_norm": 1.6118768453598022,
"learning_rate": 1.9743373447355187e-05,
"loss": 0.4371,
"step": 55000
},
{
"epoch": 2.1321054057191504,
"grad_norm": 1.6849126815795898,
"learning_rate": 1.9735634407769997e-05,
"loss": 0.4159,
"step": 55100
},
{
"epoch": 2.135974925511744,
"grad_norm": 1.7709155082702637,
"learning_rate": 1.972789536818481e-05,
"loss": 0.4229,
"step": 55200
},
{
"epoch": 2.139844445304338,
"grad_norm": 1.504991888999939,
"learning_rate": 1.9720156328599623e-05,
"loss": 0.4459,
"step": 55300
},
{
"epoch": 2.1437139650969312,
"grad_norm": 2.0437350273132324,
"learning_rate": 1.9712417289014432e-05,
"loss": 0.4507,
"step": 55400
},
{
"epoch": 2.147583484889525,
"grad_norm": 1.811646580696106,
"learning_rate": 1.970467824942925e-05,
"loss": 0.4348,
"step": 55500
},
{
"epoch": 2.1514530046821188,
"grad_norm": 2.6153831481933594,
"learning_rate": 1.969693920984406e-05,
"loss": 0.4296,
"step": 55600
},
{
"epoch": 2.1553225244747125,
"grad_norm": 2.2653005123138428,
"learning_rate": 1.9689200170258872e-05,
"loss": 0.4495,
"step": 55700
},
{
"epoch": 2.1591920442673063,
"grad_norm": 1.3044402599334717,
"learning_rate": 1.9681461130673685e-05,
"loss": 0.4353,
"step": 55800
},
{
"epoch": 2.1630615640599,
"grad_norm": 1.2863844633102417,
"learning_rate": 1.9673722091088498e-05,
"loss": 0.4343,
"step": 55900
},
{
"epoch": 2.1669310838524938,
"grad_norm": 2.1832025051116943,
"learning_rate": 1.966598305150331e-05,
"loss": 0.4521,
"step": 56000
},
{
"epoch": 2.1708006036450875,
"grad_norm": 10.471029281616211,
"learning_rate": 1.965824401191812e-05,
"loss": 0.437,
"step": 56100
},
{
"epoch": 2.1746701234376813,
"grad_norm": 1.3444117307662964,
"learning_rate": 1.9650504972332934e-05,
"loss": 0.4287,
"step": 56200
},
{
"epoch": 2.178539643230275,
"grad_norm": 1.4190716743469238,
"learning_rate": 1.9642765932747747e-05,
"loss": 0.4384,
"step": 56300
},
{
"epoch": 2.182409163022869,
"grad_norm": 2.5630593299865723,
"learning_rate": 1.963502689316256e-05,
"loss": 0.4437,
"step": 56400
},
{
"epoch": 2.1862786828154626,
"grad_norm": 4.250194549560547,
"learning_rate": 1.9627287853577373e-05,
"loss": 0.4253,
"step": 56500
},
{
"epoch": 2.1901482026080563,
"grad_norm": 2.147238254547119,
"learning_rate": 1.9619548813992183e-05,
"loss": 0.4451,
"step": 56600
},
{
"epoch": 2.19401772240065,
"grad_norm": 1.2069705724716187,
"learning_rate": 1.9611809774407e-05,
"loss": 0.431,
"step": 56700
},
{
"epoch": 2.197887242193244,
"grad_norm": 1.8252161741256714,
"learning_rate": 1.960407073482181e-05,
"loss": 0.4358,
"step": 56800
},
{
"epoch": 2.2017567619858376,
"grad_norm": 1.5236437320709229,
"learning_rate": 1.9596331695236622e-05,
"loss": 0.4291,
"step": 56900
},
{
"epoch": 2.2056262817784313,
"grad_norm": 1.3067554235458374,
"learning_rate": 1.9588592655651435e-05,
"loss": 0.4462,
"step": 57000
},
{
"epoch": 2.209495801571025,
"grad_norm": 3.1271252632141113,
"learning_rate": 1.958085361606625e-05,
"loss": 0.4308,
"step": 57100
},
{
"epoch": 2.213365321363619,
"grad_norm": 1.5025999546051025,
"learning_rate": 1.957311457648106e-05,
"loss": 0.4454,
"step": 57200
},
{
"epoch": 2.2172348411562126,
"grad_norm": 2.4508609771728516,
"learning_rate": 1.956537553689587e-05,
"loss": 0.4294,
"step": 57300
},
{
"epoch": 2.2211043609488064,
"grad_norm": 1.9247980117797852,
"learning_rate": 1.9557636497310684e-05,
"loss": 0.4388,
"step": 57400
},
{
"epoch": 2.2249738807414,
"grad_norm": 1.0998092889785767,
"learning_rate": 1.9549897457725497e-05,
"loss": 0.4249,
"step": 57500
},
{
"epoch": 2.228843400533994,
"grad_norm": 2.039179563522339,
"learning_rate": 1.954215841814031e-05,
"loss": 0.4138,
"step": 57600
},
{
"epoch": 2.2327129203265876,
"grad_norm": 1.4328453540802002,
"learning_rate": 1.9534419378555124e-05,
"loss": 0.4485,
"step": 57700
},
{
"epoch": 2.2365824401191814,
"grad_norm": 1.4221556186676025,
"learning_rate": 1.9526680338969933e-05,
"loss": 0.4372,
"step": 57800
},
{
"epoch": 2.240451959911775,
"grad_norm": 1.6141443252563477,
"learning_rate": 1.951894129938475e-05,
"loss": 0.4193,
"step": 57900
},
{
"epoch": 2.244321479704369,
"grad_norm": 1.6460309028625488,
"learning_rate": 1.951120225979956e-05,
"loss": 0.4289,
"step": 58000
},
{
"epoch": 2.2481909994969627,
"grad_norm": 1.906775951385498,
"learning_rate": 1.9503463220214373e-05,
"loss": 0.426,
"step": 58100
},
{
"epoch": 2.252060519289556,
"grad_norm": 1.4007736444473267,
"learning_rate": 1.9495724180629186e-05,
"loss": 0.4237,
"step": 58200
},
{
"epoch": 2.2559300390821497,
"grad_norm": 1.6321462392807007,
"learning_rate": 1.9487985141044e-05,
"loss": 0.442,
"step": 58300
},
{
"epoch": 2.2597995588747435,
"grad_norm": 2.8257906436920166,
"learning_rate": 1.9480246101458812e-05,
"loss": 0.4309,
"step": 58400
},
{
"epoch": 2.2636690786673372,
"grad_norm": 1.8106814622879028,
"learning_rate": 1.9472507061873622e-05,
"loss": 0.4431,
"step": 58500
},
{
"epoch": 2.267538598459931,
"grad_norm": 1.142861247062683,
"learning_rate": 1.9464768022288435e-05,
"loss": 0.415,
"step": 58600
},
{
"epoch": 2.2714081182525248,
"grad_norm": 1.88276207447052,
"learning_rate": 1.9457028982703248e-05,
"loss": 0.4004,
"step": 58700
},
{
"epoch": 2.2752776380451185,
"grad_norm": 1.45915687084198,
"learning_rate": 1.944928994311806e-05,
"loss": 0.4146,
"step": 58800
},
{
"epoch": 2.2791471578377123,
"grad_norm": 1.1579883098602295,
"learning_rate": 1.9441550903532874e-05,
"loss": 0.4161,
"step": 58900
},
{
"epoch": 2.283016677630306,
"grad_norm": 1.4199334383010864,
"learning_rate": 1.9433811863947684e-05,
"loss": 0.4284,
"step": 59000
},
{
"epoch": 2.2868861974229,
"grad_norm": 3.017756938934326,
"learning_rate": 1.9426072824362497e-05,
"loss": 0.4229,
"step": 59100
},
{
"epoch": 2.2907557172154935,
"grad_norm": 1.9444090127944946,
"learning_rate": 1.941833378477731e-05,
"loss": 0.4283,
"step": 59200
},
{
"epoch": 2.2946252370080873,
"grad_norm": 2.231217622756958,
"learning_rate": 1.9410594745192123e-05,
"loss": 0.4295,
"step": 59300
},
{
"epoch": 2.298494756800681,
"grad_norm": 2.147974729537964,
"learning_rate": 1.9402855705606936e-05,
"loss": 0.4158,
"step": 59400
},
{
"epoch": 2.302364276593275,
"grad_norm": 1.0102565288543701,
"learning_rate": 1.939511666602175e-05,
"loss": 0.4279,
"step": 59500
},
{
"epoch": 2.3062337963858686,
"grad_norm": 6.695295333862305,
"learning_rate": 1.9387377626436562e-05,
"loss": 0.4093,
"step": 59600
},
{
"epoch": 2.3101033161784623,
"grad_norm": 1.369629979133606,
"learning_rate": 1.9379638586851372e-05,
"loss": 0.4441,
"step": 59700
},
{
"epoch": 2.313972835971056,
"grad_norm": 1.9461331367492676,
"learning_rate": 1.9371899547266185e-05,
"loss": 0.433,
"step": 59800
},
{
"epoch": 2.31784235576365,
"grad_norm": 1.8734780550003052,
"learning_rate": 1.9364160507681e-05,
"loss": 0.4381,
"step": 59900
},
{
"epoch": 2.3217118755562436,
"grad_norm": 1.843590259552002,
"learning_rate": 1.935642146809581e-05,
"loss": 0.4186,
"step": 60000
},
{
"epoch": 2.3255813953488373,
"grad_norm": 2.5406510829925537,
"learning_rate": 1.9348682428510625e-05,
"loss": 0.4241,
"step": 60100
},
{
"epoch": 2.329450915141431,
"grad_norm": 2.618091344833374,
"learning_rate": 1.9340943388925434e-05,
"loss": 0.4123,
"step": 60200
},
{
"epoch": 2.333320434934025,
"grad_norm": 1.8404749631881714,
"learning_rate": 1.9333204349340247e-05,
"loss": 0.4171,
"step": 60300
},
{
"epoch": 2.3371899547266186,
"grad_norm": 1.55924391746521,
"learning_rate": 1.932546530975506e-05,
"loss": 0.4149,
"step": 60400
},
{
"epoch": 2.341059474519212,
"grad_norm": 2.034311532974243,
"learning_rate": 1.9317726270169874e-05,
"loss": 0.4127,
"step": 60500
},
{
"epoch": 2.3449289943118057,
"grad_norm": 1.4090275764465332,
"learning_rate": 1.9309987230584687e-05,
"loss": 0.3978,
"step": 60600
},
{
"epoch": 2.3487985141043994,
"grad_norm": 2.1726934909820557,
"learning_rate": 1.93022481909995e-05,
"loss": 0.4252,
"step": 60700
},
{
"epoch": 2.352668033896993,
"grad_norm": 4.539790630340576,
"learning_rate": 1.929450915141431e-05,
"loss": 0.4298,
"step": 60800
},
{
"epoch": 2.356537553689587,
"grad_norm": 1.1041001081466675,
"learning_rate": 1.9286770111829123e-05,
"loss": 0.421,
"step": 60900
},
{
"epoch": 2.3604070734821807,
"grad_norm": 1.8974961042404175,
"learning_rate": 1.9279031072243936e-05,
"loss": 0.4353,
"step": 61000
},
{
"epoch": 2.3642765932747745,
"grad_norm": 2.053619384765625,
"learning_rate": 1.927129203265875e-05,
"loss": 0.4134,
"step": 61100
},
{
"epoch": 2.368146113067368,
"grad_norm": 1.9047490358352661,
"learning_rate": 1.9263552993073562e-05,
"loss": 0.4421,
"step": 61200
},
{
"epoch": 2.372015632859962,
"grad_norm": 2.2807693481445312,
"learning_rate": 1.9255813953488375e-05,
"loss": 0.4173,
"step": 61300
},
{
"epoch": 2.3758851526525557,
"grad_norm": 1.544053077697754,
"learning_rate": 1.9248074913903185e-05,
"loss": 0.4289,
"step": 61400
},
{
"epoch": 2.3797546724451495,
"grad_norm": 1.469085693359375,
"learning_rate": 1.9240335874317998e-05,
"loss": 0.4094,
"step": 61500
},
{
"epoch": 2.3836241922377432,
"grad_norm": 1.9199841022491455,
"learning_rate": 1.923259683473281e-05,
"loss": 0.425,
"step": 61600
},
{
"epoch": 2.387493712030337,
"grad_norm": 1.388945460319519,
"learning_rate": 1.9224857795147624e-05,
"loss": 0.4238,
"step": 61700
},
{
"epoch": 2.3913632318229308,
"grad_norm": 1.136607050895691,
"learning_rate": 1.9217118755562437e-05,
"loss": 0.4242,
"step": 61800
},
{
"epoch": 2.3952327516155245,
"grad_norm": 2.213928699493408,
"learning_rate": 1.9209379715977247e-05,
"loss": 0.41,
"step": 61900
},
{
"epoch": 2.3991022714081183,
"grad_norm": 1.337274193763733,
"learning_rate": 1.920164067639206e-05,
"loss": 0.4051,
"step": 62000
},
{
"epoch": 2.402971791200712,
"grad_norm": 1.7968541383743286,
"learning_rate": 1.9193901636806873e-05,
"loss": 0.4264,
"step": 62100
},
{
"epoch": 2.406841310993306,
"grad_norm": 2.299100160598755,
"learning_rate": 1.9186162597221686e-05,
"loss": 0.4316,
"step": 62200
},
{
"epoch": 2.4107108307858995,
"grad_norm": 1.3097466230392456,
"learning_rate": 1.91784235576365e-05,
"loss": 0.416,
"step": 62300
},
{
"epoch": 2.4145803505784933,
"grad_norm": 1.9384897947311401,
"learning_rate": 1.9170684518051312e-05,
"loss": 0.398,
"step": 62400
},
{
"epoch": 2.418449870371087,
"grad_norm": 1.8852757215499878,
"learning_rate": 1.9162945478466126e-05,
"loss": 0.4158,
"step": 62500
},
{
"epoch": 2.422319390163681,
"grad_norm": 3.9488649368286133,
"learning_rate": 1.9155206438880935e-05,
"loss": 0.4113,
"step": 62600
},
{
"epoch": 2.4261889099562746,
"grad_norm": 1.9499768018722534,
"learning_rate": 1.914746739929575e-05,
"loss": 0.4163,
"step": 62700
},
{
"epoch": 2.4300584297488683,
"grad_norm": 1.1540164947509766,
"learning_rate": 1.913972835971056e-05,
"loss": 0.3977,
"step": 62800
},
{
"epoch": 2.433927949541462,
"grad_norm": 2.318495750427246,
"learning_rate": 1.9131989320125375e-05,
"loss": 0.4323,
"step": 62900
},
{
"epoch": 2.437797469334056,
"grad_norm": 1.8283582925796509,
"learning_rate": 1.9124250280540188e-05,
"loss": 0.3982,
"step": 63000
},
{
"epoch": 2.4416669891266496,
"grad_norm": 1.4836108684539795,
"learning_rate": 1.9116511240954997e-05,
"loss": 0.4079,
"step": 63100
},
{
"epoch": 2.4455365089192433,
"grad_norm": 1.6268258094787598,
"learning_rate": 1.910877220136981e-05,
"loss": 0.4099,
"step": 63200
},
{
"epoch": 2.449406028711837,
"grad_norm": 1.655819296836853,
"learning_rate": 1.9101033161784624e-05,
"loss": 0.4139,
"step": 63300
},
{
"epoch": 2.4532755485044304,
"grad_norm": 3.3714959621429443,
"learning_rate": 1.9093294122199437e-05,
"loss": 0.4258,
"step": 63400
},
{
"epoch": 2.457145068297024,
"grad_norm": 1.9959139823913574,
"learning_rate": 1.908555508261425e-05,
"loss": 0.4071,
"step": 63500
},
{
"epoch": 2.461014588089618,
"grad_norm": 1.5244359970092773,
"learning_rate": 1.9077816043029063e-05,
"loss": 0.4129,
"step": 63600
},
{
"epoch": 2.4648841078822117,
"grad_norm": 1.6724839210510254,
"learning_rate": 1.9070077003443873e-05,
"loss": 0.3886,
"step": 63700
},
{
"epoch": 2.4687536276748054,
"grad_norm": 1.5094096660614014,
"learning_rate": 1.9062337963858686e-05,
"loss": 0.3924,
"step": 63800
},
{
"epoch": 2.472623147467399,
"grad_norm": 1.6173244714736938,
"learning_rate": 1.90545989242735e-05,
"loss": 0.4284,
"step": 63900
},
{
"epoch": 2.476492667259993,
"grad_norm": 1.7004306316375732,
"learning_rate": 1.9046859884688312e-05,
"loss": 0.3927,
"step": 64000
},
{
"epoch": 2.4803621870525867,
"grad_norm": 3.754396677017212,
"learning_rate": 1.9039120845103125e-05,
"loss": 0.4147,
"step": 64100
},
{
"epoch": 2.4842317068451805,
"grad_norm": 1.993538737297058,
"learning_rate": 1.9031381805517938e-05,
"loss": 0.4388,
"step": 64200
},
{
"epoch": 2.488101226637774,
"grad_norm": 2.2600507736206055,
"learning_rate": 1.9023642765932748e-05,
"loss": 0.4126,
"step": 64300
},
{
"epoch": 2.491970746430368,
"grad_norm": 2.028188943862915,
"learning_rate": 1.901590372634756e-05,
"loss": 0.4029,
"step": 64400
},
{
"epoch": 2.4958402662229617,
"grad_norm": 1.467639446258545,
"learning_rate": 1.9008164686762374e-05,
"loss": 0.424,
"step": 64500
},
{
"epoch": 2.4997097860155555,
"grad_norm": 1.876083493232727,
"learning_rate": 1.9000425647177187e-05,
"loss": 0.4178,
"step": 64600
},
{
"epoch": 2.4999032620051853,
"eval_loss": 0.28442877531051636,
"eval_runtime": 71.3333,
"eval_samples_per_second": 29.285,
"eval_steps_per_second": 3.673,
"step": 64605
},
{
"epoch": 2.5035793058081492,
"grad_norm": 1.7402297258377075,
"learning_rate": 1.8992686607592e-05,
"loss": 0.4084,
"step": 64700
},
{
"epoch": 2.507448825600743,
"grad_norm": 1.4921330213546753,
"learning_rate": 1.8984947568006813e-05,
"loss": 0.4246,
"step": 64800
},
{
"epoch": 2.5113183453933368,
"grad_norm": 2.096726179122925,
"learning_rate": 1.8977208528421623e-05,
"loss": 0.411,
"step": 64900
},
{
"epoch": 2.5151878651859305,
"grad_norm": 1.0920313596725464,
"learning_rate": 1.8969469488836436e-05,
"loss": 0.416,
"step": 65000
},
{
"epoch": 2.5190573849785243,
"grad_norm": 1.7869759798049927,
"learning_rate": 1.896173044925125e-05,
"loss": 0.4182,
"step": 65100
},
{
"epoch": 2.522926904771118,
"grad_norm": 1.4348342418670654,
"learning_rate": 1.8953991409666062e-05,
"loss": 0.4053,
"step": 65200
},
{
"epoch": 2.526796424563712,
"grad_norm": 1.2327197790145874,
"learning_rate": 1.8946252370080876e-05,
"loss": 0.3838,
"step": 65300
},
{
"epoch": 2.5306659443563055,
"grad_norm": 1.9678161144256592,
"learning_rate": 1.893851333049569e-05,
"loss": 0.4049,
"step": 65400
},
{
"epoch": 2.534535464148899,
"grad_norm": 1.6470235586166382,
"learning_rate": 1.89307742909105e-05,
"loss": 0.4429,
"step": 65500
},
{
"epoch": 2.5384049839414926,
"grad_norm": 1.9904509782791138,
"learning_rate": 1.892303525132531e-05,
"loss": 0.3835,
"step": 65600
},
{
"epoch": 2.5422745037340864,
"grad_norm": 1.6979745626449585,
"learning_rate": 1.8915296211740125e-05,
"loss": 0.4051,
"step": 65700
},
{
"epoch": 2.54614402352668,
"grad_norm": 1.5223089456558228,
"learning_rate": 1.8907557172154938e-05,
"loss": 0.3995,
"step": 65800
},
{
"epoch": 2.550013543319274,
"grad_norm": 1.0474973917007446,
"learning_rate": 1.889981813256975e-05,
"loss": 0.3959,
"step": 65900
},
{
"epoch": 2.5538830631118676,
"grad_norm": 1.2941759824752808,
"learning_rate": 1.8892079092984564e-05,
"loss": 0.4084,
"step": 66000
},
{
"epoch": 2.5577525829044614,
"grad_norm": 2.273611545562744,
"learning_rate": 1.8884340053399374e-05,
"loss": 0.4051,
"step": 66100
},
{
"epoch": 2.561622102697055,
"grad_norm": 2.124966859817505,
"learning_rate": 1.8876601013814187e-05,
"loss": 0.4022,
"step": 66200
},
{
"epoch": 2.565491622489649,
"grad_norm": 2.5645906925201416,
"learning_rate": 1.8868861974229e-05,
"loss": 0.4108,
"step": 66300
},
{
"epoch": 2.5693611422822427,
"grad_norm": 1.5864262580871582,
"learning_rate": 1.8861122934643813e-05,
"loss": 0.402,
"step": 66400
},
{
"epoch": 2.5732306620748364,
"grad_norm": 1.5741071701049805,
"learning_rate": 1.8853383895058626e-05,
"loss": 0.4196,
"step": 66500
},
{
"epoch": 2.57710018186743,
"grad_norm": 1.3481132984161377,
"learning_rate": 1.8845644855473436e-05,
"loss": 0.4125,
"step": 66600
},
{
"epoch": 2.580969701660024,
"grad_norm": 2.292896032333374,
"learning_rate": 1.883790581588825e-05,
"loss": 0.3986,
"step": 66700
},
{
"epoch": 2.5848392214526177,
"grad_norm": 1.835115909576416,
"learning_rate": 1.8830166776303062e-05,
"loss": 0.4114,
"step": 66800
},
{
"epoch": 2.5887087412452114,
"grad_norm": 2.9787259101867676,
"learning_rate": 1.8822427736717875e-05,
"loss": 0.4039,
"step": 66900
},
{
"epoch": 2.592578261037805,
"grad_norm": 1.9280844926834106,
"learning_rate": 1.8814688697132688e-05,
"loss": 0.4032,
"step": 67000
},
{
"epoch": 2.596447780830399,
"grad_norm": 1.6325422525405884,
"learning_rate": 1.88069496575475e-05,
"loss": 0.42,
"step": 67100
},
{
"epoch": 2.6003173006229927,
"grad_norm": 1.9253751039505005,
"learning_rate": 1.8799210617962314e-05,
"loss": 0.4082,
"step": 67200
},
{
"epoch": 2.6041868204155865,
"grad_norm": 2.5450963973999023,
"learning_rate": 1.8791471578377124e-05,
"loss": 0.4055,
"step": 67300
},
{
"epoch": 2.6080563402081802,
"grad_norm": 1.7384142875671387,
"learning_rate": 1.8783732538791937e-05,
"loss": 0.4253,
"step": 67400
},
{
"epoch": 2.611925860000774,
"grad_norm": 2.371424913406372,
"learning_rate": 1.877599349920675e-05,
"loss": 0.4062,
"step": 67500
},
{
"epoch": 2.6157953797933677,
"grad_norm": 1.8545641899108887,
"learning_rate": 1.8768254459621563e-05,
"loss": 0.4301,
"step": 67600
},
{
"epoch": 2.6196648995859615,
"grad_norm": 2.011378765106201,
"learning_rate": 1.8760515420036376e-05,
"loss": 0.4081,
"step": 67700
},
{
"epoch": 2.6235344193785552,
"grad_norm": 1.7614511251449585,
"learning_rate": 1.8752776380451186e-05,
"loss": 0.3925,
"step": 67800
},
{
"epoch": 2.627403939171149,
"grad_norm": 2.2085518836975098,
"learning_rate": 1.8745037340866e-05,
"loss": 0.4054,
"step": 67900
},
{
"epoch": 2.6312734589637428,
"grad_norm": 1.7185721397399902,
"learning_rate": 1.8737298301280812e-05,
"loss": 0.4194,
"step": 68000
},
{
"epoch": 2.6351429787563365,
"grad_norm": 1.3710312843322754,
"learning_rate": 1.8729559261695626e-05,
"loss": 0.3902,
"step": 68100
},
{
"epoch": 2.6390124985489303,
"grad_norm": 2.2787375450134277,
"learning_rate": 1.872182022211044e-05,
"loss": 0.4027,
"step": 68200
},
{
"epoch": 2.642882018341524,
"grad_norm": 1.1309306621551514,
"learning_rate": 1.871408118252525e-05,
"loss": 0.3825,
"step": 68300
},
{
"epoch": 2.646751538134118,
"grad_norm": 1.4846229553222656,
"learning_rate": 1.870634214294006e-05,
"loss": 0.4127,
"step": 68400
},
{
"epoch": 2.6506210579267115,
"grad_norm": 1.4689130783081055,
"learning_rate": 1.8698603103354875e-05,
"loss": 0.4235,
"step": 68500
},
{
"epoch": 2.6544905777193053,
"grad_norm": 2.0043463706970215,
"learning_rate": 1.8690864063769688e-05,
"loss": 0.4108,
"step": 68600
},
{
"epoch": 2.6583600975118986,
"grad_norm": 1.7952332496643066,
"learning_rate": 1.86831250241845e-05,
"loss": 0.4145,
"step": 68700
},
{
"epoch": 2.6622296173044924,
"grad_norm": 1.4483826160430908,
"learning_rate": 1.8675385984599314e-05,
"loss": 0.3894,
"step": 68800
},
{
"epoch": 2.666099137097086,
"grad_norm": 1.6657880544662476,
"learning_rate": 1.8667646945014127e-05,
"loss": 0.3807,
"step": 68900
},
{
"epoch": 2.66996865688968,
"grad_norm": 1.7913455963134766,
"learning_rate": 1.8659907905428937e-05,
"loss": 0.3962,
"step": 69000
},
{
"epoch": 2.6738381766822736,
"grad_norm": 3.517664670944214,
"learning_rate": 1.865216886584375e-05,
"loss": 0.4053,
"step": 69100
},
{
"epoch": 2.6777076964748674,
"grad_norm": 1.5401078462600708,
"learning_rate": 1.8644429826258563e-05,
"loss": 0.4013,
"step": 69200
},
{
"epoch": 2.681577216267461,
"grad_norm": 1.832924485206604,
"learning_rate": 1.8636690786673376e-05,
"loss": 0.4001,
"step": 69300
},
{
"epoch": 2.685446736060055,
"grad_norm": 1.508527398109436,
"learning_rate": 1.862895174708819e-05,
"loss": 0.3918,
"step": 69400
},
{
"epoch": 2.6893162558526487,
"grad_norm": 1.329871654510498,
"learning_rate": 1.8621212707503e-05,
"loss": 0.4053,
"step": 69500
},
{
"epoch": 2.6931857756452424,
"grad_norm": 1.6876091957092285,
"learning_rate": 1.8613473667917812e-05,
"loss": 0.4068,
"step": 69600
},
{
"epoch": 2.697055295437836,
"grad_norm": 2.244474411010742,
"learning_rate": 1.8605734628332625e-05,
"loss": 0.3733,
"step": 69700
},
{
"epoch": 2.70092481523043,
"grad_norm": 1.6640311479568481,
"learning_rate": 1.8597995588747438e-05,
"loss": 0.4118,
"step": 69800
},
{
"epoch": 2.7047943350230237,
"grad_norm": 1.3973504304885864,
"learning_rate": 1.859025654916225e-05,
"loss": 0.3921,
"step": 69900
},
{
"epoch": 2.7086638548156174,
"grad_norm": 1.8491889238357544,
"learning_rate": 1.8582517509577064e-05,
"loss": 0.4114,
"step": 70000
},
{
"epoch": 2.712533374608211,
"grad_norm": 2.136467933654785,
"learning_rate": 1.8574778469991877e-05,
"loss": 0.3907,
"step": 70100
},
{
"epoch": 2.716402894400805,
"grad_norm": 1.5678136348724365,
"learning_rate": 1.8567039430406687e-05,
"loss": 0.3753,
"step": 70200
},
{
"epoch": 2.7202724141933987,
"grad_norm": 2.590696096420288,
"learning_rate": 1.85593003908215e-05,
"loss": 0.378,
"step": 70300
},
{
"epoch": 2.7241419339859925,
"grad_norm": 1.5565811395645142,
"learning_rate": 1.8551561351236313e-05,
"loss": 0.3704,
"step": 70400
},
{
"epoch": 2.7280114537785862,
"grad_norm": 0.8795768618583679,
"learning_rate": 1.8543822311651126e-05,
"loss": 0.4054,
"step": 70500
},
{
"epoch": 2.73188097357118,
"grad_norm": 1.5178683996200562,
"learning_rate": 1.853608327206594e-05,
"loss": 0.3753,
"step": 70600
},
{
"epoch": 2.7357504933637733,
"grad_norm": 1.8956971168518066,
"learning_rate": 1.852834423248075e-05,
"loss": 0.3957,
"step": 70700
},
{
"epoch": 2.739620013156367,
"grad_norm": 2.690990447998047,
"learning_rate": 1.8520605192895562e-05,
"loss": 0.4068,
"step": 70800
},
{
"epoch": 2.743489532948961,
"grad_norm": 2.114319086074829,
"learning_rate": 1.8512866153310375e-05,
"loss": 0.3945,
"step": 70900
},
{
"epoch": 2.7473590527415546,
"grad_norm": 1.9230573177337646,
"learning_rate": 1.850512711372519e-05,
"loss": 0.4104,
"step": 71000
},
{
"epoch": 2.7512285725341483,
"grad_norm": 1.365560531616211,
"learning_rate": 1.8497388074140002e-05,
"loss": 0.3866,
"step": 71100
},
{
"epoch": 2.755098092326742,
"grad_norm": 1.803850769996643,
"learning_rate": 1.848964903455481e-05,
"loss": 0.3915,
"step": 71200
},
{
"epoch": 2.758967612119336,
"grad_norm": 1.6910895109176636,
"learning_rate": 1.8481909994969628e-05,
"loss": 0.416,
"step": 71300
},
{
"epoch": 2.7628371319119296,
"grad_norm": 1.7176426649093628,
"learning_rate": 1.8474170955384438e-05,
"loss": 0.3913,
"step": 71400
},
{
"epoch": 2.7667066517045233,
"grad_norm": 1.2521884441375732,
"learning_rate": 1.846643191579925e-05,
"loss": 0.3802,
"step": 71500
},
{
"epoch": 2.770576171497117,
"grad_norm": 1.5698885917663574,
"learning_rate": 1.8458692876214064e-05,
"loss": 0.3964,
"step": 71600
},
{
"epoch": 2.774445691289711,
"grad_norm": 1.6318507194519043,
"learning_rate": 1.8450953836628877e-05,
"loss": 0.3688,
"step": 71700
},
{
"epoch": 2.7783152110823046,
"grad_norm": 2.5745694637298584,
"learning_rate": 1.844321479704369e-05,
"loss": 0.3796,
"step": 71800
},
{
"epoch": 2.7821847308748984,
"grad_norm": 2.249467134475708,
"learning_rate": 1.84354757574585e-05,
"loss": 0.4085,
"step": 71900
},
{
"epoch": 2.786054250667492,
"grad_norm": 1.1853622198104858,
"learning_rate": 1.8427736717873313e-05,
"loss": 0.3894,
"step": 72000
},
{
"epoch": 2.789923770460086,
"grad_norm": 1.2344344854354858,
"learning_rate": 1.8419997678288126e-05,
"loss": 0.3709,
"step": 72100
},
{
"epoch": 2.7937932902526796,
"grad_norm": 1.520385980606079,
"learning_rate": 1.841225863870294e-05,
"loss": 0.3874,
"step": 72200
},
{
"epoch": 2.7976628100452734,
"grad_norm": 2.7006897926330566,
"learning_rate": 1.8404519599117752e-05,
"loss": 0.3745,
"step": 72300
},
{
"epoch": 2.801532329837867,
"grad_norm": 2.2351534366607666,
"learning_rate": 1.8396780559532562e-05,
"loss": 0.3751,
"step": 72400
},
{
"epoch": 2.805401849630461,
"grad_norm": 2.515935182571411,
"learning_rate": 1.838904151994738e-05,
"loss": 0.3717,
"step": 72500
},
{
"epoch": 2.8092713694230547,
"grad_norm": 1.3921540975570679,
"learning_rate": 1.8381302480362188e-05,
"loss": 0.3742,
"step": 72600
},
{
"epoch": 2.8131408892156484,
"grad_norm": 3.017775058746338,
"learning_rate": 1.8373563440777e-05,
"loss": 0.384,
"step": 72700
},
{
"epoch": 2.817010409008242,
"grad_norm": 2.8901004791259766,
"learning_rate": 1.8365824401191814e-05,
"loss": 0.3757,
"step": 72800
},
{
"epoch": 2.820879928800836,
"grad_norm": 1.0690851211547852,
"learning_rate": 1.8358085361606624e-05,
"loss": 0.3878,
"step": 72900
},
{
"epoch": 2.8247494485934297,
"grad_norm": 1.4159176349639893,
"learning_rate": 1.835034632202144e-05,
"loss": 0.3976,
"step": 73000
},
{
"epoch": 2.8286189683860234,
"grad_norm": 1.8387011289596558,
"learning_rate": 1.834260728243625e-05,
"loss": 0.4264,
"step": 73100
},
{
"epoch": 2.832488488178617,
"grad_norm": 2.336967706680298,
"learning_rate": 1.8334868242851063e-05,
"loss": 0.4015,
"step": 73200
},
{
"epoch": 2.836358007971211,
"grad_norm": 2.210538864135742,
"learning_rate": 1.8327129203265876e-05,
"loss": 0.402,
"step": 73300
},
{
"epoch": 2.8402275277638047,
"grad_norm": 1.7943260669708252,
"learning_rate": 1.831939016368069e-05,
"loss": 0.3829,
"step": 73400
},
{
"epoch": 2.8440970475563985,
"grad_norm": 2.171783447265625,
"learning_rate": 1.8311651124095503e-05,
"loss": 0.3925,
"step": 73500
},
{
"epoch": 2.8479665673489922,
"grad_norm": 1.937455654144287,
"learning_rate": 1.8303912084510312e-05,
"loss": 0.3731,
"step": 73600
},
{
"epoch": 2.851836087141586,
"grad_norm": 2.4677200317382812,
"learning_rate": 1.829617304492513e-05,
"loss": 0.3792,
"step": 73700
},
{
"epoch": 2.8557056069341797,
"grad_norm": 1.6288717985153198,
"learning_rate": 1.828843400533994e-05,
"loss": 0.4079,
"step": 73800
},
{
"epoch": 2.859575126726773,
"grad_norm": 1.2947713136672974,
"learning_rate": 1.828069496575475e-05,
"loss": 0.3677,
"step": 73900
},
{
"epoch": 2.863444646519367,
"grad_norm": 2.0474677085876465,
"learning_rate": 1.8272955926169565e-05,
"loss": 0.3969,
"step": 74000
},
{
"epoch": 2.8673141663119606,
"grad_norm": 0.9555093050003052,
"learning_rate": 1.8265216886584374e-05,
"loss": 0.3791,
"step": 74100
},
{
"epoch": 2.8711836861045543,
"grad_norm": 2.7177960872650146,
"learning_rate": 1.825747784699919e-05,
"loss": 0.3899,
"step": 74200
},
{
"epoch": 2.875053205897148,
"grad_norm": 1.8301888704299927,
"learning_rate": 1.8249738807414e-05,
"loss": 0.3703,
"step": 74300
},
{
"epoch": 2.878922725689742,
"grad_norm": 1.6821845769882202,
"learning_rate": 1.8241999767828814e-05,
"loss": 0.3744,
"step": 74400
},
{
"epoch": 2.8827922454823356,
"grad_norm": 1.0236321687698364,
"learning_rate": 1.8234260728243627e-05,
"loss": 0.3849,
"step": 74500
},
{
"epoch": 2.8866617652749293,
"grad_norm": 1.5947498083114624,
"learning_rate": 1.822652168865844e-05,
"loss": 0.3727,
"step": 74600
},
{
"epoch": 2.890531285067523,
"grad_norm": 2.4842593669891357,
"learning_rate": 1.8218782649073253e-05,
"loss": 0.3939,
"step": 74700
},
{
"epoch": 2.894400804860117,
"grad_norm": 2.366248607635498,
"learning_rate": 1.8211043609488063e-05,
"loss": 0.3731,
"step": 74800
},
{
"epoch": 2.8982703246527106,
"grad_norm": 1.7339088916778564,
"learning_rate": 1.820330456990288e-05,
"loss": 0.404,
"step": 74900
},
{
"epoch": 2.9021398444453044,
"grad_norm": 3.6488988399505615,
"learning_rate": 1.819556553031769e-05,
"loss": 0.3875,
"step": 75000
},
{
"epoch": 2.906009364237898,
"grad_norm": 1.3778159618377686,
"learning_rate": 1.8187826490732502e-05,
"loss": 0.3858,
"step": 75100
},
{
"epoch": 2.909878884030492,
"grad_norm": 1.6373904943466187,
"learning_rate": 1.8180087451147315e-05,
"loss": 0.3784,
"step": 75200
},
{
"epoch": 2.9137484038230856,
"grad_norm": 5.406473636627197,
"learning_rate": 1.8172348411562125e-05,
"loss": 0.3787,
"step": 75300
},
{
"epoch": 2.9176179236156794,
"grad_norm": 1.9427270889282227,
"learning_rate": 1.816460937197694e-05,
"loss": 0.4024,
"step": 75400
},
{
"epoch": 2.921487443408273,
"grad_norm": 2.2424678802490234,
"learning_rate": 1.815687033239175e-05,
"loss": 0.3657,
"step": 75500
},
{
"epoch": 2.925356963200867,
"grad_norm": 1.9070557355880737,
"learning_rate": 1.8149131292806564e-05,
"loss": 0.3687,
"step": 75600
},
{
"epoch": 2.9292264829934607,
"grad_norm": 1.7725392580032349,
"learning_rate": 1.8141392253221377e-05,
"loss": 0.3735,
"step": 75700
},
{
"epoch": 2.933096002786054,
"grad_norm": 1.7941452264785767,
"learning_rate": 1.8133653213636187e-05,
"loss": 0.3861,
"step": 75800
},
{
"epoch": 2.9369655225786477,
"grad_norm": 2.008236885070801,
"learning_rate": 1.8125914174051004e-05,
"loss": 0.378,
"step": 75900
},
{
"epoch": 2.9408350423712415,
"grad_norm": 1.4745265245437622,
"learning_rate": 1.8118175134465813e-05,
"loss": 0.369,
"step": 76000
},
{
"epoch": 2.9447045621638352,
"grad_norm": 1.3864421844482422,
"learning_rate": 1.8110436094880626e-05,
"loss": 0.371,
"step": 76100
},
{
"epoch": 2.948574081956429,
"grad_norm": 2.3478002548217773,
"learning_rate": 1.810269705529544e-05,
"loss": 0.3923,
"step": 76200
},
{
"epoch": 2.9524436017490228,
"grad_norm": 1.6446783542633057,
"learning_rate": 1.8094958015710253e-05,
"loss": 0.3759,
"step": 76300
},
{
"epoch": 2.9563131215416165,
"grad_norm": 6.377575874328613,
"learning_rate": 1.8087218976125066e-05,
"loss": 0.376,
"step": 76400
},
{
"epoch": 2.9601826413342103,
"grad_norm": 1.960789680480957,
"learning_rate": 1.8079479936539875e-05,
"loss": 0.3874,
"step": 76500
},
{
"epoch": 2.964052161126804,
"grad_norm": 1.6182048320770264,
"learning_rate": 1.8071740896954692e-05,
"loss": 0.3949,
"step": 76600
},
{
"epoch": 2.967921680919398,
"grad_norm": 1.5921803712844849,
"learning_rate": 1.80640018573695e-05,
"loss": 0.3579,
"step": 76700
},
{
"epoch": 2.9717912007119915,
"grad_norm": 1.7049123048782349,
"learning_rate": 1.8056262817784315e-05,
"loss": 0.3704,
"step": 76800
},
{
"epoch": 2.9756607205045853,
"grad_norm": 1.4930731058120728,
"learning_rate": 1.8048523778199128e-05,
"loss": 0.357,
"step": 76900
},
{
"epoch": 2.979530240297179,
"grad_norm": 2.8269336223602295,
"learning_rate": 1.8040784738613938e-05,
"loss": 0.3747,
"step": 77000
},
{
"epoch": 2.983399760089773,
"grad_norm": 3.652132987976074,
"learning_rate": 1.8033045699028754e-05,
"loss": 0.3693,
"step": 77100
},
{
"epoch": 2.9872692798823666,
"grad_norm": 1.7305335998535156,
"learning_rate": 1.8025306659443564e-05,
"loss": 0.3689,
"step": 77200
},
{
"epoch": 2.9911387996749603,
"grad_norm": 1.245302438735962,
"learning_rate": 1.8017567619858377e-05,
"loss": 0.3731,
"step": 77300
},
{
"epoch": 2.995008319467554,
"grad_norm": 1.4806208610534668,
"learning_rate": 1.800982858027319e-05,
"loss": 0.3532,
"step": 77400
},
{
"epoch": 2.998877839260148,
"grad_norm": 1.8243787288665771,
"learning_rate": 1.8002089540688003e-05,
"loss": 0.3875,
"step": 77500
},
{
"epoch": 2.9998839144062224,
"eval_loss": 0.2676403522491455,
"eval_runtime": 71.2462,
"eval_samples_per_second": 29.321,
"eval_steps_per_second": 3.677,
"step": 77526
},
{
"epoch": 3.0027473590527416,
"grad_norm": 1.5966713428497314,
"learning_rate": 1.9994273110706964e-05,
"loss": 0.3706,
"step": 77600
},
{
"epoch": 3.0066168788453353,
"grad_norm": 0.9762176275253296,
"learning_rate": 1.9986534071121774e-05,
"loss": 0.3468,
"step": 77700
},
{
"epoch": 3.010486398637929,
"grad_norm": 1.1284077167510986,
"learning_rate": 1.9978795031536587e-05,
"loss": 0.346,
"step": 77800
},
{
"epoch": 3.014355918430523,
"grad_norm": 2.162651538848877,
"learning_rate": 1.99710559919514e-05,
"loss": 0.3852,
"step": 77900
},
{
"epoch": 3.0182254382231166,
"grad_norm": 1.472206711769104,
"learning_rate": 1.9963316952366213e-05,
"loss": 0.3737,
"step": 78000
},
{
"epoch": 3.0220949580157104,
"grad_norm": 1.6894917488098145,
"learning_rate": 1.9955577912781027e-05,
"loss": 0.3638,
"step": 78100
},
{
"epoch": 3.025964477808304,
"grad_norm": 1.6989420652389526,
"learning_rate": 1.9947838873195836e-05,
"loss": 0.3866,
"step": 78200
},
{
"epoch": 3.029833997600898,
"grad_norm": 1.79862642288208,
"learning_rate": 1.994009983361065e-05,
"loss": 0.3874,
"step": 78300
},
{
"epoch": 3.0337035173934916,
"grad_norm": 2.368971586227417,
"learning_rate": 1.9932360794025462e-05,
"loss": 0.3828,
"step": 78400
},
{
"epoch": 3.0375730371860854,
"grad_norm": 1.8010534048080444,
"learning_rate": 1.9924621754440276e-05,
"loss": 0.3659,
"step": 78500
},
{
"epoch": 3.041442556978679,
"grad_norm": 1.9091135263442993,
"learning_rate": 1.991688271485509e-05,
"loss": 0.351,
"step": 78600
},
{
"epoch": 3.0453120767712725,
"grad_norm": 1.708998203277588,
"learning_rate": 1.99091436752699e-05,
"loss": 0.3696,
"step": 78700
},
{
"epoch": 3.0491815965638662,
"grad_norm": 1.80574369430542,
"learning_rate": 1.9901404635684715e-05,
"loss": 0.3939,
"step": 78800
},
{
"epoch": 3.05305111635646,
"grad_norm": 1.2384390830993652,
"learning_rate": 1.9893665596099525e-05,
"loss": 0.3512,
"step": 78900
},
{
"epoch": 3.0569206361490537,
"grad_norm": 2.3889994621276855,
"learning_rate": 1.9885926556514338e-05,
"loss": 0.3601,
"step": 79000
},
{
"epoch": 3.0607901559416475,
"grad_norm": 1.5315697193145752,
"learning_rate": 1.987818751692915e-05,
"loss": 0.3889,
"step": 79100
},
{
"epoch": 3.0646596757342413,
"grad_norm": 3.5246341228485107,
"learning_rate": 1.9870448477343964e-05,
"loss": 0.3785,
"step": 79200
},
{
"epoch": 3.068529195526835,
"grad_norm": 1.7777693271636963,
"learning_rate": 1.9862709437758777e-05,
"loss": 0.3744,
"step": 79300
},
{
"epoch": 3.0723987153194288,
"grad_norm": 1.318174123764038,
"learning_rate": 1.9854970398173587e-05,
"loss": 0.3681,
"step": 79400
},
{
"epoch": 3.0762682351120225,
"grad_norm": 1.5280954837799072,
"learning_rate": 1.98472313585884e-05,
"loss": 0.3844,
"step": 79500
},
{
"epoch": 3.0801377549046163,
"grad_norm": 2.24074649810791,
"learning_rate": 1.9839492319003213e-05,
"loss": 0.3634,
"step": 79600
},
{
"epoch": 3.08400727469721,
"grad_norm": 1.337833285331726,
"learning_rate": 1.9831753279418026e-05,
"loss": 0.3586,
"step": 79700
},
{
"epoch": 3.087876794489804,
"grad_norm": 2.0881550312042236,
"learning_rate": 1.982401423983284e-05,
"loss": 0.3659,
"step": 79800
},
{
"epoch": 3.0917463142823975,
"grad_norm": 1.7096840143203735,
"learning_rate": 1.981627520024765e-05,
"loss": 0.3691,
"step": 79900
},
{
"epoch": 3.0956158340749913,
"grad_norm": 1.1658939123153687,
"learning_rate": 1.9808536160662465e-05,
"loss": 0.3816,
"step": 80000
},
{
"epoch": 3.099485353867585,
"grad_norm": 1.7050002813339233,
"learning_rate": 1.9800797121077275e-05,
"loss": 0.3606,
"step": 80100
},
{
"epoch": 3.103354873660179,
"grad_norm": 1.3548462390899658,
"learning_rate": 1.9793058081492088e-05,
"loss": 0.3597,
"step": 80200
},
{
"epoch": 3.1072243934527726,
"grad_norm": 2.5295116901397705,
"learning_rate": 1.97853190419069e-05,
"loss": 0.3689,
"step": 80300
},
{
"epoch": 3.1110939132453663,
"grad_norm": 1.1832600831985474,
"learning_rate": 1.9777580002321714e-05,
"loss": 0.365,
"step": 80400
},
{
"epoch": 3.11496343303796,
"grad_norm": 1.9550867080688477,
"learning_rate": 1.9769840962736527e-05,
"loss": 0.3754,
"step": 80500
},
{
"epoch": 3.118832952830554,
"grad_norm": 2.029646396636963,
"learning_rate": 1.9762101923151337e-05,
"loss": 0.3537,
"step": 80600
},
{
"epoch": 3.1227024726231476,
"grad_norm": 1.465968370437622,
"learning_rate": 1.975436288356615e-05,
"loss": 0.3779,
"step": 80700
},
{
"epoch": 3.1265719924157414,
"grad_norm": 1.8936628103256226,
"learning_rate": 1.9746623843980963e-05,
"loss": 0.3781,
"step": 80800
},
{
"epoch": 3.130441512208335,
"grad_norm": 2.382840633392334,
"learning_rate": 1.9738884804395777e-05,
"loss": 0.3727,
"step": 80900
},
{
"epoch": 3.134311032000929,
"grad_norm": 2.595386028289795,
"learning_rate": 1.973114576481059e-05,
"loss": 0.3537,
"step": 81000
},
{
"epoch": 3.1381805517935226,
"grad_norm": 2.3016576766967773,
"learning_rate": 1.97234067252254e-05,
"loss": 0.3797,
"step": 81100
},
{
"epoch": 3.142050071586116,
"grad_norm": 1.0764691829681396,
"learning_rate": 1.9715667685640212e-05,
"loss": 0.3753,
"step": 81200
},
{
"epoch": 3.1459195913787097,
"grad_norm": 1.311075210571289,
"learning_rate": 1.9707928646055026e-05,
"loss": 0.3551,
"step": 81300
},
{
"epoch": 3.1497891111713034,
"grad_norm": 1.7169547080993652,
"learning_rate": 1.970018960646984e-05,
"loss": 0.3624,
"step": 81400
},
{
"epoch": 3.153658630963897,
"grad_norm": 1.833234429359436,
"learning_rate": 1.9692450566884652e-05,
"loss": 0.3975,
"step": 81500
},
{
"epoch": 3.157528150756491,
"grad_norm": 1.6446950435638428,
"learning_rate": 1.968471152729946e-05,
"loss": 0.3693,
"step": 81600
},
{
"epoch": 3.1613976705490847,
"grad_norm": 1.723495364189148,
"learning_rate": 1.9676972487714278e-05,
"loss": 0.3572,
"step": 81700
},
{
"epoch": 3.1652671903416785,
"grad_norm": 1.0325422286987305,
"learning_rate": 1.9669233448129088e-05,
"loss": 0.3645,
"step": 81800
},
{
"epoch": 3.1691367101342722,
"grad_norm": 1.537728190422058,
"learning_rate": 1.96614944085439e-05,
"loss": 0.3693,
"step": 81900
},
{
"epoch": 3.173006229926866,
"grad_norm": 1.5777703523635864,
"learning_rate": 1.9653755368958714e-05,
"loss": 0.3552,
"step": 82000
},
{
"epoch": 3.1768757497194597,
"grad_norm": 1.3765802383422852,
"learning_rate": 1.9646016329373527e-05,
"loss": 0.3835,
"step": 82100
},
{
"epoch": 3.1807452695120535,
"grad_norm": 1.3902156352996826,
"learning_rate": 1.963827728978834e-05,
"loss": 0.3577,
"step": 82200
},
{
"epoch": 3.1846147893046473,
"grad_norm": 1.3278142213821411,
"learning_rate": 1.963053825020315e-05,
"loss": 0.3794,
"step": 82300
},
{
"epoch": 3.188484309097241,
"grad_norm": 1.7168638706207275,
"learning_rate": 1.9622799210617963e-05,
"loss": 0.3522,
"step": 82400
},
{
"epoch": 3.1923538288898348,
"grad_norm": 1.9700740575790405,
"learning_rate": 1.9615060171032776e-05,
"loss": 0.3769,
"step": 82500
},
{
"epoch": 3.1962233486824285,
"grad_norm": 1.5577361583709717,
"learning_rate": 1.960732113144759e-05,
"loss": 0.3537,
"step": 82600
},
{
"epoch": 3.2000928684750223,
"grad_norm": 1.9805132150650024,
"learning_rate": 1.9599582091862402e-05,
"loss": 0.3598,
"step": 82700
},
{
"epoch": 3.203962388267616,
"grad_norm": 2.008052110671997,
"learning_rate": 1.9591843052277212e-05,
"loss": 0.3595,
"step": 82800
},
{
"epoch": 3.20783190806021,
"grad_norm": 1.7605400085449219,
"learning_rate": 1.958410401269203e-05,
"loss": 0.3566,
"step": 82900
},
{
"epoch": 3.2117014278528035,
"grad_norm": 1.9822074174880981,
"learning_rate": 1.9576364973106838e-05,
"loss": 0.3541,
"step": 83000
},
{
"epoch": 3.2155709476453973,
"grad_norm": 1.9497880935668945,
"learning_rate": 1.956862593352165e-05,
"loss": 0.3554,
"step": 83100
},
{
"epoch": 3.219440467437991,
"grad_norm": 1.1537199020385742,
"learning_rate": 1.9560886893936464e-05,
"loss": 0.3549,
"step": 83200
},
{
"epoch": 3.223309987230585,
"grad_norm": 1.6270828247070312,
"learning_rate": 1.9553147854351277e-05,
"loss": 0.3398,
"step": 83300
},
{
"epoch": 3.2271795070231786,
"grad_norm": 1.7348685264587402,
"learning_rate": 1.954540881476609e-05,
"loss": 0.3572,
"step": 83400
},
{
"epoch": 3.2310490268157723,
"grad_norm": 1.329280972480774,
"learning_rate": 1.95376697751809e-05,
"loss": 0.3615,
"step": 83500
},
{
"epoch": 3.234918546608366,
"grad_norm": 1.6342438459396362,
"learning_rate": 1.9529930735595713e-05,
"loss": 0.3649,
"step": 83600
},
{
"epoch": 3.23878806640096,
"grad_norm": 1.7982897758483887,
"learning_rate": 1.9522191696010526e-05,
"loss": 0.3553,
"step": 83700
},
{
"epoch": 3.2426575861935536,
"grad_norm": 1.6109760999679565,
"learning_rate": 1.951445265642534e-05,
"loss": 0.378,
"step": 83800
},
{
"epoch": 3.2465271059861474,
"grad_norm": 1.328223466873169,
"learning_rate": 1.9506713616840153e-05,
"loss": 0.3589,
"step": 83900
},
{
"epoch": 3.2503966257787407,
"grad_norm": 1.47650146484375,
"learning_rate": 1.9498974577254962e-05,
"loss": 0.3491,
"step": 84000
},
{
"epoch": 3.2542661455713344,
"grad_norm": 2.0688846111297607,
"learning_rate": 1.9491235537669776e-05,
"loss": 0.3624,
"step": 84100
},
{
"epoch": 3.258135665363928,
"grad_norm": 1.7907928228378296,
"learning_rate": 1.948349649808459e-05,
"loss": 0.3579,
"step": 84200
},
{
"epoch": 3.262005185156522,
"grad_norm": 1.5677344799041748,
"learning_rate": 1.9475757458499402e-05,
"loss": 0.3651,
"step": 84300
},
{
"epoch": 3.2658747049491157,
"grad_norm": 2.2685437202453613,
"learning_rate": 1.9468018418914215e-05,
"loss": 0.3469,
"step": 84400
},
{
"epoch": 3.2697442247417094,
"grad_norm": 1.3307223320007324,
"learning_rate": 1.9460279379329028e-05,
"loss": 0.3548,
"step": 84500
},
{
"epoch": 3.273613744534303,
"grad_norm": 1.2336021661758423,
"learning_rate": 1.945254033974384e-05,
"loss": 0.3604,
"step": 84600
},
{
"epoch": 3.277483264326897,
"grad_norm": 1.3890929222106934,
"learning_rate": 1.944480130015865e-05,
"loss": 0.3435,
"step": 84700
},
{
"epoch": 3.2813527841194907,
"grad_norm": 1.5486915111541748,
"learning_rate": 1.9437062260573464e-05,
"loss": 0.3538,
"step": 84800
},
{
"epoch": 3.2852223039120845,
"grad_norm": 1.2610055208206177,
"learning_rate": 1.9429323220988277e-05,
"loss": 0.3595,
"step": 84900
},
{
"epoch": 3.2890918237046782,
"grad_norm": 1.2226618528366089,
"learning_rate": 1.942158418140309e-05,
"loss": 0.3478,
"step": 85000
},
{
"epoch": 3.292961343497272,
"grad_norm": 2.4010233879089355,
"learning_rate": 1.9413845141817903e-05,
"loss": 0.3735,
"step": 85100
},
{
"epoch": 3.2968308632898657,
"grad_norm": 2.1810591220855713,
"learning_rate": 1.9406106102232713e-05,
"loss": 0.3493,
"step": 85200
},
{
"epoch": 3.3007003830824595,
"grad_norm": 1.3222867250442505,
"learning_rate": 1.9398367062647526e-05,
"loss": 0.3608,
"step": 85300
},
{
"epoch": 3.3045699028750533,
"grad_norm": 2.0267252922058105,
"learning_rate": 1.939062802306234e-05,
"loss": 0.3479,
"step": 85400
},
{
"epoch": 3.308439422667647,
"grad_norm": 2.4806270599365234,
"learning_rate": 1.9382888983477152e-05,
"loss": 0.3658,
"step": 85500
},
{
"epoch": 3.3123089424602408,
"grad_norm": 2.1248300075531006,
"learning_rate": 1.9375149943891965e-05,
"loss": 0.3559,
"step": 85600
},
{
"epoch": 3.3161784622528345,
"grad_norm": 1.243067741394043,
"learning_rate": 1.936741090430678e-05,
"loss": 0.3412,
"step": 85700
},
{
"epoch": 3.3200479820454283,
"grad_norm": 1.4840171337127686,
"learning_rate": 1.9359671864721588e-05,
"loss": 0.3364,
"step": 85800
},
{
"epoch": 3.323917501838022,
"grad_norm": 1.28212571144104,
"learning_rate": 1.93519328251364e-05,
"loss": 0.3495,
"step": 85900
},
{
"epoch": 3.327787021630616,
"grad_norm": 1.3520444631576538,
"learning_rate": 1.9344193785551214e-05,
"loss": 0.3588,
"step": 86000
},
{
"epoch": 3.3316565414232095,
"grad_norm": 1.606806755065918,
"learning_rate": 1.9336454745966027e-05,
"loss": 0.36,
"step": 86100
},
{
"epoch": 3.3355260612158033,
"grad_norm": 1.5933377742767334,
"learning_rate": 1.932871570638084e-05,
"loss": 0.3586,
"step": 86200
},
{
"epoch": 3.3393955810083966,
"grad_norm": 1.4497184753417969,
"learning_rate": 1.9320976666795654e-05,
"loss": 0.3383,
"step": 86300
},
{
"epoch": 3.3432651008009904,
"grad_norm": 1.1339248418807983,
"learning_rate": 1.9313237627210463e-05,
"loss": 0.3399,
"step": 86400
},
{
"epoch": 3.347134620593584,
"grad_norm": 1.4607455730438232,
"learning_rate": 1.9305498587625276e-05,
"loss": 0.3472,
"step": 86500
},
{
"epoch": 3.351004140386178,
"grad_norm": 1.5800549983978271,
"learning_rate": 1.929775954804009e-05,
"loss": 0.3452,
"step": 86600
},
{
"epoch": 3.3548736601787716,
"grad_norm": 1.3513827323913574,
"learning_rate": 1.9290020508454903e-05,
"loss": 0.3577,
"step": 86700
},
{
"epoch": 3.3587431799713654,
"grad_norm": 1.5983587503433228,
"learning_rate": 1.9282281468869716e-05,
"loss": 0.3621,
"step": 86800
},
{
"epoch": 3.362612699763959,
"grad_norm": 1.893060326576233,
"learning_rate": 1.927454242928453e-05,
"loss": 0.3673,
"step": 86900
},
{
"epoch": 3.366482219556553,
"grad_norm": 0.9722900986671448,
"learning_rate": 1.926680338969934e-05,
"loss": 0.3315,
"step": 87000
},
{
"epoch": 3.3703517393491467,
"grad_norm": 1.8283945322036743,
"learning_rate": 1.9259064350114152e-05,
"loss": 0.339,
"step": 87100
},
{
"epoch": 3.3742212591417404,
"grad_norm": 1.7708581686019897,
"learning_rate": 1.9251325310528965e-05,
"loss": 0.3597,
"step": 87200
},
{
"epoch": 3.378090778934334,
"grad_norm": 2.0267698764801025,
"learning_rate": 1.9243586270943778e-05,
"loss": 0.3365,
"step": 87300
},
{
"epoch": 3.381960298726928,
"grad_norm": 1.3400310277938843,
"learning_rate": 1.923584723135859e-05,
"loss": 0.3626,
"step": 87400
},
{
"epoch": 3.3858298185195217,
"grad_norm": 2.713268280029297,
"learning_rate": 1.9228108191773404e-05,
"loss": 0.3576,
"step": 87500
},
{
"epoch": 3.3896993383121155,
"grad_norm": 1.648658037185669,
"learning_rate": 1.9220369152188214e-05,
"loss": 0.3696,
"step": 87600
},
{
"epoch": 3.393568858104709,
"grad_norm": 1.1035487651824951,
"learning_rate": 1.9212630112603027e-05,
"loss": 0.3622,
"step": 87700
},
{
"epoch": 3.397438377897303,
"grad_norm": 1.4534286260604858,
"learning_rate": 1.920489107301784e-05,
"loss": 0.3718,
"step": 87800
},
{
"epoch": 3.4013078976898967,
"grad_norm": 1.3734116554260254,
"learning_rate": 1.9197152033432653e-05,
"loss": 0.3496,
"step": 87900
},
{
"epoch": 3.4051774174824905,
"grad_norm": 1.6333812475204468,
"learning_rate": 1.9189412993847466e-05,
"loss": 0.3555,
"step": 88000
},
{
"epoch": 3.4090469372750842,
"grad_norm": 1.196081519126892,
"learning_rate": 1.9181673954262276e-05,
"loss": 0.3549,
"step": 88100
},
{
"epoch": 3.412916457067678,
"grad_norm": 1.954453468322754,
"learning_rate": 1.917393491467709e-05,
"loss": 0.3231,
"step": 88200
},
{
"epoch": 3.4167859768602717,
"grad_norm": 0.7807307839393616,
"learning_rate": 1.9166195875091902e-05,
"loss": 0.3486,
"step": 88300
},
{
"epoch": 3.4206554966528655,
"grad_norm": 1.2698251008987427,
"learning_rate": 1.9158456835506715e-05,
"loss": 0.3351,
"step": 88400
},
{
"epoch": 3.4245250164454593,
"grad_norm": 1.6529748439788818,
"learning_rate": 1.915071779592153e-05,
"loss": 0.3351,
"step": 88500
},
{
"epoch": 3.428394536238053,
"grad_norm": 3.4594674110412598,
"learning_rate": 1.914297875633634e-05,
"loss": 0.3504,
"step": 88600
},
{
"epoch": 3.4322640560306468,
"grad_norm": 3.723195791244507,
"learning_rate": 1.913523971675115e-05,
"loss": 0.3472,
"step": 88700
},
{
"epoch": 3.4361335758232405,
"grad_norm": 1.7539480924606323,
"learning_rate": 1.9127500677165964e-05,
"loss": 0.3386,
"step": 88800
},
{
"epoch": 3.4400030956158343,
"grad_norm": 2.1020853519439697,
"learning_rate": 1.9119761637580777e-05,
"loss": 0.3599,
"step": 88900
},
{
"epoch": 3.443872615408428,
"grad_norm": 1.5043954849243164,
"learning_rate": 1.911202259799559e-05,
"loss": 0.3386,
"step": 89000
},
{
"epoch": 3.447742135201022,
"grad_norm": 0.9868506193161011,
"learning_rate": 1.9104283558410404e-05,
"loss": 0.3388,
"step": 89100
},
{
"epoch": 3.451611654993615,
"grad_norm": 1.3694320917129517,
"learning_rate": 1.9096544518825217e-05,
"loss": 0.3185,
"step": 89200
},
{
"epoch": 3.455481174786209,
"grad_norm": 1.4770699739456177,
"learning_rate": 1.9088805479240026e-05,
"loss": 0.3386,
"step": 89300
},
{
"epoch": 3.4593506945788026,
"grad_norm": 3.487114191055298,
"learning_rate": 1.908106643965484e-05,
"loss": 0.3497,
"step": 89400
},
{
"epoch": 3.4632202143713964,
"grad_norm": 2.1382334232330322,
"learning_rate": 1.9073327400069653e-05,
"loss": 0.3469,
"step": 89500
},
{
"epoch": 3.46708973416399,
"grad_norm": 3.232499361038208,
"learning_rate": 1.9065588360484466e-05,
"loss": 0.3323,
"step": 89600
},
{
"epoch": 3.470959253956584,
"grad_norm": 1.2111986875534058,
"learning_rate": 1.905784932089928e-05,
"loss": 0.3465,
"step": 89700
},
{
"epoch": 3.4748287737491776,
"grad_norm": 1.2314847707748413,
"learning_rate": 1.9050110281314092e-05,
"loss": 0.3497,
"step": 89800
},
{
"epoch": 3.4786982935417714,
"grad_norm": 3.4358456134796143,
"learning_rate": 1.90423712417289e-05,
"loss": 0.341,
"step": 89900
},
{
"epoch": 3.482567813334365,
"grad_norm": 1.8911181688308716,
"learning_rate": 1.9034632202143715e-05,
"loss": 0.3588,
"step": 90000
},
{
"epoch": 3.486437333126959,
"grad_norm": 1.6309691667556763,
"learning_rate": 1.9026893162558528e-05,
"loss": 0.3457,
"step": 90100
},
{
"epoch": 3.4903068529195527,
"grad_norm": 1.5026049613952637,
"learning_rate": 1.901915412297334e-05,
"loss": 0.3263,
"step": 90200
},
{
"epoch": 3.4941763727121464,
"grad_norm": 1.3562653064727783,
"learning_rate": 1.9011415083388154e-05,
"loss": 0.3566,
"step": 90300
},
{
"epoch": 3.49804589250474,
"grad_norm": 2.2276010513305664,
"learning_rate": 1.9003676043802964e-05,
"loss": 0.3509,
"step": 90400
},
{
"epoch": 3.4998645668072594,
"eval_loss": 0.24299356341362,
"eval_runtime": 73.0718,
"eval_samples_per_second": 28.588,
"eval_steps_per_second": 3.586,
"step": 90447
},
{
"epoch": 3.501915412297334,
"grad_norm": 1.8114732503890991,
"learning_rate": 1.9995898309019853e-05,
"loss": 0.3384,
"step": 90500
},
{
"epoch": 3.5057849320899277,
"grad_norm": 1.3792935609817505,
"learning_rate": 1.9988159269434662e-05,
"loss": 0.3464,
"step": 90600
},
{
"epoch": 3.5096544518825215,
"grad_norm": 2.4080448150634766,
"learning_rate": 1.998042022984948e-05,
"loss": 0.3617,
"step": 90700
},
{
"epoch": 3.513523971675115,
"grad_norm": 2.0266382694244385,
"learning_rate": 1.997268119026429e-05,
"loss": 0.3667,
"step": 90800
},
{
"epoch": 3.517393491467709,
"grad_norm": 4.2909440994262695,
"learning_rate": 1.9964942150679102e-05,
"loss": 0.3303,
"step": 90900
},
{
"epoch": 3.5212630112603027,
"grad_norm": 1.7127686738967896,
"learning_rate": 1.9957203111093915e-05,
"loss": 0.35,
"step": 91000
},
{
"epoch": 3.5251325310528965,
"grad_norm": 1.5974979400634766,
"learning_rate": 1.9949464071508728e-05,
"loss": 0.3499,
"step": 91100
},
{
"epoch": 3.5290020508454902,
"grad_norm": 3.3751542568206787,
"learning_rate": 1.994172503192354e-05,
"loss": 0.3329,
"step": 91200
},
{
"epoch": 3.5328715706380835,
"grad_norm": 1.623238444328308,
"learning_rate": 1.993398599233835e-05,
"loss": 0.3496,
"step": 91300
},
{
"epoch": 3.5367410904306773,
"grad_norm": 1.5852680206298828,
"learning_rate": 1.9926246952753164e-05,
"loss": 0.3524,
"step": 91400
},
{
"epoch": 3.540610610223271,
"grad_norm": 1.1841716766357422,
"learning_rate": 1.9918507913167977e-05,
"loss": 0.3471,
"step": 91500
},
{
"epoch": 3.544480130015865,
"grad_norm": 1.724593162536621,
"learning_rate": 1.991076887358279e-05,
"loss": 0.3323,
"step": 91600
},
{
"epoch": 3.5483496498084586,
"grad_norm": 2.057185173034668,
"learning_rate": 1.9903029833997603e-05,
"loss": 0.3584,
"step": 91700
},
{
"epoch": 3.5522191696010523,
"grad_norm": 1.5378031730651855,
"learning_rate": 1.9895290794412413e-05,
"loss": 0.3505,
"step": 91800
},
{
"epoch": 3.556088689393646,
"grad_norm": 1.5235657691955566,
"learning_rate": 1.988755175482723e-05,
"loss": 0.3278,
"step": 91900
},
{
"epoch": 3.55995820918624,
"grad_norm": 1.6705044507980347,
"learning_rate": 1.987981271524204e-05,
"loss": 0.3475,
"step": 92000
},
{
"epoch": 3.5638277289788336,
"grad_norm": 1.6356074810028076,
"learning_rate": 1.9872073675656852e-05,
"loss": 0.3426,
"step": 92100
},
{
"epoch": 3.5676972487714274,
"grad_norm": 1.408981204032898,
"learning_rate": 1.9864334636071665e-05,
"loss": 0.3311,
"step": 92200
},
{
"epoch": 3.571566768564021,
"grad_norm": 8.708040237426758,
"learning_rate": 1.985659559648648e-05,
"loss": 0.3257,
"step": 92300
},
{
"epoch": 3.575436288356615,
"grad_norm": 2.8553311824798584,
"learning_rate": 1.984885655690129e-05,
"loss": 0.3452,
"step": 92400
},
{
"epoch": 3.5793058081492086,
"grad_norm": 2.367499589920044,
"learning_rate": 1.98411175173161e-05,
"loss": 0.3456,
"step": 92500
},
{
"epoch": 3.5831753279418024,
"grad_norm": 1.9503910541534424,
"learning_rate": 1.9833378477730914e-05,
"loss": 0.3471,
"step": 92600
},
{
"epoch": 3.587044847734396,
"grad_norm": 1.4180583953857422,
"learning_rate": 1.9825639438145727e-05,
"loss": 0.3421,
"step": 92700
},
{
"epoch": 3.59091436752699,
"grad_norm": 0.9737741351127625,
"learning_rate": 1.981790039856054e-05,
"loss": 0.3429,
"step": 92800
},
{
"epoch": 3.5947838873195836,
"grad_norm": 1.2342348098754883,
"learning_rate": 1.9810161358975354e-05,
"loss": 0.3474,
"step": 92900
},
{
"epoch": 3.5986534071121774,
"grad_norm": 3.7432026863098145,
"learning_rate": 1.9802422319390163e-05,
"loss": 0.3437,
"step": 93000
},
{
"epoch": 3.602522926904771,
"grad_norm": 1.9314531087875366,
"learning_rate": 1.9794683279804976e-05,
"loss": 0.3408,
"step": 93100
},
{
"epoch": 3.606392446697365,
"grad_norm": 4.156320571899414,
"learning_rate": 1.978694424021979e-05,
"loss": 0.3414,
"step": 93200
},
{
"epoch": 3.6102619664899587,
"grad_norm": 1.2768079042434692,
"learning_rate": 1.9779205200634603e-05,
"loss": 0.3461,
"step": 93300
},
{
"epoch": 3.6141314862825524,
"grad_norm": 2.0528101921081543,
"learning_rate": 1.9771466161049416e-05,
"loss": 0.3223,
"step": 93400
},
{
"epoch": 3.618001006075146,
"grad_norm": 1.143532633781433,
"learning_rate": 1.976372712146423e-05,
"loss": 0.3473,
"step": 93500
},
{
"epoch": 3.62187052586774,
"grad_norm": 1.0867356061935425,
"learning_rate": 1.9755988081879042e-05,
"loss": 0.3326,
"step": 93600
},
{
"epoch": 3.6257400456603337,
"grad_norm": 2.3322672843933105,
"learning_rate": 1.9748249042293852e-05,
"loss": 0.3331,
"step": 93700
},
{
"epoch": 3.6296095654529275,
"grad_norm": 1.39437997341156,
"learning_rate": 1.9740510002708665e-05,
"loss": 0.334,
"step": 93800
},
{
"epoch": 3.633479085245521,
"grad_norm": 1.3758375644683838,
"learning_rate": 1.9732770963123478e-05,
"loss": 0.3387,
"step": 93900
},
{
"epoch": 3.637348605038115,
"grad_norm": 1.9272472858428955,
"learning_rate": 1.972503192353829e-05,
"loss": 0.3288,
"step": 94000
},
{
"epoch": 3.6412181248307087,
"grad_norm": 1.3517364263534546,
"learning_rate": 1.9717292883953104e-05,
"loss": 0.333,
"step": 94100
},
{
"epoch": 3.6450876446233025,
"grad_norm": 1.0485949516296387,
"learning_rate": 1.9709553844367914e-05,
"loss": 0.3385,
"step": 94200
},
{
"epoch": 3.6489571644158962,
"grad_norm": 1.0853439569473267,
"learning_rate": 1.9701814804782727e-05,
"loss": 0.3322,
"step": 94300
},
{
"epoch": 3.65282668420849,
"grad_norm": 1.4697808027267456,
"learning_rate": 1.969407576519754e-05,
"loss": 0.3259,
"step": 94400
},
{
"epoch": 3.6566962040010833,
"grad_norm": 1.0822653770446777,
"learning_rate": 1.9686336725612353e-05,
"loss": 0.3466,
"step": 94500
},
{
"epoch": 3.660565723793677,
"grad_norm": 1.5661627054214478,
"learning_rate": 1.9678597686027166e-05,
"loss": 0.31,
"step": 94600
},
{
"epoch": 3.664435243586271,
"grad_norm": 1.7367948293685913,
"learning_rate": 1.9670858646441976e-05,
"loss": 0.336,
"step": 94700
},
{
"epoch": 3.6683047633788646,
"grad_norm": 1.3050540685653687,
"learning_rate": 1.9663119606856792e-05,
"loss": 0.3415,
"step": 94800
},
{
"epoch": 3.6721742831714583,
"grad_norm": 1.3506726026535034,
"learning_rate": 1.9655380567271602e-05,
"loss": 0.3496,
"step": 94900
},
{
"epoch": 3.676043802964052,
"grad_norm": 1.4908133745193481,
"learning_rate": 1.9647641527686415e-05,
"loss": 0.3522,
"step": 95000
},
{
"epoch": 3.679913322756646,
"grad_norm": 1.5835528373718262,
"learning_rate": 1.963990248810123e-05,
"loss": 0.3389,
"step": 95100
},
{
"epoch": 3.6837828425492396,
"grad_norm": 2.2646801471710205,
"learning_rate": 1.963216344851604e-05,
"loss": 0.3313,
"step": 95200
},
{
"epoch": 3.6876523623418334,
"grad_norm": 2.0749449729919434,
"learning_rate": 1.9624424408930855e-05,
"loss": 0.3465,
"step": 95300
},
{
"epoch": 3.691521882134427,
"grad_norm": 1.151435136795044,
"learning_rate": 1.9616685369345664e-05,
"loss": 0.328,
"step": 95400
},
{
"epoch": 3.695391401927021,
"grad_norm": 1.1644638776779175,
"learning_rate": 1.9608946329760477e-05,
"loss": 0.3415,
"step": 95500
},
{
"epoch": 3.6992609217196146,
"grad_norm": 1.7427713871002197,
"learning_rate": 1.960120729017529e-05,
"loss": 0.3278,
"step": 95600
},
{
"epoch": 3.7031304415122084,
"grad_norm": 1.5325894355773926,
"learning_rate": 1.9593468250590104e-05,
"loss": 0.3567,
"step": 95700
},
{
"epoch": 3.706999961304802,
"grad_norm": 0.8409464359283447,
"learning_rate": 1.9585729211004917e-05,
"loss": 0.332,
"step": 95800
},
{
"epoch": 3.710869481097396,
"grad_norm": 1.7328728437423706,
"learning_rate": 1.9577990171419726e-05,
"loss": 0.3382,
"step": 95900
},
{
"epoch": 3.7147390008899897,
"grad_norm": 1.5052417516708374,
"learning_rate": 1.957025113183454e-05,
"loss": 0.3389,
"step": 96000
},
{
"epoch": 3.7186085206825834,
"grad_norm": 1.1802027225494385,
"learning_rate": 1.9562512092249353e-05,
"loss": 0.3377,
"step": 96100
},
{
"epoch": 3.722478040475177,
"grad_norm": 1.825426459312439,
"learning_rate": 1.9554773052664166e-05,
"loss": 0.3391,
"step": 96200
},
{
"epoch": 3.726347560267771,
"grad_norm": 1.3100457191467285,
"learning_rate": 1.954703401307898e-05,
"loss": 0.3418,
"step": 96300
},
{
"epoch": 3.7302170800603642,
"grad_norm": 1.3283213376998901,
"learning_rate": 1.9539294973493792e-05,
"loss": 0.3419,
"step": 96400
},
{
"epoch": 3.734086599852958,
"grad_norm": 1.5435948371887207,
"learning_rate": 1.9531555933908605e-05,
"loss": 0.3366,
"step": 96500
},
{
"epoch": 3.7379561196455517,
"grad_norm": 1.3406691551208496,
"learning_rate": 1.9523816894323415e-05,
"loss": 0.3399,
"step": 96600
},
{
"epoch": 3.7418256394381455,
"grad_norm": 1.3712650537490845,
"learning_rate": 1.9516077854738228e-05,
"loss": 0.3419,
"step": 96700
},
{
"epoch": 3.7456951592307393,
"grad_norm": 1.0608057975769043,
"learning_rate": 1.950833881515304e-05,
"loss": 0.3422,
"step": 96800
},
{
"epoch": 3.749564679023333,
"grad_norm": 2.5015125274658203,
"learning_rate": 1.9500599775567854e-05,
"loss": 0.3318,
"step": 96900
},
{
"epoch": 3.7534341988159268,
"grad_norm": 1.6725361347198486,
"learning_rate": 1.9492860735982667e-05,
"loss": 0.3186,
"step": 97000
},
{
"epoch": 3.7573037186085205,
"grad_norm": 0.9464316964149475,
"learning_rate": 1.9485121696397477e-05,
"loss": 0.3397,
"step": 97100
},
{
"epoch": 3.7611732384011143,
"grad_norm": 1.285186529159546,
"learning_rate": 1.947738265681229e-05,
"loss": 0.3324,
"step": 97200
},
{
"epoch": 3.765042758193708,
"grad_norm": 1.267645001411438,
"learning_rate": 1.9469643617227103e-05,
"loss": 0.3242,
"step": 97300
},
{
"epoch": 3.768912277986302,
"grad_norm": 1.1808134317398071,
"learning_rate": 1.9461904577641916e-05,
"loss": 0.3351,
"step": 97400
},
{
"epoch": 3.7727817977788956,
"grad_norm": 1.590160846710205,
"learning_rate": 1.945416553805673e-05,
"loss": 0.3276,
"step": 97500
},
{
"epoch": 3.7766513175714893,
"grad_norm": 1.5932397842407227,
"learning_rate": 1.9446426498471542e-05,
"loss": 0.3622,
"step": 97600
},
{
"epoch": 3.780520837364083,
"grad_norm": 1.2350918054580688,
"learning_rate": 1.9438687458886356e-05,
"loss": 0.3193,
"step": 97700
},
{
"epoch": 3.784390357156677,
"grad_norm": 1.921157956123352,
"learning_rate": 1.9430948419301165e-05,
"loss": 0.3374,
"step": 97800
},
{
"epoch": 3.7882598769492706,
"grad_norm": 1.235912799835205,
"learning_rate": 1.942320937971598e-05,
"loss": 0.3467,
"step": 97900
},
{
"epoch": 3.7921293967418643,
"grad_norm": 1.6335248947143555,
"learning_rate": 1.941547034013079e-05,
"loss": 0.3454,
"step": 98000
},
{
"epoch": 3.795998916534458,
"grad_norm": 1.8079530000686646,
"learning_rate": 1.9407731300545605e-05,
"loss": 0.3497,
"step": 98100
},
{
"epoch": 3.799868436327052,
"grad_norm": 2.2827401161193848,
"learning_rate": 1.9399992260960418e-05,
"loss": 0.3703,
"step": 98200
},
{
"epoch": 3.8037379561196456,
"grad_norm": 1.0421605110168457,
"learning_rate": 1.9392253221375227e-05,
"loss": 0.341,
"step": 98300
},
{
"epoch": 3.8076074759122394,
"grad_norm": 2.6579549312591553,
"learning_rate": 1.938451418179004e-05,
"loss": 0.3506,
"step": 98400
},
{
"epoch": 3.811476995704833,
"grad_norm": 0.8981249332427979,
"learning_rate": 1.9376775142204854e-05,
"loss": 0.317,
"step": 98500
},
{
"epoch": 3.815346515497427,
"grad_norm": 1.5071488618850708,
"learning_rate": 1.9369036102619667e-05,
"loss": 0.3481,
"step": 98600
},
{
"epoch": 3.8192160352900206,
"grad_norm": 2.4668521881103516,
"learning_rate": 1.936129706303448e-05,
"loss": 0.3189,
"step": 98700
},
{
"epoch": 3.8230855550826144,
"grad_norm": 2.102757692337036,
"learning_rate": 1.9353558023449293e-05,
"loss": 0.3246,
"step": 98800
},
{
"epoch": 3.826955074875208,
"grad_norm": 1.3990046977996826,
"learning_rate": 1.9345818983864103e-05,
"loss": 0.3284,
"step": 98900
},
{
"epoch": 3.830824594667802,
"grad_norm": 2.944544792175293,
"learning_rate": 1.9338079944278916e-05,
"loss": 0.3242,
"step": 99000
},
{
"epoch": 3.8346941144603957,
"grad_norm": 1.9801486730575562,
"learning_rate": 1.933034090469373e-05,
"loss": 0.3319,
"step": 99100
},
{
"epoch": 3.8385636342529894,
"grad_norm": 1.5694576501846313,
"learning_rate": 1.9322601865108542e-05,
"loss": 0.3292,
"step": 99200
},
{
"epoch": 3.842433154045583,
"grad_norm": 1.6066193580627441,
"learning_rate": 1.9314862825523355e-05,
"loss": 0.3304,
"step": 99300
},
{
"epoch": 3.846302673838177,
"grad_norm": 1.478926181793213,
"learning_rate": 1.9307123785938168e-05,
"loss": 0.3208,
"step": 99400
},
{
"epoch": 3.8501721936307707,
"grad_norm": 1.1129310131072998,
"learning_rate": 1.9299384746352978e-05,
"loss": 0.3373,
"step": 99500
},
{
"epoch": 3.8540417134233644,
"grad_norm": 3.5530917644500732,
"learning_rate": 1.929164570676779e-05,
"loss": 0.3252,
"step": 99600
},
{
"epoch": 3.8579112332159577,
"grad_norm": 0.8966418504714966,
"learning_rate": 1.9283906667182604e-05,
"loss": 0.3244,
"step": 99700
},
{
"epoch": 3.8617807530085515,
"grad_norm": 2.311257839202881,
"learning_rate": 1.9276167627597417e-05,
"loss": 0.3115,
"step": 99800
},
{
"epoch": 3.8656502728011453,
"grad_norm": 2.062633752822876,
"learning_rate": 1.926842858801223e-05,
"loss": 0.3325,
"step": 99900
},
{
"epoch": 3.869519792593739,
"grad_norm": 1.8347896337509155,
"learning_rate": 1.9260689548427043e-05,
"loss": 0.3271,
"step": 100000
},
{
"epoch": 3.8733893123863328,
"grad_norm": 1.5175038576126099,
"learning_rate": 1.9252950508841853e-05,
"loss": 0.3285,
"step": 100100
},
{
"epoch": 3.8772588321789265,
"grad_norm": 1.3393527269363403,
"learning_rate": 1.9245211469256666e-05,
"loss": 0.3409,
"step": 100200
},
{
"epoch": 3.8811283519715203,
"grad_norm": 1.7215604782104492,
"learning_rate": 1.923747242967148e-05,
"loss": 0.3321,
"step": 100300
},
{
"epoch": 3.884997871764114,
"grad_norm": 1.3278648853302002,
"learning_rate": 1.9229733390086292e-05,
"loss": 0.3166,
"step": 100400
},
{
"epoch": 3.888867391556708,
"grad_norm": 1.59735107421875,
"learning_rate": 1.9221994350501106e-05,
"loss": 0.3215,
"step": 100500
},
{
"epoch": 3.8927369113493016,
"grad_norm": 1.3256441354751587,
"learning_rate": 1.9214255310915915e-05,
"loss": 0.3456,
"step": 100600
},
{
"epoch": 3.8966064311418953,
"grad_norm": 1.9966683387756348,
"learning_rate": 1.920651627133073e-05,
"loss": 0.3012,
"step": 100700
},
{
"epoch": 3.900475950934489,
"grad_norm": 1.6612262725830078,
"learning_rate": 1.919877723174554e-05,
"loss": 0.3588,
"step": 100800
},
{
"epoch": 3.904345470727083,
"grad_norm": 3.342247486114502,
"learning_rate": 1.9191038192160355e-05,
"loss": 0.3205,
"step": 100900
},
{
"epoch": 3.9082149905196766,
"grad_norm": 1.1689525842666626,
"learning_rate": 1.9183299152575168e-05,
"loss": 0.325,
"step": 101000
},
{
"epoch": 3.9120845103122703,
"grad_norm": 1.5586668252944946,
"learning_rate": 1.917556011298998e-05,
"loss": 0.3091,
"step": 101100
},
{
"epoch": 3.915954030104864,
"grad_norm": 1.0798732042312622,
"learning_rate": 1.916782107340479e-05,
"loss": 0.3229,
"step": 101200
},
{
"epoch": 3.919823549897458,
"grad_norm": 1.0493509769439697,
"learning_rate": 1.9160082033819604e-05,
"loss": 0.3039,
"step": 101300
},
{
"epoch": 3.9236930696900516,
"grad_norm": 0.7543585896492004,
"learning_rate": 1.9152342994234417e-05,
"loss": 0.3191,
"step": 101400
},
{
"epoch": 3.9275625894826454,
"grad_norm": 1.2396085262298584,
"learning_rate": 1.914460395464923e-05,
"loss": 0.3338,
"step": 101500
},
{
"epoch": 3.9314321092752387,
"grad_norm": 1.0943130254745483,
"learning_rate": 1.9136864915064043e-05,
"loss": 0.3151,
"step": 101600
},
{
"epoch": 3.9353016290678324,
"grad_norm": 1.3704535961151123,
"learning_rate": 1.9129125875478856e-05,
"loss": 0.3238,
"step": 101700
},
{
"epoch": 3.939171148860426,
"grad_norm": 1.32877516746521,
"learning_rate": 1.9121386835893666e-05,
"loss": 0.3359,
"step": 101800
},
{
"epoch": 3.94304066865302,
"grad_norm": 1.6287816762924194,
"learning_rate": 1.911364779630848e-05,
"loss": 0.333,
"step": 101900
},
{
"epoch": 3.9469101884456137,
"grad_norm": 1.7541491985321045,
"learning_rate": 1.9105908756723292e-05,
"loss": 0.3174,
"step": 102000
},
{
"epoch": 3.9507797082382075,
"grad_norm": 1.1953548192977905,
"learning_rate": 1.9098169717138105e-05,
"loss": 0.3445,
"step": 102100
},
{
"epoch": 3.954649228030801,
"grad_norm": 1.7301913499832153,
"learning_rate": 1.9090430677552918e-05,
"loss": 0.3225,
"step": 102200
},
{
"epoch": 3.958518747823395,
"grad_norm": 1.687058448791504,
"learning_rate": 1.908269163796773e-05,
"loss": 0.339,
"step": 102300
},
{
"epoch": 3.9623882676159887,
"grad_norm": 1.3156592845916748,
"learning_rate": 1.907495259838254e-05,
"loss": 0.3227,
"step": 102400
},
{
"epoch": 3.9662577874085825,
"grad_norm": 1.7830286026000977,
"learning_rate": 1.9067213558797354e-05,
"loss": 0.3241,
"step": 102500
},
{
"epoch": 3.9701273072011762,
"grad_norm": 1.549320936203003,
"learning_rate": 1.9059474519212167e-05,
"loss": 0.3324,
"step": 102600
},
{
"epoch": 3.97399682699377,
"grad_norm": 1.6894500255584717,
"learning_rate": 1.905173547962698e-05,
"loss": 0.3295,
"step": 102700
},
{
"epoch": 3.9778663467863637,
"grad_norm": 4.200815677642822,
"learning_rate": 1.9043996440041793e-05,
"loss": 0.3266,
"step": 102800
},
{
"epoch": 3.9817358665789575,
"grad_norm": 1.9882376194000244,
"learning_rate": 1.9036257400456606e-05,
"loss": 0.3547,
"step": 102900
},
{
"epoch": 3.9856053863715513,
"grad_norm": 1.1047308444976807,
"learning_rate": 1.9028518360871416e-05,
"loss": 0.3249,
"step": 103000
},
{
"epoch": 3.989474906164145,
"grad_norm": 1.6856424808502197,
"learning_rate": 1.902077932128623e-05,
"loss": 0.3001,
"step": 103100
},
{
"epoch": 3.9933444259567388,
"grad_norm": 1.4230420589447021,
"learning_rate": 1.9013040281701042e-05,
"loss": 0.3316,
"step": 103200
},
{
"epoch": 3.9972139457493325,
"grad_norm": 1.5009479522705078,
"learning_rate": 1.9005301242115855e-05,
"loss": 0.318,
"step": 103300
},
{
"epoch": 3.9998452192082965,
"eval_loss": 0.23254971206188202,
"eval_runtime": 69.2697,
"eval_samples_per_second": 30.157,
"eval_steps_per_second": 3.782,
"step": 103368
}
],
"logging_steps": 100,
"max_steps": 258430,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 12921,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.1862160593846272e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}