pangea_checkpoint_1216 / trainer_state.json
yuexiang96's picture
Upload folder using huggingface_hub
0e53c4d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.10006171569635877,
"eval_steps": 500,
"global_step": 1216,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 8.228759514503188e-05,
"grad_norm": 9.532528095057138,
"learning_rate": 5.479452054794521e-08,
"loss": 0.7901,
"step": 1
},
{
"epoch": 0.00016457519029006376,
"grad_norm": 30.026945671831577,
"learning_rate": 1.0958904109589042e-07,
"loss": 2.1253,
"step": 2
},
{
"epoch": 0.00024686278543509563,
"grad_norm": 8.88519815829157,
"learning_rate": 1.6438356164383561e-07,
"loss": 0.7715,
"step": 3
},
{
"epoch": 0.00032915038058012753,
"grad_norm": 29.197616305414858,
"learning_rate": 2.1917808219178084e-07,
"loss": 2.1284,
"step": 4
},
{
"epoch": 0.0004114379757251594,
"grad_norm": 29.892559190290434,
"learning_rate": 2.73972602739726e-07,
"loss": 2.0685,
"step": 5
},
{
"epoch": 0.0004937255708701913,
"grad_norm": 10.567782598278942,
"learning_rate": 3.2876712328767123e-07,
"loss": 0.8122,
"step": 6
},
{
"epoch": 0.0005760131660152232,
"grad_norm": 28.912763215741734,
"learning_rate": 3.835616438356165e-07,
"loss": 2.1056,
"step": 7
},
{
"epoch": 0.0006583007611602551,
"grad_norm": 29.51664131482477,
"learning_rate": 4.383561643835617e-07,
"loss": 2.0418,
"step": 8
},
{
"epoch": 0.000740588356305287,
"grad_norm": 28.30266632286417,
"learning_rate": 4.931506849315068e-07,
"loss": 2.0237,
"step": 9
},
{
"epoch": 0.0008228759514503189,
"grad_norm": 27.46875103243188,
"learning_rate": 5.47945205479452e-07,
"loss": 1.9595,
"step": 10
},
{
"epoch": 0.0009051635465953507,
"grad_norm": 24.865752165641698,
"learning_rate": 6.027397260273974e-07,
"loss": 1.9174,
"step": 11
},
{
"epoch": 0.0009874511417403825,
"grad_norm": 24.328147714809518,
"learning_rate": 6.575342465753425e-07,
"loss": 1.9307,
"step": 12
},
{
"epoch": 0.0010697387368854144,
"grad_norm": 5.5234808874616395,
"learning_rate": 7.123287671232878e-07,
"loss": 0.8138,
"step": 13
},
{
"epoch": 0.0011520263320304463,
"grad_norm": 24.035678143620423,
"learning_rate": 7.67123287671233e-07,
"loss": 1.9803,
"step": 14
},
{
"epoch": 0.0012343139271754782,
"grad_norm": 20.7270429685146,
"learning_rate": 8.219178082191781e-07,
"loss": 1.8216,
"step": 15
},
{
"epoch": 0.0013166015223205101,
"grad_norm": 3.1954913902580597,
"learning_rate": 8.767123287671234e-07,
"loss": 0.7577,
"step": 16
},
{
"epoch": 0.001398889117465542,
"grad_norm": 19.0932823831642,
"learning_rate": 9.315068493150686e-07,
"loss": 1.8765,
"step": 17
},
{
"epoch": 0.001481176712610574,
"grad_norm": 17.783753558169572,
"learning_rate": 9.863013698630137e-07,
"loss": 1.7423,
"step": 18
},
{
"epoch": 0.0015634643077556058,
"grad_norm": 13.929582396803928,
"learning_rate": 1.041095890410959e-06,
"loss": 1.5683,
"step": 19
},
{
"epoch": 0.0016457519029006377,
"grad_norm": 10.860155069125868,
"learning_rate": 1.095890410958904e-06,
"loss": 1.5344,
"step": 20
},
{
"epoch": 0.0017280394980456696,
"grad_norm": 10.868210550382598,
"learning_rate": 1.1506849315068494e-06,
"loss": 1.4788,
"step": 21
},
{
"epoch": 0.0018103270931907015,
"grad_norm": 9.306619668804826,
"learning_rate": 1.2054794520547947e-06,
"loss": 1.4831,
"step": 22
},
{
"epoch": 0.0018926146883357334,
"grad_norm": 2.4601086961337857,
"learning_rate": 1.26027397260274e-06,
"loss": 0.7305,
"step": 23
},
{
"epoch": 0.001974902283480765,
"grad_norm": 7.6886950923134005,
"learning_rate": 1.315068493150685e-06,
"loss": 1.4257,
"step": 24
},
{
"epoch": 0.002057189878625797,
"grad_norm": 6.220708397685521,
"learning_rate": 1.3698630136986302e-06,
"loss": 1.3468,
"step": 25
},
{
"epoch": 0.002139477473770829,
"grad_norm": 4.674476253548759,
"learning_rate": 1.4246575342465755e-06,
"loss": 1.3151,
"step": 26
},
{
"epoch": 0.002221765068915861,
"grad_norm": 3.895214381538298,
"learning_rate": 1.4794520547945206e-06,
"loss": 1.3041,
"step": 27
},
{
"epoch": 0.0023040526640608927,
"grad_norm": 3.527134956076901,
"learning_rate": 1.534246575342466e-06,
"loss": 1.2878,
"step": 28
},
{
"epoch": 0.0023863402592059248,
"grad_norm": 3.5362809667326522,
"learning_rate": 1.5890410958904112e-06,
"loss": 1.2726,
"step": 29
},
{
"epoch": 0.0024686278543509564,
"grad_norm": 2.966450361552696,
"learning_rate": 1.6438356164383561e-06,
"loss": 1.2993,
"step": 30
},
{
"epoch": 0.0025509154494959886,
"grad_norm": 2.458939366346722,
"learning_rate": 1.6986301369863014e-06,
"loss": 1.281,
"step": 31
},
{
"epoch": 0.0026332030446410202,
"grad_norm": 2.535030337573037,
"learning_rate": 1.7534246575342468e-06,
"loss": 1.2708,
"step": 32
},
{
"epoch": 0.0027154906397860524,
"grad_norm": 1.239317382781359,
"learning_rate": 1.808219178082192e-06,
"loss": 0.6648,
"step": 33
},
{
"epoch": 0.002797778234931084,
"grad_norm": 1.1180854196130607,
"learning_rate": 1.8630136986301372e-06,
"loss": 0.6646,
"step": 34
},
{
"epoch": 0.002880065830076116,
"grad_norm": 2.1450564270921646,
"learning_rate": 1.9178082191780823e-06,
"loss": 1.2447,
"step": 35
},
{
"epoch": 0.002962353425221148,
"grad_norm": 1.8049145439148968,
"learning_rate": 1.9726027397260274e-06,
"loss": 1.1815,
"step": 36
},
{
"epoch": 0.00304464102036618,
"grad_norm": 0.795375753210199,
"learning_rate": 2.027397260273973e-06,
"loss": 0.6292,
"step": 37
},
{
"epoch": 0.0031269286155112116,
"grad_norm": 0.7439259016336192,
"learning_rate": 2.082191780821918e-06,
"loss": 0.6468,
"step": 38
},
{
"epoch": 0.0032092162106562437,
"grad_norm": 2.102073236832498,
"learning_rate": 2.1369863013698635e-06,
"loss": 1.1965,
"step": 39
},
{
"epoch": 0.0032915038058012754,
"grad_norm": 1.7507482751861791,
"learning_rate": 2.191780821917808e-06,
"loss": 1.147,
"step": 40
},
{
"epoch": 0.0033737914009463075,
"grad_norm": 2.115499646494852,
"learning_rate": 2.2465753424657537e-06,
"loss": 1.2079,
"step": 41
},
{
"epoch": 0.003456078996091339,
"grad_norm": 1.5822724466961147,
"learning_rate": 2.301369863013699e-06,
"loss": 1.213,
"step": 42
},
{
"epoch": 0.0035383665912363713,
"grad_norm": 0.6843357265370693,
"learning_rate": 2.356164383561644e-06,
"loss": 0.624,
"step": 43
},
{
"epoch": 0.003620654186381403,
"grad_norm": 1.9669305292499641,
"learning_rate": 2.4109589041095894e-06,
"loss": 1.1691,
"step": 44
},
{
"epoch": 0.003702941781526435,
"grad_norm": 4.293989393639943,
"learning_rate": 2.4657534246575345e-06,
"loss": 1.1484,
"step": 45
},
{
"epoch": 0.003785229376671467,
"grad_norm": 1.3873591085798673,
"learning_rate": 2.52054794520548e-06,
"loss": 1.177,
"step": 46
},
{
"epoch": 0.0038675169718164985,
"grad_norm": 3.6561002665760807,
"learning_rate": 2.5753424657534247e-06,
"loss": 1.1469,
"step": 47
},
{
"epoch": 0.00394980456696153,
"grad_norm": 1.5450365482515196,
"learning_rate": 2.63013698630137e-06,
"loss": 1.1521,
"step": 48
},
{
"epoch": 0.004032092162106563,
"grad_norm": 1.5565124011894804,
"learning_rate": 2.6849315068493153e-06,
"loss": 1.1589,
"step": 49
},
{
"epoch": 0.004114379757251594,
"grad_norm": 0.6675144755255817,
"learning_rate": 2.7397260273972604e-06,
"loss": 0.6406,
"step": 50
},
{
"epoch": 0.004196667352396626,
"grad_norm": 1.5292143908928457,
"learning_rate": 2.794520547945206e-06,
"loss": 1.1297,
"step": 51
},
{
"epoch": 0.004278954947541658,
"grad_norm": 0.6502938857874467,
"learning_rate": 2.849315068493151e-06,
"loss": 0.6186,
"step": 52
},
{
"epoch": 0.00436124254268669,
"grad_norm": 1.4333837148693778,
"learning_rate": 2.9041095890410957e-06,
"loss": 1.1303,
"step": 53
},
{
"epoch": 0.004443530137831722,
"grad_norm": 1.4749791593345467,
"learning_rate": 2.9589041095890413e-06,
"loss": 1.1387,
"step": 54
},
{
"epoch": 0.004525817732976754,
"grad_norm": 1.4998339630977238,
"learning_rate": 3.0136986301369864e-06,
"loss": 1.1857,
"step": 55
},
{
"epoch": 0.004608105328121785,
"grad_norm": 1.5507431529256293,
"learning_rate": 3.068493150684932e-06,
"loss": 1.1487,
"step": 56
},
{
"epoch": 0.004690392923266818,
"grad_norm": 1.6348282836598194,
"learning_rate": 3.123287671232877e-06,
"loss": 1.1641,
"step": 57
},
{
"epoch": 0.0047726805184118495,
"grad_norm": 0.5752534532225031,
"learning_rate": 3.1780821917808225e-06,
"loss": 0.5701,
"step": 58
},
{
"epoch": 0.004854968113556881,
"grad_norm": 1.6099812024773308,
"learning_rate": 3.2328767123287676e-06,
"loss": 1.1721,
"step": 59
},
{
"epoch": 0.004937255708701913,
"grad_norm": 0.6408161226805661,
"learning_rate": 3.2876712328767123e-06,
"loss": 0.5998,
"step": 60
},
{
"epoch": 0.0050195433038469454,
"grad_norm": 0.5617271278467075,
"learning_rate": 3.342465753424658e-06,
"loss": 0.6265,
"step": 61
},
{
"epoch": 0.005101830898991977,
"grad_norm": 1.9160395609787255,
"learning_rate": 3.397260273972603e-06,
"loss": 1.1687,
"step": 62
},
{
"epoch": 0.005184118494137009,
"grad_norm": 1.7944962743686514,
"learning_rate": 3.4520547945205484e-06,
"loss": 1.0999,
"step": 63
},
{
"epoch": 0.0052664060892820405,
"grad_norm": 1.6550254402978586,
"learning_rate": 3.5068493150684935e-06,
"loss": 1.1283,
"step": 64
},
{
"epoch": 0.005348693684427073,
"grad_norm": 2.06701106889446,
"learning_rate": 3.5616438356164386e-06,
"loss": 1.1449,
"step": 65
},
{
"epoch": 0.005430981279572105,
"grad_norm": 1.334891505276627,
"learning_rate": 3.616438356164384e-06,
"loss": 1.0978,
"step": 66
},
{
"epoch": 0.005513268874717136,
"grad_norm": 1.809032539584058,
"learning_rate": 3.671232876712329e-06,
"loss": 1.1172,
"step": 67
},
{
"epoch": 0.005595556469862168,
"grad_norm": 0.5631162064075181,
"learning_rate": 3.7260273972602743e-06,
"loss": 0.5793,
"step": 68
},
{
"epoch": 0.0056778440650072,
"grad_norm": 1.6486487445332147,
"learning_rate": 3.7808219178082194e-06,
"loss": 1.0659,
"step": 69
},
{
"epoch": 0.005760131660152232,
"grad_norm": 1.7514518974861626,
"learning_rate": 3.8356164383561645e-06,
"loss": 1.1786,
"step": 70
},
{
"epoch": 0.005842419255297264,
"grad_norm": 2.6958756773092887,
"learning_rate": 3.89041095890411e-06,
"loss": 1.1019,
"step": 71
},
{
"epoch": 0.005924706850442296,
"grad_norm": 1.7803679070531404,
"learning_rate": 3.945205479452055e-06,
"loss": 1.0859,
"step": 72
},
{
"epoch": 0.006006994445587327,
"grad_norm": 1.5059878641321802,
"learning_rate": 4.000000000000001e-06,
"loss": 1.0788,
"step": 73
},
{
"epoch": 0.00608928204073236,
"grad_norm": 1.8716327109844846,
"learning_rate": 4.054794520547946e-06,
"loss": 1.1095,
"step": 74
},
{
"epoch": 0.0061715696358773916,
"grad_norm": 1.5616475319286818,
"learning_rate": 4.109589041095891e-06,
"loss": 1.1278,
"step": 75
},
{
"epoch": 0.006253857231022423,
"grad_norm": 1.493898527453622,
"learning_rate": 4.164383561643836e-06,
"loss": 1.104,
"step": 76
},
{
"epoch": 0.006336144826167455,
"grad_norm": 1.8452837120263397,
"learning_rate": 4.219178082191781e-06,
"loss": 1.1095,
"step": 77
},
{
"epoch": 0.0064184324213124875,
"grad_norm": 1.784319898693149,
"learning_rate": 4.273972602739727e-06,
"loss": 1.0949,
"step": 78
},
{
"epoch": 0.006500720016457519,
"grad_norm": 2.137737098454538,
"learning_rate": 4.328767123287671e-06,
"loss": 1.1302,
"step": 79
},
{
"epoch": 0.006583007611602551,
"grad_norm": 1.5914074135685312,
"learning_rate": 4.383561643835616e-06,
"loss": 1.0916,
"step": 80
},
{
"epoch": 0.0066652952067475825,
"grad_norm": 2.3489068213528266,
"learning_rate": 4.438356164383562e-06,
"loss": 1.0729,
"step": 81
},
{
"epoch": 0.006747582801892615,
"grad_norm": 2.073369039063705,
"learning_rate": 4.493150684931507e-06,
"loss": 1.0892,
"step": 82
},
{
"epoch": 0.006829870397037647,
"grad_norm": 1.8770075428367665,
"learning_rate": 4.5479452054794525e-06,
"loss": 1.1187,
"step": 83
},
{
"epoch": 0.006912157992182678,
"grad_norm": 4.506883747948483,
"learning_rate": 4.602739726027398e-06,
"loss": 1.0762,
"step": 84
},
{
"epoch": 0.00699444558732771,
"grad_norm": 1.7209663187813125,
"learning_rate": 4.657534246575343e-06,
"loss": 1.1226,
"step": 85
},
{
"epoch": 0.007076733182472743,
"grad_norm": 0.6052191270162426,
"learning_rate": 4.712328767123288e-06,
"loss": 0.6055,
"step": 86
},
{
"epoch": 0.007159020777617774,
"grad_norm": 1.7994312730778819,
"learning_rate": 4.767123287671233e-06,
"loss": 1.0967,
"step": 87
},
{
"epoch": 0.007241308372762806,
"grad_norm": 1.9304702595282108,
"learning_rate": 4.821917808219179e-06,
"loss": 1.1492,
"step": 88
},
{
"epoch": 0.007323595967907838,
"grad_norm": 2.088564652992412,
"learning_rate": 4.876712328767124e-06,
"loss": 1.0985,
"step": 89
},
{
"epoch": 0.00740588356305287,
"grad_norm": 1.8604994381662585,
"learning_rate": 4.931506849315069e-06,
"loss": 1.0923,
"step": 90
},
{
"epoch": 0.007488171158197902,
"grad_norm": 0.5594391183994828,
"learning_rate": 4.986301369863014e-06,
"loss": 0.6021,
"step": 91
},
{
"epoch": 0.007570458753342934,
"grad_norm": 1.7905925850647735,
"learning_rate": 5.04109589041096e-06,
"loss": 1.1047,
"step": 92
},
{
"epoch": 0.007652746348487965,
"grad_norm": 2.5829004230758055,
"learning_rate": 5.095890410958904e-06,
"loss": 1.0856,
"step": 93
},
{
"epoch": 0.007735033943632997,
"grad_norm": 2.8109366679812817,
"learning_rate": 5.1506849315068494e-06,
"loss": 1.0906,
"step": 94
},
{
"epoch": 0.00781732153877803,
"grad_norm": 1.9488333893087777,
"learning_rate": 5.2054794520547945e-06,
"loss": 1.1174,
"step": 95
},
{
"epoch": 0.00789960913392306,
"grad_norm": 1.8898489727850725,
"learning_rate": 5.26027397260274e-06,
"loss": 1.0764,
"step": 96
},
{
"epoch": 0.007981896729068093,
"grad_norm": 1.9662220110655733,
"learning_rate": 5.3150684931506856e-06,
"loss": 1.0687,
"step": 97
},
{
"epoch": 0.008064184324213125,
"grad_norm": 2.012210892740288,
"learning_rate": 5.369863013698631e-06,
"loss": 1.0688,
"step": 98
},
{
"epoch": 0.008146471919358156,
"grad_norm": 2.0256582980555145,
"learning_rate": 5.424657534246576e-06,
"loss": 1.0435,
"step": 99
},
{
"epoch": 0.008228759514503189,
"grad_norm": 2.3161294458478228,
"learning_rate": 5.479452054794521e-06,
"loss": 1.1027,
"step": 100
},
{
"epoch": 0.008311047109648221,
"grad_norm": 2.159842764055281,
"learning_rate": 5.534246575342466e-06,
"loss": 1.0223,
"step": 101
},
{
"epoch": 0.008393334704793252,
"grad_norm": 2.7342793057170964,
"learning_rate": 5.589041095890412e-06,
"loss": 1.0485,
"step": 102
},
{
"epoch": 0.008475622299938285,
"grad_norm": 0.6133807544248717,
"learning_rate": 5.643835616438357e-06,
"loss": 0.5933,
"step": 103
},
{
"epoch": 0.008557909895083315,
"grad_norm": 2.0957817610708593,
"learning_rate": 5.698630136986302e-06,
"loss": 1.084,
"step": 104
},
{
"epoch": 0.008640197490228348,
"grad_norm": 3.0607800999765105,
"learning_rate": 5.753424657534246e-06,
"loss": 1.0369,
"step": 105
},
{
"epoch": 0.00872248508537338,
"grad_norm": 2.3550652220766404,
"learning_rate": 5.8082191780821915e-06,
"loss": 1.0785,
"step": 106
},
{
"epoch": 0.008804772680518411,
"grad_norm": 2.885362070393249,
"learning_rate": 5.863013698630137e-06,
"loss": 1.1143,
"step": 107
},
{
"epoch": 0.008887060275663444,
"grad_norm": 2.726344088292101,
"learning_rate": 5.9178082191780825e-06,
"loss": 1.0423,
"step": 108
},
{
"epoch": 0.008969347870808476,
"grad_norm": 2.720421039977678,
"learning_rate": 5.972602739726028e-06,
"loss": 1.0424,
"step": 109
},
{
"epoch": 0.009051635465953507,
"grad_norm": 2.7737084246092043,
"learning_rate": 6.027397260273973e-06,
"loss": 1.0669,
"step": 110
},
{
"epoch": 0.00913392306109854,
"grad_norm": 2.4862795852431696,
"learning_rate": 6.082191780821919e-06,
"loss": 1.0798,
"step": 111
},
{
"epoch": 0.00921621065624357,
"grad_norm": 1.9953691894673529,
"learning_rate": 6.136986301369864e-06,
"loss": 1.0337,
"step": 112
},
{
"epoch": 0.009298498251388603,
"grad_norm": 2.1734409375655908,
"learning_rate": 6.191780821917809e-06,
"loss": 1.0769,
"step": 113
},
{
"epoch": 0.009380785846533636,
"grad_norm": 2.4691052918090457,
"learning_rate": 6.246575342465754e-06,
"loss": 1.0758,
"step": 114
},
{
"epoch": 0.009463073441678667,
"grad_norm": 2.51765809469206,
"learning_rate": 6.301369863013699e-06,
"loss": 1.1065,
"step": 115
},
{
"epoch": 0.009545361036823699,
"grad_norm": 2.3976820917439916,
"learning_rate": 6.356164383561645e-06,
"loss": 1.0454,
"step": 116
},
{
"epoch": 0.00962764863196873,
"grad_norm": 0.5713752667519881,
"learning_rate": 6.41095890410959e-06,
"loss": 0.5767,
"step": 117
},
{
"epoch": 0.009709936227113762,
"grad_norm": 2.9303587471653385,
"learning_rate": 6.465753424657535e-06,
"loss": 1.0596,
"step": 118
},
{
"epoch": 0.009792223822258795,
"grad_norm": 2.625385971373383,
"learning_rate": 6.5205479452054794e-06,
"loss": 1.0694,
"step": 119
},
{
"epoch": 0.009874511417403826,
"grad_norm": 2.6850490082257368,
"learning_rate": 6.5753424657534245e-06,
"loss": 1.0629,
"step": 120
},
{
"epoch": 0.009956799012548858,
"grad_norm": 2.8941680627630575,
"learning_rate": 6.630136986301371e-06,
"loss": 1.0797,
"step": 121
},
{
"epoch": 0.010039086607693891,
"grad_norm": 2.437227451528501,
"learning_rate": 6.684931506849316e-06,
"loss": 1.0446,
"step": 122
},
{
"epoch": 0.010121374202838922,
"grad_norm": 4.2330170384868655,
"learning_rate": 6.739726027397261e-06,
"loss": 1.077,
"step": 123
},
{
"epoch": 0.010203661797983954,
"grad_norm": 3.742681446646284,
"learning_rate": 6.794520547945206e-06,
"loss": 1.0578,
"step": 124
},
{
"epoch": 0.010285949393128985,
"grad_norm": 2.905751102486295,
"learning_rate": 6.849315068493151e-06,
"loss": 1.0397,
"step": 125
},
{
"epoch": 0.010368236988274018,
"grad_norm": 2.248809486049495,
"learning_rate": 6.904109589041097e-06,
"loss": 1.0057,
"step": 126
},
{
"epoch": 0.01045052458341905,
"grad_norm": 2.793469113179832,
"learning_rate": 6.958904109589042e-06,
"loss": 1.0423,
"step": 127
},
{
"epoch": 0.010532812178564081,
"grad_norm": 3.044433211099124,
"learning_rate": 7.013698630136987e-06,
"loss": 1.0519,
"step": 128
},
{
"epoch": 0.010615099773709114,
"grad_norm": 3.453404138683163,
"learning_rate": 7.068493150684932e-06,
"loss": 1.0492,
"step": 129
},
{
"epoch": 0.010697387368854146,
"grad_norm": 3.294896819292345,
"learning_rate": 7.123287671232877e-06,
"loss": 1.0186,
"step": 130
},
{
"epoch": 0.010779674963999177,
"grad_norm": 2.652529510878711,
"learning_rate": 7.178082191780823e-06,
"loss": 1.0481,
"step": 131
},
{
"epoch": 0.01086196255914421,
"grad_norm": 2.5635334133873835,
"learning_rate": 7.232876712328768e-06,
"loss": 1.0189,
"step": 132
},
{
"epoch": 0.01094425015428924,
"grad_norm": 2.310822969570939,
"learning_rate": 7.287671232876713e-06,
"loss": 1.0804,
"step": 133
},
{
"epoch": 0.011026537749434273,
"grad_norm": 2.7939745420750532,
"learning_rate": 7.342465753424658e-06,
"loss": 1.0731,
"step": 134
},
{
"epoch": 0.011108825344579305,
"grad_norm": 10.159052417359996,
"learning_rate": 7.397260273972603e-06,
"loss": 1.0013,
"step": 135
},
{
"epoch": 0.011191112939724336,
"grad_norm": 2.492104076947929,
"learning_rate": 7.452054794520549e-06,
"loss": 1.058,
"step": 136
},
{
"epoch": 0.011273400534869369,
"grad_norm": 2.7323610574219512,
"learning_rate": 7.506849315068494e-06,
"loss": 1.0503,
"step": 137
},
{
"epoch": 0.0113556881300144,
"grad_norm": 2.94667222448598,
"learning_rate": 7.561643835616439e-06,
"loss": 1.0283,
"step": 138
},
{
"epoch": 0.011437975725159432,
"grad_norm": 4.017422542900321,
"learning_rate": 7.616438356164384e-06,
"loss": 1.0883,
"step": 139
},
{
"epoch": 0.011520263320304465,
"grad_norm": 3.6715275879486633,
"learning_rate": 7.671232876712329e-06,
"loss": 1.0536,
"step": 140
},
{
"epoch": 0.011602550915449495,
"grad_norm": 3.0172048685106603,
"learning_rate": 7.726027397260276e-06,
"loss": 1.055,
"step": 141
},
{
"epoch": 0.011684838510594528,
"grad_norm": 3.077620329335805,
"learning_rate": 7.78082191780822e-06,
"loss": 1.0195,
"step": 142
},
{
"epoch": 0.01176712610573956,
"grad_norm": 2.959594926294125,
"learning_rate": 7.835616438356164e-06,
"loss": 1.0369,
"step": 143
},
{
"epoch": 0.011849413700884591,
"grad_norm": 5.2531338908420055,
"learning_rate": 7.89041095890411e-06,
"loss": 1.0524,
"step": 144
},
{
"epoch": 0.011931701296029624,
"grad_norm": 2.9462988063147755,
"learning_rate": 7.945205479452055e-06,
"loss": 1.0258,
"step": 145
},
{
"epoch": 0.012013988891174655,
"grad_norm": 2.835501864556677,
"learning_rate": 8.000000000000001e-06,
"loss": 1.0035,
"step": 146
},
{
"epoch": 0.012096276486319687,
"grad_norm": 3.1002864915340798,
"learning_rate": 8.054794520547946e-06,
"loss": 1.0379,
"step": 147
},
{
"epoch": 0.01217856408146472,
"grad_norm": 2.7184860323108464,
"learning_rate": 8.109589041095892e-06,
"loss": 1.0373,
"step": 148
},
{
"epoch": 0.01226085167660975,
"grad_norm": 3.093424317685046,
"learning_rate": 8.164383561643837e-06,
"loss": 1.0559,
"step": 149
},
{
"epoch": 0.012343139271754783,
"grad_norm": 2.9403313251924064,
"learning_rate": 8.219178082191782e-06,
"loss": 1.0312,
"step": 150
},
{
"epoch": 0.012425426866899816,
"grad_norm": 3.334710236004298,
"learning_rate": 8.273972602739727e-06,
"loss": 1.032,
"step": 151
},
{
"epoch": 0.012507714462044846,
"grad_norm": 3.754339855053731,
"learning_rate": 8.328767123287672e-06,
"loss": 1.007,
"step": 152
},
{
"epoch": 0.012590002057189879,
"grad_norm": 3.468367068790295,
"learning_rate": 8.383561643835617e-06,
"loss": 1.0352,
"step": 153
},
{
"epoch": 0.01267228965233491,
"grad_norm": 3.08946479512089,
"learning_rate": 8.438356164383562e-06,
"loss": 1.0285,
"step": 154
},
{
"epoch": 0.012754577247479942,
"grad_norm": 2.7171722187405463,
"learning_rate": 8.493150684931507e-06,
"loss": 1.0355,
"step": 155
},
{
"epoch": 0.012836864842624975,
"grad_norm": 2.9125857783989955,
"learning_rate": 8.547945205479454e-06,
"loss": 1.0383,
"step": 156
},
{
"epoch": 0.012919152437770006,
"grad_norm": 3.431055558365553,
"learning_rate": 8.602739726027397e-06,
"loss": 0.9858,
"step": 157
},
{
"epoch": 0.013001440032915038,
"grad_norm": 2.5695243675652906,
"learning_rate": 8.657534246575343e-06,
"loss": 1.0257,
"step": 158
},
{
"epoch": 0.013083727628060069,
"grad_norm": 3.1403965108405645,
"learning_rate": 8.712328767123288e-06,
"loss": 1.0161,
"step": 159
},
{
"epoch": 0.013166015223205102,
"grad_norm": 3.0914617102513535,
"learning_rate": 8.767123287671233e-06,
"loss": 1.0126,
"step": 160
},
{
"epoch": 0.013248302818350134,
"grad_norm": 2.974266261740425,
"learning_rate": 8.82191780821918e-06,
"loss": 1.0146,
"step": 161
},
{
"epoch": 0.013330590413495165,
"grad_norm": 4.453619610906972,
"learning_rate": 8.876712328767125e-06,
"loss": 1.01,
"step": 162
},
{
"epoch": 0.013412878008640198,
"grad_norm": 3.3339134633525203,
"learning_rate": 8.93150684931507e-06,
"loss": 1.0164,
"step": 163
},
{
"epoch": 0.01349516560378523,
"grad_norm": 3.096524915506246,
"learning_rate": 8.986301369863015e-06,
"loss": 1.0436,
"step": 164
},
{
"epoch": 0.013577453198930261,
"grad_norm": 0.5714699105064062,
"learning_rate": 9.04109589041096e-06,
"loss": 0.5844,
"step": 165
},
{
"epoch": 0.013659740794075293,
"grad_norm": 3.3053733088978294,
"learning_rate": 9.095890410958905e-06,
"loss": 1.01,
"step": 166
},
{
"epoch": 0.013742028389220324,
"grad_norm": 3.042487650681917,
"learning_rate": 9.15068493150685e-06,
"loss": 1.0258,
"step": 167
},
{
"epoch": 0.013824315984365357,
"grad_norm": 3.0826602321214267,
"learning_rate": 9.205479452054795e-06,
"loss": 1.0152,
"step": 168
},
{
"epoch": 0.01390660357951039,
"grad_norm": 4.049305212778963,
"learning_rate": 9.26027397260274e-06,
"loss": 1.0344,
"step": 169
},
{
"epoch": 0.01398889117465542,
"grad_norm": 2.262878129775452,
"learning_rate": 9.315068493150685e-06,
"loss": 0.9903,
"step": 170
},
{
"epoch": 0.014071178769800453,
"grad_norm": 2.5478144837312904,
"learning_rate": 9.36986301369863e-06,
"loss": 1.0255,
"step": 171
},
{
"epoch": 0.014153466364945485,
"grad_norm": 0.5963923221726043,
"learning_rate": 9.424657534246576e-06,
"loss": 0.5835,
"step": 172
},
{
"epoch": 0.014235753960090516,
"grad_norm": 2.4229291883624775,
"learning_rate": 9.47945205479452e-06,
"loss": 0.9969,
"step": 173
},
{
"epoch": 0.014318041555235549,
"grad_norm": 2.5861485778295563,
"learning_rate": 9.534246575342466e-06,
"loss": 1.0321,
"step": 174
},
{
"epoch": 0.01440032915038058,
"grad_norm": 3.0535728376170868,
"learning_rate": 9.589041095890411e-06,
"loss": 1.0545,
"step": 175
},
{
"epoch": 0.014482616745525612,
"grad_norm": 3.167624134264756,
"learning_rate": 9.643835616438358e-06,
"loss": 1.0212,
"step": 176
},
{
"epoch": 0.014564904340670645,
"grad_norm": 2.532407359117499,
"learning_rate": 9.698630136986303e-06,
"loss": 1.0395,
"step": 177
},
{
"epoch": 0.014647191935815675,
"grad_norm": 3.335905765902237,
"learning_rate": 9.753424657534248e-06,
"loss": 1.0444,
"step": 178
},
{
"epoch": 0.014729479530960708,
"grad_norm": 2.6694368517880376,
"learning_rate": 9.808219178082193e-06,
"loss": 1.0609,
"step": 179
},
{
"epoch": 0.01481176712610574,
"grad_norm": 2.4432476499205946,
"learning_rate": 9.863013698630138e-06,
"loss": 1.028,
"step": 180
},
{
"epoch": 0.014894054721250771,
"grad_norm": 3.074867289580692,
"learning_rate": 9.917808219178083e-06,
"loss": 1.0277,
"step": 181
},
{
"epoch": 0.014976342316395804,
"grad_norm": 2.8234239360995548,
"learning_rate": 9.972602739726028e-06,
"loss": 1.0145,
"step": 182
},
{
"epoch": 0.015058629911540835,
"grad_norm": 2.7243533214462636,
"learning_rate": 1.0027397260273975e-05,
"loss": 0.9962,
"step": 183
},
{
"epoch": 0.015140917506685867,
"grad_norm": 9.268831121545867,
"learning_rate": 1.008219178082192e-05,
"loss": 1.0202,
"step": 184
},
{
"epoch": 0.0152232051018309,
"grad_norm": 0.6032487906705319,
"learning_rate": 1.0136986301369864e-05,
"loss": 0.5914,
"step": 185
},
{
"epoch": 0.01530549269697593,
"grad_norm": 2.446903956621448,
"learning_rate": 1.0191780821917809e-05,
"loss": 1.0332,
"step": 186
},
{
"epoch": 0.015387780292120963,
"grad_norm": 2.9898530283159857,
"learning_rate": 1.0246575342465754e-05,
"loss": 1.0058,
"step": 187
},
{
"epoch": 0.015470067887265994,
"grad_norm": 3.1462756197093147,
"learning_rate": 1.0301369863013699e-05,
"loss": 0.9956,
"step": 188
},
{
"epoch": 0.015552355482411026,
"grad_norm": 2.603677254795289,
"learning_rate": 1.0356164383561644e-05,
"loss": 1.0567,
"step": 189
},
{
"epoch": 0.01563464307755606,
"grad_norm": 2.888609337531178,
"learning_rate": 1.0410958904109589e-05,
"loss": 1.0117,
"step": 190
},
{
"epoch": 0.01571693067270109,
"grad_norm": 3.4481892347405694,
"learning_rate": 1.0465753424657534e-05,
"loss": 1.0312,
"step": 191
},
{
"epoch": 0.01579921826784612,
"grad_norm": 2.723259220748936,
"learning_rate": 1.052054794520548e-05,
"loss": 1.0011,
"step": 192
},
{
"epoch": 0.015881505862991155,
"grad_norm": 2.400388335266181,
"learning_rate": 1.0575342465753426e-05,
"loss": 1.0397,
"step": 193
},
{
"epoch": 0.015963793458136186,
"grad_norm": 2.459799194471057,
"learning_rate": 1.0630136986301371e-05,
"loss": 1.0051,
"step": 194
},
{
"epoch": 0.016046081053281216,
"grad_norm": 2.493367813709158,
"learning_rate": 1.0684931506849316e-05,
"loss": 0.9877,
"step": 195
},
{
"epoch": 0.01612836864842625,
"grad_norm": 2.997365023733453,
"learning_rate": 1.0739726027397261e-05,
"loss": 0.9991,
"step": 196
},
{
"epoch": 0.01621065624357128,
"grad_norm": 3.1534988892754927,
"learning_rate": 1.0794520547945206e-05,
"loss": 1.0088,
"step": 197
},
{
"epoch": 0.016292943838716312,
"grad_norm": 0.7839570400001313,
"learning_rate": 1.0849315068493152e-05,
"loss": 0.5796,
"step": 198
},
{
"epoch": 0.016375231433861347,
"grad_norm": 2.968831135340441,
"learning_rate": 1.0904109589041097e-05,
"loss": 1.0169,
"step": 199
},
{
"epoch": 0.016457519029006377,
"grad_norm": 3.1769343467774736,
"learning_rate": 1.0958904109589042e-05,
"loss": 1.0097,
"step": 200
},
{
"epoch": 0.01653980662415141,
"grad_norm": 2.941876345769733,
"learning_rate": 1.1013698630136987e-05,
"loss": 1.0021,
"step": 201
},
{
"epoch": 0.016622094219296443,
"grad_norm": 3.3680817014108353,
"learning_rate": 1.1068493150684932e-05,
"loss": 1.0218,
"step": 202
},
{
"epoch": 0.016704381814441473,
"grad_norm": 2.908397865551594,
"learning_rate": 1.1123287671232879e-05,
"loss": 0.9939,
"step": 203
},
{
"epoch": 0.016786669409586504,
"grad_norm": 2.822395296594326,
"learning_rate": 1.1178082191780824e-05,
"loss": 1.0172,
"step": 204
},
{
"epoch": 0.016868957004731535,
"grad_norm": 2.758365809402905,
"learning_rate": 1.1232876712328769e-05,
"loss": 1.05,
"step": 205
},
{
"epoch": 0.01695124459987657,
"grad_norm": 2.9222144058188984,
"learning_rate": 1.1287671232876714e-05,
"loss": 1.0073,
"step": 206
},
{
"epoch": 0.0170335321950216,
"grad_norm": 2.7763083571649547,
"learning_rate": 1.1342465753424659e-05,
"loss": 0.9958,
"step": 207
},
{
"epoch": 0.01711581979016663,
"grad_norm": 0.9573751817349475,
"learning_rate": 1.1397260273972604e-05,
"loss": 0.6336,
"step": 208
},
{
"epoch": 0.017198107385311665,
"grad_norm": 3.6768856466236857,
"learning_rate": 1.1452054794520548e-05,
"loss": 0.9839,
"step": 209
},
{
"epoch": 0.017280394980456696,
"grad_norm": 0.6002615125347783,
"learning_rate": 1.1506849315068493e-05,
"loss": 0.5964,
"step": 210
},
{
"epoch": 0.017362682575601727,
"grad_norm": 3.003839522918383,
"learning_rate": 1.1561643835616438e-05,
"loss": 1.0106,
"step": 211
},
{
"epoch": 0.01744497017074676,
"grad_norm": 3.0141237654512305,
"learning_rate": 1.1616438356164383e-05,
"loss": 1.005,
"step": 212
},
{
"epoch": 0.017527257765891792,
"grad_norm": 2.3380796106197583,
"learning_rate": 1.1671232876712331e-05,
"loss": 1.0025,
"step": 213
},
{
"epoch": 0.017609545361036823,
"grad_norm": 2.749317750470713,
"learning_rate": 1.1726027397260275e-05,
"loss": 1.0208,
"step": 214
},
{
"epoch": 0.017691832956181857,
"grad_norm": 2.5174324368341363,
"learning_rate": 1.178082191780822e-05,
"loss": 1.0225,
"step": 215
},
{
"epoch": 0.017774120551326888,
"grad_norm": 2.6939469770631206,
"learning_rate": 1.1835616438356165e-05,
"loss": 1.0181,
"step": 216
},
{
"epoch": 0.01785640814647192,
"grad_norm": 2.7969043874385218,
"learning_rate": 1.189041095890411e-05,
"loss": 1.0321,
"step": 217
},
{
"epoch": 0.017938695741616953,
"grad_norm": 2.130515743950604,
"learning_rate": 1.1945205479452055e-05,
"loss": 0.9939,
"step": 218
},
{
"epoch": 0.018020983336761984,
"grad_norm": 2.8848097718992296,
"learning_rate": 1.2e-05,
"loss": 1.0064,
"step": 219
},
{
"epoch": 0.018103270931907015,
"grad_norm": 1.496463088281579,
"learning_rate": 1.2054794520547945e-05,
"loss": 0.6077,
"step": 220
},
{
"epoch": 0.018185558527052045,
"grad_norm": 3.6292481030110935,
"learning_rate": 1.210958904109589e-05,
"loss": 1.0446,
"step": 221
},
{
"epoch": 0.01826784612219708,
"grad_norm": 2.252792644024641,
"learning_rate": 1.2164383561643837e-05,
"loss": 0.9739,
"step": 222
},
{
"epoch": 0.01835013371734211,
"grad_norm": 2.4478822538483755,
"learning_rate": 1.2219178082191782e-05,
"loss": 1.0131,
"step": 223
},
{
"epoch": 0.01843242131248714,
"grad_norm": 2.559717897830331,
"learning_rate": 1.2273972602739727e-05,
"loss": 1.0394,
"step": 224
},
{
"epoch": 0.018514708907632176,
"grad_norm": 2.869935242686829,
"learning_rate": 1.2328767123287673e-05,
"loss": 0.982,
"step": 225
},
{
"epoch": 0.018596996502777206,
"grad_norm": 2.5009663006221974,
"learning_rate": 1.2383561643835618e-05,
"loss": 1.0108,
"step": 226
},
{
"epoch": 0.018679284097922237,
"grad_norm": 2.9956405565150654,
"learning_rate": 1.2438356164383563e-05,
"loss": 0.9902,
"step": 227
},
{
"epoch": 0.01876157169306727,
"grad_norm": 2.674322004514903,
"learning_rate": 1.2493150684931508e-05,
"loss": 0.9927,
"step": 228
},
{
"epoch": 0.018843859288212302,
"grad_norm": 2.8674094236769583,
"learning_rate": 1.2547945205479453e-05,
"loss": 1.003,
"step": 229
},
{
"epoch": 0.018926146883357333,
"grad_norm": 2.9710081363188703,
"learning_rate": 1.2602739726027398e-05,
"loss": 0.9844,
"step": 230
},
{
"epoch": 0.019008434478502367,
"grad_norm": 2.98201549226896,
"learning_rate": 1.2657534246575343e-05,
"loss": 0.967,
"step": 231
},
{
"epoch": 0.019090722073647398,
"grad_norm": 2.903452559676373,
"learning_rate": 1.271232876712329e-05,
"loss": 1.0102,
"step": 232
},
{
"epoch": 0.01917300966879243,
"grad_norm": 2.5049333400477813,
"learning_rate": 1.2767123287671235e-05,
"loss": 1.0096,
"step": 233
},
{
"epoch": 0.01925529726393746,
"grad_norm": 2.6342420325330522,
"learning_rate": 1.282191780821918e-05,
"loss": 0.9718,
"step": 234
},
{
"epoch": 0.019337584859082494,
"grad_norm": 2.616314817819011,
"learning_rate": 1.2876712328767125e-05,
"loss": 0.9977,
"step": 235
},
{
"epoch": 0.019419872454227525,
"grad_norm": 2.420031810864845,
"learning_rate": 1.293150684931507e-05,
"loss": 1.0117,
"step": 236
},
{
"epoch": 0.019502160049372556,
"grad_norm": 2.9412487319960126,
"learning_rate": 1.2986301369863015e-05,
"loss": 1.0471,
"step": 237
},
{
"epoch": 0.01958444764451759,
"grad_norm": 2.7984406162708906,
"learning_rate": 1.3041095890410959e-05,
"loss": 0.9501,
"step": 238
},
{
"epoch": 0.01966673523966262,
"grad_norm": 4.841561737416111,
"learning_rate": 1.3095890410958904e-05,
"loss": 1.0138,
"step": 239
},
{
"epoch": 0.01974902283480765,
"grad_norm": 2.1778156992905577,
"learning_rate": 1.3150684931506849e-05,
"loss": 1.0101,
"step": 240
},
{
"epoch": 0.019831310429952686,
"grad_norm": 2.67809296527932,
"learning_rate": 1.3205479452054794e-05,
"loss": 0.982,
"step": 241
},
{
"epoch": 0.019913598025097717,
"grad_norm": 2.738306662356033,
"learning_rate": 1.3260273972602743e-05,
"loss": 0.9953,
"step": 242
},
{
"epoch": 0.019995885620242747,
"grad_norm": 3.69258760845872,
"learning_rate": 1.3315068493150686e-05,
"loss": 0.9933,
"step": 243
},
{
"epoch": 0.020078173215387782,
"grad_norm": 3.4285570541743096,
"learning_rate": 1.3369863013698631e-05,
"loss": 0.9891,
"step": 244
},
{
"epoch": 0.020160460810532813,
"grad_norm": 2.1884703037736175,
"learning_rate": 1.3424657534246576e-05,
"loss": 0.9615,
"step": 245
},
{
"epoch": 0.020242748405677843,
"grad_norm": 2.278997433805173,
"learning_rate": 1.3479452054794521e-05,
"loss": 0.9984,
"step": 246
},
{
"epoch": 0.020325036000822878,
"grad_norm": 0.9732502137516167,
"learning_rate": 1.3534246575342466e-05,
"loss": 0.5964,
"step": 247
},
{
"epoch": 0.02040732359596791,
"grad_norm": 4.111007905694721,
"learning_rate": 1.3589041095890412e-05,
"loss": 1.03,
"step": 248
},
{
"epoch": 0.02048961119111294,
"grad_norm": 2.104309544659177,
"learning_rate": 1.3643835616438357e-05,
"loss": 0.9696,
"step": 249
},
{
"epoch": 0.02057189878625797,
"grad_norm": 2.5670779853119665,
"learning_rate": 1.3698630136986302e-05,
"loss": 0.9589,
"step": 250
},
{
"epoch": 0.020654186381403004,
"grad_norm": 2.7898261074191777,
"learning_rate": 1.3753424657534247e-05,
"loss": 1.0084,
"step": 251
},
{
"epoch": 0.020736473976548035,
"grad_norm": 3.2009246830375204,
"learning_rate": 1.3808219178082194e-05,
"loss": 0.9911,
"step": 252
},
{
"epoch": 0.020818761571693066,
"grad_norm": 3.1563797863262777,
"learning_rate": 1.3863013698630139e-05,
"loss": 0.9947,
"step": 253
},
{
"epoch": 0.0209010491668381,
"grad_norm": 3.193090081286074,
"learning_rate": 1.3917808219178084e-05,
"loss": 1.0069,
"step": 254
},
{
"epoch": 0.02098333676198313,
"grad_norm": 5.521797116199944,
"learning_rate": 1.3972602739726029e-05,
"loss": 0.9842,
"step": 255
},
{
"epoch": 0.021065624357128162,
"grad_norm": 1.243014761274919,
"learning_rate": 1.4027397260273974e-05,
"loss": 0.6147,
"step": 256
},
{
"epoch": 0.021147911952273196,
"grad_norm": 3.191364616862045,
"learning_rate": 1.4082191780821919e-05,
"loss": 0.974,
"step": 257
},
{
"epoch": 0.021230199547418227,
"grad_norm": 2.93570172220106,
"learning_rate": 1.4136986301369864e-05,
"loss": 0.9719,
"step": 258
},
{
"epoch": 0.021312487142563258,
"grad_norm": 4.468162617805659,
"learning_rate": 1.419178082191781e-05,
"loss": 0.9904,
"step": 259
},
{
"epoch": 0.021394774737708292,
"grad_norm": 2.2571244653960862,
"learning_rate": 1.4246575342465754e-05,
"loss": 0.9613,
"step": 260
},
{
"epoch": 0.021477062332853323,
"grad_norm": 4.467563699694284,
"learning_rate": 1.43013698630137e-05,
"loss": 0.9944,
"step": 261
},
{
"epoch": 0.021559349927998354,
"grad_norm": 0.68889362412214,
"learning_rate": 1.4356164383561646e-05,
"loss": 0.5789,
"step": 262
},
{
"epoch": 0.021641637523143385,
"grad_norm": 0.6373164384054985,
"learning_rate": 1.4410958904109591e-05,
"loss": 0.5688,
"step": 263
},
{
"epoch": 0.02172392511828842,
"grad_norm": 3.597782460566262,
"learning_rate": 1.4465753424657537e-05,
"loss": 0.9776,
"step": 264
},
{
"epoch": 0.02180621271343345,
"grad_norm": 2.7541673143111347,
"learning_rate": 1.4520547945205482e-05,
"loss": 0.9927,
"step": 265
},
{
"epoch": 0.02188850030857848,
"grad_norm": 0.6805788182804722,
"learning_rate": 1.4575342465753427e-05,
"loss": 0.5971,
"step": 266
},
{
"epoch": 0.021970787903723515,
"grad_norm": 2.725379141853366,
"learning_rate": 1.463013698630137e-05,
"loss": 0.9675,
"step": 267
},
{
"epoch": 0.022053075498868546,
"grad_norm": 4.08013853272879,
"learning_rate": 1.4684931506849315e-05,
"loss": 0.9786,
"step": 268
},
{
"epoch": 0.022135363094013576,
"grad_norm": 2.5492247984913483,
"learning_rate": 1.473972602739726e-05,
"loss": 0.9988,
"step": 269
},
{
"epoch": 0.02221765068915861,
"grad_norm": 3.8860413387854327,
"learning_rate": 1.4794520547945205e-05,
"loss": 0.9697,
"step": 270
},
{
"epoch": 0.02229993828430364,
"grad_norm": 3.0719505820425925,
"learning_rate": 1.484931506849315e-05,
"loss": 0.9778,
"step": 271
},
{
"epoch": 0.022382225879448672,
"grad_norm": 3.065813452275364,
"learning_rate": 1.4904109589041097e-05,
"loss": 1.0114,
"step": 272
},
{
"epoch": 0.022464513474593707,
"grad_norm": 3.119520514603019,
"learning_rate": 1.4958904109589042e-05,
"loss": 1.0143,
"step": 273
},
{
"epoch": 0.022546801069738737,
"grad_norm": 2.8059490672957823,
"learning_rate": 1.5013698630136988e-05,
"loss": 0.9815,
"step": 274
},
{
"epoch": 0.022629088664883768,
"grad_norm": 2.6271007340037706,
"learning_rate": 1.5068493150684933e-05,
"loss": 1.0251,
"step": 275
},
{
"epoch": 0.0227113762600288,
"grad_norm": 3.114887825941429,
"learning_rate": 1.5123287671232878e-05,
"loss": 0.9722,
"step": 276
},
{
"epoch": 0.022793663855173833,
"grad_norm": 3.222134871844559,
"learning_rate": 1.5178082191780823e-05,
"loss": 0.9895,
"step": 277
},
{
"epoch": 0.022875951450318864,
"grad_norm": 0.8596732284566506,
"learning_rate": 1.5232876712328768e-05,
"loss": 0.6421,
"step": 278
},
{
"epoch": 0.022958239045463895,
"grad_norm": 2.688881192050172,
"learning_rate": 1.5287671232876713e-05,
"loss": 0.9709,
"step": 279
},
{
"epoch": 0.02304052664060893,
"grad_norm": 0.5908184070761948,
"learning_rate": 1.5342465753424658e-05,
"loss": 0.5813,
"step": 280
},
{
"epoch": 0.02312281423575396,
"grad_norm": 2.5626042733441565,
"learning_rate": 1.5397260273972603e-05,
"loss": 1.0054,
"step": 281
},
{
"epoch": 0.02320510183089899,
"grad_norm": 0.6319032426639426,
"learning_rate": 1.545205479452055e-05,
"loss": 0.569,
"step": 282
},
{
"epoch": 0.023287389426044025,
"grad_norm": 3.381429029921771,
"learning_rate": 1.5506849315068497e-05,
"loss": 0.9924,
"step": 283
},
{
"epoch": 0.023369677021189056,
"grad_norm": 0.6893518849945868,
"learning_rate": 1.556164383561644e-05,
"loss": 0.5947,
"step": 284
},
{
"epoch": 0.023451964616334087,
"grad_norm": 0.6030322287256665,
"learning_rate": 1.5616438356164384e-05,
"loss": 0.5849,
"step": 285
},
{
"epoch": 0.02353425221147912,
"grad_norm": 2.584371231162671,
"learning_rate": 1.567123287671233e-05,
"loss": 1.0113,
"step": 286
},
{
"epoch": 0.023616539806624152,
"grad_norm": 2.617374246670965,
"learning_rate": 1.5726027397260274e-05,
"loss": 0.9952,
"step": 287
},
{
"epoch": 0.023698827401769183,
"grad_norm": 3.131756380862052,
"learning_rate": 1.578082191780822e-05,
"loss": 0.9978,
"step": 288
},
{
"epoch": 0.023781114996914217,
"grad_norm": 0.7149086621817794,
"learning_rate": 1.5835616438356164e-05,
"loss": 0.6005,
"step": 289
},
{
"epoch": 0.023863402592059248,
"grad_norm": 2.8572031223595804,
"learning_rate": 1.589041095890411e-05,
"loss": 0.9764,
"step": 290
},
{
"epoch": 0.02394569018720428,
"grad_norm": 3.0067656548078525,
"learning_rate": 1.5945205479452054e-05,
"loss": 0.9931,
"step": 291
},
{
"epoch": 0.02402797778234931,
"grad_norm": 2.9396448545767067,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.0167,
"step": 292
},
{
"epoch": 0.024110265377494344,
"grad_norm": 2.551576593689318,
"learning_rate": 1.6054794520547948e-05,
"loss": 0.9652,
"step": 293
},
{
"epoch": 0.024192552972639374,
"grad_norm": 3.4929495312083376,
"learning_rate": 1.6109589041095893e-05,
"loss": 0.9741,
"step": 294
},
{
"epoch": 0.024274840567784405,
"grad_norm": 0.5986861672946895,
"learning_rate": 1.6164383561643838e-05,
"loss": 0.5967,
"step": 295
},
{
"epoch": 0.02435712816292944,
"grad_norm": 2.3369563375899163,
"learning_rate": 1.6219178082191783e-05,
"loss": 0.9541,
"step": 296
},
{
"epoch": 0.02443941575807447,
"grad_norm": 3.115001072277964,
"learning_rate": 1.6273972602739728e-05,
"loss": 1.002,
"step": 297
},
{
"epoch": 0.0245217033532195,
"grad_norm": 3.594307440216849,
"learning_rate": 1.6328767123287673e-05,
"loss": 0.9483,
"step": 298
},
{
"epoch": 0.024603990948364535,
"grad_norm": 2.4315114201324977,
"learning_rate": 1.638356164383562e-05,
"loss": 0.9844,
"step": 299
},
{
"epoch": 0.024686278543509566,
"grad_norm": 3.3312431748162528,
"learning_rate": 1.6438356164383563e-05,
"loss": 1.0031,
"step": 300
},
{
"epoch": 0.024768566138654597,
"grad_norm": 2.7478721222497695,
"learning_rate": 1.649315068493151e-05,
"loss": 0.9942,
"step": 301
},
{
"epoch": 0.02485085373379963,
"grad_norm": 2.7443057694383097,
"learning_rate": 1.6547945205479454e-05,
"loss": 0.9841,
"step": 302
},
{
"epoch": 0.024933141328944662,
"grad_norm": 2.5333469665657797,
"learning_rate": 1.66027397260274e-05,
"loss": 0.9751,
"step": 303
},
{
"epoch": 0.025015428924089693,
"grad_norm": 3.161735273370277,
"learning_rate": 1.6657534246575344e-05,
"loss": 0.9687,
"step": 304
},
{
"epoch": 0.025097716519234724,
"grad_norm": 2.6737823247108183,
"learning_rate": 1.671232876712329e-05,
"loss": 0.9787,
"step": 305
},
{
"epoch": 0.025180004114379758,
"grad_norm": 0.6510425400067263,
"learning_rate": 1.6767123287671234e-05,
"loss": 0.5622,
"step": 306
},
{
"epoch": 0.02526229170952479,
"grad_norm": 4.574909987598007,
"learning_rate": 1.682191780821918e-05,
"loss": 0.9643,
"step": 307
},
{
"epoch": 0.02534457930466982,
"grad_norm": 3.4438804774031935,
"learning_rate": 1.6876712328767124e-05,
"loss": 0.9615,
"step": 308
},
{
"epoch": 0.025426866899814854,
"grad_norm": 2.9285136796976015,
"learning_rate": 1.693150684931507e-05,
"loss": 0.9527,
"step": 309
},
{
"epoch": 0.025509154494959885,
"grad_norm": 2.779888649016243,
"learning_rate": 1.6986301369863014e-05,
"loss": 0.9544,
"step": 310
},
{
"epoch": 0.025591442090104916,
"grad_norm": 2.7248520567063848,
"learning_rate": 1.7041095890410963e-05,
"loss": 0.9473,
"step": 311
},
{
"epoch": 0.02567372968524995,
"grad_norm": 3.5709762174348954,
"learning_rate": 1.7095890410958908e-05,
"loss": 0.9575,
"step": 312
},
{
"epoch": 0.02575601728039498,
"grad_norm": 3.0856327234258827,
"learning_rate": 1.715068493150685e-05,
"loss": 0.9652,
"step": 313
},
{
"epoch": 0.02583830487554001,
"grad_norm": 2.2692448164089343,
"learning_rate": 1.7205479452054795e-05,
"loss": 0.9735,
"step": 314
},
{
"epoch": 0.025920592470685046,
"grad_norm": 5.769054110868784,
"learning_rate": 1.726027397260274e-05,
"loss": 0.9703,
"step": 315
},
{
"epoch": 0.026002880065830077,
"grad_norm": 2.508893910476298,
"learning_rate": 1.7315068493150685e-05,
"loss": 0.944,
"step": 316
},
{
"epoch": 0.026085167660975107,
"grad_norm": 2.8832916992173767,
"learning_rate": 1.736986301369863e-05,
"loss": 0.9646,
"step": 317
},
{
"epoch": 0.026167455256120138,
"grad_norm": 2.919174367177141,
"learning_rate": 1.7424657534246575e-05,
"loss": 0.9642,
"step": 318
},
{
"epoch": 0.026249742851265172,
"grad_norm": 2.3758292544134068,
"learning_rate": 1.747945205479452e-05,
"loss": 0.9819,
"step": 319
},
{
"epoch": 0.026332030446410203,
"grad_norm": 2.8844662683768822,
"learning_rate": 1.7534246575342465e-05,
"loss": 0.9757,
"step": 320
},
{
"epoch": 0.026414318041555234,
"grad_norm": 2.2651505276443964,
"learning_rate": 1.7589041095890414e-05,
"loss": 0.9461,
"step": 321
},
{
"epoch": 0.02649660563670027,
"grad_norm": 3.148064595511082,
"learning_rate": 1.764383561643836e-05,
"loss": 0.9457,
"step": 322
},
{
"epoch": 0.0265788932318453,
"grad_norm": 2.593793697550568,
"learning_rate": 1.7698630136986304e-05,
"loss": 0.9564,
"step": 323
},
{
"epoch": 0.02666118082699033,
"grad_norm": 3.5777764577994637,
"learning_rate": 1.775342465753425e-05,
"loss": 0.9585,
"step": 324
},
{
"epoch": 0.026743468422135364,
"grad_norm": 2.5200344733829434,
"learning_rate": 1.7808219178082194e-05,
"loss": 0.9429,
"step": 325
},
{
"epoch": 0.026825756017280395,
"grad_norm": 0.7344214528472546,
"learning_rate": 1.786301369863014e-05,
"loss": 0.6191,
"step": 326
},
{
"epoch": 0.026908043612425426,
"grad_norm": 3.3825851018048962,
"learning_rate": 1.7917808219178085e-05,
"loss": 0.9739,
"step": 327
},
{
"epoch": 0.02699033120757046,
"grad_norm": 2.4626600175420212,
"learning_rate": 1.797260273972603e-05,
"loss": 0.9813,
"step": 328
},
{
"epoch": 0.02707261880271549,
"grad_norm": 2.604744324101538,
"learning_rate": 1.8027397260273975e-05,
"loss": 0.9605,
"step": 329
},
{
"epoch": 0.027154906397860522,
"grad_norm": 2.3443898191922408,
"learning_rate": 1.808219178082192e-05,
"loss": 0.968,
"step": 330
},
{
"epoch": 0.027237193993005556,
"grad_norm": 2.2972121260527274,
"learning_rate": 1.8136986301369865e-05,
"loss": 0.9636,
"step": 331
},
{
"epoch": 0.027319481588150587,
"grad_norm": 0.6704215743863139,
"learning_rate": 1.819178082191781e-05,
"loss": 0.5832,
"step": 332
},
{
"epoch": 0.027401769183295618,
"grad_norm": 2.5588332490587806,
"learning_rate": 1.8246575342465755e-05,
"loss": 0.967,
"step": 333
},
{
"epoch": 0.02748405677844065,
"grad_norm": 0.5729720504764441,
"learning_rate": 1.83013698630137e-05,
"loss": 0.5796,
"step": 334
},
{
"epoch": 0.027566344373585683,
"grad_norm": 0.536934165288964,
"learning_rate": 1.8356164383561645e-05,
"loss": 0.586,
"step": 335
},
{
"epoch": 0.027648631968730714,
"grad_norm": 2.729927929300927,
"learning_rate": 1.841095890410959e-05,
"loss": 1.0006,
"step": 336
},
{
"epoch": 0.027730919563875744,
"grad_norm": 2.9380300033617193,
"learning_rate": 1.8465753424657535e-05,
"loss": 0.9806,
"step": 337
},
{
"epoch": 0.02781320715902078,
"grad_norm": 3.1871007449922595,
"learning_rate": 1.852054794520548e-05,
"loss": 1.0205,
"step": 338
},
{
"epoch": 0.02789549475416581,
"grad_norm": 2.7551362648970454,
"learning_rate": 1.8575342465753426e-05,
"loss": 0.9843,
"step": 339
},
{
"epoch": 0.02797778234931084,
"grad_norm": 2.341899316621362,
"learning_rate": 1.863013698630137e-05,
"loss": 0.9828,
"step": 340
},
{
"epoch": 0.028060069944455875,
"grad_norm": 3.0041315739517143,
"learning_rate": 1.8684931506849316e-05,
"loss": 0.9599,
"step": 341
},
{
"epoch": 0.028142357539600905,
"grad_norm": 1.098290342373438,
"learning_rate": 1.873972602739726e-05,
"loss": 0.5762,
"step": 342
},
{
"epoch": 0.028224645134745936,
"grad_norm": 2.793401629061216,
"learning_rate": 1.8794520547945206e-05,
"loss": 0.9599,
"step": 343
},
{
"epoch": 0.02830693272989097,
"grad_norm": 3.381992225466734,
"learning_rate": 1.884931506849315e-05,
"loss": 1.0128,
"step": 344
},
{
"epoch": 0.028389220325036,
"grad_norm": 3.0552921674313107,
"learning_rate": 1.8904109589041096e-05,
"loss": 0.9683,
"step": 345
},
{
"epoch": 0.028471507920181032,
"grad_norm": 2.59026883064129,
"learning_rate": 1.895890410958904e-05,
"loss": 0.9361,
"step": 346
},
{
"epoch": 0.028553795515326063,
"grad_norm": 3.0842540515307473,
"learning_rate": 1.9013698630136986e-05,
"loss": 0.9697,
"step": 347
},
{
"epoch": 0.028636083110471097,
"grad_norm": 2.443425049236279,
"learning_rate": 1.906849315068493e-05,
"loss": 0.9183,
"step": 348
},
{
"epoch": 0.028718370705616128,
"grad_norm": 3.127867492745528,
"learning_rate": 1.9123287671232877e-05,
"loss": 0.9601,
"step": 349
},
{
"epoch": 0.02880065830076116,
"grad_norm": 4.402570399866093,
"learning_rate": 1.9178082191780822e-05,
"loss": 0.9303,
"step": 350
},
{
"epoch": 0.028882945895906193,
"grad_norm": 0.8543818428159927,
"learning_rate": 1.923287671232877e-05,
"loss": 0.5988,
"step": 351
},
{
"epoch": 0.028965233491051224,
"grad_norm": 0.7093532126289934,
"learning_rate": 1.9287671232876715e-05,
"loss": 0.5831,
"step": 352
},
{
"epoch": 0.029047521086196255,
"grad_norm": 0.6407564149823172,
"learning_rate": 1.934246575342466e-05,
"loss": 0.577,
"step": 353
},
{
"epoch": 0.02912980868134129,
"grad_norm": 3.390283574742443,
"learning_rate": 1.9397260273972606e-05,
"loss": 0.9609,
"step": 354
},
{
"epoch": 0.02921209627648632,
"grad_norm": 2.53734497566345,
"learning_rate": 1.945205479452055e-05,
"loss": 0.9909,
"step": 355
},
{
"epoch": 0.02929438387163135,
"grad_norm": 1.0115473868573372,
"learning_rate": 1.9506849315068496e-05,
"loss": 0.6035,
"step": 356
},
{
"epoch": 0.029376671466776385,
"grad_norm": 0.8686466035185451,
"learning_rate": 1.956164383561644e-05,
"loss": 0.5971,
"step": 357
},
{
"epoch": 0.029458959061921416,
"grad_norm": 3.039718625814903,
"learning_rate": 1.9616438356164386e-05,
"loss": 0.9912,
"step": 358
},
{
"epoch": 0.029541246657066447,
"grad_norm": 3.1175114788948473,
"learning_rate": 1.967123287671233e-05,
"loss": 0.9866,
"step": 359
},
{
"epoch": 0.02962353425221148,
"grad_norm": 6.758106134116968,
"learning_rate": 1.9726027397260276e-05,
"loss": 0.9847,
"step": 360
},
{
"epoch": 0.02970582184735651,
"grad_norm": 2.589972092841794,
"learning_rate": 1.978082191780822e-05,
"loss": 0.9565,
"step": 361
},
{
"epoch": 0.029788109442501542,
"grad_norm": 1.073769179644345,
"learning_rate": 1.9835616438356166e-05,
"loss": 0.6201,
"step": 362
},
{
"epoch": 0.029870397037646573,
"grad_norm": 2.620541255700163,
"learning_rate": 1.989041095890411e-05,
"loss": 0.9694,
"step": 363
},
{
"epoch": 0.029952684632791608,
"grad_norm": 2.9983273469412,
"learning_rate": 1.9945205479452057e-05,
"loss": 0.9517,
"step": 364
},
{
"epoch": 0.03003497222793664,
"grad_norm": 3.1705127831701176,
"learning_rate": 2e-05,
"loss": 0.9757,
"step": 365
},
{
"epoch": 0.03011725982308167,
"grad_norm": 3.0769206086851493,
"learning_rate": 1.9999999644807997e-05,
"loss": 0.9725,
"step": 366
},
{
"epoch": 0.030199547418226703,
"grad_norm": 2.6381794624352346,
"learning_rate": 1.999999857923201e-05,
"loss": 0.9579,
"step": 367
},
{
"epoch": 0.030281835013371734,
"grad_norm": 2.524417719057271,
"learning_rate": 1.999999680327212e-05,
"loss": 0.9491,
"step": 368
},
{
"epoch": 0.030364122608516765,
"grad_norm": 2.0772737485337958,
"learning_rate": 1.9999994316928445e-05,
"loss": 0.9802,
"step": 369
},
{
"epoch": 0.0304464102036618,
"grad_norm": 0.695305872906948,
"learning_rate": 1.9999991120201172e-05,
"loss": 0.6179,
"step": 370
},
{
"epoch": 0.03052869779880683,
"grad_norm": 2.034367122214282,
"learning_rate": 1.999998721309052e-05,
"loss": 0.9365,
"step": 371
},
{
"epoch": 0.03061098539395186,
"grad_norm": 2.5094859416224096,
"learning_rate": 1.999998259559677e-05,
"loss": 0.9806,
"step": 372
},
{
"epoch": 0.030693272989096895,
"grad_norm": 2.037387180631793,
"learning_rate": 1.9999977267720245e-05,
"loss": 0.9625,
"step": 373
},
{
"epoch": 0.030775560584241926,
"grad_norm": 1.9827245047395246,
"learning_rate": 1.999997122946133e-05,
"loss": 0.996,
"step": 374
},
{
"epoch": 0.030857848179386957,
"grad_norm": 2.000201005705768,
"learning_rate": 1.9999964480820448e-05,
"loss": 0.9247,
"step": 375
},
{
"epoch": 0.030940135774531988,
"grad_norm": 2.237696098262905,
"learning_rate": 1.999995702179809e-05,
"loss": 0.9432,
"step": 376
},
{
"epoch": 0.031022423369677022,
"grad_norm": 2.1572992959011668,
"learning_rate": 1.999994885239477e-05,
"loss": 0.9567,
"step": 377
},
{
"epoch": 0.031104710964822053,
"grad_norm": 2.5949178993773656,
"learning_rate": 1.999993997261108e-05,
"loss": 0.9523,
"step": 378
},
{
"epoch": 0.031186998559967084,
"grad_norm": 4.412522046641788,
"learning_rate": 1.9999930382447644e-05,
"loss": 0.9463,
"step": 379
},
{
"epoch": 0.03126928615511212,
"grad_norm": 4.095975078147534,
"learning_rate": 1.9999920081905148e-05,
"loss": 0.9562,
"step": 380
},
{
"epoch": 0.03135157375025715,
"grad_norm": 0.7238222599759508,
"learning_rate": 1.999990907098432e-05,
"loss": 0.6367,
"step": 381
},
{
"epoch": 0.03143386134540218,
"grad_norm": 2.051737393292375,
"learning_rate": 1.9999897349685948e-05,
"loss": 0.9396,
"step": 382
},
{
"epoch": 0.03151614894054721,
"grad_norm": 3.608873989338571,
"learning_rate": 1.999988491801086e-05,
"loss": 0.9427,
"step": 383
},
{
"epoch": 0.03159843653569224,
"grad_norm": 0.5731166749659096,
"learning_rate": 1.999987177595994e-05,
"loss": 0.6066,
"step": 384
},
{
"epoch": 0.03168072413083728,
"grad_norm": 2.7911800909686244,
"learning_rate": 1.9999857923534117e-05,
"loss": 0.9553,
"step": 385
},
{
"epoch": 0.03176301172598231,
"grad_norm": 0.5640032520210956,
"learning_rate": 1.9999843360734384e-05,
"loss": 0.6089,
"step": 386
},
{
"epoch": 0.03184529932112734,
"grad_norm": 3.218289339029279,
"learning_rate": 1.999982808756177e-05,
"loss": 1.002,
"step": 387
},
{
"epoch": 0.03192758691627237,
"grad_norm": 0.5298496199217386,
"learning_rate": 1.999981210401736e-05,
"loss": 0.6014,
"step": 388
},
{
"epoch": 0.0320098745114174,
"grad_norm": 2.1651032679205544,
"learning_rate": 1.9999795410102288e-05,
"loss": 0.977,
"step": 389
},
{
"epoch": 0.03209216210656243,
"grad_norm": 3.0876660454466336,
"learning_rate": 1.999977800581775e-05,
"loss": 0.954,
"step": 390
},
{
"epoch": 0.03217444970170747,
"grad_norm": 2.8016809296721186,
"learning_rate": 1.999975989116497e-05,
"loss": 0.9773,
"step": 391
},
{
"epoch": 0.0322567372968525,
"grad_norm": 2.2686954346227584,
"learning_rate": 1.999974106614524e-05,
"loss": 0.9284,
"step": 392
},
{
"epoch": 0.03233902489199753,
"grad_norm": 2.848599719139828,
"learning_rate": 1.9999721530759896e-05,
"loss": 0.9666,
"step": 393
},
{
"epoch": 0.03242131248714256,
"grad_norm": 2.5480580332195792,
"learning_rate": 1.9999701285010327e-05,
"loss": 0.9748,
"step": 394
},
{
"epoch": 0.032503600082287594,
"grad_norm": 3.0659568674712587,
"learning_rate": 1.999968032889797e-05,
"loss": 0.9773,
"step": 395
},
{
"epoch": 0.032585887677432625,
"grad_norm": 3.2486686691126607,
"learning_rate": 1.9999658662424318e-05,
"loss": 0.9378,
"step": 396
},
{
"epoch": 0.032668175272577656,
"grad_norm": 2.231555735516029,
"learning_rate": 1.9999636285590903e-05,
"loss": 0.9402,
"step": 397
},
{
"epoch": 0.03275046286772269,
"grad_norm": 7.750954267677904,
"learning_rate": 1.999961319839932e-05,
"loss": 0.9212,
"step": 398
},
{
"epoch": 0.032832750462867724,
"grad_norm": 3.9379616174216747,
"learning_rate": 1.9999589400851208e-05,
"loss": 0.957,
"step": 399
},
{
"epoch": 0.032915038058012755,
"grad_norm": 3.09592161673104,
"learning_rate": 1.9999564892948254e-05,
"loss": 0.9644,
"step": 400
},
{
"epoch": 0.032997325653157786,
"grad_norm": 0.6258510816084707,
"learning_rate": 1.9999539674692206e-05,
"loss": 0.6,
"step": 401
},
{
"epoch": 0.03307961324830282,
"grad_norm": 2.757532242911201,
"learning_rate": 1.9999513746084848e-05,
"loss": 0.9627,
"step": 402
},
{
"epoch": 0.03316190084344785,
"grad_norm": 0.518069489983011,
"learning_rate": 1.999948710712803e-05,
"loss": 0.5736,
"step": 403
},
{
"epoch": 0.033244188438592885,
"grad_norm": 2.7302377830347293,
"learning_rate": 1.9999459757823632e-05,
"loss": 0.9452,
"step": 404
},
{
"epoch": 0.033326476033737916,
"grad_norm": 3.8829507326351678,
"learning_rate": 1.9999431698173614e-05,
"loss": 0.9501,
"step": 405
},
{
"epoch": 0.03340876362888295,
"grad_norm": 3.030860642634053,
"learning_rate": 1.9999402928179953e-05,
"loss": 0.935,
"step": 406
},
{
"epoch": 0.03349105122402798,
"grad_norm": 2.7297517789446735,
"learning_rate": 1.99993734478447e-05,
"loss": 0.9816,
"step": 407
},
{
"epoch": 0.03357333881917301,
"grad_norm": 2.9131211283428864,
"learning_rate": 1.999934325716995e-05,
"loss": 0.953,
"step": 408
},
{
"epoch": 0.03365562641431804,
"grad_norm": 2.8724758175032457,
"learning_rate": 1.999931235615785e-05,
"loss": 0.9543,
"step": 409
},
{
"epoch": 0.03373791400946307,
"grad_norm": 3.8558067751787894,
"learning_rate": 1.999928074481059e-05,
"loss": 0.9024,
"step": 410
},
{
"epoch": 0.03382020160460811,
"grad_norm": 4.890426251595657,
"learning_rate": 1.9999248423130414e-05,
"loss": 0.9557,
"step": 411
},
{
"epoch": 0.03390248919975314,
"grad_norm": 3.9224502088816307,
"learning_rate": 1.9999215391119623e-05,
"loss": 0.9625,
"step": 412
},
{
"epoch": 0.03398477679489817,
"grad_norm": 4.121169405356662,
"learning_rate": 1.9999181648780564e-05,
"loss": 0.9836,
"step": 413
},
{
"epoch": 0.0340670643900432,
"grad_norm": 3.2570143865225365,
"learning_rate": 1.999914719611563e-05,
"loss": 0.9548,
"step": 414
},
{
"epoch": 0.03414935198518823,
"grad_norm": 0.8551591188426197,
"learning_rate": 1.999911203312727e-05,
"loss": 0.6257,
"step": 415
},
{
"epoch": 0.03423163958033326,
"grad_norm": 2.282348243685617,
"learning_rate": 1.9999076159817984e-05,
"loss": 0.9534,
"step": 416
},
{
"epoch": 0.0343139271754783,
"grad_norm": 3.1849388817078417,
"learning_rate": 1.999903957619032e-05,
"loss": 0.9559,
"step": 417
},
{
"epoch": 0.03439621477062333,
"grad_norm": 3.0160267374462744,
"learning_rate": 1.9999002282246877e-05,
"loss": 0.9414,
"step": 418
},
{
"epoch": 0.03447850236576836,
"grad_norm": 2.8630460192439484,
"learning_rate": 1.99989642779903e-05,
"loss": 0.97,
"step": 419
},
{
"epoch": 0.03456078996091339,
"grad_norm": 0.6092993503428186,
"learning_rate": 1.999892556342329e-05,
"loss": 0.5762,
"step": 420
},
{
"epoch": 0.03464307755605842,
"grad_norm": 3.558089457861364,
"learning_rate": 1.9998886138548597e-05,
"loss": 0.9674,
"step": 421
},
{
"epoch": 0.034725365151203454,
"grad_norm": 0.5392883644170888,
"learning_rate": 1.9998846003369028e-05,
"loss": 0.6002,
"step": 422
},
{
"epoch": 0.03480765274634849,
"grad_norm": 2.4265611825364175,
"learning_rate": 1.9998805157887432e-05,
"loss": 0.9469,
"step": 423
},
{
"epoch": 0.03488994034149352,
"grad_norm": 2.5084390180607508,
"learning_rate": 1.9998763602106704e-05,
"loss": 0.9547,
"step": 424
},
{
"epoch": 0.03497222793663855,
"grad_norm": 3.0592802155387284,
"learning_rate": 1.99987213360298e-05,
"loss": 0.9549,
"step": 425
},
{
"epoch": 0.035054515531783584,
"grad_norm": 3.0606106243138353,
"learning_rate": 1.9998678359659726e-05,
"loss": 0.925,
"step": 426
},
{
"epoch": 0.035136803126928615,
"grad_norm": 0.5614840770252022,
"learning_rate": 1.999863467299953e-05,
"loss": 0.6226,
"step": 427
},
{
"epoch": 0.035219090722073645,
"grad_norm": 2.3274481514972636,
"learning_rate": 1.9998590276052318e-05,
"loss": 0.9627,
"step": 428
},
{
"epoch": 0.035301378317218676,
"grad_norm": 0.5247325522573751,
"learning_rate": 1.999854516882124e-05,
"loss": 0.5626,
"step": 429
},
{
"epoch": 0.035383665912363714,
"grad_norm": 2.4963541117374635,
"learning_rate": 1.999849935130951e-05,
"loss": 0.9198,
"step": 430
},
{
"epoch": 0.035465953507508745,
"grad_norm": 2.470517097187284,
"learning_rate": 1.999845282352037e-05,
"loss": 0.9433,
"step": 431
},
{
"epoch": 0.035548241102653776,
"grad_norm": 2.7560008424762183,
"learning_rate": 1.9998405585457134e-05,
"loss": 0.9428,
"step": 432
},
{
"epoch": 0.035630528697798806,
"grad_norm": 2.7637029961336226,
"learning_rate": 1.9998357637123157e-05,
"loss": 0.942,
"step": 433
},
{
"epoch": 0.03571281629294384,
"grad_norm": 2.9100289752309045,
"learning_rate": 1.9998308978521842e-05,
"loss": 0.9457,
"step": 434
},
{
"epoch": 0.03579510388808887,
"grad_norm": 4.313071561196342,
"learning_rate": 1.9998259609656645e-05,
"loss": 0.9367,
"step": 435
},
{
"epoch": 0.035877391483233906,
"grad_norm": 2.9430306639688384,
"learning_rate": 1.999820953053108e-05,
"loss": 0.9292,
"step": 436
},
{
"epoch": 0.03595967907837894,
"grad_norm": 3.336500502830984,
"learning_rate": 1.9998158741148695e-05,
"loss": 0.9517,
"step": 437
},
{
"epoch": 0.03604196667352397,
"grad_norm": 2.830315148432978,
"learning_rate": 1.99981072415131e-05,
"loss": 0.9619,
"step": 438
},
{
"epoch": 0.036124254268669,
"grad_norm": 2.9628110908182506,
"learning_rate": 1.9998055031627964e-05,
"loss": 0.9342,
"step": 439
},
{
"epoch": 0.03620654186381403,
"grad_norm": 5.046468138436623,
"learning_rate": 1.9998002111496986e-05,
"loss": 0.9577,
"step": 440
},
{
"epoch": 0.03628882945895906,
"grad_norm": 3.1781915402537324,
"learning_rate": 1.9997948481123925e-05,
"loss": 0.9275,
"step": 441
},
{
"epoch": 0.03637111705410409,
"grad_norm": 3.291481831836819,
"learning_rate": 1.9997894140512595e-05,
"loss": 0.9504,
"step": 442
},
{
"epoch": 0.03645340464924913,
"grad_norm": 3.1084220240196254,
"learning_rate": 1.9997839089666854e-05,
"loss": 0.9236,
"step": 443
},
{
"epoch": 0.03653569224439416,
"grad_norm": 3.1887037749162093,
"learning_rate": 1.9997783328590613e-05,
"loss": 0.8855,
"step": 444
},
{
"epoch": 0.03661797983953919,
"grad_norm": 3.305256714504642,
"learning_rate": 1.9997726857287834e-05,
"loss": 0.9552,
"step": 445
},
{
"epoch": 0.03670026743468422,
"grad_norm": 4.754531864085289,
"learning_rate": 1.9997669675762528e-05,
"loss": 0.9504,
"step": 446
},
{
"epoch": 0.03678255502982925,
"grad_norm": 2.474649426046985,
"learning_rate": 1.9997611784018754e-05,
"loss": 0.9518,
"step": 447
},
{
"epoch": 0.03686484262497428,
"grad_norm": 2.880288649426941,
"learning_rate": 1.9997553182060633e-05,
"loss": 0.8702,
"step": 448
},
{
"epoch": 0.03694713022011932,
"grad_norm": 2.9619541365703976,
"learning_rate": 1.999749386989232e-05,
"loss": 0.948,
"step": 449
},
{
"epoch": 0.03702941781526435,
"grad_norm": 3.0040457692945552,
"learning_rate": 1.999743384751803e-05,
"loss": 0.9161,
"step": 450
},
{
"epoch": 0.03711170541040938,
"grad_norm": 0.6917840645754628,
"learning_rate": 1.999737311494203e-05,
"loss": 0.5999,
"step": 451
},
{
"epoch": 0.03719399300555441,
"grad_norm": 2.500969399378362,
"learning_rate": 1.9997311672168632e-05,
"loss": 0.9321,
"step": 452
},
{
"epoch": 0.037276280600699443,
"grad_norm": 3.4756867592830076,
"learning_rate": 1.99972495192022e-05,
"loss": 0.9468,
"step": 453
},
{
"epoch": 0.037358568195844474,
"grad_norm": 2.4507954914499974,
"learning_rate": 1.9997186656047154e-05,
"loss": 0.9367,
"step": 454
},
{
"epoch": 0.037440855790989505,
"grad_norm": 2.3319357748120066,
"learning_rate": 1.9997123082707954e-05,
"loss": 0.9506,
"step": 455
},
{
"epoch": 0.03752314338613454,
"grad_norm": 2.4614553831803896,
"learning_rate": 1.999705879918912e-05,
"loss": 0.9812,
"step": 456
},
{
"epoch": 0.037605430981279574,
"grad_norm": 2.7421103733102665,
"learning_rate": 1.999699380549521e-05,
"loss": 0.975,
"step": 457
},
{
"epoch": 0.037687718576424604,
"grad_norm": 3.193134683800622,
"learning_rate": 1.9996928101630853e-05,
"loss": 0.9462,
"step": 458
},
{
"epoch": 0.037770006171569635,
"grad_norm": 2.4788434065823353,
"learning_rate": 1.999686168760071e-05,
"loss": 0.9442,
"step": 459
},
{
"epoch": 0.037852293766714666,
"grad_norm": 2.67715161966991,
"learning_rate": 1.99967945634095e-05,
"loss": 0.9497,
"step": 460
},
{
"epoch": 0.0379345813618597,
"grad_norm": 2.8286753306256234,
"learning_rate": 1.9996726729061995e-05,
"loss": 0.9371,
"step": 461
},
{
"epoch": 0.038016868957004735,
"grad_norm": 2.494636914608068,
"learning_rate": 1.999665818456301e-05,
"loss": 0.9369,
"step": 462
},
{
"epoch": 0.038099156552149765,
"grad_norm": 3.3684641604813312,
"learning_rate": 1.9996588929917413e-05,
"loss": 0.9167,
"step": 463
},
{
"epoch": 0.038181444147294796,
"grad_norm": 2.8300347810651836,
"learning_rate": 1.9996518965130126e-05,
"loss": 0.96,
"step": 464
},
{
"epoch": 0.03826373174243983,
"grad_norm": 2.7216914732590634,
"learning_rate": 1.9996448290206117e-05,
"loss": 0.9587,
"step": 465
},
{
"epoch": 0.03834601933758486,
"grad_norm": 2.8897584926398223,
"learning_rate": 1.999637690515041e-05,
"loss": 0.9424,
"step": 466
},
{
"epoch": 0.03842830693272989,
"grad_norm": 2.6782745713753364,
"learning_rate": 1.9996304809968074e-05,
"loss": 0.9421,
"step": 467
},
{
"epoch": 0.03851059452787492,
"grad_norm": 0.8391702922649521,
"learning_rate": 1.9996232004664232e-05,
"loss": 0.6291,
"step": 468
},
{
"epoch": 0.03859288212301996,
"grad_norm": 2.9110538284406213,
"learning_rate": 1.9996158489244054e-05,
"loss": 0.9548,
"step": 469
},
{
"epoch": 0.03867516971816499,
"grad_norm": 2.9735024191976813,
"learning_rate": 1.9996084263712764e-05,
"loss": 0.9397,
"step": 470
},
{
"epoch": 0.03875745731331002,
"grad_norm": 2.459802449779267,
"learning_rate": 1.9996009328075635e-05,
"loss": 0.9516,
"step": 471
},
{
"epoch": 0.03883974490845505,
"grad_norm": 1.4795476906818943,
"learning_rate": 1.999593368233799e-05,
"loss": 0.6175,
"step": 472
},
{
"epoch": 0.03892203250360008,
"grad_norm": 2.7329559825050844,
"learning_rate": 1.9995857326505202e-05,
"loss": 0.9279,
"step": 473
},
{
"epoch": 0.03900432009874511,
"grad_norm": 2.7310837617231307,
"learning_rate": 1.999578026058269e-05,
"loss": 0.9325,
"step": 474
},
{
"epoch": 0.03908660769389015,
"grad_norm": 3.580150174543716,
"learning_rate": 1.999570248457594e-05,
"loss": 0.9403,
"step": 475
},
{
"epoch": 0.03916889528903518,
"grad_norm": 3.518367412394758,
"learning_rate": 1.9995623998490473e-05,
"loss": 0.9346,
"step": 476
},
{
"epoch": 0.03925118288418021,
"grad_norm": 2.1655004063703167,
"learning_rate": 1.999554480233186e-05,
"loss": 0.9294,
"step": 477
},
{
"epoch": 0.03933347047932524,
"grad_norm": 2.857429287491222,
"learning_rate": 1.9995464896105727e-05,
"loss": 0.9201,
"step": 478
},
{
"epoch": 0.03941575807447027,
"grad_norm": 2.3230944603500094,
"learning_rate": 1.999538427981776e-05,
"loss": 0.9172,
"step": 479
},
{
"epoch": 0.0394980456696153,
"grad_norm": 2.686091492583088,
"learning_rate": 1.9995302953473673e-05,
"loss": 0.7009,
"step": 480
},
{
"epoch": 0.039580333264760334,
"grad_norm": 2.5370139223659445,
"learning_rate": 1.999522091707925e-05,
"loss": 0.9547,
"step": 481
},
{
"epoch": 0.03966262085990537,
"grad_norm": 2.9114624346952787,
"learning_rate": 1.9995138170640322e-05,
"loss": 0.9309,
"step": 482
},
{
"epoch": 0.0397449084550504,
"grad_norm": 2.636772148383987,
"learning_rate": 1.9995054714162757e-05,
"loss": 0.9224,
"step": 483
},
{
"epoch": 0.03982719605019543,
"grad_norm": 2.3887969483327005,
"learning_rate": 1.9994970547652495e-05,
"loss": 0.9509,
"step": 484
},
{
"epoch": 0.039909483645340464,
"grad_norm": 2.9497130431080256,
"learning_rate": 1.9994885671115506e-05,
"loss": 0.9693,
"step": 485
},
{
"epoch": 0.039991771240485495,
"grad_norm": 2.225873777913106,
"learning_rate": 1.9994800084557826e-05,
"loss": 0.9382,
"step": 486
},
{
"epoch": 0.040074058835630526,
"grad_norm": 3.015548118510522,
"learning_rate": 1.9994713787985534e-05,
"loss": 0.9084,
"step": 487
},
{
"epoch": 0.040156346430775564,
"grad_norm": 3.2147762822609787,
"learning_rate": 1.9994626781404754e-05,
"loss": 0.9432,
"step": 488
},
{
"epoch": 0.040238634025920594,
"grad_norm": 2.732749831828487,
"learning_rate": 1.9994539064821676e-05,
"loss": 0.9493,
"step": 489
},
{
"epoch": 0.040320921621065625,
"grad_norm": 2.718095114325169,
"learning_rate": 1.9994450638242524e-05,
"loss": 0.6999,
"step": 490
},
{
"epoch": 0.040403209216210656,
"grad_norm": 1.192110613853859,
"learning_rate": 1.9994361501673586e-05,
"loss": 0.606,
"step": 491
},
{
"epoch": 0.04048549681135569,
"grad_norm": 2.6545275290481523,
"learning_rate": 1.9994271655121187e-05,
"loss": 0.9562,
"step": 492
},
{
"epoch": 0.04056778440650072,
"grad_norm": 2.6306786770452217,
"learning_rate": 1.999418109859171e-05,
"loss": 0.932,
"step": 493
},
{
"epoch": 0.040650072001645755,
"grad_norm": 0.7723300623794189,
"learning_rate": 1.99940898320916e-05,
"loss": 0.6167,
"step": 494
},
{
"epoch": 0.040732359596790786,
"grad_norm": 3.4539680548732075,
"learning_rate": 1.9993997855627323e-05,
"loss": 0.9547,
"step": 495
},
{
"epoch": 0.04081464719193582,
"grad_norm": 8.174151834055909,
"learning_rate": 1.9993905169205425e-05,
"loss": 0.9532,
"step": 496
},
{
"epoch": 0.04089693478708085,
"grad_norm": 2.4333462034983517,
"learning_rate": 1.9993811772832487e-05,
"loss": 0.9201,
"step": 497
},
{
"epoch": 0.04097922238222588,
"grad_norm": 2.621241890180304,
"learning_rate": 1.9993717666515143e-05,
"loss": 0.9336,
"step": 498
},
{
"epoch": 0.04106150997737091,
"grad_norm": 2.8830815398438308,
"learning_rate": 1.999362285026008e-05,
"loss": 0.9254,
"step": 499
},
{
"epoch": 0.04114379757251594,
"grad_norm": 3.0315366250694136,
"learning_rate": 1.9993527324074028e-05,
"loss": 0.9272,
"step": 500
},
{
"epoch": 0.04122608516766098,
"grad_norm": 2.657554413096405,
"learning_rate": 1.999343108796378e-05,
"loss": 0.9462,
"step": 501
},
{
"epoch": 0.04130837276280601,
"grad_norm": 2.905472644448609,
"learning_rate": 1.999333414193617e-05,
"loss": 0.9034,
"step": 502
},
{
"epoch": 0.04139066035795104,
"grad_norm": 3.925086807406567,
"learning_rate": 1.9993236485998085e-05,
"loss": 0.9315,
"step": 503
},
{
"epoch": 0.04147294795309607,
"grad_norm": 3.0313048521155146,
"learning_rate": 1.999313812015646e-05,
"loss": 0.9535,
"step": 504
},
{
"epoch": 0.0415552355482411,
"grad_norm": 2.962993951360446,
"learning_rate": 1.9993039044418286e-05,
"loss": 0.9309,
"step": 505
},
{
"epoch": 0.04163752314338613,
"grad_norm": 0.6779011051688715,
"learning_rate": 1.99929392587906e-05,
"loss": 0.5869,
"step": 506
},
{
"epoch": 0.04171981073853117,
"grad_norm": 2.579639640184937,
"learning_rate": 1.9992838763280488e-05,
"loss": 0.9118,
"step": 507
},
{
"epoch": 0.0418020983336762,
"grad_norm": 2.1450772300859655,
"learning_rate": 1.9992737557895093e-05,
"loss": 0.932,
"step": 508
},
{
"epoch": 0.04188438592882123,
"grad_norm": 2.4058977622816977,
"learning_rate": 1.9992635642641605e-05,
"loss": 0.9301,
"step": 509
},
{
"epoch": 0.04196667352396626,
"grad_norm": 2.4723871593300584,
"learning_rate": 1.999253301752726e-05,
"loss": 0.9362,
"step": 510
},
{
"epoch": 0.04204896111911129,
"grad_norm": 2.7787980954607616,
"learning_rate": 1.999242968255935e-05,
"loss": 0.949,
"step": 511
},
{
"epoch": 0.042131248714256324,
"grad_norm": 2.7091957078534783,
"learning_rate": 1.9992325637745214e-05,
"loss": 0.8939,
"step": 512
},
{
"epoch": 0.042213536309401355,
"grad_norm": 3.104398485557938,
"learning_rate": 1.9992220883092247e-05,
"loss": 0.9201,
"step": 513
},
{
"epoch": 0.04229582390454639,
"grad_norm": 2.688893801232366,
"learning_rate": 1.9992115418607886e-05,
"loss": 0.9314,
"step": 514
},
{
"epoch": 0.04237811149969142,
"grad_norm": 0.6175757936794599,
"learning_rate": 1.999200924429963e-05,
"loss": 0.5823,
"step": 515
},
{
"epoch": 0.042460399094836454,
"grad_norm": 2.134638530502557,
"learning_rate": 1.9991902360175017e-05,
"loss": 0.8988,
"step": 516
},
{
"epoch": 0.042542686689981485,
"grad_norm": 2.660777130272323,
"learning_rate": 1.9991794766241638e-05,
"loss": 0.9058,
"step": 517
},
{
"epoch": 0.042624974285126516,
"grad_norm": 2.519959303045957,
"learning_rate": 1.9991686462507137e-05,
"loss": 0.9157,
"step": 518
},
{
"epoch": 0.042707261880271546,
"grad_norm": 0.5033254525320345,
"learning_rate": 1.9991577448979213e-05,
"loss": 0.5637,
"step": 519
},
{
"epoch": 0.042789549475416584,
"grad_norm": 2.3638963921206777,
"learning_rate": 1.9991467725665604e-05,
"loss": 0.9532,
"step": 520
},
{
"epoch": 0.042871837070561615,
"grad_norm": 2.760667379358993,
"learning_rate": 1.9991357292574106e-05,
"loss": 0.9194,
"step": 521
},
{
"epoch": 0.042954124665706646,
"grad_norm": 2.285449190484726,
"learning_rate": 1.9991246149712564e-05,
"loss": 0.854,
"step": 522
},
{
"epoch": 0.04303641226085168,
"grad_norm": 2.9222709070685315,
"learning_rate": 1.9991134297088877e-05,
"loss": 0.9534,
"step": 523
},
{
"epoch": 0.04311869985599671,
"grad_norm": 3.1630611007009355,
"learning_rate": 1.9991021734710988e-05,
"loss": 0.9505,
"step": 524
},
{
"epoch": 0.04320098745114174,
"grad_norm": 3.174869013367673,
"learning_rate": 1.999090846258689e-05,
"loss": 0.964,
"step": 525
},
{
"epoch": 0.04328327504628677,
"grad_norm": 2.4328576962151693,
"learning_rate": 1.9990794480724634e-05,
"loss": 0.9084,
"step": 526
},
{
"epoch": 0.04336556264143181,
"grad_norm": 0.5700103881605539,
"learning_rate": 1.9990679789132317e-05,
"loss": 0.5734,
"step": 527
},
{
"epoch": 0.04344785023657684,
"grad_norm": 2.392627489613796,
"learning_rate": 1.9990564387818087e-05,
"loss": 0.916,
"step": 528
},
{
"epoch": 0.04353013783172187,
"grad_norm": 3.2074775648239453,
"learning_rate": 1.999044827679014e-05,
"loss": 0.9095,
"step": 529
},
{
"epoch": 0.0436124254268669,
"grad_norm": 3.140601191667111,
"learning_rate": 1.999033145605672e-05,
"loss": 0.904,
"step": 530
},
{
"epoch": 0.04369471302201193,
"grad_norm": 2.3743918081273505,
"learning_rate": 1.9990213925626135e-05,
"loss": 0.9173,
"step": 531
},
{
"epoch": 0.04377700061715696,
"grad_norm": 2.803625633325397,
"learning_rate": 1.999009568550673e-05,
"loss": 0.9425,
"step": 532
},
{
"epoch": 0.043859288212302,
"grad_norm": 2.624304052527756,
"learning_rate": 1.9989976735706903e-05,
"loss": 0.8778,
"step": 533
},
{
"epoch": 0.04394157580744703,
"grad_norm": 3.611007788459353,
"learning_rate": 1.9989857076235105e-05,
"loss": 0.9454,
"step": 534
},
{
"epoch": 0.04402386340259206,
"grad_norm": 3.0477796789876885,
"learning_rate": 1.9989736707099836e-05,
"loss": 0.9301,
"step": 535
},
{
"epoch": 0.04410615099773709,
"grad_norm": 3.661229035903915,
"learning_rate": 1.998961562830965e-05,
"loss": 0.9234,
"step": 536
},
{
"epoch": 0.04418843859288212,
"grad_norm": 3.014314493078093,
"learning_rate": 1.9989493839873144e-05,
"loss": 0.9205,
"step": 537
},
{
"epoch": 0.04427072618802715,
"grad_norm": 3.1607667446866348,
"learning_rate": 1.998937134179897e-05,
"loss": 0.9184,
"step": 538
},
{
"epoch": 0.044353013783172184,
"grad_norm": 0.5679302245778807,
"learning_rate": 1.9989248134095835e-05,
"loss": 0.5808,
"step": 539
},
{
"epoch": 0.04443530137831722,
"grad_norm": 3.4927267069905827,
"learning_rate": 1.9989124216772486e-05,
"loss": 0.9068,
"step": 540
},
{
"epoch": 0.04451758897346225,
"grad_norm": 3.2792902354283524,
"learning_rate": 1.9988999589837727e-05,
"loss": 0.9441,
"step": 541
},
{
"epoch": 0.04459987656860728,
"grad_norm": 3.2813608886269465,
"learning_rate": 1.9988874253300415e-05,
"loss": 0.9135,
"step": 542
},
{
"epoch": 0.044682164163752314,
"grad_norm": 3.6532563430030387,
"learning_rate": 1.9988748207169448e-05,
"loss": 0.9124,
"step": 543
},
{
"epoch": 0.044764451758897345,
"grad_norm": 3.0411510483789708,
"learning_rate": 1.9988621451453783e-05,
"loss": 0.9437,
"step": 544
},
{
"epoch": 0.044846739354042375,
"grad_norm": 2.947067350806481,
"learning_rate": 1.9988493986162426e-05,
"loss": 0.9377,
"step": 545
},
{
"epoch": 0.04492902694918741,
"grad_norm": 3.733984375480931,
"learning_rate": 1.9988365811304434e-05,
"loss": 0.9302,
"step": 546
},
{
"epoch": 0.045011314544332444,
"grad_norm": 0.5973399530190582,
"learning_rate": 1.99882369268889e-05,
"loss": 0.5985,
"step": 547
},
{
"epoch": 0.045093602139477475,
"grad_norm": 3.1946558451893483,
"learning_rate": 1.9988107332924997e-05,
"loss": 0.9306,
"step": 548
},
{
"epoch": 0.045175889734622506,
"grad_norm": 3.0518182224655184,
"learning_rate": 1.998797702942192e-05,
"loss": 0.9238,
"step": 549
},
{
"epoch": 0.045258177329767536,
"grad_norm": 0.5186994011171457,
"learning_rate": 1.9987846016388927e-05,
"loss": 0.5534,
"step": 550
},
{
"epoch": 0.04534046492491257,
"grad_norm": 2.9538180602678072,
"learning_rate": 1.9987714293835326e-05,
"loss": 0.9131,
"step": 551
},
{
"epoch": 0.0454227525200576,
"grad_norm": 3.583039419798021,
"learning_rate": 1.9987581861770476e-05,
"loss": 0.931,
"step": 552
},
{
"epoch": 0.045505040115202636,
"grad_norm": 3.872167117824797,
"learning_rate": 1.9987448720203783e-05,
"loss": 0.9149,
"step": 553
},
{
"epoch": 0.045587327710347667,
"grad_norm": 0.5153323660807152,
"learning_rate": 1.9987314869144704e-05,
"loss": 0.5707,
"step": 554
},
{
"epoch": 0.0456696153054927,
"grad_norm": 3.2458016621373162,
"learning_rate": 1.9987180308602752e-05,
"loss": 0.9481,
"step": 555
},
{
"epoch": 0.04575190290063773,
"grad_norm": 0.5131089745749331,
"learning_rate": 1.998704503858748e-05,
"loss": 0.6107,
"step": 556
},
{
"epoch": 0.04583419049578276,
"grad_norm": 3.826718669936501,
"learning_rate": 1.99869090591085e-05,
"loss": 0.9334,
"step": 557
},
{
"epoch": 0.04591647809092779,
"grad_norm": 2.808877894852513,
"learning_rate": 1.9986772370175475e-05,
"loss": 0.9313,
"step": 558
},
{
"epoch": 0.04599876568607283,
"grad_norm": 3.429756806838896,
"learning_rate": 1.998663497179811e-05,
"loss": 0.9041,
"step": 559
},
{
"epoch": 0.04608105328121786,
"grad_norm": 3.927553685701978,
"learning_rate": 1.998649686398617e-05,
"loss": 0.9229,
"step": 560
},
{
"epoch": 0.04616334087636289,
"grad_norm": 4.358404357254217,
"learning_rate": 1.9986358046749463e-05,
"loss": 0.9453,
"step": 561
},
{
"epoch": 0.04624562847150792,
"grad_norm": 0.6974205247527027,
"learning_rate": 1.998621852009785e-05,
"loss": 0.582,
"step": 562
},
{
"epoch": 0.04632791606665295,
"grad_norm": 2.8790199811794213,
"learning_rate": 1.9986078284041245e-05,
"loss": 0.9073,
"step": 563
},
{
"epoch": 0.04641020366179798,
"grad_norm": 3.1507198941552343,
"learning_rate": 1.998593733858961e-05,
"loss": 0.9285,
"step": 564
},
{
"epoch": 0.04649249125694301,
"grad_norm": 3.3010925203438757,
"learning_rate": 1.9985795683752955e-05,
"loss": 0.8975,
"step": 565
},
{
"epoch": 0.04657477885208805,
"grad_norm": 2.4173724120050277,
"learning_rate": 1.9985653319541345e-05,
"loss": 0.9211,
"step": 566
},
{
"epoch": 0.04665706644723308,
"grad_norm": 3.219239778661617,
"learning_rate": 1.9985510245964894e-05,
"loss": 0.9414,
"step": 567
},
{
"epoch": 0.04673935404237811,
"grad_norm": 4.702680418398121,
"learning_rate": 1.9985366463033763e-05,
"loss": 0.8886,
"step": 568
},
{
"epoch": 0.04682164163752314,
"grad_norm": 2.946137626961066,
"learning_rate": 1.9985221970758166e-05,
"loss": 0.907,
"step": 569
},
{
"epoch": 0.04690392923266817,
"grad_norm": 3.1637086789258224,
"learning_rate": 1.9985076769148373e-05,
"loss": 0.9063,
"step": 570
},
{
"epoch": 0.046986216827813204,
"grad_norm": 2.7457117180469286,
"learning_rate": 1.9984930858214695e-05,
"loss": 0.9163,
"step": 571
},
{
"epoch": 0.04706850442295824,
"grad_norm": 2.8795617581547597,
"learning_rate": 1.9984784237967495e-05,
"loss": 0.9272,
"step": 572
},
{
"epoch": 0.04715079201810327,
"grad_norm": 3.539552457926088,
"learning_rate": 1.998463690841719e-05,
"loss": 0.9254,
"step": 573
},
{
"epoch": 0.047233079613248304,
"grad_norm": 2.590893854876316,
"learning_rate": 1.998448886957425e-05,
"loss": 0.9135,
"step": 574
},
{
"epoch": 0.047315367208393334,
"grad_norm": 3.385121747004568,
"learning_rate": 1.9984340121449187e-05,
"loss": 0.898,
"step": 575
},
{
"epoch": 0.047397654803538365,
"grad_norm": 2.8668381053066248,
"learning_rate": 1.998419066405257e-05,
"loss": 0.9111,
"step": 576
},
{
"epoch": 0.047479942398683396,
"grad_norm": 0.5561294337589316,
"learning_rate": 1.9984040497395016e-05,
"loss": 0.6026,
"step": 577
},
{
"epoch": 0.047562229993828434,
"grad_norm": 2.7790207529975683,
"learning_rate": 1.9983889621487193e-05,
"loss": 0.8813,
"step": 578
},
{
"epoch": 0.047644517588973465,
"grad_norm": 2.929493346002011,
"learning_rate": 1.9983738036339818e-05,
"loss": 0.934,
"step": 579
},
{
"epoch": 0.047726805184118495,
"grad_norm": 2.6432622003873294,
"learning_rate": 1.9983585741963655e-05,
"loss": 0.935,
"step": 580
},
{
"epoch": 0.047809092779263526,
"grad_norm": 2.343596103466015,
"learning_rate": 1.998343273836953e-05,
"loss": 0.8885,
"step": 581
},
{
"epoch": 0.04789138037440856,
"grad_norm": 2.6377392327317355,
"learning_rate": 1.998327902556831e-05,
"loss": 0.9195,
"step": 582
},
{
"epoch": 0.04797366796955359,
"grad_norm": 0.5734849677326599,
"learning_rate": 1.9983124603570915e-05,
"loss": 0.5804,
"step": 583
},
{
"epoch": 0.04805595556469862,
"grad_norm": 2.359098397716237,
"learning_rate": 1.9982969472388313e-05,
"loss": 0.9154,
"step": 584
},
{
"epoch": 0.048138243159843656,
"grad_norm": 3.07285660000184,
"learning_rate": 1.9982813632031526e-05,
"loss": 0.9293,
"step": 585
},
{
"epoch": 0.04822053075498869,
"grad_norm": 3.145177565014435,
"learning_rate": 1.9982657082511624e-05,
"loss": 0.909,
"step": 586
},
{
"epoch": 0.04830281835013372,
"grad_norm": 2.4460324686547,
"learning_rate": 1.9982499823839726e-05,
"loss": 0.9172,
"step": 587
},
{
"epoch": 0.04838510594527875,
"grad_norm": 2.7860695223687335,
"learning_rate": 1.9982341856027006e-05,
"loss": 0.8962,
"step": 588
},
{
"epoch": 0.04846739354042378,
"grad_norm": 2.5003193611135126,
"learning_rate": 1.9982183179084683e-05,
"loss": 0.9523,
"step": 589
},
{
"epoch": 0.04854968113556881,
"grad_norm": 0.5728078039718163,
"learning_rate": 1.998202379302403e-05,
"loss": 0.5939,
"step": 590
},
{
"epoch": 0.04863196873071385,
"grad_norm": 2.513890686672487,
"learning_rate": 1.9981863697856376e-05,
"loss": 0.9027,
"step": 591
},
{
"epoch": 0.04871425632585888,
"grad_norm": 6.401109317568734,
"learning_rate": 1.9981702893593086e-05,
"loss": 0.9041,
"step": 592
},
{
"epoch": 0.04879654392100391,
"grad_norm": 0.526955304818451,
"learning_rate": 1.9981541380245586e-05,
"loss": 0.6109,
"step": 593
},
{
"epoch": 0.04887883151614894,
"grad_norm": 0.5280472746795982,
"learning_rate": 1.9981379157825346e-05,
"loss": 0.5801,
"step": 594
},
{
"epoch": 0.04896111911129397,
"grad_norm": 2.831289529507686,
"learning_rate": 1.99812162263439e-05,
"loss": 0.9296,
"step": 595
},
{
"epoch": 0.049043406706439,
"grad_norm": 2.5183731275746637,
"learning_rate": 1.998105258581281e-05,
"loss": 0.9373,
"step": 596
},
{
"epoch": 0.04912569430158403,
"grad_norm": 2.290556291606923,
"learning_rate": 1.998088823624371e-05,
"loss": 0.9339,
"step": 597
},
{
"epoch": 0.04920798189672907,
"grad_norm": 2.9827790643550065,
"learning_rate": 1.998072317764827e-05,
"loss": 0.9341,
"step": 598
},
{
"epoch": 0.0492902694918741,
"grad_norm": 3.9980040686222535,
"learning_rate": 1.998055741003822e-05,
"loss": 0.9428,
"step": 599
},
{
"epoch": 0.04937255708701913,
"grad_norm": 2.9421068715344125,
"learning_rate": 1.998039093342533e-05,
"loss": 0.9183,
"step": 600
},
{
"epoch": 0.04945484468216416,
"grad_norm": 2.3512621164999654,
"learning_rate": 1.998022374782143e-05,
"loss": 0.9139,
"step": 601
},
{
"epoch": 0.049537132277309194,
"grad_norm": 2.8922341692853863,
"learning_rate": 1.9980055853238394e-05,
"loss": 0.8847,
"step": 602
},
{
"epoch": 0.049619419872454225,
"grad_norm": 2.5544870335833916,
"learning_rate": 1.9979887249688158e-05,
"loss": 0.9322,
"step": 603
},
{
"epoch": 0.04970170746759926,
"grad_norm": 2.3713588179833427,
"learning_rate": 1.9979717937182685e-05,
"loss": 0.8953,
"step": 604
},
{
"epoch": 0.04978399506274429,
"grad_norm": 2.567195793905517,
"learning_rate": 1.9979547915734014e-05,
"loss": 0.9287,
"step": 605
},
{
"epoch": 0.049866282657889324,
"grad_norm": 2.116439796262553,
"learning_rate": 1.997937718535422e-05,
"loss": 0.9122,
"step": 606
},
{
"epoch": 0.049948570253034355,
"grad_norm": 2.6728583449200967,
"learning_rate": 1.9979205746055426e-05,
"loss": 0.9409,
"step": 607
},
{
"epoch": 0.050030857848179386,
"grad_norm": 2.9303321533796147,
"learning_rate": 1.9979033597849817e-05,
"loss": 0.877,
"step": 608
},
{
"epoch": 0.05011314544332442,
"grad_norm": 2.6453736009345103,
"learning_rate": 1.9978860740749618e-05,
"loss": 0.9264,
"step": 609
},
{
"epoch": 0.05019543303846945,
"grad_norm": 0.6463475109604742,
"learning_rate": 1.9978687174767115e-05,
"loss": 0.6037,
"step": 610
},
{
"epoch": 0.050277720633614485,
"grad_norm": 2.1568723876857514,
"learning_rate": 1.9978512899914632e-05,
"loss": 0.9291,
"step": 611
},
{
"epoch": 0.050360008228759516,
"grad_norm": 2.779974581309181,
"learning_rate": 1.997833791620455e-05,
"loss": 0.9487,
"step": 612
},
{
"epoch": 0.05044229582390455,
"grad_norm": 2.6541794961423726,
"learning_rate": 1.9978162223649303e-05,
"loss": 0.9314,
"step": 613
},
{
"epoch": 0.05052458341904958,
"grad_norm": 2.204822617972563,
"learning_rate": 1.9977985822261367e-05,
"loss": 0.9195,
"step": 614
},
{
"epoch": 0.05060687101419461,
"grad_norm": 2.528877153941993,
"learning_rate": 1.9977808712053276e-05,
"loss": 0.925,
"step": 615
},
{
"epoch": 0.05068915860933964,
"grad_norm": 2.89407673046398,
"learning_rate": 1.9977630893037613e-05,
"loss": 0.9164,
"step": 616
},
{
"epoch": 0.05077144620448468,
"grad_norm": 2.8147196835709924,
"learning_rate": 1.9977452365227005e-05,
"loss": 0.9109,
"step": 617
},
{
"epoch": 0.05085373379962971,
"grad_norm": 2.8624190313017697,
"learning_rate": 1.997727312863414e-05,
"loss": 0.9227,
"step": 618
},
{
"epoch": 0.05093602139477474,
"grad_norm": 2.6853591545801243,
"learning_rate": 1.9977093183271746e-05,
"loss": 0.9043,
"step": 619
},
{
"epoch": 0.05101830898991977,
"grad_norm": 2.847809177384018,
"learning_rate": 1.997691252915261e-05,
"loss": 0.8797,
"step": 620
},
{
"epoch": 0.0511005965850648,
"grad_norm": 2.5413962256979477,
"learning_rate": 1.9976731166289565e-05,
"loss": 0.888,
"step": 621
},
{
"epoch": 0.05118288418020983,
"grad_norm": 2.4434297876428768,
"learning_rate": 1.997654909469549e-05,
"loss": 0.9193,
"step": 622
},
{
"epoch": 0.05126517177535486,
"grad_norm": 2.554334961124947,
"learning_rate": 1.9976366314383323e-05,
"loss": 0.945,
"step": 623
},
{
"epoch": 0.0513474593704999,
"grad_norm": 3.0606359366025155,
"learning_rate": 1.9976182825366052e-05,
"loss": 0.9018,
"step": 624
},
{
"epoch": 0.05142974696564493,
"grad_norm": 2.7602463387503877,
"learning_rate": 1.9975998627656704e-05,
"loss": 0.9572,
"step": 625
},
{
"epoch": 0.05151203456078996,
"grad_norm": 2.645779738054759,
"learning_rate": 1.997581372126837e-05,
"loss": 0.8986,
"step": 626
},
{
"epoch": 0.05159432215593499,
"grad_norm": 2.3004786981907808,
"learning_rate": 1.997562810621418e-05,
"loss": 0.9378,
"step": 627
},
{
"epoch": 0.05167660975108002,
"grad_norm": 3.0529134410232954,
"learning_rate": 1.9975441782507327e-05,
"loss": 0.9374,
"step": 628
},
{
"epoch": 0.051758897346225054,
"grad_norm": 6.366982443959264,
"learning_rate": 1.997525475016104e-05,
"loss": 0.9572,
"step": 629
},
{
"epoch": 0.05184118494137009,
"grad_norm": 7.143057307651942,
"learning_rate": 1.9975067009188608e-05,
"loss": 0.9368,
"step": 630
},
{
"epoch": 0.05192347253651512,
"grad_norm": 2.486114121904295,
"learning_rate": 1.997487855960337e-05,
"loss": 0.8618,
"step": 631
},
{
"epoch": 0.05200576013166015,
"grad_norm": 2.909503733964849,
"learning_rate": 1.9974689401418712e-05,
"loss": 0.8998,
"step": 632
},
{
"epoch": 0.052088047726805184,
"grad_norm": 2.506345699862428,
"learning_rate": 1.9974499534648068e-05,
"loss": 0.9119,
"step": 633
},
{
"epoch": 0.052170335321950215,
"grad_norm": 0.5966023669088316,
"learning_rate": 1.9974308959304933e-05,
"loss": 0.5656,
"step": 634
},
{
"epoch": 0.052252622917095246,
"grad_norm": 2.9205909740125784,
"learning_rate": 1.997411767540284e-05,
"loss": 0.9109,
"step": 635
},
{
"epoch": 0.052334910512240276,
"grad_norm": 2.2641759973862534,
"learning_rate": 1.9973925682955378e-05,
"loss": 0.9023,
"step": 636
},
{
"epoch": 0.052417198107385314,
"grad_norm": 2.4641130571954086,
"learning_rate": 1.9973732981976188e-05,
"loss": 0.909,
"step": 637
},
{
"epoch": 0.052499485702530345,
"grad_norm": 2.2247912270982195,
"learning_rate": 1.9973539572478955e-05,
"loss": 0.9111,
"step": 638
},
{
"epoch": 0.052581773297675376,
"grad_norm": 2.182850954981328,
"learning_rate": 1.9973345454477422e-05,
"loss": 0.885,
"step": 639
},
{
"epoch": 0.05266406089282041,
"grad_norm": 0.5616279149900174,
"learning_rate": 1.997315062798538e-05,
"loss": 0.5634,
"step": 640
},
{
"epoch": 0.05274634848796544,
"grad_norm": 2.1709200144119287,
"learning_rate": 1.9972955093016662e-05,
"loss": 0.9021,
"step": 641
},
{
"epoch": 0.05282863608311047,
"grad_norm": 3.0243470611887853,
"learning_rate": 1.9972758849585167e-05,
"loss": 0.923,
"step": 642
},
{
"epoch": 0.052910923678255506,
"grad_norm": 0.5181983481216014,
"learning_rate": 1.9972561897704832e-05,
"loss": 0.589,
"step": 643
},
{
"epoch": 0.05299321127340054,
"grad_norm": 2.3618384003718904,
"learning_rate": 1.997236423738965e-05,
"loss": 0.8893,
"step": 644
},
{
"epoch": 0.05307549886854557,
"grad_norm": 2.83302899205139,
"learning_rate": 1.997216586865366e-05,
"loss": 0.9056,
"step": 645
},
{
"epoch": 0.0531577864636906,
"grad_norm": 2.1524435897397756,
"learning_rate": 1.9971966791510952e-05,
"loss": 0.8875,
"step": 646
},
{
"epoch": 0.05324007405883563,
"grad_norm": 0.5403616002875096,
"learning_rate": 1.9971767005975676e-05,
"loss": 0.5864,
"step": 647
},
{
"epoch": 0.05332236165398066,
"grad_norm": 3.032727501630103,
"learning_rate": 1.9971566512062016e-05,
"loss": 0.9269,
"step": 648
},
{
"epoch": 0.0534046492491257,
"grad_norm": 2.677613120586094,
"learning_rate": 1.9971365309784222e-05,
"loss": 0.9319,
"step": 649
},
{
"epoch": 0.05348693684427073,
"grad_norm": 2.7527601762070626,
"learning_rate": 1.9971163399156577e-05,
"loss": 0.911,
"step": 650
},
{
"epoch": 0.05356922443941576,
"grad_norm": 2.456807133771137,
"learning_rate": 1.9970960780193435e-05,
"loss": 0.9274,
"step": 651
},
{
"epoch": 0.05365151203456079,
"grad_norm": 0.5512339745238304,
"learning_rate": 1.9970757452909185e-05,
"loss": 0.5999,
"step": 652
},
{
"epoch": 0.05373379962970582,
"grad_norm": 3.3078302086877454,
"learning_rate": 1.997055341731827e-05,
"loss": 0.9161,
"step": 653
},
{
"epoch": 0.05381608722485085,
"grad_norm": 1.9567891820560834,
"learning_rate": 1.9970348673435187e-05,
"loss": 0.8954,
"step": 654
},
{
"epoch": 0.05389837481999588,
"grad_norm": 2.4558167849951027,
"learning_rate": 1.9970143221274477e-05,
"loss": 0.9041,
"step": 655
},
{
"epoch": 0.05398066241514092,
"grad_norm": 2.6700615275845214,
"learning_rate": 1.996993706085074e-05,
"loss": 0.9406,
"step": 656
},
{
"epoch": 0.05406295001028595,
"grad_norm": 2.47054592661293,
"learning_rate": 1.9969730192178618e-05,
"loss": 0.9075,
"step": 657
},
{
"epoch": 0.05414523760543098,
"grad_norm": 2.527986443897195,
"learning_rate": 1.9969522615272806e-05,
"loss": 0.9012,
"step": 658
},
{
"epoch": 0.05422752520057601,
"grad_norm": 0.5565334590513972,
"learning_rate": 1.9969314330148056e-05,
"loss": 0.5587,
"step": 659
},
{
"epoch": 0.054309812795721044,
"grad_norm": 1.8601076711624556,
"learning_rate": 1.9969105336819154e-05,
"loss": 0.8991,
"step": 660
},
{
"epoch": 0.054392100390866074,
"grad_norm": 2.0210809868042356,
"learning_rate": 1.9968895635300956e-05,
"loss": 0.9302,
"step": 661
},
{
"epoch": 0.05447438798601111,
"grad_norm": 2.1871429796039363,
"learning_rate": 1.9968685225608353e-05,
"loss": 0.8719,
"step": 662
},
{
"epoch": 0.05455667558115614,
"grad_norm": 2.699275991596056,
"learning_rate": 1.9968474107756295e-05,
"loss": 0.9107,
"step": 663
},
{
"epoch": 0.054638963176301174,
"grad_norm": 2.921814293546767,
"learning_rate": 1.996826228175978e-05,
"loss": 0.9124,
"step": 664
},
{
"epoch": 0.054721250771446205,
"grad_norm": 2.9121454433336917,
"learning_rate": 1.9968049747633848e-05,
"loss": 0.8872,
"step": 665
},
{
"epoch": 0.054803538366591235,
"grad_norm": 4.665109966003875,
"learning_rate": 1.996783650539361e-05,
"loss": 0.9337,
"step": 666
},
{
"epoch": 0.054885825961736266,
"grad_norm": 2.2334882062761814,
"learning_rate": 1.9967622555054204e-05,
"loss": 0.9249,
"step": 667
},
{
"epoch": 0.0549681135568813,
"grad_norm": 1.8093225226331142,
"learning_rate": 1.9967407896630837e-05,
"loss": 0.8666,
"step": 668
},
{
"epoch": 0.055050401152026335,
"grad_norm": 0.5652676807003993,
"learning_rate": 1.996719253013875e-05,
"loss": 0.5961,
"step": 669
},
{
"epoch": 0.055132688747171366,
"grad_norm": 0.5100457321950321,
"learning_rate": 1.9966976455593247e-05,
"loss": 0.5618,
"step": 670
},
{
"epoch": 0.055214976342316396,
"grad_norm": 2.773850609378529,
"learning_rate": 1.9966759673009677e-05,
"loss": 0.9275,
"step": 671
},
{
"epoch": 0.05529726393746143,
"grad_norm": 2.5443256480658296,
"learning_rate": 1.9966542182403437e-05,
"loss": 0.9077,
"step": 672
},
{
"epoch": 0.05537955153260646,
"grad_norm": 3.282011580384134,
"learning_rate": 1.9966323983789983e-05,
"loss": 0.921,
"step": 673
},
{
"epoch": 0.05546183912775149,
"grad_norm": 2.2203588190464885,
"learning_rate": 1.996610507718481e-05,
"loss": 0.8988,
"step": 674
},
{
"epoch": 0.05554412672289653,
"grad_norm": 4.790143157081725,
"learning_rate": 1.996588546260347e-05,
"loss": 0.9526,
"step": 675
},
{
"epoch": 0.05562641431804156,
"grad_norm": 2.092143807841506,
"learning_rate": 1.9965665140061565e-05,
"loss": 0.915,
"step": 676
},
{
"epoch": 0.05570870191318659,
"grad_norm": 1.9784649465852888,
"learning_rate": 1.9965444109574744e-05,
"loss": 0.905,
"step": 677
},
{
"epoch": 0.05579098950833162,
"grad_norm": 2.7843501048163217,
"learning_rate": 1.9965222371158718e-05,
"loss": 0.8951,
"step": 678
},
{
"epoch": 0.05587327710347665,
"grad_norm": 2.6331805589786383,
"learning_rate": 1.9964999924829224e-05,
"loss": 0.8614,
"step": 679
},
{
"epoch": 0.05595556469862168,
"grad_norm": 0.7467735870885243,
"learning_rate": 1.9964776770602078e-05,
"loss": 0.6063,
"step": 680
},
{
"epoch": 0.05603785229376671,
"grad_norm": 2.680536053721946,
"learning_rate": 1.9964552908493123e-05,
"loss": 0.8782,
"step": 681
},
{
"epoch": 0.05612013988891175,
"grad_norm": 3.49552823109986,
"learning_rate": 1.9964328338518264e-05,
"loss": 0.902,
"step": 682
},
{
"epoch": 0.05620242748405678,
"grad_norm": 2.120123047682193,
"learning_rate": 1.996410306069346e-05,
"loss": 0.9496,
"step": 683
},
{
"epoch": 0.05628471507920181,
"grad_norm": 1.937156037107827,
"learning_rate": 1.9963877075034706e-05,
"loss": 0.8875,
"step": 684
},
{
"epoch": 0.05636700267434684,
"grad_norm": 2.4742509534066754,
"learning_rate": 1.9963650381558063e-05,
"loss": 0.9192,
"step": 685
},
{
"epoch": 0.05644929026949187,
"grad_norm": 2.3426169694208903,
"learning_rate": 1.996342298027963e-05,
"loss": 0.9481,
"step": 686
},
{
"epoch": 0.0565315778646369,
"grad_norm": 2.1543307158741434,
"learning_rate": 1.9963194871215557e-05,
"loss": 0.8948,
"step": 687
},
{
"epoch": 0.05661386545978194,
"grad_norm": 1.7721734117310426,
"learning_rate": 1.9962966054382062e-05,
"loss": 0.8769,
"step": 688
},
{
"epoch": 0.05669615305492697,
"grad_norm": 2.637184520870366,
"learning_rate": 1.9962736529795388e-05,
"loss": 0.9305,
"step": 689
},
{
"epoch": 0.056778440650072,
"grad_norm": 2.5552424968357306,
"learning_rate": 1.9962506297471846e-05,
"loss": 0.9011,
"step": 690
},
{
"epoch": 0.05686072824521703,
"grad_norm": 2.1091093097631797,
"learning_rate": 1.9962275357427787e-05,
"loss": 0.9153,
"step": 691
},
{
"epoch": 0.056943015840362064,
"grad_norm": 3.8893843496883775,
"learning_rate": 1.996204370967962e-05,
"loss": 0.9516,
"step": 692
},
{
"epoch": 0.057025303435507095,
"grad_norm": 0.6989567675386245,
"learning_rate": 1.9961811354243798e-05,
"loss": 0.6088,
"step": 693
},
{
"epoch": 0.057107591030652126,
"grad_norm": 3.0703220705587326,
"learning_rate": 1.9961578291136834e-05,
"loss": 0.9468,
"step": 694
},
{
"epoch": 0.057189878625797164,
"grad_norm": 0.5452905698296876,
"learning_rate": 1.9961344520375276e-05,
"loss": 0.5795,
"step": 695
},
{
"epoch": 0.057272166220942194,
"grad_norm": 3.477621910759164,
"learning_rate": 1.9961110041975732e-05,
"loss": 0.9586,
"step": 696
},
{
"epoch": 0.057354453816087225,
"grad_norm": 3.5385882928206454,
"learning_rate": 1.9960874855954863e-05,
"loss": 0.9508,
"step": 697
},
{
"epoch": 0.057436741411232256,
"grad_norm": 2.6972731084205437,
"learning_rate": 1.996063896232938e-05,
"loss": 0.9313,
"step": 698
},
{
"epoch": 0.05751902900637729,
"grad_norm": 0.6344603977192381,
"learning_rate": 1.9960402361116026e-05,
"loss": 0.6044,
"step": 699
},
{
"epoch": 0.05760131660152232,
"grad_norm": 5.571545453742246,
"learning_rate": 1.996016505233162e-05,
"loss": 0.92,
"step": 700
},
{
"epoch": 0.057683604196667355,
"grad_norm": 2.859612009759652,
"learning_rate": 1.9959927035993017e-05,
"loss": 0.897,
"step": 701
},
{
"epoch": 0.057765891791812386,
"grad_norm": 2.426187536557682,
"learning_rate": 1.9959688312117128e-05,
"loss": 0.9305,
"step": 702
},
{
"epoch": 0.05784817938695742,
"grad_norm": 2.7388965530788,
"learning_rate": 1.995944888072091e-05,
"loss": 0.9145,
"step": 703
},
{
"epoch": 0.05793046698210245,
"grad_norm": 2.776291815110774,
"learning_rate": 1.995920874182137e-05,
"loss": 0.9075,
"step": 704
},
{
"epoch": 0.05801275457724748,
"grad_norm": 2.575679639237728,
"learning_rate": 1.995896789543557e-05,
"loss": 0.9045,
"step": 705
},
{
"epoch": 0.05809504217239251,
"grad_norm": 3.5403132152741263,
"learning_rate": 1.9958726341580615e-05,
"loss": 0.913,
"step": 706
},
{
"epoch": 0.05817732976753754,
"grad_norm": 2.58072580176139,
"learning_rate": 1.995848408027367e-05,
"loss": 0.9229,
"step": 707
},
{
"epoch": 0.05825961736268258,
"grad_norm": 2.5124996774654473,
"learning_rate": 1.9958241111531942e-05,
"loss": 0.9126,
"step": 708
},
{
"epoch": 0.05834190495782761,
"grad_norm": 2.36119565147592,
"learning_rate": 1.995799743537269e-05,
"loss": 0.9066,
"step": 709
},
{
"epoch": 0.05842419255297264,
"grad_norm": 3.2376572469679847,
"learning_rate": 1.9957753051813228e-05,
"loss": 0.9107,
"step": 710
},
{
"epoch": 0.05850648014811767,
"grad_norm": 0.5718002254539629,
"learning_rate": 1.9957507960870908e-05,
"loss": 0.5838,
"step": 711
},
{
"epoch": 0.0585887677432627,
"grad_norm": 2.9835296928097765,
"learning_rate": 1.9957262162563155e-05,
"loss": 0.9062,
"step": 712
},
{
"epoch": 0.05867105533840773,
"grad_norm": 2.312335655498833,
"learning_rate": 1.9957015656907417e-05,
"loss": 0.9331,
"step": 713
},
{
"epoch": 0.05875334293355277,
"grad_norm": 2.3792417930038168,
"learning_rate": 1.9956768443921214e-05,
"loss": 0.9371,
"step": 714
},
{
"epoch": 0.0588356305286978,
"grad_norm": 3.0747711781753955,
"learning_rate": 1.99565205236221e-05,
"loss": 0.9245,
"step": 715
},
{
"epoch": 0.05891791812384283,
"grad_norm": 2.469147337654409,
"learning_rate": 1.9956271896027696e-05,
"loss": 0.9053,
"step": 716
},
{
"epoch": 0.05900020571898786,
"grad_norm": 4.677348829502867,
"learning_rate": 1.9956022561155655e-05,
"loss": 0.9316,
"step": 717
},
{
"epoch": 0.05908249331413289,
"grad_norm": 2.574073344258724,
"learning_rate": 1.9955772519023694e-05,
"loss": 0.9144,
"step": 718
},
{
"epoch": 0.059164780909277924,
"grad_norm": 0.6010291838312377,
"learning_rate": 1.995552176964958e-05,
"loss": 0.5969,
"step": 719
},
{
"epoch": 0.05924706850442296,
"grad_norm": 0.48362592184616704,
"learning_rate": 1.9955270313051115e-05,
"loss": 0.6105,
"step": 720
},
{
"epoch": 0.05932935609956799,
"grad_norm": 4.6846130266410935,
"learning_rate": 1.995501814924617e-05,
"loss": 0.9146,
"step": 721
},
{
"epoch": 0.05941164369471302,
"grad_norm": 2.577204170673208,
"learning_rate": 1.9954765278252656e-05,
"loss": 0.9073,
"step": 722
},
{
"epoch": 0.059493931289858054,
"grad_norm": 4.7923802267754985,
"learning_rate": 1.995451170008854e-05,
"loss": 0.9192,
"step": 723
},
{
"epoch": 0.059576218885003085,
"grad_norm": 3.637556402050712,
"learning_rate": 1.995425741477183e-05,
"loss": 0.8916,
"step": 724
},
{
"epoch": 0.059658506480148116,
"grad_norm": 3.318312481516906,
"learning_rate": 1.9954002422320593e-05,
"loss": 0.8979,
"step": 725
},
{
"epoch": 0.05974079407529315,
"grad_norm": 2.2896767162285476,
"learning_rate": 1.9953746722752944e-05,
"loss": 0.9078,
"step": 726
},
{
"epoch": 0.059823081670438184,
"grad_norm": 2.4261610228532433,
"learning_rate": 1.9953490316087045e-05,
"loss": 0.9094,
"step": 727
},
{
"epoch": 0.059905369265583215,
"grad_norm": 3.5742603087267533,
"learning_rate": 1.9953233202341115e-05,
"loss": 0.9668,
"step": 728
},
{
"epoch": 0.059987656860728246,
"grad_norm": 3.646866686252275,
"learning_rate": 1.995297538153341e-05,
"loss": 0.9081,
"step": 729
},
{
"epoch": 0.06006994445587328,
"grad_norm": 3.5756298093016134,
"learning_rate": 1.9952716853682258e-05,
"loss": 0.932,
"step": 730
},
{
"epoch": 0.06015223205101831,
"grad_norm": 2.461737210935374,
"learning_rate": 1.9952457618806016e-05,
"loss": 0.9161,
"step": 731
},
{
"epoch": 0.06023451964616334,
"grad_norm": 2.9435688364135038,
"learning_rate": 1.99521976769231e-05,
"loss": 0.8791,
"step": 732
},
{
"epoch": 0.060316807241308376,
"grad_norm": 3.752079579941048,
"learning_rate": 1.995193702805198e-05,
"loss": 0.8864,
"step": 733
},
{
"epoch": 0.06039909483645341,
"grad_norm": 4.53396790098707,
"learning_rate": 1.9951675672211163e-05,
"loss": 0.8929,
"step": 734
},
{
"epoch": 0.06048138243159844,
"grad_norm": 4.961620647630342,
"learning_rate": 1.9951413609419225e-05,
"loss": 0.8536,
"step": 735
},
{
"epoch": 0.06056367002674347,
"grad_norm": 3.891304133200799,
"learning_rate": 1.995115083969478e-05,
"loss": 0.8944,
"step": 736
},
{
"epoch": 0.0606459576218885,
"grad_norm": 2.712319861053012,
"learning_rate": 1.9950887363056495e-05,
"loss": 0.9206,
"step": 737
},
{
"epoch": 0.06072824521703353,
"grad_norm": 4.223019111124196,
"learning_rate": 1.9950623179523085e-05,
"loss": 0.9025,
"step": 738
},
{
"epoch": 0.06081053281217856,
"grad_norm": 5.016232013409377,
"learning_rate": 1.9950358289113317e-05,
"loss": 0.8815,
"step": 739
},
{
"epoch": 0.0608928204073236,
"grad_norm": 2.6897434242049694,
"learning_rate": 1.995009269184601e-05,
"loss": 0.8836,
"step": 740
},
{
"epoch": 0.06097510800246863,
"grad_norm": 0.7568433896575619,
"learning_rate": 1.994982638774003e-05,
"loss": 0.5993,
"step": 741
},
{
"epoch": 0.06105739559761366,
"grad_norm": 2.553452324246678,
"learning_rate": 1.9949559376814296e-05,
"loss": 0.8986,
"step": 742
},
{
"epoch": 0.06113968319275869,
"grad_norm": 0.5018812785768227,
"learning_rate": 1.9949291659087776e-05,
"loss": 0.5597,
"step": 743
},
{
"epoch": 0.06122197078790372,
"grad_norm": 2.4064235706469,
"learning_rate": 1.994902323457949e-05,
"loss": 0.8943,
"step": 744
},
{
"epoch": 0.06130425838304875,
"grad_norm": 2.295948111702661,
"learning_rate": 1.9948754103308504e-05,
"loss": 0.8668,
"step": 745
},
{
"epoch": 0.06138654597819379,
"grad_norm": 0.6531820015601002,
"learning_rate": 1.9948484265293934e-05,
"loss": 0.5944,
"step": 746
},
{
"epoch": 0.06146883357333882,
"grad_norm": 2.488686897667554,
"learning_rate": 1.9948213720554955e-05,
"loss": 0.8939,
"step": 747
},
{
"epoch": 0.06155112116848385,
"grad_norm": 2.2478829073807867,
"learning_rate": 1.994794246911078e-05,
"loss": 0.878,
"step": 748
},
{
"epoch": 0.06163340876362888,
"grad_norm": 3.21297658438237,
"learning_rate": 1.9947670510980686e-05,
"loss": 0.9367,
"step": 749
},
{
"epoch": 0.061715696358773914,
"grad_norm": 2.5032219143064296,
"learning_rate": 1.9947397846183986e-05,
"loss": 0.909,
"step": 750
},
{
"epoch": 0.061797983953918945,
"grad_norm": 2.3821398027611367,
"learning_rate": 1.9947124474740052e-05,
"loss": 0.8767,
"step": 751
},
{
"epoch": 0.061880271549063975,
"grad_norm": 4.029427101966951,
"learning_rate": 1.99468503966683e-05,
"loss": 0.8618,
"step": 752
},
{
"epoch": 0.06196255914420901,
"grad_norm": 2.404778806152705,
"learning_rate": 1.9946575611988207e-05,
"loss": 0.9047,
"step": 753
},
{
"epoch": 0.062044846739354044,
"grad_norm": 2.962612526189809,
"learning_rate": 1.9946300120719287e-05,
"loss": 0.889,
"step": 754
},
{
"epoch": 0.062127134334499075,
"grad_norm": 2.5437765511188695,
"learning_rate": 1.994602392288112e-05,
"loss": 0.9399,
"step": 755
},
{
"epoch": 0.062209421929644106,
"grad_norm": 0.5539735241167393,
"learning_rate": 1.9945747018493314e-05,
"loss": 0.5963,
"step": 756
},
{
"epoch": 0.062291709524789136,
"grad_norm": 3.1779858985642817,
"learning_rate": 1.9945469407575543e-05,
"loss": 0.876,
"step": 757
},
{
"epoch": 0.06237399711993417,
"grad_norm": 2.687485842671492,
"learning_rate": 1.9945191090147537e-05,
"loss": 0.9022,
"step": 758
},
{
"epoch": 0.062456284715079205,
"grad_norm": 2.9422463927653766,
"learning_rate": 1.9944912066229058e-05,
"loss": 0.8956,
"step": 759
},
{
"epoch": 0.06253857231022424,
"grad_norm": 4.157936413648122,
"learning_rate": 1.9944632335839927e-05,
"loss": 0.9138,
"step": 760
},
{
"epoch": 0.06262085990536927,
"grad_norm": 0.48567249965915693,
"learning_rate": 1.9944351899000026e-05,
"loss": 0.5563,
"step": 761
},
{
"epoch": 0.0627031475005143,
"grad_norm": 2.7821820465506,
"learning_rate": 1.9944070755729266e-05,
"loss": 0.9122,
"step": 762
},
{
"epoch": 0.06278543509565933,
"grad_norm": 2.65823773191475,
"learning_rate": 1.9943788906047624e-05,
"loss": 0.9009,
"step": 763
},
{
"epoch": 0.06286772269080436,
"grad_norm": 0.4745158162176376,
"learning_rate": 1.9943506349975118e-05,
"loss": 0.5845,
"step": 764
},
{
"epoch": 0.06295001028594939,
"grad_norm": 4.304541123505603,
"learning_rate": 1.9943223087531824e-05,
"loss": 0.911,
"step": 765
},
{
"epoch": 0.06303229788109442,
"grad_norm": 2.599121308286042,
"learning_rate": 1.9942939118737866e-05,
"loss": 0.9082,
"step": 766
},
{
"epoch": 0.06311458547623945,
"grad_norm": 2.661380985142305,
"learning_rate": 1.9942654443613413e-05,
"loss": 0.889,
"step": 767
},
{
"epoch": 0.06319687307138448,
"grad_norm": 2.7289869422777406,
"learning_rate": 1.994236906217869e-05,
"loss": 0.8807,
"step": 768
},
{
"epoch": 0.06327916066652953,
"grad_norm": 3.552184676009908,
"learning_rate": 1.9942082974453968e-05,
"loss": 0.8869,
"step": 769
},
{
"epoch": 0.06336144826167456,
"grad_norm": 3.3116779659066222,
"learning_rate": 1.994179618045957e-05,
"loss": 0.886,
"step": 770
},
{
"epoch": 0.06344373585681959,
"grad_norm": 2.733151926112565,
"learning_rate": 1.9941508680215874e-05,
"loss": 0.878,
"step": 771
},
{
"epoch": 0.06352602345196462,
"grad_norm": 3.689575278866226,
"learning_rate": 1.9941220473743297e-05,
"loss": 0.9012,
"step": 772
},
{
"epoch": 0.06360831104710965,
"grad_norm": 3.6509278934675344,
"learning_rate": 1.994093156106232e-05,
"loss": 0.8859,
"step": 773
},
{
"epoch": 0.06369059864225468,
"grad_norm": 3.4408763078150373,
"learning_rate": 1.9940641942193462e-05,
"loss": 0.9895,
"step": 774
},
{
"epoch": 0.06377288623739971,
"grad_norm": 3.356367722166113,
"learning_rate": 1.9940351617157298e-05,
"loss": 0.9321,
"step": 775
},
{
"epoch": 0.06385517383254474,
"grad_norm": 2.6685489053310905,
"learning_rate": 1.994006058597445e-05,
"loss": 0.871,
"step": 776
},
{
"epoch": 0.06393746142768977,
"grad_norm": 2.1000398415565447,
"learning_rate": 1.99397688486656e-05,
"loss": 0.8799,
"step": 777
},
{
"epoch": 0.0640197490228348,
"grad_norm": 2.1292877692214462,
"learning_rate": 1.9939476405251464e-05,
"loss": 0.8955,
"step": 778
},
{
"epoch": 0.06410203661797984,
"grad_norm": 3.4132241841166073,
"learning_rate": 1.9939183255752817e-05,
"loss": 0.8757,
"step": 779
},
{
"epoch": 0.06418432421312487,
"grad_norm": 2.62487277122737,
"learning_rate": 1.9938889400190494e-05,
"loss": 0.8884,
"step": 780
},
{
"epoch": 0.0642666118082699,
"grad_norm": 2.044302329571613,
"learning_rate": 1.993859483858536e-05,
"loss": 0.9023,
"step": 781
},
{
"epoch": 0.06434889940341494,
"grad_norm": 0.5567547220538414,
"learning_rate": 1.993829957095834e-05,
"loss": 0.5694,
"step": 782
},
{
"epoch": 0.06443118699855997,
"grad_norm": 0.48731474493235843,
"learning_rate": 1.9938003597330415e-05,
"loss": 0.5764,
"step": 783
},
{
"epoch": 0.064513474593705,
"grad_norm": 2.335128235917664,
"learning_rate": 1.9937706917722607e-05,
"loss": 0.9091,
"step": 784
},
{
"epoch": 0.06459576218885003,
"grad_norm": 2.6840226763995383,
"learning_rate": 1.9937409532155992e-05,
"loss": 0.8881,
"step": 785
},
{
"epoch": 0.06467804978399506,
"grad_norm": 2.3949102024541653,
"learning_rate": 1.99371114406517e-05,
"loss": 0.9183,
"step": 786
},
{
"epoch": 0.0647603373791401,
"grad_norm": 2.6216703824274488,
"learning_rate": 1.99368126432309e-05,
"loss": 0.9207,
"step": 787
},
{
"epoch": 0.06484262497428513,
"grad_norm": 2.614435269135524,
"learning_rate": 1.993651313991482e-05,
"loss": 0.9145,
"step": 788
},
{
"epoch": 0.06492491256943016,
"grad_norm": 1.9122678315195296,
"learning_rate": 1.9936212930724742e-05,
"loss": 0.8829,
"step": 789
},
{
"epoch": 0.06500720016457519,
"grad_norm": 0.5913835221535177,
"learning_rate": 1.9935912015681984e-05,
"loss": 0.6145,
"step": 790
},
{
"epoch": 0.06508948775972022,
"grad_norm": 2.528199419410872,
"learning_rate": 1.993561039480793e-05,
"loss": 0.8655,
"step": 791
},
{
"epoch": 0.06517177535486525,
"grad_norm": 3.3798538121747326,
"learning_rate": 1.9935308068124e-05,
"loss": 0.9251,
"step": 792
},
{
"epoch": 0.06525406295001028,
"grad_norm": 2.6588327121370194,
"learning_rate": 1.9935005035651676e-05,
"loss": 0.8983,
"step": 793
},
{
"epoch": 0.06533635054515531,
"grad_norm": 0.5232567113259947,
"learning_rate": 1.9934701297412482e-05,
"loss": 0.578,
"step": 794
},
{
"epoch": 0.06541863814030036,
"grad_norm": 4.752300485944965,
"learning_rate": 1.9934396853427998e-05,
"loss": 0.8953,
"step": 795
},
{
"epoch": 0.06550092573544539,
"grad_norm": 2.2269507955655987,
"learning_rate": 1.9934091703719846e-05,
"loss": 0.9245,
"step": 796
},
{
"epoch": 0.06558321333059042,
"grad_norm": 3.122445969674065,
"learning_rate": 1.9933785848309708e-05,
"loss": 0.8914,
"step": 797
},
{
"epoch": 0.06566550092573545,
"grad_norm": 3.1204724551293426,
"learning_rate": 1.9933479287219312e-05,
"loss": 0.9287,
"step": 798
},
{
"epoch": 0.06574778852088048,
"grad_norm": 14.479758337139925,
"learning_rate": 1.9933172020470433e-05,
"loss": 0.8677,
"step": 799
},
{
"epoch": 0.06583007611602551,
"grad_norm": 2.1224285416282953,
"learning_rate": 1.99328640480849e-05,
"loss": 0.8755,
"step": 800
},
{
"epoch": 0.06591236371117054,
"grad_norm": 2.487164087508179,
"learning_rate": 1.9932555370084588e-05,
"loss": 0.8775,
"step": 801
},
{
"epoch": 0.06599465130631557,
"grad_norm": 0.5728404010402629,
"learning_rate": 1.9932245986491425e-05,
"loss": 0.5477,
"step": 802
},
{
"epoch": 0.0660769389014606,
"grad_norm": 3.245446623126787,
"learning_rate": 1.9931935897327396e-05,
"loss": 0.9005,
"step": 803
},
{
"epoch": 0.06615922649660563,
"grad_norm": 2.5198170754823237,
"learning_rate": 1.9931625102614524e-05,
"loss": 0.9251,
"step": 804
},
{
"epoch": 0.06624151409175066,
"grad_norm": 2.7124091417439447,
"learning_rate": 1.9931313602374886e-05,
"loss": 0.9043,
"step": 805
},
{
"epoch": 0.0663238016868957,
"grad_norm": 2.295917945326921,
"learning_rate": 1.9931001396630613e-05,
"loss": 0.9037,
"step": 806
},
{
"epoch": 0.06640608928204073,
"grad_norm": 2.5595180677086176,
"learning_rate": 1.9930688485403885e-05,
"loss": 0.8916,
"step": 807
},
{
"epoch": 0.06648837687718577,
"grad_norm": 2.54401264532517,
"learning_rate": 1.993037486871693e-05,
"loss": 0.8865,
"step": 808
},
{
"epoch": 0.0665706644723308,
"grad_norm": 2.7644346282703567,
"learning_rate": 1.993006054659202e-05,
"loss": 0.875,
"step": 809
},
{
"epoch": 0.06665295206747583,
"grad_norm": 2.145314542653547,
"learning_rate": 1.9929745519051497e-05,
"loss": 0.9358,
"step": 810
},
{
"epoch": 0.06673523966262086,
"grad_norm": 3.2713117109960583,
"learning_rate": 1.9929429786117724e-05,
"loss": 0.8777,
"step": 811
},
{
"epoch": 0.0668175272577659,
"grad_norm": 0.5829653015669467,
"learning_rate": 1.9929113347813145e-05,
"loss": 0.5366,
"step": 812
},
{
"epoch": 0.06689981485291092,
"grad_norm": 2.4233464969419516,
"learning_rate": 1.992879620416023e-05,
"loss": 0.9099,
"step": 813
},
{
"epoch": 0.06698210244805596,
"grad_norm": 2.7021068296091624,
"learning_rate": 1.9928478355181512e-05,
"loss": 0.9092,
"step": 814
},
{
"epoch": 0.06706439004320099,
"grad_norm": 2.522776219516862,
"learning_rate": 1.992815980089957e-05,
"loss": 0.9024,
"step": 815
},
{
"epoch": 0.06714667763834602,
"grad_norm": 2.232284370603574,
"learning_rate": 1.9927840541337037e-05,
"loss": 0.9233,
"step": 816
},
{
"epoch": 0.06722896523349105,
"grad_norm": 2.9343145896014255,
"learning_rate": 1.9927520576516587e-05,
"loss": 0.9312,
"step": 817
},
{
"epoch": 0.06731125282863608,
"grad_norm": 3.3222486630048764,
"learning_rate": 1.9927199906460947e-05,
"loss": 0.8681,
"step": 818
},
{
"epoch": 0.06739354042378111,
"grad_norm": 2.1225744897957153,
"learning_rate": 1.9926878531192908e-05,
"loss": 0.8916,
"step": 819
},
{
"epoch": 0.06747582801892614,
"grad_norm": 5.166258547080567,
"learning_rate": 1.992655645073529e-05,
"loss": 0.9153,
"step": 820
},
{
"epoch": 0.06755811561407118,
"grad_norm": 3.2639889220707077,
"learning_rate": 1.992623366511098e-05,
"loss": 0.8715,
"step": 821
},
{
"epoch": 0.06764040320921622,
"grad_norm": 4.714497016717951,
"learning_rate": 1.9925910174342907e-05,
"loss": 0.8723,
"step": 822
},
{
"epoch": 0.06772269080436125,
"grad_norm": 2.5352280280058315,
"learning_rate": 1.9925585978454043e-05,
"loss": 0.9045,
"step": 823
},
{
"epoch": 0.06780497839950628,
"grad_norm": 3.485579632575649,
"learning_rate": 1.992526107746743e-05,
"loss": 0.8797,
"step": 824
},
{
"epoch": 0.06788726599465131,
"grad_norm": 12.454695730191421,
"learning_rate": 1.992493547140614e-05,
"loss": 0.8755,
"step": 825
},
{
"epoch": 0.06796955358979634,
"grad_norm": 0.5679287848373274,
"learning_rate": 1.9924609160293308e-05,
"loss": 0.5737,
"step": 826
},
{
"epoch": 0.06805184118494137,
"grad_norm": 6.733588252523935,
"learning_rate": 1.9924282144152115e-05,
"loss": 0.8607,
"step": 827
},
{
"epoch": 0.0681341287800864,
"grad_norm": 2.8353728427421965,
"learning_rate": 1.9923954423005786e-05,
"loss": 0.8658,
"step": 828
},
{
"epoch": 0.06821641637523143,
"grad_norm": 2.226675047912921,
"learning_rate": 1.9923625996877607e-05,
"loss": 0.8908,
"step": 829
},
{
"epoch": 0.06829870397037646,
"grad_norm": 2.090011013197403,
"learning_rate": 1.9923296865790907e-05,
"loss": 0.9027,
"step": 830
},
{
"epoch": 0.06838099156552149,
"grad_norm": 2.4269097740027687,
"learning_rate": 1.992296702976907e-05,
"loss": 0.8743,
"step": 831
},
{
"epoch": 0.06846327916066652,
"grad_norm": 2.4454075613373174,
"learning_rate": 1.9922636488835528e-05,
"loss": 0.9188,
"step": 832
},
{
"epoch": 0.06854556675581157,
"grad_norm": 2.708156376904729,
"learning_rate": 1.992230524301375e-05,
"loss": 0.8753,
"step": 833
},
{
"epoch": 0.0686278543509566,
"grad_norm": 6.9289687760917955,
"learning_rate": 1.9921973292327285e-05,
"loss": 0.8714,
"step": 834
},
{
"epoch": 0.06871014194610163,
"grad_norm": 2.833475838520833,
"learning_rate": 1.9921640636799697e-05,
"loss": 0.878,
"step": 835
},
{
"epoch": 0.06879242954124666,
"grad_norm": 0.6390100760660502,
"learning_rate": 1.992130727645463e-05,
"loss": 0.5892,
"step": 836
},
{
"epoch": 0.06887471713639169,
"grad_norm": 3.503075844449775,
"learning_rate": 1.992097321131576e-05,
"loss": 0.9134,
"step": 837
},
{
"epoch": 0.06895700473153672,
"grad_norm": 2.928003367939948,
"learning_rate": 1.992063844140682e-05,
"loss": 0.916,
"step": 838
},
{
"epoch": 0.06903929232668175,
"grad_norm": 2.79325002366026,
"learning_rate": 1.992030296675159e-05,
"loss": 0.8767,
"step": 839
},
{
"epoch": 0.06912157992182678,
"grad_norm": 2.312184411585912,
"learning_rate": 1.9919966787373902e-05,
"loss": 0.9053,
"step": 840
},
{
"epoch": 0.06920386751697181,
"grad_norm": 2.9138317208293594,
"learning_rate": 1.991962990329764e-05,
"loss": 0.9005,
"step": 841
},
{
"epoch": 0.06928615511211685,
"grad_norm": 2.418947503313838,
"learning_rate": 1.991929231454673e-05,
"loss": 0.8876,
"step": 842
},
{
"epoch": 0.06936844270726188,
"grad_norm": 2.746227734046784,
"learning_rate": 1.9918954021145162e-05,
"loss": 0.9174,
"step": 843
},
{
"epoch": 0.06945073030240691,
"grad_norm": 4.054877897574317,
"learning_rate": 1.991861502311696e-05,
"loss": 0.8785,
"step": 844
},
{
"epoch": 0.06953301789755194,
"grad_norm": 3.3645447414769856,
"learning_rate": 1.9918275320486212e-05,
"loss": 0.8885,
"step": 845
},
{
"epoch": 0.06961530549269698,
"grad_norm": 0.6257651466469342,
"learning_rate": 1.9917934913277047e-05,
"loss": 0.5679,
"step": 846
},
{
"epoch": 0.06969759308784201,
"grad_norm": 2.9579632903454987,
"learning_rate": 1.9917593801513645e-05,
"loss": 0.8892,
"step": 847
},
{
"epoch": 0.06977988068298704,
"grad_norm": 2.3255674692633703,
"learning_rate": 1.991725198522024e-05,
"loss": 0.8969,
"step": 848
},
{
"epoch": 0.06986216827813208,
"grad_norm": 1.8812338541653777,
"learning_rate": 1.9916909464421118e-05,
"loss": 0.84,
"step": 849
},
{
"epoch": 0.0699444558732771,
"grad_norm": 4.348093261520783,
"learning_rate": 1.9916566239140605e-05,
"loss": 0.9035,
"step": 850
},
{
"epoch": 0.07002674346842214,
"grad_norm": 2.2375985456191003,
"learning_rate": 1.9916222309403085e-05,
"loss": 0.8754,
"step": 851
},
{
"epoch": 0.07010903106356717,
"grad_norm": 3.613200403801302,
"learning_rate": 1.9915877675232992e-05,
"loss": 0.8815,
"step": 852
},
{
"epoch": 0.0701913186587122,
"grad_norm": 3.839543987455212,
"learning_rate": 1.9915532336654807e-05,
"loss": 0.9072,
"step": 853
},
{
"epoch": 0.07027360625385723,
"grad_norm": 2.105567560984786,
"learning_rate": 1.991518629369306e-05,
"loss": 0.896,
"step": 854
},
{
"epoch": 0.07035589384900226,
"grad_norm": 2.267537355899574,
"learning_rate": 1.9914839546372336e-05,
"loss": 0.9158,
"step": 855
},
{
"epoch": 0.07043818144414729,
"grad_norm": 3.589047414435187,
"learning_rate": 1.991449209471727e-05,
"loss": 0.8734,
"step": 856
},
{
"epoch": 0.07052046903929232,
"grad_norm": 3.1819343869570536,
"learning_rate": 1.991414393875254e-05,
"loss": 0.9089,
"step": 857
},
{
"epoch": 0.07060275663443735,
"grad_norm": 2.5055069972264503,
"learning_rate": 1.991379507850288e-05,
"loss": 0.8681,
"step": 858
},
{
"epoch": 0.0706850442295824,
"grad_norm": 2.545062208600291,
"learning_rate": 1.991344551399307e-05,
"loss": 0.8835,
"step": 859
},
{
"epoch": 0.07076733182472743,
"grad_norm": 2.8423181256983487,
"learning_rate": 1.9913095245247948e-05,
"loss": 0.8855,
"step": 860
},
{
"epoch": 0.07084961941987246,
"grad_norm": 2.623939420394984,
"learning_rate": 1.9912744272292392e-05,
"loss": 0.8912,
"step": 861
},
{
"epoch": 0.07093190701501749,
"grad_norm": 2.456776383887346,
"learning_rate": 1.9912392595151336e-05,
"loss": 0.9026,
"step": 862
},
{
"epoch": 0.07101419461016252,
"grad_norm": 2.7531225878969177,
"learning_rate": 1.9912040213849762e-05,
"loss": 0.8875,
"step": 863
},
{
"epoch": 0.07109648220530755,
"grad_norm": 4.481796954208249,
"learning_rate": 1.9911687128412708e-05,
"loss": 0.8636,
"step": 864
},
{
"epoch": 0.07117876980045258,
"grad_norm": 2.545397332779262,
"learning_rate": 1.9911333338865245e-05,
"loss": 0.8803,
"step": 865
},
{
"epoch": 0.07126105739559761,
"grad_norm": 3.045980428767302,
"learning_rate": 1.9910978845232517e-05,
"loss": 0.9035,
"step": 866
},
{
"epoch": 0.07134334499074264,
"grad_norm": 3.6871914250355715,
"learning_rate": 1.9910623647539702e-05,
"loss": 0.8666,
"step": 867
},
{
"epoch": 0.07142563258588767,
"grad_norm": 2.116550202268351,
"learning_rate": 1.991026774581203e-05,
"loss": 0.9031,
"step": 868
},
{
"epoch": 0.0715079201810327,
"grad_norm": 2.532009330642646,
"learning_rate": 1.9909911140074788e-05,
"loss": 0.8661,
"step": 869
},
{
"epoch": 0.07159020777617774,
"grad_norm": 3.33485917673071,
"learning_rate": 1.9909553830353308e-05,
"loss": 0.8776,
"step": 870
},
{
"epoch": 0.07167249537132277,
"grad_norm": 2.3439342371747167,
"learning_rate": 1.990919581667297e-05,
"loss": 0.9151,
"step": 871
},
{
"epoch": 0.07175478296646781,
"grad_norm": 2.488600787006511,
"learning_rate": 1.9908837099059212e-05,
"loss": 0.9165,
"step": 872
},
{
"epoch": 0.07183707056161284,
"grad_norm": 3.95670742389146,
"learning_rate": 1.990847767753751e-05,
"loss": 0.8659,
"step": 873
},
{
"epoch": 0.07191935815675787,
"grad_norm": 0.5947750160477462,
"learning_rate": 1.99081175521334e-05,
"loss": 0.5886,
"step": 874
},
{
"epoch": 0.0720016457519029,
"grad_norm": 2.033586754058639,
"learning_rate": 1.9907756722872465e-05,
"loss": 0.8897,
"step": 875
},
{
"epoch": 0.07208393334704793,
"grad_norm": 3.346298659721499,
"learning_rate": 1.9907395189780335e-05,
"loss": 0.902,
"step": 876
},
{
"epoch": 0.07216622094219297,
"grad_norm": 3.004056249927372,
"learning_rate": 1.9907032952882703e-05,
"loss": 0.8715,
"step": 877
},
{
"epoch": 0.072248508537338,
"grad_norm": 5.4098932917643285,
"learning_rate": 1.9906670012205286e-05,
"loss": 0.8866,
"step": 878
},
{
"epoch": 0.07233079613248303,
"grad_norm": 6.828654192266096,
"learning_rate": 1.990630636777388e-05,
"loss": 0.8689,
"step": 879
},
{
"epoch": 0.07241308372762806,
"grad_norm": 2.6337207605941737,
"learning_rate": 1.9905942019614312e-05,
"loss": 0.8647,
"step": 880
},
{
"epoch": 0.07249537132277309,
"grad_norm": 0.5235737963953581,
"learning_rate": 1.990557696775246e-05,
"loss": 0.5661,
"step": 881
},
{
"epoch": 0.07257765891791812,
"grad_norm": 11.548238836629363,
"learning_rate": 1.9905211212214266e-05,
"loss": 0.9294,
"step": 882
},
{
"epoch": 0.07265994651306315,
"grad_norm": 5.489164212385315,
"learning_rate": 1.990484475302571e-05,
"loss": 0.8685,
"step": 883
},
{
"epoch": 0.07274223410820818,
"grad_norm": 7.88390924258145,
"learning_rate": 1.990447759021282e-05,
"loss": 0.874,
"step": 884
},
{
"epoch": 0.07282452170335323,
"grad_norm": 4.299200684634295,
"learning_rate": 1.9904109723801684e-05,
"loss": 0.9146,
"step": 885
},
{
"epoch": 0.07290680929849826,
"grad_norm": 6.21170690266594,
"learning_rate": 1.990374115381843e-05,
"loss": 0.8728,
"step": 886
},
{
"epoch": 0.07298909689364329,
"grad_norm": 4.563438990093578,
"learning_rate": 1.9903371880289247e-05,
"loss": 0.8747,
"step": 887
},
{
"epoch": 0.07307138448878832,
"grad_norm": 3.6273703961737187,
"learning_rate": 1.990300190324036e-05,
"loss": 0.9008,
"step": 888
},
{
"epoch": 0.07315367208393335,
"grad_norm": 7.441233530871766,
"learning_rate": 1.9902631222698057e-05,
"loss": 0.9141,
"step": 889
},
{
"epoch": 0.07323595967907838,
"grad_norm": 4.82833921873659,
"learning_rate": 1.990225983868867e-05,
"loss": 0.9339,
"step": 890
},
{
"epoch": 0.07331824727422341,
"grad_norm": 5.887738980648113,
"learning_rate": 1.9901887751238577e-05,
"loss": 0.8799,
"step": 891
},
{
"epoch": 0.07340053486936844,
"grad_norm": 2.5245499693701072,
"learning_rate": 1.9901514960374217e-05,
"loss": 0.8835,
"step": 892
},
{
"epoch": 0.07348282246451347,
"grad_norm": 6.763974106441189,
"learning_rate": 1.990114146612207e-05,
"loss": 0.891,
"step": 893
},
{
"epoch": 0.0735651100596585,
"grad_norm": 2.8844071869365835,
"learning_rate": 1.9900767268508666e-05,
"loss": 0.9097,
"step": 894
},
{
"epoch": 0.07364739765480353,
"grad_norm": 5.440132687337712,
"learning_rate": 1.9900392367560588e-05,
"loss": 0.8831,
"step": 895
},
{
"epoch": 0.07372968524994856,
"grad_norm": 3.745407109325051,
"learning_rate": 1.9900016763304472e-05,
"loss": 0.8805,
"step": 896
},
{
"epoch": 0.0738119728450936,
"grad_norm": 4.288740968099518,
"learning_rate": 1.9899640455766997e-05,
"loss": 0.8891,
"step": 897
},
{
"epoch": 0.07389426044023864,
"grad_norm": 2.755838421562454,
"learning_rate": 1.9899263444974894e-05,
"loss": 0.8973,
"step": 898
},
{
"epoch": 0.07397654803538367,
"grad_norm": 2.63866374184814,
"learning_rate": 1.9898885730954948e-05,
"loss": 0.8418,
"step": 899
},
{
"epoch": 0.0740588356305287,
"grad_norm": 3.0901321494386598,
"learning_rate": 1.9898507313733995e-05,
"loss": 0.8614,
"step": 900
},
{
"epoch": 0.07414112322567373,
"grad_norm": 2.754917360078824,
"learning_rate": 1.9898128193338907e-05,
"loss": 0.8964,
"step": 901
},
{
"epoch": 0.07422341082081876,
"grad_norm": 2.4717700343085163,
"learning_rate": 1.9897748369796627e-05,
"loss": 0.8793,
"step": 902
},
{
"epoch": 0.0743056984159638,
"grad_norm": 2.2819538240312585,
"learning_rate": 1.989736784313413e-05,
"loss": 0.9086,
"step": 903
},
{
"epoch": 0.07438798601110883,
"grad_norm": 2.7031870546344385,
"learning_rate": 1.989698661337845e-05,
"loss": 0.8601,
"step": 904
},
{
"epoch": 0.07447027360625386,
"grad_norm": 2.2788277737039757,
"learning_rate": 1.9896604680556664e-05,
"loss": 0.8464,
"step": 905
},
{
"epoch": 0.07455256120139889,
"grad_norm": 2.0567769102378954,
"learning_rate": 1.9896222044695914e-05,
"loss": 0.8807,
"step": 906
},
{
"epoch": 0.07463484879654392,
"grad_norm": 2.384203325674513,
"learning_rate": 1.9895838705823377e-05,
"loss": 0.8923,
"step": 907
},
{
"epoch": 0.07471713639168895,
"grad_norm": 2.0967277384590535,
"learning_rate": 1.989545466396628e-05,
"loss": 0.8793,
"step": 908
},
{
"epoch": 0.07479942398683398,
"grad_norm": 9.442852725541027,
"learning_rate": 1.9895069919151915e-05,
"loss": 0.8965,
"step": 909
},
{
"epoch": 0.07488171158197901,
"grad_norm": 5.109761027664979,
"learning_rate": 1.9894684471407605e-05,
"loss": 0.8983,
"step": 910
},
{
"epoch": 0.07496399917712405,
"grad_norm": 2.2367018687313185,
"learning_rate": 1.9894298320760733e-05,
"loss": 0.8879,
"step": 911
},
{
"epoch": 0.07504628677226909,
"grad_norm": 2.6873708972425656,
"learning_rate": 1.989391146723873e-05,
"loss": 0.8975,
"step": 912
},
{
"epoch": 0.07512857436741412,
"grad_norm": 0.5656242706848698,
"learning_rate": 1.9893523910869085e-05,
"loss": 0.617,
"step": 913
},
{
"epoch": 0.07521086196255915,
"grad_norm": 3.9316911134297814,
"learning_rate": 1.989313565167932e-05,
"loss": 0.9385,
"step": 914
},
{
"epoch": 0.07529314955770418,
"grad_norm": 2.783913423475105,
"learning_rate": 1.9892746689697024e-05,
"loss": 0.898,
"step": 915
},
{
"epoch": 0.07537543715284921,
"grad_norm": 4.235687618463353,
"learning_rate": 1.989235702494982e-05,
"loss": 0.8539,
"step": 916
},
{
"epoch": 0.07545772474799424,
"grad_norm": 2.387819568149409,
"learning_rate": 1.9891966657465397e-05,
"loss": 0.8369,
"step": 917
},
{
"epoch": 0.07554001234313927,
"grad_norm": 3.6947231383398424,
"learning_rate": 1.989157558727148e-05,
"loss": 0.8834,
"step": 918
},
{
"epoch": 0.0756222999382843,
"grad_norm": 2.604963394831731,
"learning_rate": 1.989118381439585e-05,
"loss": 0.9019,
"step": 919
},
{
"epoch": 0.07570458753342933,
"grad_norm": 0.5332477363950743,
"learning_rate": 1.9890791338866344e-05,
"loss": 0.5771,
"step": 920
},
{
"epoch": 0.07578687512857436,
"grad_norm": 3.2104258542562953,
"learning_rate": 1.9890398160710837e-05,
"loss": 0.9337,
"step": 921
},
{
"epoch": 0.0758691627237194,
"grad_norm": 0.48633325822320617,
"learning_rate": 1.9890004279957266e-05,
"loss": 0.5602,
"step": 922
},
{
"epoch": 0.07595145031886442,
"grad_norm": 12.835475358323716,
"learning_rate": 1.9889609696633606e-05,
"loss": 0.8553,
"step": 923
},
{
"epoch": 0.07603373791400947,
"grad_norm": 3.2124511867282037,
"learning_rate": 1.9889214410767887e-05,
"loss": 0.8674,
"step": 924
},
{
"epoch": 0.0761160255091545,
"grad_norm": 2.904116877033008,
"learning_rate": 1.9888818422388193e-05,
"loss": 0.8747,
"step": 925
},
{
"epoch": 0.07619831310429953,
"grad_norm": 3.157871788078832,
"learning_rate": 1.9888421731522656e-05,
"loss": 0.8891,
"step": 926
},
{
"epoch": 0.07628060069944456,
"grad_norm": 2.3718730999123547,
"learning_rate": 1.9888024338199448e-05,
"loss": 0.8993,
"step": 927
},
{
"epoch": 0.07636288829458959,
"grad_norm": 2.4565769064213723,
"learning_rate": 1.988762624244681e-05,
"loss": 0.9013,
"step": 928
},
{
"epoch": 0.07644517588973462,
"grad_norm": 2.540968098318489,
"learning_rate": 1.988722744429301e-05,
"loss": 0.8633,
"step": 929
},
{
"epoch": 0.07652746348487965,
"grad_norm": 3.56518007003656,
"learning_rate": 1.988682794376639e-05,
"loss": 0.8882,
"step": 930
},
{
"epoch": 0.07660975108002469,
"grad_norm": 2.176182910474906,
"learning_rate": 1.9886427740895325e-05,
"loss": 0.9149,
"step": 931
},
{
"epoch": 0.07669203867516972,
"grad_norm": 0.5807290241092793,
"learning_rate": 1.9886026835708242e-05,
"loss": 0.5897,
"step": 932
},
{
"epoch": 0.07677432627031475,
"grad_norm": 0.5568253540494434,
"learning_rate": 1.9885625228233624e-05,
"loss": 0.5944,
"step": 933
},
{
"epoch": 0.07685661386545978,
"grad_norm": 0.46307351633355415,
"learning_rate": 1.9885222918499998e-05,
"loss": 0.5687,
"step": 934
},
{
"epoch": 0.07693890146060481,
"grad_norm": 2.21686936101954,
"learning_rate": 1.9884819906535946e-05,
"loss": 0.899,
"step": 935
},
{
"epoch": 0.07702118905574984,
"grad_norm": 2.7051990886793758,
"learning_rate": 1.9884416192370096e-05,
"loss": 0.9015,
"step": 936
},
{
"epoch": 0.07710347665089488,
"grad_norm": 2.1375647901334385,
"learning_rate": 1.988401177603113e-05,
"loss": 0.9001,
"step": 937
},
{
"epoch": 0.07718576424603991,
"grad_norm": 4.132265546672556,
"learning_rate": 1.988360665754777e-05,
"loss": 0.8908,
"step": 938
},
{
"epoch": 0.07726805184118495,
"grad_norm": 2.1359019957192533,
"learning_rate": 1.9883200836948803e-05,
"loss": 0.8717,
"step": 939
},
{
"epoch": 0.07735033943632998,
"grad_norm": 3.9513646854514386,
"learning_rate": 1.9882794314263053e-05,
"loss": 0.8718,
"step": 940
},
{
"epoch": 0.07743262703147501,
"grad_norm": 2.321609974282721,
"learning_rate": 1.9882387089519398e-05,
"loss": 0.869,
"step": 941
},
{
"epoch": 0.07751491462662004,
"grad_norm": 3.70309268916697,
"learning_rate": 1.9881979162746772e-05,
"loss": 0.8649,
"step": 942
},
{
"epoch": 0.07759720222176507,
"grad_norm": 3.361767416529052,
"learning_rate": 1.9881570533974148e-05,
"loss": 0.8683,
"step": 943
},
{
"epoch": 0.0776794898169101,
"grad_norm": 3.4179325921845036,
"learning_rate": 1.988116120323056e-05,
"loss": 0.8963,
"step": 944
},
{
"epoch": 0.07776177741205513,
"grad_norm": 3.021751145368183,
"learning_rate": 1.988075117054508e-05,
"loss": 0.8746,
"step": 945
},
{
"epoch": 0.07784406500720016,
"grad_norm": 3.5878829514900974,
"learning_rate": 1.9880340435946837e-05,
"loss": 0.8516,
"step": 946
},
{
"epoch": 0.07792635260234519,
"grad_norm": 1.920072678794743,
"learning_rate": 1.9879928999465016e-05,
"loss": 0.8937,
"step": 947
},
{
"epoch": 0.07800864019749022,
"grad_norm": 2.2091268186489796,
"learning_rate": 1.9879516861128835e-05,
"loss": 0.8475,
"step": 948
},
{
"epoch": 0.07809092779263525,
"grad_norm": 2.2168445139505644,
"learning_rate": 1.9879104020967577e-05,
"loss": 0.8633,
"step": 949
},
{
"epoch": 0.0781732153877803,
"grad_norm": 1.0323698606460356,
"learning_rate": 1.9878690479010568e-05,
"loss": 0.6111,
"step": 950
},
{
"epoch": 0.07825550298292533,
"grad_norm": 2.682420816107399,
"learning_rate": 1.987827623528719e-05,
"loss": 0.9341,
"step": 951
},
{
"epoch": 0.07833779057807036,
"grad_norm": 0.6240540448167275,
"learning_rate": 1.987786128982686e-05,
"loss": 0.5523,
"step": 952
},
{
"epoch": 0.07842007817321539,
"grad_norm": 3.6752862094905905,
"learning_rate": 1.9877445642659066e-05,
"loss": 0.9273,
"step": 953
},
{
"epoch": 0.07850236576836042,
"grad_norm": 2.3734201750601858,
"learning_rate": 1.987702929381333e-05,
"loss": 0.8919,
"step": 954
},
{
"epoch": 0.07858465336350545,
"grad_norm": 0.7387548503010232,
"learning_rate": 1.9876612243319228e-05,
"loss": 0.5746,
"step": 955
},
{
"epoch": 0.07866694095865048,
"grad_norm": 0.6959735516945202,
"learning_rate": 1.9876194491206388e-05,
"loss": 0.5751,
"step": 956
},
{
"epoch": 0.07874922855379551,
"grad_norm": 2.1882974936345394,
"learning_rate": 1.9875776037504482e-05,
"loss": 0.9006,
"step": 957
},
{
"epoch": 0.07883151614894054,
"grad_norm": 2.341847998608011,
"learning_rate": 1.9875356882243245e-05,
"loss": 0.9041,
"step": 958
},
{
"epoch": 0.07891380374408558,
"grad_norm": 2.1628210206575433,
"learning_rate": 1.9874937025452445e-05,
"loss": 0.8883,
"step": 959
},
{
"epoch": 0.0789960913392306,
"grad_norm": 2.8510221399462483,
"learning_rate": 1.9874516467161914e-05,
"loss": 0.9231,
"step": 960
},
{
"epoch": 0.07907837893437564,
"grad_norm": 4.694838855869676,
"learning_rate": 1.9874095207401526e-05,
"loss": 0.9156,
"step": 961
},
{
"epoch": 0.07916066652952067,
"grad_norm": 2.877307386668155,
"learning_rate": 1.98736732462012e-05,
"loss": 0.8686,
"step": 962
},
{
"epoch": 0.07924295412466571,
"grad_norm": 2.581259841624273,
"learning_rate": 1.9873250583590923e-05,
"loss": 0.9125,
"step": 963
},
{
"epoch": 0.07932524171981074,
"grad_norm": 2.3158798477006037,
"learning_rate": 1.9872827219600716e-05,
"loss": 0.8926,
"step": 964
},
{
"epoch": 0.07940752931495577,
"grad_norm": 3.0098712265326784,
"learning_rate": 1.987240315426065e-05,
"loss": 0.8758,
"step": 965
},
{
"epoch": 0.0794898169101008,
"grad_norm": 3.1422180864323233,
"learning_rate": 1.987197838760085e-05,
"loss": 0.8908,
"step": 966
},
{
"epoch": 0.07957210450524584,
"grad_norm": 0.9645131727703571,
"learning_rate": 1.9871552919651494e-05,
"loss": 0.6045,
"step": 967
},
{
"epoch": 0.07965439210039087,
"grad_norm": 3.56520313826412,
"learning_rate": 1.9871126750442807e-05,
"loss": 0.8696,
"step": 968
},
{
"epoch": 0.0797366796955359,
"grad_norm": 2.0059409411059113,
"learning_rate": 1.9870699880005063e-05,
"loss": 0.8799,
"step": 969
},
{
"epoch": 0.07981896729068093,
"grad_norm": 4.983123742682501,
"learning_rate": 1.9870272308368584e-05,
"loss": 0.8693,
"step": 970
},
{
"epoch": 0.07990125488582596,
"grad_norm": 2.1182309366583474,
"learning_rate": 1.9869844035563747e-05,
"loss": 0.8649,
"step": 971
},
{
"epoch": 0.07998354248097099,
"grad_norm": 2.157976641839583,
"learning_rate": 1.986941506162097e-05,
"loss": 0.8844,
"step": 972
},
{
"epoch": 0.08006583007611602,
"grad_norm": 3.1179516322271117,
"learning_rate": 1.9868985386570734e-05,
"loss": 0.8702,
"step": 973
},
{
"epoch": 0.08014811767126105,
"grad_norm": 2.1804704549093246,
"learning_rate": 1.986855501044356e-05,
"loss": 0.8963,
"step": 974
},
{
"epoch": 0.08023040526640608,
"grad_norm": 2.825665735780858,
"learning_rate": 1.986812393327002e-05,
"loss": 0.9028,
"step": 975
},
{
"epoch": 0.08031269286155113,
"grad_norm": 2.7064578154820276,
"learning_rate": 1.9867692155080736e-05,
"loss": 0.8922,
"step": 976
},
{
"epoch": 0.08039498045669616,
"grad_norm": 4.940848988099329,
"learning_rate": 1.9867259675906383e-05,
"loss": 0.9096,
"step": 977
},
{
"epoch": 0.08047726805184119,
"grad_norm": 3.7159663449631943,
"learning_rate": 1.9866826495777683e-05,
"loss": 0.8946,
"step": 978
},
{
"epoch": 0.08055955564698622,
"grad_norm": 4.235722900766384,
"learning_rate": 1.9866392614725408e-05,
"loss": 0.8844,
"step": 979
},
{
"epoch": 0.08064184324213125,
"grad_norm": 2.5725805077545796,
"learning_rate": 1.9865958032780383e-05,
"loss": 0.8849,
"step": 980
},
{
"epoch": 0.08072413083727628,
"grad_norm": 3.2900229009140367,
"learning_rate": 1.986552274997348e-05,
"loss": 0.8712,
"step": 981
},
{
"epoch": 0.08080641843242131,
"grad_norm": 2.7018112393037206,
"learning_rate": 1.986508676633561e-05,
"loss": 0.881,
"step": 982
},
{
"epoch": 0.08088870602756634,
"grad_norm": 3.2565064868257356,
"learning_rate": 1.986465008189776e-05,
"loss": 0.8741,
"step": 983
},
{
"epoch": 0.08097099362271137,
"grad_norm": 2.977427479800942,
"learning_rate": 1.986421269669094e-05,
"loss": 0.864,
"step": 984
},
{
"epoch": 0.0810532812178564,
"grad_norm": 2.8391838913702734,
"learning_rate": 1.986377461074623e-05,
"loss": 0.8777,
"step": 985
},
{
"epoch": 0.08113556881300144,
"grad_norm": 2.228144074432828,
"learning_rate": 1.9863335824094742e-05,
"loss": 0.8873,
"step": 986
},
{
"epoch": 0.08121785640814647,
"grad_norm": 2.6153835393886444,
"learning_rate": 1.9862896336767654e-05,
"loss": 0.8565,
"step": 987
},
{
"epoch": 0.08130014400329151,
"grad_norm": 2.469488378896095,
"learning_rate": 1.9862456148796182e-05,
"loss": 0.9062,
"step": 988
},
{
"epoch": 0.08138243159843654,
"grad_norm": 0.9008951474609029,
"learning_rate": 1.98620152602116e-05,
"loss": 0.5855,
"step": 989
},
{
"epoch": 0.08146471919358157,
"grad_norm": 3.1010964992276335,
"learning_rate": 1.986157367104522e-05,
"loss": 0.8901,
"step": 990
},
{
"epoch": 0.0815470067887266,
"grad_norm": 2.745575020455269,
"learning_rate": 1.9861131381328422e-05,
"loss": 0.8992,
"step": 991
},
{
"epoch": 0.08162929438387163,
"grad_norm": 2.319333762749616,
"learning_rate": 1.9860688391092623e-05,
"loss": 0.8489,
"step": 992
},
{
"epoch": 0.08171158197901666,
"grad_norm": 1.8701951574677815,
"learning_rate": 1.9860244700369288e-05,
"loss": 0.8895,
"step": 993
},
{
"epoch": 0.0817938695741617,
"grad_norm": 2.4973895580746928,
"learning_rate": 1.985980030918994e-05,
"loss": 0.8414,
"step": 994
},
{
"epoch": 0.08187615716930673,
"grad_norm": 2.542292639884159,
"learning_rate": 1.9859355217586144e-05,
"loss": 0.8865,
"step": 995
},
{
"epoch": 0.08195844476445176,
"grad_norm": 0.5992255264191748,
"learning_rate": 1.9858909425589524e-05,
"loss": 0.5575,
"step": 996
},
{
"epoch": 0.08204073235959679,
"grad_norm": 2.143472686925439,
"learning_rate": 1.9858462933231742e-05,
"loss": 0.8543,
"step": 997
},
{
"epoch": 0.08212301995474182,
"grad_norm": 2.49083696229216,
"learning_rate": 1.9858015740544524e-05,
"loss": 0.8961,
"step": 998
},
{
"epoch": 0.08220530754988685,
"grad_norm": 5.032363107017064,
"learning_rate": 1.985756784755963e-05,
"loss": 0.869,
"step": 999
},
{
"epoch": 0.08228759514503188,
"grad_norm": 3.456646347683982,
"learning_rate": 1.9857119254308885e-05,
"loss": 0.868,
"step": 1000
},
{
"epoch": 0.08236988274017693,
"grad_norm": 3.7630419410589755,
"learning_rate": 1.9856669960824147e-05,
"loss": 0.9249,
"step": 1001
},
{
"epoch": 0.08245217033532196,
"grad_norm": 3.1625549709552994,
"learning_rate": 1.985621996713734e-05,
"loss": 0.8869,
"step": 1002
},
{
"epoch": 0.08253445793046699,
"grad_norm": 3.881507636381793,
"learning_rate": 1.985576927328043e-05,
"loss": 0.888,
"step": 1003
},
{
"epoch": 0.08261674552561202,
"grad_norm": 2.544247409259161,
"learning_rate": 1.9855317879285434e-05,
"loss": 0.8715,
"step": 1004
},
{
"epoch": 0.08269903312075705,
"grad_norm": 2.5279916413903583,
"learning_rate": 1.9854865785184417e-05,
"loss": 0.8849,
"step": 1005
},
{
"epoch": 0.08278132071590208,
"grad_norm": 3.4196695037594576,
"learning_rate": 1.9854412991009494e-05,
"loss": 0.8364,
"step": 1006
},
{
"epoch": 0.08286360831104711,
"grad_norm": 2.759961086631554,
"learning_rate": 1.985395949679283e-05,
"loss": 0.854,
"step": 1007
},
{
"epoch": 0.08294589590619214,
"grad_norm": 0.5731316878529051,
"learning_rate": 1.9853505302566646e-05,
"loss": 0.6152,
"step": 1008
},
{
"epoch": 0.08302818350133717,
"grad_norm": 2.9549671685361525,
"learning_rate": 1.98530504083632e-05,
"loss": 0.861,
"step": 1009
},
{
"epoch": 0.0831104710964822,
"grad_norm": 2.3193711696281025,
"learning_rate": 1.9852594814214812e-05,
"loss": 0.865,
"step": 1010
},
{
"epoch": 0.08319275869162723,
"grad_norm": 3.0076758009209636,
"learning_rate": 1.9852138520153846e-05,
"loss": 0.8852,
"step": 1011
},
{
"epoch": 0.08327504628677226,
"grad_norm": 2.732008977686221,
"learning_rate": 1.9851681526212716e-05,
"loss": 0.8928,
"step": 1012
},
{
"epoch": 0.0833573338819173,
"grad_norm": 2.37950207279815,
"learning_rate": 1.9851223832423886e-05,
"loss": 0.8617,
"step": 1013
},
{
"epoch": 0.08343962147706234,
"grad_norm": 2.464424002675186,
"learning_rate": 1.985076543881987e-05,
"loss": 0.8625,
"step": 1014
},
{
"epoch": 0.08352190907220737,
"grad_norm": 2.9080302916718015,
"learning_rate": 1.985030634543323e-05,
"loss": 0.8832,
"step": 1015
},
{
"epoch": 0.0836041966673524,
"grad_norm": 2.6287476224799655,
"learning_rate": 1.984984655229658e-05,
"loss": 0.8728,
"step": 1016
},
{
"epoch": 0.08368648426249743,
"grad_norm": 2.5936175763493052,
"learning_rate": 1.9849386059442585e-05,
"loss": 0.8678,
"step": 1017
},
{
"epoch": 0.08376877185764246,
"grad_norm": 2.3604963235792904,
"learning_rate": 1.9848924866903955e-05,
"loss": 0.8783,
"step": 1018
},
{
"epoch": 0.0838510594527875,
"grad_norm": 0.5341112663835049,
"learning_rate": 1.984846297471345e-05,
"loss": 0.605,
"step": 1019
},
{
"epoch": 0.08393334704793252,
"grad_norm": 2.9860218730439057,
"learning_rate": 1.984800038290389e-05,
"loss": 0.8525,
"step": 1020
},
{
"epoch": 0.08401563464307756,
"grad_norm": 2.4630212214875025,
"learning_rate": 1.9847537091508134e-05,
"loss": 0.8825,
"step": 1021
},
{
"epoch": 0.08409792223822259,
"grad_norm": 2.424908485494412,
"learning_rate": 1.984707310055909e-05,
"loss": 0.891,
"step": 1022
},
{
"epoch": 0.08418020983336762,
"grad_norm": 2.886480910540036,
"learning_rate": 1.984660841008972e-05,
"loss": 0.8935,
"step": 1023
},
{
"epoch": 0.08426249742851265,
"grad_norm": 2.4246756718684384,
"learning_rate": 1.9846143020133035e-05,
"loss": 0.8679,
"step": 1024
},
{
"epoch": 0.08434478502365768,
"grad_norm": 4.020038177987053,
"learning_rate": 1.98456769307221e-05,
"loss": 0.8191,
"step": 1025
},
{
"epoch": 0.08442707261880271,
"grad_norm": 2.6823999549769795,
"learning_rate": 1.9845210141890018e-05,
"loss": 0.8618,
"step": 1026
},
{
"epoch": 0.08450936021394775,
"grad_norm": 2.2350487266641035,
"learning_rate": 1.9844742653669953e-05,
"loss": 0.8595,
"step": 1027
},
{
"epoch": 0.08459164780909278,
"grad_norm": 4.977761117586025,
"learning_rate": 1.9844274466095117e-05,
"loss": 0.8516,
"step": 1028
},
{
"epoch": 0.08467393540423782,
"grad_norm": 3.31805191100729,
"learning_rate": 1.9843805579198766e-05,
"loss": 0.8636,
"step": 1029
},
{
"epoch": 0.08475622299938285,
"grad_norm": 2.5881873279624648,
"learning_rate": 1.9843335993014206e-05,
"loss": 0.8667,
"step": 1030
},
{
"epoch": 0.08483851059452788,
"grad_norm": 3.9560157884462,
"learning_rate": 1.98428657075748e-05,
"loss": 0.8799,
"step": 1031
},
{
"epoch": 0.08492079818967291,
"grad_norm": 2.5965271671259753,
"learning_rate": 1.984239472291396e-05,
"loss": 0.8714,
"step": 1032
},
{
"epoch": 0.08500308578481794,
"grad_norm": 2.9384162786300094,
"learning_rate": 1.9841923039065136e-05,
"loss": 0.8784,
"step": 1033
},
{
"epoch": 0.08508537337996297,
"grad_norm": 4.575841979886102,
"learning_rate": 1.984145065606184e-05,
"loss": 0.871,
"step": 1034
},
{
"epoch": 0.085167660975108,
"grad_norm": 2.6762798398130205,
"learning_rate": 1.984097757393763e-05,
"loss": 0.8884,
"step": 1035
},
{
"epoch": 0.08524994857025303,
"grad_norm": 2.3317749715867757,
"learning_rate": 1.9840503792726107e-05,
"loss": 0.8582,
"step": 1036
},
{
"epoch": 0.08533223616539806,
"grad_norm": 2.5192408862448925,
"learning_rate": 1.9840029312460936e-05,
"loss": 0.8987,
"step": 1037
},
{
"epoch": 0.08541452376054309,
"grad_norm": 3.0314447963476954,
"learning_rate": 1.9839554133175815e-05,
"loss": 0.9115,
"step": 1038
},
{
"epoch": 0.08549681135568812,
"grad_norm": 2.718611923577393,
"learning_rate": 1.983907825490451e-05,
"loss": 0.8768,
"step": 1039
},
{
"epoch": 0.08557909895083317,
"grad_norm": 3.2506331598038063,
"learning_rate": 1.9838601677680818e-05,
"loss": 0.8892,
"step": 1040
},
{
"epoch": 0.0856613865459782,
"grad_norm": 2.8785960552339844,
"learning_rate": 1.9838124401538596e-05,
"loss": 0.8762,
"step": 1041
},
{
"epoch": 0.08574367414112323,
"grad_norm": 3.255205364224761,
"learning_rate": 1.9837646426511755e-05,
"loss": 0.8878,
"step": 1042
},
{
"epoch": 0.08582596173626826,
"grad_norm": 2.152447959926313,
"learning_rate": 1.9837167752634243e-05,
"loss": 0.8939,
"step": 1043
},
{
"epoch": 0.08590824933141329,
"grad_norm": 6.038167525170103,
"learning_rate": 1.983668837994006e-05,
"loss": 0.854,
"step": 1044
},
{
"epoch": 0.08599053692655832,
"grad_norm": 2.4872882270608296,
"learning_rate": 1.983620830846327e-05,
"loss": 0.865,
"step": 1045
},
{
"epoch": 0.08607282452170335,
"grad_norm": 5.0878964623293905,
"learning_rate": 1.9835727538237977e-05,
"loss": 0.8848,
"step": 1046
},
{
"epoch": 0.08615511211684838,
"grad_norm": 0.5466809522376739,
"learning_rate": 1.9835246069298325e-05,
"loss": 0.5879,
"step": 1047
},
{
"epoch": 0.08623739971199341,
"grad_norm": 2.8930059060138134,
"learning_rate": 1.9834763901678523e-05,
"loss": 0.9032,
"step": 1048
},
{
"epoch": 0.08631968730713845,
"grad_norm": 3.481150201855255,
"learning_rate": 1.983428103541282e-05,
"loss": 0.895,
"step": 1049
},
{
"epoch": 0.08640197490228348,
"grad_norm": 2.2668611618771806,
"learning_rate": 1.983379747053552e-05,
"loss": 0.8841,
"step": 1050
},
{
"epoch": 0.08648426249742851,
"grad_norm": 0.5012767267519984,
"learning_rate": 1.9833313207080976e-05,
"loss": 0.5584,
"step": 1051
},
{
"epoch": 0.08656655009257354,
"grad_norm": 4.03230401593853,
"learning_rate": 1.983282824508359e-05,
"loss": 0.8722,
"step": 1052
},
{
"epoch": 0.08664883768771858,
"grad_norm": 3.2238027639613662,
"learning_rate": 1.9832342584577808e-05,
"loss": 0.9061,
"step": 1053
},
{
"epoch": 0.08673112528286361,
"grad_norm": 2.5875473888993827,
"learning_rate": 1.9831856225598134e-05,
"loss": 0.8655,
"step": 1054
},
{
"epoch": 0.08681341287800864,
"grad_norm": 2.9531227295823435,
"learning_rate": 1.9831369168179116e-05,
"loss": 0.9014,
"step": 1055
},
{
"epoch": 0.08689570047315368,
"grad_norm": 3.2403950768604273,
"learning_rate": 1.9830881412355356e-05,
"loss": 0.8802,
"step": 1056
},
{
"epoch": 0.0869779880682987,
"grad_norm": 2.6421330385224406,
"learning_rate": 1.9830392958161505e-05,
"loss": 0.8624,
"step": 1057
},
{
"epoch": 0.08706027566344374,
"grad_norm": 2.796247945415367,
"learning_rate": 1.9829903805632257e-05,
"loss": 0.8465,
"step": 1058
},
{
"epoch": 0.08714256325858877,
"grad_norm": 0.5356691167104551,
"learning_rate": 1.982941395480236e-05,
"loss": 0.5749,
"step": 1059
},
{
"epoch": 0.0872248508537338,
"grad_norm": 2.543782162970702,
"learning_rate": 1.9828923405706622e-05,
"loss": 0.8651,
"step": 1060
},
{
"epoch": 0.08730713844887883,
"grad_norm": 5.052374438346327,
"learning_rate": 1.982843215837988e-05,
"loss": 0.8556,
"step": 1061
},
{
"epoch": 0.08738942604402386,
"grad_norm": 2.709282429422679,
"learning_rate": 1.9827940212857038e-05,
"loss": 0.8739,
"step": 1062
},
{
"epoch": 0.08747171363916889,
"grad_norm": 12.014153200069254,
"learning_rate": 1.982744756917304e-05,
"loss": 0.8685,
"step": 1063
},
{
"epoch": 0.08755400123431392,
"grad_norm": 4.7874082941622875,
"learning_rate": 1.9826954227362883e-05,
"loss": 0.8968,
"step": 1064
},
{
"epoch": 0.08763628882945895,
"grad_norm": 3.094799934600602,
"learning_rate": 1.9826460187461616e-05,
"loss": 0.8678,
"step": 1065
},
{
"epoch": 0.087718576424604,
"grad_norm": 2.2422659009449664,
"learning_rate": 1.982596544950433e-05,
"loss": 0.8764,
"step": 1066
},
{
"epoch": 0.08780086401974903,
"grad_norm": 3.436687255418153,
"learning_rate": 1.982547001352617e-05,
"loss": 0.8516,
"step": 1067
},
{
"epoch": 0.08788315161489406,
"grad_norm": 0.4947838359746663,
"learning_rate": 1.982497387956234e-05,
"loss": 0.5591,
"step": 1068
},
{
"epoch": 0.08796543921003909,
"grad_norm": 2.6289534390817098,
"learning_rate": 1.9824477047648073e-05,
"loss": 0.8481,
"step": 1069
},
{
"epoch": 0.08804772680518412,
"grad_norm": 0.4837575812403313,
"learning_rate": 1.9823979517818672e-05,
"loss": 0.5778,
"step": 1070
},
{
"epoch": 0.08813001440032915,
"grad_norm": 3.538024856422455,
"learning_rate": 1.9823481290109478e-05,
"loss": 0.8619,
"step": 1071
},
{
"epoch": 0.08821230199547418,
"grad_norm": 4.321407175482124,
"learning_rate": 1.982298236455588e-05,
"loss": 0.8846,
"step": 1072
},
{
"epoch": 0.08829458959061921,
"grad_norm": 3.616450253072054,
"learning_rate": 1.9822482741193324e-05,
"loss": 0.8856,
"step": 1073
},
{
"epoch": 0.08837687718576424,
"grad_norm": 4.473435045577941,
"learning_rate": 1.9821982420057308e-05,
"loss": 0.8608,
"step": 1074
},
{
"epoch": 0.08845916478090927,
"grad_norm": 0.5344599795616546,
"learning_rate": 1.9821481401183364e-05,
"loss": 0.5741,
"step": 1075
},
{
"epoch": 0.0885414523760543,
"grad_norm": 3.608389298386541,
"learning_rate": 1.982097968460709e-05,
"loss": 0.8832,
"step": 1076
},
{
"epoch": 0.08862373997119934,
"grad_norm": 4.223422665021111,
"learning_rate": 1.9820477270364123e-05,
"loss": 0.8854,
"step": 1077
},
{
"epoch": 0.08870602756634437,
"grad_norm": 3.236757188788279,
"learning_rate": 1.981997415849016e-05,
"loss": 0.8727,
"step": 1078
},
{
"epoch": 0.08878831516148941,
"grad_norm": 0.5297374533084104,
"learning_rate": 1.9819470349020936e-05,
"loss": 0.5883,
"step": 1079
},
{
"epoch": 0.08887060275663444,
"grad_norm": 2.8725890412006656,
"learning_rate": 1.9818965841992243e-05,
"loss": 0.8719,
"step": 1080
},
{
"epoch": 0.08895289035177947,
"grad_norm": 0.4917914943060142,
"learning_rate": 1.9818460637439917e-05,
"loss": 0.5497,
"step": 1081
},
{
"epoch": 0.0890351779469245,
"grad_norm": 3.666129989863918,
"learning_rate": 1.9817954735399853e-05,
"loss": 0.855,
"step": 1082
},
{
"epoch": 0.08911746554206953,
"grad_norm": 3.667558282780085,
"learning_rate": 1.9817448135907984e-05,
"loss": 0.8618,
"step": 1083
},
{
"epoch": 0.08919975313721457,
"grad_norm": 2.8134358753083597,
"learning_rate": 1.9816940839000303e-05,
"loss": 0.8639,
"step": 1084
},
{
"epoch": 0.0892820407323596,
"grad_norm": 3.8554001706730907,
"learning_rate": 1.981643284471284e-05,
"loss": 0.8449,
"step": 1085
},
{
"epoch": 0.08936432832750463,
"grad_norm": 3.767364747903415,
"learning_rate": 1.981592415308169e-05,
"loss": 0.8549,
"step": 1086
},
{
"epoch": 0.08944661592264966,
"grad_norm": 2.8398571302805453,
"learning_rate": 1.9815414764142986e-05,
"loss": 0.8735,
"step": 1087
},
{
"epoch": 0.08952890351779469,
"grad_norm": 2.980261363247237,
"learning_rate": 1.9814904677932912e-05,
"loss": 0.8725,
"step": 1088
},
{
"epoch": 0.08961119111293972,
"grad_norm": 3.7219107197197916,
"learning_rate": 1.9814393894487713e-05,
"loss": 0.9151,
"step": 1089
},
{
"epoch": 0.08969347870808475,
"grad_norm": 4.035211371174713,
"learning_rate": 1.981388241384366e-05,
"loss": 0.8825,
"step": 1090
},
{
"epoch": 0.08977576630322978,
"grad_norm": 3.053085785512212,
"learning_rate": 1.9813370236037098e-05,
"loss": 0.8497,
"step": 1091
},
{
"epoch": 0.08985805389837483,
"grad_norm": 0.5368604454434628,
"learning_rate": 1.981285736110441e-05,
"loss": 0.5812,
"step": 1092
},
{
"epoch": 0.08994034149351986,
"grad_norm": 4.355844807027429,
"learning_rate": 1.981234378908203e-05,
"loss": 0.8887,
"step": 1093
},
{
"epoch": 0.09002262908866489,
"grad_norm": 2.649968557975437,
"learning_rate": 1.9811829520006433e-05,
"loss": 0.8415,
"step": 1094
},
{
"epoch": 0.09010491668380992,
"grad_norm": 3.4417587859008214,
"learning_rate": 1.9811314553914166e-05,
"loss": 0.8685,
"step": 1095
},
{
"epoch": 0.09018720427895495,
"grad_norm": 0.48295286929932113,
"learning_rate": 1.98107988908418e-05,
"loss": 0.5608,
"step": 1096
},
{
"epoch": 0.09026949187409998,
"grad_norm": 4.948234702126818,
"learning_rate": 1.981028253082597e-05,
"loss": 0.8638,
"step": 1097
},
{
"epoch": 0.09035177946924501,
"grad_norm": 2.8257336957776733,
"learning_rate": 1.9809765473903362e-05,
"loss": 0.8402,
"step": 1098
},
{
"epoch": 0.09043406706439004,
"grad_norm": 0.48328014205289604,
"learning_rate": 1.98092477201107e-05,
"loss": 0.5797,
"step": 1099
},
{
"epoch": 0.09051635465953507,
"grad_norm": 3.1346349138814418,
"learning_rate": 1.980872926948477e-05,
"loss": 0.8675,
"step": 1100
},
{
"epoch": 0.0905986422546801,
"grad_norm": 2.707381646623277,
"learning_rate": 1.9808210122062396e-05,
"loss": 0.8588,
"step": 1101
},
{
"epoch": 0.09068092984982513,
"grad_norm": 0.4754150829561111,
"learning_rate": 1.9807690277880464e-05,
"loss": 0.5962,
"step": 1102
},
{
"epoch": 0.09076321744497017,
"grad_norm": 3.2149488041323946,
"learning_rate": 1.98071697369759e-05,
"loss": 0.849,
"step": 1103
},
{
"epoch": 0.0908455050401152,
"grad_norm": 3.1468421046064887,
"learning_rate": 1.9806648499385678e-05,
"loss": 0.8525,
"step": 1104
},
{
"epoch": 0.09092779263526024,
"grad_norm": 3.011551334891878,
"learning_rate": 1.9806126565146835e-05,
"loss": 0.862,
"step": 1105
},
{
"epoch": 0.09101008023040527,
"grad_norm": 3.7542041127163235,
"learning_rate": 1.980560393429644e-05,
"loss": 0.878,
"step": 1106
},
{
"epoch": 0.0910923678255503,
"grad_norm": 3.924675309445745,
"learning_rate": 1.9805080606871625e-05,
"loss": 0.8932,
"step": 1107
},
{
"epoch": 0.09117465542069533,
"grad_norm": 3.149434195229172,
"learning_rate": 1.980455658290956e-05,
"loss": 0.8968,
"step": 1108
},
{
"epoch": 0.09125694301584036,
"grad_norm": 0.4528941005660691,
"learning_rate": 1.9804031862447483e-05,
"loss": 0.5658,
"step": 1109
},
{
"epoch": 0.0913392306109854,
"grad_norm": 3.2710296854560688,
"learning_rate": 1.9803506445522658e-05,
"loss": 0.8739,
"step": 1110
},
{
"epoch": 0.09142151820613043,
"grad_norm": 0.48322757491755364,
"learning_rate": 1.9802980332172415e-05,
"loss": 0.592,
"step": 1111
},
{
"epoch": 0.09150380580127546,
"grad_norm": 3.600092282955291,
"learning_rate": 1.9802453522434123e-05,
"loss": 0.8524,
"step": 1112
},
{
"epoch": 0.09158609339642049,
"grad_norm": 3.7142303319750773,
"learning_rate": 1.980192601634521e-05,
"loss": 0.8811,
"step": 1113
},
{
"epoch": 0.09166838099156552,
"grad_norm": 3.133621188104266,
"learning_rate": 1.9801397813943156e-05,
"loss": 0.8937,
"step": 1114
},
{
"epoch": 0.09175066858671055,
"grad_norm": 5.265940334189566,
"learning_rate": 1.980086891526547e-05,
"loss": 0.8761,
"step": 1115
},
{
"epoch": 0.09183295618185558,
"grad_norm": 0.5062751751465183,
"learning_rate": 1.9800339320349732e-05,
"loss": 0.5516,
"step": 1116
},
{
"epoch": 0.09191524377700061,
"grad_norm": 3.772473804543901,
"learning_rate": 1.9799809029233558e-05,
"loss": 0.8375,
"step": 1117
},
{
"epoch": 0.09199753137214566,
"grad_norm": 3.8490743801526803,
"learning_rate": 1.9799278041954628e-05,
"loss": 0.877,
"step": 1118
},
{
"epoch": 0.09207981896729069,
"grad_norm": 3.5820410192444174,
"learning_rate": 1.9798746358550656e-05,
"loss": 0.8833,
"step": 1119
},
{
"epoch": 0.09216210656243572,
"grad_norm": 8.839295550642253,
"learning_rate": 1.9798213979059412e-05,
"loss": 0.8553,
"step": 1120
},
{
"epoch": 0.09224439415758075,
"grad_norm": 3.7706882959014205,
"learning_rate": 1.979768090351872e-05,
"loss": 0.8564,
"step": 1121
},
{
"epoch": 0.09232668175272578,
"grad_norm": 4.312690219016083,
"learning_rate": 1.9797147131966445e-05,
"loss": 0.8605,
"step": 1122
},
{
"epoch": 0.09240896934787081,
"grad_norm": 6.342821693734463,
"learning_rate": 1.9796612664440503e-05,
"loss": 0.8863,
"step": 1123
},
{
"epoch": 0.09249125694301584,
"grad_norm": 3.480039566309057,
"learning_rate": 1.979607750097887e-05,
"loss": 0.8676,
"step": 1124
},
{
"epoch": 0.09257354453816087,
"grad_norm": 0.5209974485249531,
"learning_rate": 1.9795541641619552e-05,
"loss": 0.6128,
"step": 1125
},
{
"epoch": 0.0926558321333059,
"grad_norm": 3.0644541451290106,
"learning_rate": 1.9795005086400623e-05,
"loss": 0.8596,
"step": 1126
},
{
"epoch": 0.09273811972845093,
"grad_norm": 4.0339545836639585,
"learning_rate": 1.9794467835360198e-05,
"loss": 0.8956,
"step": 1127
},
{
"epoch": 0.09282040732359596,
"grad_norm": 3.606396064787203,
"learning_rate": 1.9793929888536443e-05,
"loss": 0.8446,
"step": 1128
},
{
"epoch": 0.092902694918741,
"grad_norm": 3.266963278351553,
"learning_rate": 1.979339124596757e-05,
"loss": 0.8804,
"step": 1129
},
{
"epoch": 0.09298498251388602,
"grad_norm": 4.171351560316691,
"learning_rate": 1.9792851907691847e-05,
"loss": 0.8764,
"step": 1130
},
{
"epoch": 0.09306727010903107,
"grad_norm": 3.1333885189366066,
"learning_rate": 1.9792311873747584e-05,
"loss": 0.8882,
"step": 1131
},
{
"epoch": 0.0931495577041761,
"grad_norm": 4.115748009743592,
"learning_rate": 1.9791771144173146e-05,
"loss": 0.8693,
"step": 1132
},
{
"epoch": 0.09323184529932113,
"grad_norm": 4.248749716560056,
"learning_rate": 1.9791229719006947e-05,
"loss": 0.866,
"step": 1133
},
{
"epoch": 0.09331413289446616,
"grad_norm": 0.5602770220421947,
"learning_rate": 1.979068759828745e-05,
"loss": 0.5729,
"step": 1134
},
{
"epoch": 0.09339642048961119,
"grad_norm": 3.208526975104471,
"learning_rate": 1.979014478205316e-05,
"loss": 0.8447,
"step": 1135
},
{
"epoch": 0.09347870808475622,
"grad_norm": 3.837179354794119,
"learning_rate": 1.978960127034264e-05,
"loss": 0.8395,
"step": 1136
},
{
"epoch": 0.09356099567990125,
"grad_norm": 4.22608442690413,
"learning_rate": 1.9789057063194505e-05,
"loss": 0.8345,
"step": 1137
},
{
"epoch": 0.09364328327504629,
"grad_norm": 4.512917248957414,
"learning_rate": 1.978851216064741e-05,
"loss": 0.8755,
"step": 1138
},
{
"epoch": 0.09372557087019132,
"grad_norm": 4.485181370046995,
"learning_rate": 1.978796656274007e-05,
"loss": 0.9001,
"step": 1139
},
{
"epoch": 0.09380785846533635,
"grad_norm": 4.311526149543538,
"learning_rate": 1.978742026951123e-05,
"loss": 0.8147,
"step": 1140
},
{
"epoch": 0.09389014606048138,
"grad_norm": 3.400869370992463,
"learning_rate": 1.9786873280999716e-05,
"loss": 0.8458,
"step": 1141
},
{
"epoch": 0.09397243365562641,
"grad_norm": 3.484007931145798,
"learning_rate": 1.978632559724437e-05,
"loss": 0.8396,
"step": 1142
},
{
"epoch": 0.09405472125077145,
"grad_norm": 5.974225023368629,
"learning_rate": 1.9785777218284107e-05,
"loss": 0.8544,
"step": 1143
},
{
"epoch": 0.09413700884591648,
"grad_norm": 4.758176933846711,
"learning_rate": 1.978522814415788e-05,
"loss": 0.8738,
"step": 1144
},
{
"epoch": 0.09421929644106151,
"grad_norm": 4.054376339470337,
"learning_rate": 1.9784678374904694e-05,
"loss": 0.8647,
"step": 1145
},
{
"epoch": 0.09430158403620655,
"grad_norm": 3.254256033254886,
"learning_rate": 1.9784127910563606e-05,
"loss": 0.8353,
"step": 1146
},
{
"epoch": 0.09438387163135158,
"grad_norm": 0.5816738083728531,
"learning_rate": 1.978357675117372e-05,
"loss": 0.5812,
"step": 1147
},
{
"epoch": 0.09446615922649661,
"grad_norm": 0.49793035339456754,
"learning_rate": 1.9783024896774187e-05,
"loss": 0.5791,
"step": 1148
},
{
"epoch": 0.09454844682164164,
"grad_norm": 4.179537892792988,
"learning_rate": 1.9782472347404206e-05,
"loss": 0.8907,
"step": 1149
},
{
"epoch": 0.09463073441678667,
"grad_norm": 4.067029184300302,
"learning_rate": 1.978191910310304e-05,
"loss": 0.8541,
"step": 1150
},
{
"epoch": 0.0947130220119317,
"grad_norm": 4.248345665782451,
"learning_rate": 1.9781365163909984e-05,
"loss": 0.8632,
"step": 1151
},
{
"epoch": 0.09479530960707673,
"grad_norm": 6.439138971096778,
"learning_rate": 1.978081052986439e-05,
"loss": 0.8629,
"step": 1152
},
{
"epoch": 0.09487759720222176,
"grad_norm": 6.71298685938902,
"learning_rate": 1.9780255201005656e-05,
"loss": 0.8549,
"step": 1153
},
{
"epoch": 0.09495988479736679,
"grad_norm": 3.967437431624442,
"learning_rate": 1.9779699177373236e-05,
"loss": 0.8732,
"step": 1154
},
{
"epoch": 0.09504217239251182,
"grad_norm": 0.8392360999561069,
"learning_rate": 1.9779142459006626e-05,
"loss": 0.5872,
"step": 1155
},
{
"epoch": 0.09512445998765687,
"grad_norm": 4.657178845971167,
"learning_rate": 1.9778585045945374e-05,
"loss": 0.8495,
"step": 1156
},
{
"epoch": 0.0952067475828019,
"grad_norm": 4.123727952348605,
"learning_rate": 1.977802693822908e-05,
"loss": 0.9142,
"step": 1157
},
{
"epoch": 0.09528903517794693,
"grad_norm": 0.5860758553236142,
"learning_rate": 1.9777468135897387e-05,
"loss": 0.5549,
"step": 1158
},
{
"epoch": 0.09537132277309196,
"grad_norm": 0.5401053295003246,
"learning_rate": 1.9776908638989996e-05,
"loss": 0.5801,
"step": 1159
},
{
"epoch": 0.09545361036823699,
"grad_norm": 0.5496816005625466,
"learning_rate": 1.9776348447546653e-05,
"loss": 0.5839,
"step": 1160
},
{
"epoch": 0.09553589796338202,
"grad_norm": 6.020685438337091,
"learning_rate": 1.977578756160715e-05,
"loss": 0.866,
"step": 1161
},
{
"epoch": 0.09561818555852705,
"grad_norm": 2.792057637957128,
"learning_rate": 1.9775225981211333e-05,
"loss": 0.8638,
"step": 1162
},
{
"epoch": 0.09570047315367208,
"grad_norm": 0.5553177375677683,
"learning_rate": 1.9774663706399092e-05,
"loss": 0.5612,
"step": 1163
},
{
"epoch": 0.09578276074881711,
"grad_norm": 5.245834669495098,
"learning_rate": 1.9774100737210376e-05,
"loss": 0.8688,
"step": 1164
},
{
"epoch": 0.09586504834396214,
"grad_norm": 3.5768926302294344,
"learning_rate": 1.977353707368518e-05,
"loss": 0.897,
"step": 1165
},
{
"epoch": 0.09594733593910718,
"grad_norm": 3.381007087662086,
"learning_rate": 1.9772972715863534e-05,
"loss": 0.8956,
"step": 1166
},
{
"epoch": 0.0960296235342522,
"grad_norm": 4.24711216964703,
"learning_rate": 1.9772407663785538e-05,
"loss": 0.8546,
"step": 1167
},
{
"epoch": 0.09611191112939724,
"grad_norm": 0.5978826180005935,
"learning_rate": 1.977184191749133e-05,
"loss": 0.5658,
"step": 1168
},
{
"epoch": 0.09619419872454228,
"grad_norm": 5.6864731543708285,
"learning_rate": 1.9771275477021102e-05,
"loss": 0.8573,
"step": 1169
},
{
"epoch": 0.09627648631968731,
"grad_norm": 0.5306016735606011,
"learning_rate": 1.9770708342415087e-05,
"loss": 0.5443,
"step": 1170
},
{
"epoch": 0.09635877391483234,
"grad_norm": 3.4108513712835733,
"learning_rate": 1.9770140513713582e-05,
"loss": 0.9162,
"step": 1171
},
{
"epoch": 0.09644106150997737,
"grad_norm": 3.0240876250486775,
"learning_rate": 1.976957199095692e-05,
"loss": 0.8959,
"step": 1172
},
{
"epoch": 0.0965233491051224,
"grad_norm": 4.329264160111276,
"learning_rate": 1.9769002774185483e-05,
"loss": 0.8581,
"step": 1173
},
{
"epoch": 0.09660563670026744,
"grad_norm": 2.8538371301611045,
"learning_rate": 1.9768432863439714e-05,
"loss": 0.8472,
"step": 1174
},
{
"epoch": 0.09668792429541247,
"grad_norm": 4.192529144078922,
"learning_rate": 1.97678622587601e-05,
"loss": 0.8697,
"step": 1175
},
{
"epoch": 0.0967702118905575,
"grad_norm": 3.729038589656874,
"learning_rate": 1.976729096018717e-05,
"loss": 0.8319,
"step": 1176
},
{
"epoch": 0.09685249948570253,
"grad_norm": 0.6437788103093597,
"learning_rate": 1.976671896776151e-05,
"loss": 0.5736,
"step": 1177
},
{
"epoch": 0.09693478708084756,
"grad_norm": 3.9035454070115017,
"learning_rate": 1.9766146281523753e-05,
"loss": 0.8874,
"step": 1178
},
{
"epoch": 0.09701707467599259,
"grad_norm": 3.819713897204886,
"learning_rate": 1.9765572901514583e-05,
"loss": 0.8422,
"step": 1179
},
{
"epoch": 0.09709936227113762,
"grad_norm": 5.277006488684462,
"learning_rate": 1.9764998827774734e-05,
"loss": 0.8849,
"step": 1180
},
{
"epoch": 0.09718164986628265,
"grad_norm": 5.189466257849834,
"learning_rate": 1.9764424060344988e-05,
"loss": 0.8612,
"step": 1181
},
{
"epoch": 0.0972639374614277,
"grad_norm": 3.4415909778873743,
"learning_rate": 1.9763848599266168e-05,
"loss": 0.8649,
"step": 1182
},
{
"epoch": 0.09734622505657273,
"grad_norm": 3.5762421871051,
"learning_rate": 1.976327244457916e-05,
"loss": 0.8643,
"step": 1183
},
{
"epoch": 0.09742851265171776,
"grad_norm": 2.9475630534612116,
"learning_rate": 1.976269559632489e-05,
"loss": 0.8756,
"step": 1184
},
{
"epoch": 0.09751080024686279,
"grad_norm": 2.865959286407617,
"learning_rate": 1.976211805454434e-05,
"loss": 0.8317,
"step": 1185
},
{
"epoch": 0.09759308784200782,
"grad_norm": 0.5278838170529865,
"learning_rate": 1.976153981927853e-05,
"loss": 0.5707,
"step": 1186
},
{
"epoch": 0.09767537543715285,
"grad_norm": 0.5151202226322995,
"learning_rate": 1.976096089056855e-05,
"loss": 0.5589,
"step": 1187
},
{
"epoch": 0.09775766303229788,
"grad_norm": 5.474549135950859,
"learning_rate": 1.9760381268455515e-05,
"loss": 0.8707,
"step": 1188
},
{
"epoch": 0.09783995062744291,
"grad_norm": 2.886942130305931,
"learning_rate": 1.9759800952980604e-05,
"loss": 0.8764,
"step": 1189
},
{
"epoch": 0.09792223822258794,
"grad_norm": 3.5448856849038015,
"learning_rate": 1.9759219944185045e-05,
"loss": 0.8546,
"step": 1190
},
{
"epoch": 0.09800452581773297,
"grad_norm": 2.3163053463145022,
"learning_rate": 1.9758638242110105e-05,
"loss": 0.827,
"step": 1191
},
{
"epoch": 0.098086813412878,
"grad_norm": 3.2678753876711903,
"learning_rate": 1.9758055846797113e-05,
"loss": 0.8456,
"step": 1192
},
{
"epoch": 0.09816910100802304,
"grad_norm": 4.046087494412628,
"learning_rate": 1.9757472758287437e-05,
"loss": 0.8565,
"step": 1193
},
{
"epoch": 0.09825138860316807,
"grad_norm": 5.312871548189173,
"learning_rate": 1.9756888976622504e-05,
"loss": 0.8316,
"step": 1194
},
{
"epoch": 0.09833367619831311,
"grad_norm": 3.5965506794172035,
"learning_rate": 1.9756304501843782e-05,
"loss": 0.8479,
"step": 1195
},
{
"epoch": 0.09841596379345814,
"grad_norm": 4.869038156703397,
"learning_rate": 1.975571933399279e-05,
"loss": 0.8957,
"step": 1196
},
{
"epoch": 0.09849825138860317,
"grad_norm": 5.073504198475643,
"learning_rate": 1.9755133473111097e-05,
"loss": 0.8748,
"step": 1197
},
{
"epoch": 0.0985805389837482,
"grad_norm": 4.129896753535656,
"learning_rate": 1.9754546919240325e-05,
"loss": 0.8624,
"step": 1198
},
{
"epoch": 0.09866282657889323,
"grad_norm": 0.75499109894716,
"learning_rate": 1.975395967242214e-05,
"loss": 0.5753,
"step": 1199
},
{
"epoch": 0.09874511417403826,
"grad_norm": 4.926214741317277,
"learning_rate": 1.9753371732698255e-05,
"loss": 0.8514,
"step": 1200
},
{
"epoch": 0.0988274017691833,
"grad_norm": 4.113995566064139,
"learning_rate": 1.9752783100110443e-05,
"loss": 0.8735,
"step": 1201
},
{
"epoch": 0.09890968936432833,
"grad_norm": 0.5883860438611207,
"learning_rate": 1.975219377470052e-05,
"loss": 0.6035,
"step": 1202
},
{
"epoch": 0.09899197695947336,
"grad_norm": 3.3466076308514863,
"learning_rate": 1.9751603756510344e-05,
"loss": 0.8769,
"step": 1203
},
{
"epoch": 0.09907426455461839,
"grad_norm": 0.47595350765066086,
"learning_rate": 1.9751013045581835e-05,
"loss": 0.5663,
"step": 1204
},
{
"epoch": 0.09915655214976342,
"grad_norm": 3.4049170080353615,
"learning_rate": 1.975042164195695e-05,
"loss": 0.8363,
"step": 1205
},
{
"epoch": 0.09923883974490845,
"grad_norm": 3.7661200169302327,
"learning_rate": 1.974982954567771e-05,
"loss": 0.8437,
"step": 1206
},
{
"epoch": 0.09932112734005348,
"grad_norm": 3.6094210284619286,
"learning_rate": 1.9749236756786167e-05,
"loss": 0.861,
"step": 1207
},
{
"epoch": 0.09940341493519853,
"grad_norm": 3.145969814243711,
"learning_rate": 1.9748643275324438e-05,
"loss": 0.8454,
"step": 1208
},
{
"epoch": 0.09948570253034356,
"grad_norm": 3.6067880218861568,
"learning_rate": 1.9748049101334684e-05,
"loss": 0.8682,
"step": 1209
},
{
"epoch": 0.09956799012548859,
"grad_norm": 3.0185050449291984,
"learning_rate": 1.974745423485911e-05,
"loss": 0.8708,
"step": 1210
},
{
"epoch": 0.09965027772063362,
"grad_norm": 3.128449103884966,
"learning_rate": 1.9746858675939974e-05,
"loss": 0.8594,
"step": 1211
},
{
"epoch": 0.09973256531577865,
"grad_norm": 0.6028578588325906,
"learning_rate": 1.9746262424619585e-05,
"loss": 0.6006,
"step": 1212
},
{
"epoch": 0.09981485291092368,
"grad_norm": 0.5378805528352323,
"learning_rate": 1.9745665480940304e-05,
"loss": 0.5702,
"step": 1213
},
{
"epoch": 0.09989714050606871,
"grad_norm": 2.9709104250769025,
"learning_rate": 1.974506784494453e-05,
"loss": 0.8769,
"step": 1214
},
{
"epoch": 0.09997942810121374,
"grad_norm": 3.5710834059738983,
"learning_rate": 1.974446951667472e-05,
"loss": 0.8524,
"step": 1215
},
{
"epoch": 0.10006171569635877,
"grad_norm": 3.564453597862319,
"learning_rate": 1.9743870496173385e-05,
"loss": 0.8602,
"step": 1216
}
],
"logging_steps": 1.0,
"max_steps": 12152,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 608,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2834347550703616.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}