jssky's picture
Training in progress, step 1000, checkpoint
ef48e09 verified
{
"best_metric": NaN,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.5675368898978433,
"eval_steps": 200,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005675368898978433,
"grad_norm": 4.242635250091553,
"learning_rate": 1e-05,
"loss": 2.0895,
"step": 1
},
{
"epoch": 0.0011350737797956867,
"grad_norm": 3.8852005004882812,
"learning_rate": 2e-05,
"loss": 2.5238,
"step": 2
},
{
"epoch": 0.00170261066969353,
"grad_norm": 4.170104026794434,
"learning_rate": 3e-05,
"loss": 2.7007,
"step": 3
},
{
"epoch": 0.0022701475595913734,
"grad_norm": 4.369339942932129,
"learning_rate": 4e-05,
"loss": 2.7838,
"step": 4
},
{
"epoch": 0.0028376844494892167,
"grad_norm": 4.925931453704834,
"learning_rate": 5e-05,
"loss": 2.8797,
"step": 5
},
{
"epoch": 0.00340522133938706,
"grad_norm": 5.264482021331787,
"learning_rate": 6e-05,
"loss": 2.971,
"step": 6
},
{
"epoch": 0.003972758229284903,
"grad_norm": 5.356241703033447,
"learning_rate": 7e-05,
"loss": 3.1023,
"step": 7
},
{
"epoch": 0.004540295119182747,
"grad_norm": 5.722445964813232,
"learning_rate": 8e-05,
"loss": 2.7219,
"step": 8
},
{
"epoch": 0.00510783200908059,
"grad_norm": 10.222125053405762,
"learning_rate": 9e-05,
"loss": 2.7866,
"step": 9
},
{
"epoch": 0.0056753688989784334,
"grad_norm": 6.646042346954346,
"learning_rate": 0.0001,
"loss": 2.4618,
"step": 10
},
{
"epoch": 0.006242905788876277,
"grad_norm": 8.881997108459473,
"learning_rate": 9.999974825027756e-05,
"loss": 2.2149,
"step": 11
},
{
"epoch": 0.00681044267877412,
"grad_norm": 7.798340797424316,
"learning_rate": 9.999899300364532e-05,
"loss": 1.9245,
"step": 12
},
{
"epoch": 0.0073779795686719635,
"grad_norm": 8.679018020629883,
"learning_rate": 9.999773426770865e-05,
"loss": 1.4955,
"step": 13
},
{
"epoch": 0.007945516458569807,
"grad_norm": 7.172414302825928,
"learning_rate": 9.999597205514297e-05,
"loss": 1.0912,
"step": 14
},
{
"epoch": 0.00851305334846765,
"grad_norm": 9.331069946289062,
"learning_rate": 9.999370638369377e-05,
"loss": 0.9289,
"step": 15
},
{
"epoch": 0.009080590238365494,
"grad_norm": 9.002270698547363,
"learning_rate": 9.99909372761763e-05,
"loss": 0.657,
"step": 16
},
{
"epoch": 0.009648127128263337,
"grad_norm": 9.825034141540527,
"learning_rate": 9.998766476047547e-05,
"loss": 0.6728,
"step": 17
},
{
"epoch": 0.01021566401816118,
"grad_norm": 14.936773300170898,
"learning_rate": 9.998388886954547e-05,
"loss": 0.6303,
"step": 18
},
{
"epoch": 0.010783200908059024,
"grad_norm": 14.347192764282227,
"learning_rate": 9.997960964140947e-05,
"loss": 0.6379,
"step": 19
},
{
"epoch": 0.011350737797956867,
"grad_norm": 12.535562515258789,
"learning_rate": 9.997482711915927e-05,
"loss": 0.5843,
"step": 20
},
{
"epoch": 0.01191827468785471,
"grad_norm": 12.070816993713379,
"learning_rate": 9.99695413509548e-05,
"loss": 0.5889,
"step": 21
},
{
"epoch": 0.012485811577752554,
"grad_norm": 10.580400466918945,
"learning_rate": 9.996375239002369e-05,
"loss": 0.601,
"step": 22
},
{
"epoch": 0.013053348467650397,
"grad_norm": 8.476105690002441,
"learning_rate": 9.995746029466071e-05,
"loss": 0.6079,
"step": 23
},
{
"epoch": 0.01362088535754824,
"grad_norm": 9.052068710327148,
"learning_rate": 9.99506651282272e-05,
"loss": 0.6068,
"step": 24
},
{
"epoch": 0.014188422247446084,
"grad_norm": 7.772611141204834,
"learning_rate": 9.99433669591504e-05,
"loss": 0.7142,
"step": 25
},
{
"epoch": 0.014755959137343927,
"grad_norm": 5.843691349029541,
"learning_rate": 9.993556586092281e-05,
"loss": 0.4146,
"step": 26
},
{
"epoch": 0.01532349602724177,
"grad_norm": 6.3077392578125,
"learning_rate": 9.992726191210138e-05,
"loss": 0.4692,
"step": 27
},
{
"epoch": 0.015891032917139614,
"grad_norm": 5.899518013000488,
"learning_rate": 9.991845519630678e-05,
"loss": 0.3898,
"step": 28
},
{
"epoch": 0.016458569807037457,
"grad_norm": 7.513155460357666,
"learning_rate": 9.990914580222257e-05,
"loss": 0.4173,
"step": 29
},
{
"epoch": 0.0170261066969353,
"grad_norm": 7.608649253845215,
"learning_rate": 9.989933382359422e-05,
"loss": 0.3552,
"step": 30
},
{
"epoch": 0.017593643586833144,
"grad_norm": 6.533905029296875,
"learning_rate": 9.988901935922826e-05,
"loss": 0.3371,
"step": 31
},
{
"epoch": 0.018161180476730987,
"grad_norm": 5.706025123596191,
"learning_rate": 9.987820251299122e-05,
"loss": 0.2471,
"step": 32
},
{
"epoch": 0.01872871736662883,
"grad_norm": 4.540302276611328,
"learning_rate": 9.986688339380862e-05,
"loss": 0.2083,
"step": 33
},
{
"epoch": 0.019296254256526674,
"grad_norm": 3.610647439956665,
"learning_rate": 9.985506211566388e-05,
"loss": 0.1795,
"step": 34
},
{
"epoch": 0.019863791146424517,
"grad_norm": 5.012790203094482,
"learning_rate": 9.984273879759713e-05,
"loss": 0.2031,
"step": 35
},
{
"epoch": 0.02043132803632236,
"grad_norm": 3.4943277835845947,
"learning_rate": 9.982991356370404e-05,
"loss": 0.1355,
"step": 36
},
{
"epoch": 0.020998864926220204,
"grad_norm": 4.174023628234863,
"learning_rate": 9.981658654313457e-05,
"loss": 0.1841,
"step": 37
},
{
"epoch": 0.021566401816118047,
"grad_norm": 2.6815176010131836,
"learning_rate": 9.98027578700917e-05,
"loss": 0.1052,
"step": 38
},
{
"epoch": 0.02213393870601589,
"grad_norm": 1.8553539514541626,
"learning_rate": 9.978842768382998e-05,
"loss": 0.0862,
"step": 39
},
{
"epoch": 0.022701475595913734,
"grad_norm": 2.5569417476654053,
"learning_rate": 9.977359612865423e-05,
"loss": 0.0708,
"step": 40
},
{
"epoch": 0.023269012485811577,
"grad_norm": 4.1771979331970215,
"learning_rate": 9.975826335391808e-05,
"loss": 0.0879,
"step": 41
},
{
"epoch": 0.02383654937570942,
"grad_norm": 3.163653612136841,
"learning_rate": 9.974242951402235e-05,
"loss": 0.1033,
"step": 42
},
{
"epoch": 0.024404086265607264,
"grad_norm": 4.081658840179443,
"learning_rate": 9.972609476841367e-05,
"loss": 0.1103,
"step": 43
},
{
"epoch": 0.024971623155505107,
"grad_norm": 5.564581871032715,
"learning_rate": 9.970925928158274e-05,
"loss": 0.1136,
"step": 44
},
{
"epoch": 0.02553916004540295,
"grad_norm": 86.91041564941406,
"learning_rate": 9.969192322306271e-05,
"loss": 1.3437,
"step": 45
},
{
"epoch": 0.026106696935300794,
"grad_norm": 0.0,
"learning_rate": 9.967408676742751e-05,
"loss": 0.0,
"step": 46
},
{
"epoch": 0.026674233825198637,
"grad_norm": 0.0,
"learning_rate": 9.965575009429006e-05,
"loss": 0.0,
"step": 47
},
{
"epoch": 0.02724177071509648,
"grad_norm": 0.0,
"learning_rate": 9.963691338830044e-05,
"loss": 0.0,
"step": 48
},
{
"epoch": 0.027809307604994324,
"grad_norm": 0.0,
"learning_rate": 9.961757683914406e-05,
"loss": 0.0,
"step": 49
},
{
"epoch": 0.028376844494892167,
"grad_norm": 0.0,
"learning_rate": 9.959774064153977e-05,
"loss": 0.0,
"step": 50
},
{
"epoch": 0.02894438138479001,
"grad_norm": 13.452136993408203,
"learning_rate": 9.957740499523787e-05,
"loss": 1.2541,
"step": 51
},
{
"epoch": 0.029511918274687854,
"grad_norm": 10.72777271270752,
"learning_rate": 9.955657010501806e-05,
"loss": 1.1451,
"step": 52
},
{
"epoch": 0.030079455164585697,
"grad_norm": 5.7461466789245605,
"learning_rate": 9.953523618068749e-05,
"loss": 0.7454,
"step": 53
},
{
"epoch": 0.03064699205448354,
"grad_norm": 4.224442481994629,
"learning_rate": 9.951340343707852e-05,
"loss": 0.6189,
"step": 54
},
{
"epoch": 0.031214528944381384,
"grad_norm": 5.231300354003906,
"learning_rate": 9.949107209404665e-05,
"loss": 0.5754,
"step": 55
},
{
"epoch": 0.03178206583427923,
"grad_norm": 4.194768905639648,
"learning_rate": 9.946824237646824e-05,
"loss": 0.4918,
"step": 56
},
{
"epoch": 0.03234960272417707,
"grad_norm": 3.1545193195343018,
"learning_rate": 9.944491451423828e-05,
"loss": 0.3889,
"step": 57
},
{
"epoch": 0.032917139614074914,
"grad_norm": 4.240617275238037,
"learning_rate": 9.942108874226811e-05,
"loss": 0.383,
"step": 58
},
{
"epoch": 0.03348467650397276,
"grad_norm": 3.6889731884002686,
"learning_rate": 9.939676530048301e-05,
"loss": 0.3308,
"step": 59
},
{
"epoch": 0.0340522133938706,
"grad_norm": 7.476080894470215,
"learning_rate": 9.937194443381972e-05,
"loss": 0.2755,
"step": 60
},
{
"epoch": 0.034619750283768444,
"grad_norm": 3.6329305171966553,
"learning_rate": 9.934662639222412e-05,
"loss": 0.2908,
"step": 61
},
{
"epoch": 0.03518728717366629,
"grad_norm": 3.839686870574951,
"learning_rate": 9.93208114306486e-05,
"loss": 0.317,
"step": 62
},
{
"epoch": 0.03575482406356413,
"grad_norm": 2.5644800662994385,
"learning_rate": 9.929449980904952e-05,
"loss": 0.2524,
"step": 63
},
{
"epoch": 0.036322360953461974,
"grad_norm": 2.268704652786255,
"learning_rate": 9.926769179238466e-05,
"loss": 0.2686,
"step": 64
},
{
"epoch": 0.03688989784335982,
"grad_norm": 3.147296905517578,
"learning_rate": 9.924038765061042e-05,
"loss": 0.2311,
"step": 65
},
{
"epoch": 0.03745743473325766,
"grad_norm": 2.1133084297180176,
"learning_rate": 9.921258765867919e-05,
"loss": 0.2072,
"step": 66
},
{
"epoch": 0.038024971623155504,
"grad_norm": 1.422222375869751,
"learning_rate": 9.918429209653662e-05,
"loss": 0.2128,
"step": 67
},
{
"epoch": 0.03859250851305335,
"grad_norm": 1.157293438911438,
"learning_rate": 9.915550124911866e-05,
"loss": 0.1427,
"step": 68
},
{
"epoch": 0.03916004540295119,
"grad_norm": 1.4492322206497192,
"learning_rate": 9.912621540634887e-05,
"loss": 0.1704,
"step": 69
},
{
"epoch": 0.039727582292849034,
"grad_norm": 1.1632133722305298,
"learning_rate": 9.909643486313533e-05,
"loss": 0.1299,
"step": 70
},
{
"epoch": 0.04029511918274688,
"grad_norm": 3.0194833278656006,
"learning_rate": 9.90661599193678e-05,
"loss": 0.1088,
"step": 71
},
{
"epoch": 0.04086265607264472,
"grad_norm": 1.3391764163970947,
"learning_rate": 9.903539087991462e-05,
"loss": 0.1105,
"step": 72
},
{
"epoch": 0.041430192962542564,
"grad_norm": 1.761568307876587,
"learning_rate": 9.900412805461967e-05,
"loss": 0.1449,
"step": 73
},
{
"epoch": 0.04199772985244041,
"grad_norm": 1.8852134943008423,
"learning_rate": 9.897237175829926e-05,
"loss": 0.1381,
"step": 74
},
{
"epoch": 0.04256526674233825,
"grad_norm": 1.8318125009536743,
"learning_rate": 9.894012231073894e-05,
"loss": 0.0953,
"step": 75
},
{
"epoch": 0.043132803632236094,
"grad_norm": 1.9700140953063965,
"learning_rate": 9.890738003669029e-05,
"loss": 0.1002,
"step": 76
},
{
"epoch": 0.04370034052213394,
"grad_norm": 2.201488971710205,
"learning_rate": 9.887414526586763e-05,
"loss": 0.0935,
"step": 77
},
{
"epoch": 0.04426787741203178,
"grad_norm": 2.130439281463623,
"learning_rate": 9.884041833294476e-05,
"loss": 0.1128,
"step": 78
},
{
"epoch": 0.044835414301929624,
"grad_norm": 1.2699000835418701,
"learning_rate": 9.880619957755151e-05,
"loss": 0.0704,
"step": 79
},
{
"epoch": 0.04540295119182747,
"grad_norm": 1.2597098350524902,
"learning_rate": 9.877148934427037e-05,
"loss": 0.0913,
"step": 80
},
{
"epoch": 0.04597048808172531,
"grad_norm": 1.9284838438034058,
"learning_rate": 9.873628798263296e-05,
"loss": 0.1002,
"step": 81
},
{
"epoch": 0.046538024971623154,
"grad_norm": 2.915311574935913,
"learning_rate": 9.870059584711668e-05,
"loss": 0.0878,
"step": 82
},
{
"epoch": 0.047105561861521,
"grad_norm": 2.436211347579956,
"learning_rate": 9.866441329714088e-05,
"loss": 0.0733,
"step": 83
},
{
"epoch": 0.04767309875141884,
"grad_norm": 1.696310043334961,
"learning_rate": 9.862774069706346e-05,
"loss": 0.0852,
"step": 84
},
{
"epoch": 0.048240635641316684,
"grad_norm": 0.9896736145019531,
"learning_rate": 9.859057841617709e-05,
"loss": 0.0163,
"step": 85
},
{
"epoch": 0.04880817253121453,
"grad_norm": 1.1014189720153809,
"learning_rate": 9.855292682870551e-05,
"loss": 0.0783,
"step": 86
},
{
"epoch": 0.04937570942111237,
"grad_norm": 1.2181459665298462,
"learning_rate": 9.851478631379982e-05,
"loss": 0.0454,
"step": 87
},
{
"epoch": 0.049943246311010214,
"grad_norm": 1.8675448894500732,
"learning_rate": 9.847615725553456e-05,
"loss": 0.062,
"step": 88
},
{
"epoch": 0.05051078320090806,
"grad_norm": 2.457249641418457,
"learning_rate": 9.843704004290392e-05,
"loss": 0.0158,
"step": 89
},
{
"epoch": 0.0510783200908059,
"grad_norm": 1.7994331121444702,
"learning_rate": 9.839743506981782e-05,
"loss": 0.0463,
"step": 90
},
{
"epoch": 0.051645856980703744,
"grad_norm": 1.6931531429290771,
"learning_rate": 9.835734273509786e-05,
"loss": 0.0308,
"step": 91
},
{
"epoch": 0.05221339387060159,
"grad_norm": 0.2792555093765259,
"learning_rate": 9.831676344247342e-05,
"loss": 0.0043,
"step": 92
},
{
"epoch": 0.05278093076049943,
"grad_norm": 1.1043611764907837,
"learning_rate": 9.827569760057755e-05,
"loss": 0.0266,
"step": 93
},
{
"epoch": 0.053348467650397274,
"grad_norm": 2.844864845275879,
"learning_rate": 9.82341456229428e-05,
"loss": 0.0724,
"step": 94
},
{
"epoch": 0.05391600454029512,
"grad_norm": 47.320552825927734,
"learning_rate": 9.819210792799712e-05,
"loss": 0.9477,
"step": 95
},
{
"epoch": 0.05448354143019296,
"grad_norm": 0.0,
"learning_rate": 9.814958493905963e-05,
"loss": 0.0,
"step": 96
},
{
"epoch": 0.055051078320090804,
"grad_norm": 8.552749633789062,
"learning_rate": 9.810657708433637e-05,
"loss": 0.0776,
"step": 97
},
{
"epoch": 0.05561861520998865,
"grad_norm": 0.0,
"learning_rate": 9.806308479691595e-05,
"loss": 0.0,
"step": 98
},
{
"epoch": 0.05618615209988649,
"grad_norm": 0.0,
"learning_rate": 9.801910851476523e-05,
"loss": 0.0,
"step": 99
},
{
"epoch": 0.056753688989784334,
"grad_norm": 0.0,
"learning_rate": 9.797464868072488e-05,
"loss": 0.0,
"step": 100
},
{
"epoch": 0.05732122587968218,
"grad_norm": 7.161002159118652,
"learning_rate": 9.792970574250493e-05,
"loss": 0.57,
"step": 101
},
{
"epoch": 0.05788876276958002,
"grad_norm": 5.865129470825195,
"learning_rate": 9.788428015268027e-05,
"loss": 0.4715,
"step": 102
},
{
"epoch": 0.058456299659477864,
"grad_norm": 4.487433433532715,
"learning_rate": 9.783837236868609e-05,
"loss": 0.3068,
"step": 103
},
{
"epoch": 0.05902383654937571,
"grad_norm": 4.0235209465026855,
"learning_rate": 9.779198285281325e-05,
"loss": 0.233,
"step": 104
},
{
"epoch": 0.05959137343927355,
"grad_norm": 5.06863260269165,
"learning_rate": 9.77451120722037e-05,
"loss": 0.2298,
"step": 105
},
{
"epoch": 0.060158910329171394,
"grad_norm": 2.5704989433288574,
"learning_rate": 9.769776049884563e-05,
"loss": 0.156,
"step": 106
},
{
"epoch": 0.06072644721906924,
"grad_norm": 3.1615333557128906,
"learning_rate": 9.764992860956889e-05,
"loss": 0.2069,
"step": 107
},
{
"epoch": 0.06129398410896708,
"grad_norm": 2.513225555419922,
"learning_rate": 9.760161688604008e-05,
"loss": 0.129,
"step": 108
},
{
"epoch": 0.061861520998864925,
"grad_norm": 2.5690817832946777,
"learning_rate": 9.755282581475769e-05,
"loss": 0.1146,
"step": 109
},
{
"epoch": 0.06242905788876277,
"grad_norm": 3.713148593902588,
"learning_rate": 9.750355588704727e-05,
"loss": 0.1187,
"step": 110
},
{
"epoch": 0.06299659477866061,
"grad_norm": 2.496454954147339,
"learning_rate": 9.745380759905647e-05,
"loss": 0.1376,
"step": 111
},
{
"epoch": 0.06356413166855845,
"grad_norm": 1.6230063438415527,
"learning_rate": 9.740358145174998e-05,
"loss": 0.0658,
"step": 112
},
{
"epoch": 0.0641316685584563,
"grad_norm": 1.4402037858963013,
"learning_rate": 9.735287795090455e-05,
"loss": 0.067,
"step": 113
},
{
"epoch": 0.06469920544835414,
"grad_norm": 1.7551097869873047,
"learning_rate": 9.730169760710386e-05,
"loss": 0.0546,
"step": 114
},
{
"epoch": 0.06526674233825198,
"grad_norm": 9.121005058288574,
"learning_rate": 9.725004093573342e-05,
"loss": 0.0719,
"step": 115
},
{
"epoch": 0.06583427922814983,
"grad_norm": 1.0054181814193726,
"learning_rate": 9.719790845697533e-05,
"loss": 0.04,
"step": 116
},
{
"epoch": 0.06640181611804767,
"grad_norm": 1.0197290182113647,
"learning_rate": 9.714530069580309e-05,
"loss": 0.0465,
"step": 117
},
{
"epoch": 0.06696935300794551,
"grad_norm": 1.8132661581039429,
"learning_rate": 9.709221818197624e-05,
"loss": 0.0315,
"step": 118
},
{
"epoch": 0.06753688989784336,
"grad_norm": 1.3299674987792969,
"learning_rate": 9.703866145003511e-05,
"loss": 0.0777,
"step": 119
},
{
"epoch": 0.0681044267877412,
"grad_norm": 0.7436254024505615,
"learning_rate": 9.698463103929542e-05,
"loss": 0.0274,
"step": 120
},
{
"epoch": 0.06867196367763904,
"grad_norm": 0.8951746821403503,
"learning_rate": 9.693012749384279e-05,
"loss": 0.0349,
"step": 121
},
{
"epoch": 0.06923950056753689,
"grad_norm": 1.018670678138733,
"learning_rate": 9.687515136252731e-05,
"loss": 0.0209,
"step": 122
},
{
"epoch": 0.06980703745743473,
"grad_norm": 0.5818009972572327,
"learning_rate": 9.681970319895803e-05,
"loss": 0.0151,
"step": 123
},
{
"epoch": 0.07037457434733257,
"grad_norm": 0.49394237995147705,
"learning_rate": 9.676378356149734e-05,
"loss": 0.0149,
"step": 124
},
{
"epoch": 0.07094211123723042,
"grad_norm": 1.2261513471603394,
"learning_rate": 9.670739301325534e-05,
"loss": 0.0279,
"step": 125
},
{
"epoch": 0.07150964812712826,
"grad_norm": 0.8551621437072754,
"learning_rate": 9.665053212208426e-05,
"loss": 0.0307,
"step": 126
},
{
"epoch": 0.0720771850170261,
"grad_norm": 0.7473931312561035,
"learning_rate": 9.659320146057262e-05,
"loss": 0.0258,
"step": 127
},
{
"epoch": 0.07264472190692395,
"grad_norm": 1.058668851852417,
"learning_rate": 9.653540160603956e-05,
"loss": 0.0333,
"step": 128
},
{
"epoch": 0.07321225879682179,
"grad_norm": 0.4712604284286499,
"learning_rate": 9.647713314052896e-05,
"loss": 0.0194,
"step": 129
},
{
"epoch": 0.07377979568671963,
"grad_norm": 0.554572343826294,
"learning_rate": 9.641839665080363e-05,
"loss": 0.012,
"step": 130
},
{
"epoch": 0.07434733257661748,
"grad_norm": 0.6603901386260986,
"learning_rate": 9.635919272833938e-05,
"loss": 0.0223,
"step": 131
},
{
"epoch": 0.07491486946651532,
"grad_norm": 0.8694583773612976,
"learning_rate": 9.629952196931901e-05,
"loss": 0.0455,
"step": 132
},
{
"epoch": 0.07548240635641316,
"grad_norm": 1.202289342880249,
"learning_rate": 9.623938497462646e-05,
"loss": 0.0438,
"step": 133
},
{
"epoch": 0.07604994324631101,
"grad_norm": 3.9090631008148193,
"learning_rate": 9.617878234984055e-05,
"loss": 0.0531,
"step": 134
},
{
"epoch": 0.07661748013620885,
"grad_norm": 1.0893713235855103,
"learning_rate": 9.611771470522908e-05,
"loss": 0.0357,
"step": 135
},
{
"epoch": 0.0771850170261067,
"grad_norm": 0.5505892634391785,
"learning_rate": 9.60561826557425e-05,
"loss": 0.0116,
"step": 136
},
{
"epoch": 0.07775255391600454,
"grad_norm": 3.2899348735809326,
"learning_rate": 9.599418682100793e-05,
"loss": 0.028,
"step": 137
},
{
"epoch": 0.07832009080590238,
"grad_norm": 1.5124907493591309,
"learning_rate": 9.593172782532268e-05,
"loss": 0.0374,
"step": 138
},
{
"epoch": 0.07888762769580022,
"grad_norm": 1.1038376092910767,
"learning_rate": 9.586880629764817e-05,
"loss": 0.0433,
"step": 139
},
{
"epoch": 0.07945516458569807,
"grad_norm": 0.2565675675868988,
"learning_rate": 9.580542287160348e-05,
"loss": 0.0037,
"step": 140
},
{
"epoch": 0.08002270147559591,
"grad_norm": 1.869051456451416,
"learning_rate": 9.574157818545901e-05,
"loss": 0.0232,
"step": 141
},
{
"epoch": 0.08059023836549375,
"grad_norm": 0.21874603629112244,
"learning_rate": 9.567727288213005e-05,
"loss": 0.004,
"step": 142
},
{
"epoch": 0.0811577752553916,
"grad_norm": 0.47795429825782776,
"learning_rate": 9.561250760917027e-05,
"loss": 0.0062,
"step": 143
},
{
"epoch": 0.08172531214528944,
"grad_norm": 1.0321537256240845,
"learning_rate": 9.554728301876526e-05,
"loss": 0.0152,
"step": 144
},
{
"epoch": 0.08229284903518728,
"grad_norm": 10.375513076782227,
"learning_rate": 9.548159976772592e-05,
"loss": 0.5914,
"step": 145
},
{
"epoch": 0.08286038592508513,
"grad_norm": 10.878011703491211,
"learning_rate": 9.541545851748186e-05,
"loss": 0.0679,
"step": 146
},
{
"epoch": 0.08342792281498297,
"grad_norm": 0.0,
"learning_rate": 9.534885993407474e-05,
"loss": 0.0,
"step": 147
},
{
"epoch": 0.08399545970488081,
"grad_norm": 0.0,
"learning_rate": 9.528180468815155e-05,
"loss": 0.0,
"step": 148
},
{
"epoch": 0.08456299659477866,
"grad_norm": 0.0,
"learning_rate": 9.521429345495787e-05,
"loss": 0.0,
"step": 149
},
{
"epoch": 0.0851305334846765,
"grad_norm": 0.0,
"learning_rate": 9.514632691433107e-05,
"loss": 0.0,
"step": 150
},
{
"epoch": 0.08569807037457434,
"grad_norm": 5.398491859436035,
"learning_rate": 9.507790575069347e-05,
"loss": 0.2644,
"step": 151
},
{
"epoch": 0.08626560726447219,
"grad_norm": 3.6026408672332764,
"learning_rate": 9.50090306530454e-05,
"loss": 0.1911,
"step": 152
},
{
"epoch": 0.08683314415437003,
"grad_norm": 3.1517863273620605,
"learning_rate": 9.493970231495835e-05,
"loss": 0.0834,
"step": 153
},
{
"epoch": 0.08740068104426787,
"grad_norm": 3.0677297115325928,
"learning_rate": 9.486992143456792e-05,
"loss": 0.0744,
"step": 154
},
{
"epoch": 0.08796821793416572,
"grad_norm": 2.8431897163391113,
"learning_rate": 9.479968871456679e-05,
"loss": 0.0717,
"step": 155
},
{
"epoch": 0.08853575482406356,
"grad_norm": 1.704907774925232,
"learning_rate": 9.472900486219769e-05,
"loss": 0.0569,
"step": 156
},
{
"epoch": 0.0891032917139614,
"grad_norm": 1.7260456085205078,
"learning_rate": 9.46578705892462e-05,
"loss": 0.0341,
"step": 157
},
{
"epoch": 0.08967082860385925,
"grad_norm": 2.4934730529785156,
"learning_rate": 9.458628661203367e-05,
"loss": 0.0329,
"step": 158
},
{
"epoch": 0.09023836549375709,
"grad_norm": 2.195416212081909,
"learning_rate": 9.451425365140996e-05,
"loss": 0.0476,
"step": 159
},
{
"epoch": 0.09080590238365494,
"grad_norm": 0.4546680748462677,
"learning_rate": 9.444177243274618e-05,
"loss": 0.0112,
"step": 160
},
{
"epoch": 0.09137343927355278,
"grad_norm": 1.1639807224273682,
"learning_rate": 9.43688436859274e-05,
"loss": 0.0313,
"step": 161
},
{
"epoch": 0.09194097616345062,
"grad_norm": 2.623199462890625,
"learning_rate": 9.429546814534529e-05,
"loss": 0.0568,
"step": 162
},
{
"epoch": 0.09250851305334847,
"grad_norm": 0.5077713131904602,
"learning_rate": 9.422164654989072e-05,
"loss": 0.0125,
"step": 163
},
{
"epoch": 0.09307604994324631,
"grad_norm": 0.37310367822647095,
"learning_rate": 9.414737964294636e-05,
"loss": 0.0067,
"step": 164
},
{
"epoch": 0.09364358683314415,
"grad_norm": 1.5596776008605957,
"learning_rate": 9.407266817237911e-05,
"loss": 0.0392,
"step": 165
},
{
"epoch": 0.094211123723042,
"grad_norm": 1.5760366916656494,
"learning_rate": 9.399751289053267e-05,
"loss": 0.0296,
"step": 166
},
{
"epoch": 0.09477866061293984,
"grad_norm": 0.7743924856185913,
"learning_rate": 9.392191455421988e-05,
"loss": 0.0112,
"step": 167
},
{
"epoch": 0.09534619750283768,
"grad_norm": 0.9516783356666565,
"learning_rate": 9.384587392471515e-05,
"loss": 0.0109,
"step": 168
},
{
"epoch": 0.09591373439273553,
"grad_norm": 0.824831485748291,
"learning_rate": 9.376939176774679e-05,
"loss": 0.0133,
"step": 169
},
{
"epoch": 0.09648127128263337,
"grad_norm": 0.7576391696929932,
"learning_rate": 9.369246885348926e-05,
"loss": 0.0229,
"step": 170
},
{
"epoch": 0.09704880817253121,
"grad_norm": 0.7438343167304993,
"learning_rate": 9.361510595655545e-05,
"loss": 0.0148,
"step": 171
},
{
"epoch": 0.09761634506242906,
"grad_norm": 1.1500699520111084,
"learning_rate": 9.353730385598887e-05,
"loss": 0.0842,
"step": 172
},
{
"epoch": 0.0981838819523269,
"grad_norm": 0.8265398740768433,
"learning_rate": 9.345906333525581e-05,
"loss": 0.0161,
"step": 173
},
{
"epoch": 0.09875141884222474,
"grad_norm": 0.7592639923095703,
"learning_rate": 9.338038518223747e-05,
"loss": 0.0276,
"step": 174
},
{
"epoch": 0.09931895573212259,
"grad_norm": 0.9068871736526489,
"learning_rate": 9.330127018922194e-05,
"loss": 0.0173,
"step": 175
},
{
"epoch": 0.09988649262202043,
"grad_norm": 3.255566120147705,
"learning_rate": 9.322171915289635e-05,
"loss": 0.0371,
"step": 176
},
{
"epoch": 0.10045402951191827,
"grad_norm": 0.607418954372406,
"learning_rate": 9.314173287433873e-05,
"loss": 0.0184,
"step": 177
},
{
"epoch": 0.10102156640181612,
"grad_norm": 1.0031543970108032,
"learning_rate": 9.306131215901003e-05,
"loss": 0.0184,
"step": 178
},
{
"epoch": 0.10158910329171396,
"grad_norm": 0.9904316663742065,
"learning_rate": 9.298045781674596e-05,
"loss": 0.0403,
"step": 179
},
{
"epoch": 0.1021566401816118,
"grad_norm": 0.6840120553970337,
"learning_rate": 9.289917066174886e-05,
"loss": 0.012,
"step": 180
},
{
"epoch": 0.10272417707150965,
"grad_norm": 1.294670820236206,
"learning_rate": 9.281745151257946e-05,
"loss": 0.0204,
"step": 181
},
{
"epoch": 0.10329171396140749,
"grad_norm": 0.5927445888519287,
"learning_rate": 9.273530119214868e-05,
"loss": 0.0166,
"step": 182
},
{
"epoch": 0.10385925085130533,
"grad_norm": 1.159449815750122,
"learning_rate": 9.265272052770936e-05,
"loss": 0.0258,
"step": 183
},
{
"epoch": 0.10442678774120318,
"grad_norm": 0.5215921401977539,
"learning_rate": 9.256971035084785e-05,
"loss": 0.0121,
"step": 184
},
{
"epoch": 0.10499432463110102,
"grad_norm": 0.537041962146759,
"learning_rate": 9.248627149747573e-05,
"loss": 0.0125,
"step": 185
},
{
"epoch": 0.10556186152099886,
"grad_norm": 0.3171516954898834,
"learning_rate": 9.24024048078213e-05,
"loss": 0.0058,
"step": 186
},
{
"epoch": 0.1061293984108967,
"grad_norm": 0.13635091483592987,
"learning_rate": 9.231811112642121e-05,
"loss": 0.0028,
"step": 187
},
{
"epoch": 0.10669693530079455,
"grad_norm": 0.38101646304130554,
"learning_rate": 9.223339130211192e-05,
"loss": 0.0174,
"step": 188
},
{
"epoch": 0.10726447219069239,
"grad_norm": 0.6654653549194336,
"learning_rate": 9.214824618802109e-05,
"loss": 0.0101,
"step": 189
},
{
"epoch": 0.10783200908059024,
"grad_norm": 0.3335312008857727,
"learning_rate": 9.206267664155907e-05,
"loss": 0.0065,
"step": 190
},
{
"epoch": 0.10839954597048808,
"grad_norm": 0.4047980308532715,
"learning_rate": 9.197668352441025e-05,
"loss": 0.0035,
"step": 191
},
{
"epoch": 0.10896708286038592,
"grad_norm": 1.2880914211273193,
"learning_rate": 9.189026770252436e-05,
"loss": 0.0283,
"step": 192
},
{
"epoch": 0.10953461975028377,
"grad_norm": 0.276368111371994,
"learning_rate": 9.18034300461078e-05,
"loss": 0.0044,
"step": 193
},
{
"epoch": 0.11010215664018161,
"grad_norm": 0.5553678274154663,
"learning_rate": 9.171617142961477e-05,
"loss": 0.0107,
"step": 194
},
{
"epoch": 0.11066969353007945,
"grad_norm": 2.4395854473114014,
"learning_rate": 9.162849273173857e-05,
"loss": 0.0229,
"step": 195
},
{
"epoch": 0.1112372304199773,
"grad_norm": 0.0,
"learning_rate": 9.154039483540273e-05,
"loss": 0.0,
"step": 196
},
{
"epoch": 0.11180476730987514,
"grad_norm": 0.0,
"learning_rate": 9.145187862775209e-05,
"loss": 0.0,
"step": 197
},
{
"epoch": 0.11237230419977298,
"grad_norm": 0.0,
"learning_rate": 9.136294500014386e-05,
"loss": 0.0,
"step": 198
},
{
"epoch": 0.11293984108967083,
"grad_norm": 0.0,
"learning_rate": 9.12735948481387e-05,
"loss": 0.0,
"step": 199
},
{
"epoch": 0.11350737797956867,
"grad_norm": 0.0,
"learning_rate": 9.118382907149165e-05,
"loss": 0.0,
"step": 200
},
{
"epoch": 0.11350737797956867,
"eval_loss": NaN,
"eval_runtime": 106.2144,
"eval_samples_per_second": 27.943,
"eval_steps_per_second": 6.986,
"step": 200
},
{
"epoch": 0.11407491486946651,
"grad_norm": 2.6148464679718018,
"learning_rate": 9.109364857414306e-05,
"loss": 0.1986,
"step": 201
},
{
"epoch": 0.11464245175936436,
"grad_norm": 1.466818928718567,
"learning_rate": 9.100305426420956e-05,
"loss": 0.0807,
"step": 202
},
{
"epoch": 0.1152099886492622,
"grad_norm": 0.8603935241699219,
"learning_rate": 9.091204705397484e-05,
"loss": 0.0535,
"step": 203
},
{
"epoch": 0.11577752553916004,
"grad_norm": 1.8044847249984741,
"learning_rate": 9.082062785988049e-05,
"loss": 0.0691,
"step": 204
},
{
"epoch": 0.11634506242905789,
"grad_norm": 0.43121302127838135,
"learning_rate": 9.072879760251679e-05,
"loss": 0.0214,
"step": 205
},
{
"epoch": 0.11691259931895573,
"grad_norm": 0.8256431221961975,
"learning_rate": 9.06365572066134e-05,
"loss": 0.0738,
"step": 206
},
{
"epoch": 0.11748013620885357,
"grad_norm": 0.938698410987854,
"learning_rate": 9.05439076010301e-05,
"loss": 0.0192,
"step": 207
},
{
"epoch": 0.11804767309875142,
"grad_norm": 0.551313579082489,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0205,
"step": 208
},
{
"epoch": 0.11861520998864926,
"grad_norm": 1.3714677095413208,
"learning_rate": 9.035738449685707e-05,
"loss": 0.0479,
"step": 209
},
{
"epoch": 0.1191827468785471,
"grad_norm": 0.35658133029937744,
"learning_rate": 9.026351287655294e-05,
"loss": 0.0098,
"step": 210
},
{
"epoch": 0.11975028376844495,
"grad_norm": 0.6199024319648743,
"learning_rate": 9.016923580312113e-05,
"loss": 0.036,
"step": 211
},
{
"epoch": 0.12031782065834279,
"grad_norm": 0.315612256526947,
"learning_rate": 9.007455422593077e-05,
"loss": 0.0065,
"step": 212
},
{
"epoch": 0.12088535754824063,
"grad_norm": 0.4234760105609894,
"learning_rate": 8.997946909842425e-05,
"loss": 0.0294,
"step": 213
},
{
"epoch": 0.12145289443813848,
"grad_norm": 0.6859511137008667,
"learning_rate": 8.988398137810777e-05,
"loss": 0.0263,
"step": 214
},
{
"epoch": 0.12202043132803632,
"grad_norm": 0.41570374369621277,
"learning_rate": 8.978809202654162e-05,
"loss": 0.0102,
"step": 215
},
{
"epoch": 0.12258796821793416,
"grad_norm": 0.5462591052055359,
"learning_rate": 8.969180200933047e-05,
"loss": 0.027,
"step": 216
},
{
"epoch": 0.123155505107832,
"grad_norm": 1.4283958673477173,
"learning_rate": 8.959511229611376e-05,
"loss": 0.0177,
"step": 217
},
{
"epoch": 0.12372304199772985,
"grad_norm": 0.2500029504299164,
"learning_rate": 8.949802386055581e-05,
"loss": 0.0027,
"step": 218
},
{
"epoch": 0.12429057888762769,
"grad_norm": 0.452014297246933,
"learning_rate": 8.940053768033609e-05,
"loss": 0.0182,
"step": 219
},
{
"epoch": 0.12485811577752554,
"grad_norm": 0.896853506565094,
"learning_rate": 8.930265473713938e-05,
"loss": 0.0119,
"step": 220
},
{
"epoch": 0.1254256526674234,
"grad_norm": 0.2784510552883148,
"learning_rate": 8.92043760166458e-05,
"loss": 0.0081,
"step": 221
},
{
"epoch": 0.12599318955732122,
"grad_norm": 0.6255968809127808,
"learning_rate": 8.910570250852097e-05,
"loss": 0.0148,
"step": 222
},
{
"epoch": 0.12656072644721908,
"grad_norm": 0.7261110544204712,
"learning_rate": 8.900663520640604e-05,
"loss": 0.022,
"step": 223
},
{
"epoch": 0.1271282633371169,
"grad_norm": 1.0769401788711548,
"learning_rate": 8.890717510790763e-05,
"loss": 0.0061,
"step": 224
},
{
"epoch": 0.12769580022701477,
"grad_norm": 0.48627111315727234,
"learning_rate": 8.880732321458784e-05,
"loss": 0.0277,
"step": 225
},
{
"epoch": 0.1282633371169126,
"grad_norm": 0.23434560000896454,
"learning_rate": 8.870708053195413e-05,
"loss": 0.0031,
"step": 226
},
{
"epoch": 0.12883087400681045,
"grad_norm": 0.5444976091384888,
"learning_rate": 8.860644806944918e-05,
"loss": 0.0096,
"step": 227
},
{
"epoch": 0.12939841089670828,
"grad_norm": 1.0603594779968262,
"learning_rate": 8.850542684044078e-05,
"loss": 0.0314,
"step": 228
},
{
"epoch": 0.12996594778660614,
"grad_norm": 0.5144844651222229,
"learning_rate": 8.840401786221159e-05,
"loss": 0.013,
"step": 229
},
{
"epoch": 0.13053348467650397,
"grad_norm": 0.1403108835220337,
"learning_rate": 8.83022221559489e-05,
"loss": 0.0024,
"step": 230
},
{
"epoch": 0.13110102156640183,
"grad_norm": 0.24111324548721313,
"learning_rate": 8.820004074673433e-05,
"loss": 0.0034,
"step": 231
},
{
"epoch": 0.13166855845629966,
"grad_norm": 0.789040744304657,
"learning_rate": 8.809747466353356e-05,
"loss": 0.0295,
"step": 232
},
{
"epoch": 0.1322360953461975,
"grad_norm": 0.7863220572471619,
"learning_rate": 8.799452493918585e-05,
"loss": 0.0139,
"step": 233
},
{
"epoch": 0.13280363223609534,
"grad_norm": 0.5174320340156555,
"learning_rate": 8.789119261039385e-05,
"loss": 0.0097,
"step": 234
},
{
"epoch": 0.1333711691259932,
"grad_norm": 0.23707064986228943,
"learning_rate": 8.778747871771292e-05,
"loss": 0.0029,
"step": 235
},
{
"epoch": 0.13393870601589103,
"grad_norm": 0.20634303987026215,
"learning_rate": 8.768338430554082e-05,
"loss": 0.0036,
"step": 236
},
{
"epoch": 0.1345062429057889,
"grad_norm": 0.34142839908599854,
"learning_rate": 8.757891042210714e-05,
"loss": 0.0041,
"step": 237
},
{
"epoch": 0.13507377979568672,
"grad_norm": 0.3984658420085907,
"learning_rate": 8.74740581194627e-05,
"loss": 0.0062,
"step": 238
},
{
"epoch": 0.13564131668558457,
"grad_norm": 0.39278241991996765,
"learning_rate": 8.736882845346906e-05,
"loss": 0.011,
"step": 239
},
{
"epoch": 0.1362088535754824,
"grad_norm": 0.49513405561447144,
"learning_rate": 8.726322248378775e-05,
"loss": 0.0138,
"step": 240
},
{
"epoch": 0.13677639046538026,
"grad_norm": 0.1306513100862503,
"learning_rate": 8.715724127386972e-05,
"loss": 0.0009,
"step": 241
},
{
"epoch": 0.1373439273552781,
"grad_norm": 0.04676857590675354,
"learning_rate": 8.705088589094459e-05,
"loss": 0.0007,
"step": 242
},
{
"epoch": 0.13791146424517595,
"grad_norm": 0.1314803659915924,
"learning_rate": 8.694415740600988e-05,
"loss": 0.0018,
"step": 243
},
{
"epoch": 0.13847900113507378,
"grad_norm": 0.1374429613351822,
"learning_rate": 8.683705689382024e-05,
"loss": 0.0011,
"step": 244
},
{
"epoch": 0.13904653802497163,
"grad_norm": 0.0954061895608902,
"learning_rate": 8.672958543287666e-05,
"loss": 0.0008,
"step": 245
},
{
"epoch": 0.13961407491486946,
"grad_norm": 0.0,
"learning_rate": 8.662174410541555e-05,
"loss": 0.0,
"step": 246
},
{
"epoch": 0.14018161180476732,
"grad_norm": 0.0,
"learning_rate": 8.651353399739787e-05,
"loss": 0.0,
"step": 247
},
{
"epoch": 0.14074914869466515,
"grad_norm": 0.0,
"learning_rate": 8.640495619849821e-05,
"loss": 0.0,
"step": 248
},
{
"epoch": 0.141316685584563,
"grad_norm": 0.0,
"learning_rate": 8.629601180209381e-05,
"loss": 0.0,
"step": 249
},
{
"epoch": 0.14188422247446084,
"grad_norm": 0.0,
"learning_rate": 8.618670190525352e-05,
"loss": 0.0,
"step": 250
},
{
"epoch": 0.1424517593643587,
"grad_norm": 5.090747356414795,
"learning_rate": 8.607702760872678e-05,
"loss": 0.3383,
"step": 251
},
{
"epoch": 0.14301929625425652,
"grad_norm": 3.2074856758117676,
"learning_rate": 8.596699001693255e-05,
"loss": 0.1635,
"step": 252
},
{
"epoch": 0.14358683314415438,
"grad_norm": 1.3404688835144043,
"learning_rate": 8.585659023794818e-05,
"loss": 0.0633,
"step": 253
},
{
"epoch": 0.1441543700340522,
"grad_norm": 0.9414183497428894,
"learning_rate": 8.574582938349817e-05,
"loss": 0.0453,
"step": 254
},
{
"epoch": 0.14472190692395007,
"grad_norm": 1.2090741395950317,
"learning_rate": 8.563470856894316e-05,
"loss": 0.0521,
"step": 255
},
{
"epoch": 0.1452894438138479,
"grad_norm": 0.47880247235298157,
"learning_rate": 8.552322891326846e-05,
"loss": 0.0178,
"step": 256
},
{
"epoch": 0.14585698070374575,
"grad_norm": 0.9624150991439819,
"learning_rate": 8.541139153907296e-05,
"loss": 0.0165,
"step": 257
},
{
"epoch": 0.14642451759364358,
"grad_norm": 0.33472761511802673,
"learning_rate": 8.529919757255783e-05,
"loss": 0.0167,
"step": 258
},
{
"epoch": 0.14699205448354144,
"grad_norm": 0.44358474016189575,
"learning_rate": 8.518664814351502e-05,
"loss": 0.0156,
"step": 259
},
{
"epoch": 0.14755959137343927,
"grad_norm": 0.7658271789550781,
"learning_rate": 8.507374438531607e-05,
"loss": 0.0382,
"step": 260
},
{
"epoch": 0.14812712826333713,
"grad_norm": 0.30857032537460327,
"learning_rate": 8.496048743490053e-05,
"loss": 0.0075,
"step": 261
},
{
"epoch": 0.14869466515323496,
"grad_norm": 1.9148088693618774,
"learning_rate": 8.484687843276469e-05,
"loss": 0.0156,
"step": 262
},
{
"epoch": 0.1492622020431328,
"grad_norm": 0.8631612658500671,
"learning_rate": 8.473291852294987e-05,
"loss": 0.0441,
"step": 263
},
{
"epoch": 0.14982973893303064,
"grad_norm": 0.5475025773048401,
"learning_rate": 8.461860885303114e-05,
"loss": 0.0406,
"step": 264
},
{
"epoch": 0.1503972758229285,
"grad_norm": 0.12846483290195465,
"learning_rate": 8.450395057410561e-05,
"loss": 0.003,
"step": 265
},
{
"epoch": 0.15096481271282633,
"grad_norm": 0.275508850812912,
"learning_rate": 8.438894484078086e-05,
"loss": 0.0133,
"step": 266
},
{
"epoch": 0.1515323496027242,
"grad_norm": 0.5227663516998291,
"learning_rate": 8.427359281116334e-05,
"loss": 0.0177,
"step": 267
},
{
"epoch": 0.15209988649262202,
"grad_norm": 0.6141021847724915,
"learning_rate": 8.415789564684673e-05,
"loss": 0.0131,
"step": 268
},
{
"epoch": 0.15266742338251987,
"grad_norm": 0.04998692125082016,
"learning_rate": 8.404185451290018e-05,
"loss": 0.0013,
"step": 269
},
{
"epoch": 0.1532349602724177,
"grad_norm": 0.46891334652900696,
"learning_rate": 8.392547057785661e-05,
"loss": 0.0273,
"step": 270
},
{
"epoch": 0.15380249716231556,
"grad_norm": 0.5121367573738098,
"learning_rate": 8.380874501370097e-05,
"loss": 0.019,
"step": 271
},
{
"epoch": 0.1543700340522134,
"grad_norm": 0.12058154493570328,
"learning_rate": 8.369167899585841e-05,
"loss": 0.0022,
"step": 272
},
{
"epoch": 0.15493757094211125,
"grad_norm": 0.31559038162231445,
"learning_rate": 8.357427370318239e-05,
"loss": 0.0164,
"step": 273
},
{
"epoch": 0.15550510783200908,
"grad_norm": 0.7393925189971924,
"learning_rate": 8.345653031794292e-05,
"loss": 0.0179,
"step": 274
},
{
"epoch": 0.15607264472190693,
"grad_norm": 0.4647248089313507,
"learning_rate": 8.333845002581458e-05,
"loss": 0.008,
"step": 275
},
{
"epoch": 0.15664018161180476,
"grad_norm": 0.21942315995693207,
"learning_rate": 8.322003401586462e-05,
"loss": 0.0028,
"step": 276
},
{
"epoch": 0.15720771850170262,
"grad_norm": 0.4762805104255676,
"learning_rate": 8.310128348054094e-05,
"loss": 0.0254,
"step": 277
},
{
"epoch": 0.15777525539160045,
"grad_norm": 0.3500935435295105,
"learning_rate": 8.298219961566009e-05,
"loss": 0.0085,
"step": 278
},
{
"epoch": 0.1583427922814983,
"grad_norm": 0.3653341233730316,
"learning_rate": 8.286278362039528e-05,
"loss": 0.0074,
"step": 279
},
{
"epoch": 0.15891032917139614,
"grad_norm": 0.2545139193534851,
"learning_rate": 8.274303669726426e-05,
"loss": 0.0049,
"step": 280
},
{
"epoch": 0.159477866061294,
"grad_norm": 0.0680074691772461,
"learning_rate": 8.262296005211721e-05,
"loss": 0.0014,
"step": 281
},
{
"epoch": 0.16004540295119182,
"grad_norm": 0.2128506749868393,
"learning_rate": 8.250255489412463e-05,
"loss": 0.0049,
"step": 282
},
{
"epoch": 0.16061293984108968,
"grad_norm": 0.0370691753923893,
"learning_rate": 8.238182243576512e-05,
"loss": 0.0008,
"step": 283
},
{
"epoch": 0.1611804767309875,
"grad_norm": 0.05422423034906387,
"learning_rate": 8.226076389281316e-05,
"loss": 0.0009,
"step": 284
},
{
"epoch": 0.16174801362088537,
"grad_norm": 0.25359049439430237,
"learning_rate": 8.213938048432697e-05,
"loss": 0.002,
"step": 285
},
{
"epoch": 0.1623155505107832,
"grad_norm": 0.12178357690572739,
"learning_rate": 8.201767343263612e-05,
"loss": 0.0022,
"step": 286
},
{
"epoch": 0.16288308740068105,
"grad_norm": 0.06412477791309357,
"learning_rate": 8.189564396332928e-05,
"loss": 0.0012,
"step": 287
},
{
"epoch": 0.16345062429057888,
"grad_norm": 0.04413165897130966,
"learning_rate": 8.177329330524182e-05,
"loss": 0.0009,
"step": 288
},
{
"epoch": 0.16401816118047674,
"grad_norm": 0.1375110000371933,
"learning_rate": 8.165062269044353e-05,
"loss": 0.0019,
"step": 289
},
{
"epoch": 0.16458569807037457,
"grad_norm": 0.04058424010872841,
"learning_rate": 8.152763335422613e-05,
"loss": 0.0006,
"step": 290
},
{
"epoch": 0.16515323496027243,
"grad_norm": 0.46348121762275696,
"learning_rate": 8.140432653509089e-05,
"loss": 0.0038,
"step": 291
},
{
"epoch": 0.16572077185017026,
"grad_norm": 0.18360164761543274,
"learning_rate": 8.128070347473609e-05,
"loss": 0.0018,
"step": 292
},
{
"epoch": 0.1662883087400681,
"grad_norm": 0.6381713151931763,
"learning_rate": 8.115676541804456e-05,
"loss": 0.0069,
"step": 293
},
{
"epoch": 0.16685584562996594,
"grad_norm": 0.11540421843528748,
"learning_rate": 8.103251361307119e-05,
"loss": 0.0011,
"step": 294
},
{
"epoch": 0.1674233825198638,
"grad_norm": 0.5324001908302307,
"learning_rate": 8.090794931103026e-05,
"loss": 0.0035,
"step": 295
},
{
"epoch": 0.16799091940976163,
"grad_norm": 0.0,
"learning_rate": 8.07830737662829e-05,
"loss": 0.0,
"step": 296
},
{
"epoch": 0.1685584562996595,
"grad_norm": 0.12677313387393951,
"learning_rate": 8.065788823632451e-05,
"loss": 0.0018,
"step": 297
},
{
"epoch": 0.16912599318955732,
"grad_norm": 0.0,
"learning_rate": 8.053239398177191e-05,
"loss": 0.0,
"step": 298
},
{
"epoch": 0.16969353007945517,
"grad_norm": 0.0,
"learning_rate": 8.04065922663509e-05,
"loss": 0.0,
"step": 299
},
{
"epoch": 0.170261066969353,
"grad_norm": 0.0,
"learning_rate": 8.028048435688333e-05,
"loss": 0.0,
"step": 300
},
{
"epoch": 0.17082860385925086,
"grad_norm": 3.0016751289367676,
"learning_rate": 8.015407152327448e-05,
"loss": 0.2288,
"step": 301
},
{
"epoch": 0.1713961407491487,
"grad_norm": 1.9297741651535034,
"learning_rate": 8.002735503850016e-05,
"loss": 0.0538,
"step": 302
},
{
"epoch": 0.17196367763904655,
"grad_norm": 0.6649733185768127,
"learning_rate": 7.990033617859396e-05,
"loss": 0.0204,
"step": 303
},
{
"epoch": 0.17253121452894438,
"grad_norm": 1.4802879095077515,
"learning_rate": 7.97730162226344e-05,
"loss": 0.0355,
"step": 304
},
{
"epoch": 0.17309875141884223,
"grad_norm": 1.2510706186294556,
"learning_rate": 7.964539645273204e-05,
"loss": 0.0712,
"step": 305
},
{
"epoch": 0.17366628830874006,
"grad_norm": 0.45705509185791016,
"learning_rate": 7.95174781540165e-05,
"loss": 0.0259,
"step": 306
},
{
"epoch": 0.17423382519863792,
"grad_norm": 0.5345933437347412,
"learning_rate": 7.938926261462366e-05,
"loss": 0.02,
"step": 307
},
{
"epoch": 0.17480136208853575,
"grad_norm": 0.3882739841938019,
"learning_rate": 7.926075112568259e-05,
"loss": 0.014,
"step": 308
},
{
"epoch": 0.1753688989784336,
"grad_norm": 0.07783171534538269,
"learning_rate": 7.913194498130252e-05,
"loss": 0.0029,
"step": 309
},
{
"epoch": 0.17593643586833144,
"grad_norm": 0.39135614037513733,
"learning_rate": 7.900284547855991e-05,
"loss": 0.0127,
"step": 310
},
{
"epoch": 0.1765039727582293,
"grad_norm": 0.6026532053947449,
"learning_rate": 7.887345391748533e-05,
"loss": 0.003,
"step": 311
},
{
"epoch": 0.17707150964812712,
"grad_norm": 0.4667406380176544,
"learning_rate": 7.874377160105036e-05,
"loss": 0.008,
"step": 312
},
{
"epoch": 0.17763904653802498,
"grad_norm": 0.4185452461242676,
"learning_rate": 7.861379983515449e-05,
"loss": 0.0121,
"step": 313
},
{
"epoch": 0.1782065834279228,
"grad_norm": 0.28240469098091125,
"learning_rate": 7.848353992861195e-05,
"loss": 0.007,
"step": 314
},
{
"epoch": 0.17877412031782067,
"grad_norm": 0.2165137082338333,
"learning_rate": 7.835299319313853e-05,
"loss": 0.0057,
"step": 315
},
{
"epoch": 0.1793416572077185,
"grad_norm": 0.4501195549964905,
"learning_rate": 7.822216094333847e-05,
"loss": 0.016,
"step": 316
},
{
"epoch": 0.17990919409761635,
"grad_norm": 0.21665939688682556,
"learning_rate": 7.809104449669101e-05,
"loss": 0.0053,
"step": 317
},
{
"epoch": 0.18047673098751418,
"grad_norm": 0.45562973618507385,
"learning_rate": 7.795964517353735e-05,
"loss": 0.0399,
"step": 318
},
{
"epoch": 0.18104426787741204,
"grad_norm": 0.2064024657011032,
"learning_rate": 7.78279642970672e-05,
"loss": 0.0033,
"step": 319
},
{
"epoch": 0.18161180476730987,
"grad_norm": 0.06030461564660072,
"learning_rate": 7.769600319330552e-05,
"loss": 0.0015,
"step": 320
},
{
"epoch": 0.18217934165720773,
"grad_norm": 0.12329499423503876,
"learning_rate": 7.756376319109917e-05,
"loss": 0.002,
"step": 321
},
{
"epoch": 0.18274687854710556,
"grad_norm": 0.09843454509973526,
"learning_rate": 7.74312456221035e-05,
"loss": 0.0024,
"step": 322
},
{
"epoch": 0.18331441543700341,
"grad_norm": 0.3691561222076416,
"learning_rate": 7.729845182076895e-05,
"loss": 0.0087,
"step": 323
},
{
"epoch": 0.18388195232690124,
"grad_norm": 0.18092408776283264,
"learning_rate": 7.716538312432766e-05,
"loss": 0.0042,
"step": 324
},
{
"epoch": 0.1844494892167991,
"grad_norm": 0.2996932566165924,
"learning_rate": 7.703204087277988e-05,
"loss": 0.0221,
"step": 325
},
{
"epoch": 0.18501702610669693,
"grad_norm": 0.12783432006835938,
"learning_rate": 7.689842640888063e-05,
"loss": 0.0033,
"step": 326
},
{
"epoch": 0.1855845629965948,
"grad_norm": 0.26556456089019775,
"learning_rate": 7.676454107812607e-05,
"loss": 0.0056,
"step": 327
},
{
"epoch": 0.18615209988649262,
"grad_norm": 0.1686052829027176,
"learning_rate": 7.663038622873999e-05,
"loss": 0.0029,
"step": 328
},
{
"epoch": 0.18671963677639047,
"grad_norm": 0.04450399801135063,
"learning_rate": 7.649596321166024e-05,
"loss": 0.0007,
"step": 329
},
{
"epoch": 0.1872871736662883,
"grad_norm": 0.2206043303012848,
"learning_rate": 7.636127338052512e-05,
"loss": 0.0019,
"step": 330
},
{
"epoch": 0.18785471055618616,
"grad_norm": 0.050599176436662674,
"learning_rate": 7.622631809165973e-05,
"loss": 0.0006,
"step": 331
},
{
"epoch": 0.188422247446084,
"grad_norm": 1.3266581296920776,
"learning_rate": 7.60910987040623e-05,
"loss": 0.0502,
"step": 332
},
{
"epoch": 0.18898978433598185,
"grad_norm": 0.012006393633782864,
"learning_rate": 7.595561657939061e-05,
"loss": 0.0004,
"step": 333
},
{
"epoch": 0.18955732122587968,
"grad_norm": 0.24034585058689117,
"learning_rate": 7.58198730819481e-05,
"loss": 0.0033,
"step": 334
},
{
"epoch": 0.19012485811577753,
"grad_norm": 0.378845751285553,
"learning_rate": 7.568386957867033e-05,
"loss": 0.0189,
"step": 335
},
{
"epoch": 0.19069239500567536,
"grad_norm": 0.019944118335843086,
"learning_rate": 7.554760743911103e-05,
"loss": 0.0004,
"step": 336
},
{
"epoch": 0.19125993189557322,
"grad_norm": 1.7960922718048096,
"learning_rate": 7.541108803542846e-05,
"loss": 0.0138,
"step": 337
},
{
"epoch": 0.19182746878547105,
"grad_norm": 0.015294855460524559,
"learning_rate": 7.52743127423715e-05,
"loss": 0.0004,
"step": 338
},
{
"epoch": 0.1923950056753689,
"grad_norm": 0.08567917346954346,
"learning_rate": 7.51372829372658e-05,
"loss": 0.0013,
"step": 339
},
{
"epoch": 0.19296254256526674,
"grad_norm": 0.07724782824516296,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0008,
"step": 340
},
{
"epoch": 0.1935300794551646,
"grad_norm": 0.009706157259643078,
"learning_rate": 7.486246531301177e-05,
"loss": 0.0003,
"step": 341
},
{
"epoch": 0.19409761634506242,
"grad_norm": 0.04667770117521286,
"learning_rate": 7.472468026127385e-05,
"loss": 0.0004,
"step": 342
},
{
"epoch": 0.19466515323496028,
"grad_norm": 0.01700473390519619,
"learning_rate": 7.45866462322802e-05,
"loss": 0.0004,
"step": 343
},
{
"epoch": 0.1952326901248581,
"grad_norm": 0.6124288439750671,
"learning_rate": 7.444836461603195e-05,
"loss": 0.0087,
"step": 344
},
{
"epoch": 0.19580022701475597,
"grad_norm": 0.4518308937549591,
"learning_rate": 7.430983680502344e-05,
"loss": 0.0028,
"step": 345
},
{
"epoch": 0.1963677639046538,
"grad_norm": 0.0,
"learning_rate": 7.417106419422819e-05,
"loss": 0.0,
"step": 346
},
{
"epoch": 0.19693530079455165,
"grad_norm": 0.0,
"learning_rate": 7.403204818108487e-05,
"loss": 0.0,
"step": 347
},
{
"epoch": 0.19750283768444948,
"grad_norm": 0.14064697921276093,
"learning_rate": 7.389279016548316e-05,
"loss": 0.0015,
"step": 348
},
{
"epoch": 0.19807037457434734,
"grad_norm": 0.0,
"learning_rate": 7.375329154974975e-05,
"loss": 0.0,
"step": 349
},
{
"epoch": 0.19863791146424517,
"grad_norm": 0.0,
"learning_rate": 7.361355373863414e-05,
"loss": 0.0,
"step": 350
},
{
"epoch": 0.19920544835414303,
"grad_norm": 2.25301194190979,
"learning_rate": 7.347357813929454e-05,
"loss": 0.0754,
"step": 351
},
{
"epoch": 0.19977298524404086,
"grad_norm": 1.7841074466705322,
"learning_rate": 7.333336616128369e-05,
"loss": 0.051,
"step": 352
},
{
"epoch": 0.20034052213393871,
"grad_norm": 0.9712184071540833,
"learning_rate": 7.319291921653464e-05,
"loss": 0.016,
"step": 353
},
{
"epoch": 0.20090805902383654,
"grad_norm": 0.5902833342552185,
"learning_rate": 7.305223871934657e-05,
"loss": 0.0106,
"step": 354
},
{
"epoch": 0.2014755959137344,
"grad_norm": 0.2831375002861023,
"learning_rate": 7.291132608637052e-05,
"loss": 0.0077,
"step": 355
},
{
"epoch": 0.20204313280363223,
"grad_norm": 0.5281710624694824,
"learning_rate": 7.277018273659517e-05,
"loss": 0.0236,
"step": 356
},
{
"epoch": 0.2026106696935301,
"grad_norm": 0.7202128767967224,
"learning_rate": 7.262881009133242e-05,
"loss": 0.0256,
"step": 357
},
{
"epoch": 0.20317820658342792,
"grad_norm": 0.2578269839286804,
"learning_rate": 7.24872095742033e-05,
"loss": 0.0061,
"step": 358
},
{
"epoch": 0.20374574347332577,
"grad_norm": 0.21329842507839203,
"learning_rate": 7.23453826111234e-05,
"loss": 0.0029,
"step": 359
},
{
"epoch": 0.2043132803632236,
"grad_norm": 0.3269996643066406,
"learning_rate": 7.220333063028872e-05,
"loss": 0.0042,
"step": 360
},
{
"epoch": 0.20488081725312146,
"grad_norm": 0.5432631373405457,
"learning_rate": 7.206105506216106e-05,
"loss": 0.0272,
"step": 361
},
{
"epoch": 0.2054483541430193,
"grad_norm": 1.9401220083236694,
"learning_rate": 7.191855733945387e-05,
"loss": 0.0163,
"step": 362
},
{
"epoch": 0.20601589103291715,
"grad_norm": 0.24572275578975677,
"learning_rate": 7.177583889711762e-05,
"loss": 0.006,
"step": 363
},
{
"epoch": 0.20658342792281498,
"grad_norm": 0.13575679063796997,
"learning_rate": 7.163290117232542e-05,
"loss": 0.0026,
"step": 364
},
{
"epoch": 0.20715096481271283,
"grad_norm": 0.4401944577693939,
"learning_rate": 7.148974560445859e-05,
"loss": 0.0187,
"step": 365
},
{
"epoch": 0.20771850170261066,
"grad_norm": 0.45200228691101074,
"learning_rate": 7.13463736350921e-05,
"loss": 0.0083,
"step": 366
},
{
"epoch": 0.20828603859250852,
"grad_norm": 0.5528292655944824,
"learning_rate": 7.120278670798009e-05,
"loss": 0.0092,
"step": 367
},
{
"epoch": 0.20885357548240635,
"grad_norm": 0.5644862651824951,
"learning_rate": 7.105898626904134e-05,
"loss": 0.0147,
"step": 368
},
{
"epoch": 0.2094211123723042,
"grad_norm": 0.7960838675498962,
"learning_rate": 7.091497376634464e-05,
"loss": 0.0252,
"step": 369
},
{
"epoch": 0.20998864926220204,
"grad_norm": 0.26013273000717163,
"learning_rate": 7.077075065009433e-05,
"loss": 0.003,
"step": 370
},
{
"epoch": 0.2105561861520999,
"grad_norm": 0.44845283031463623,
"learning_rate": 7.062631837261557e-05,
"loss": 0.005,
"step": 371
},
{
"epoch": 0.21112372304199772,
"grad_norm": 0.48455584049224854,
"learning_rate": 7.048167838833977e-05,
"loss": 0.0084,
"step": 372
},
{
"epoch": 0.21169125993189558,
"grad_norm": 0.48875439167022705,
"learning_rate": 7.033683215379002e-05,
"loss": 0.0034,
"step": 373
},
{
"epoch": 0.2122587968217934,
"grad_norm": 1.5464515686035156,
"learning_rate": 7.019178112756624e-05,
"loss": 0.0422,
"step": 374
},
{
"epoch": 0.21282633371169127,
"grad_norm": 0.08226021379232407,
"learning_rate": 7.004652677033068e-05,
"loss": 0.0014,
"step": 375
},
{
"epoch": 0.2133938706015891,
"grad_norm": 0.9428783655166626,
"learning_rate": 6.990107054479312e-05,
"loss": 0.0085,
"step": 376
},
{
"epoch": 0.21396140749148695,
"grad_norm": 0.5083039999008179,
"learning_rate": 6.97554139156961e-05,
"loss": 0.0054,
"step": 377
},
{
"epoch": 0.21452894438138478,
"grad_norm": 0.7152851819992065,
"learning_rate": 6.960955834980028e-05,
"loss": 0.0118,
"step": 378
},
{
"epoch": 0.21509648127128264,
"grad_norm": 0.7423697113990784,
"learning_rate": 6.946350531586959e-05,
"loss": 0.0196,
"step": 379
},
{
"epoch": 0.21566401816118047,
"grad_norm": 0.19148842990398407,
"learning_rate": 6.931725628465643e-05,
"loss": 0.0029,
"step": 380
},
{
"epoch": 0.21623155505107833,
"grad_norm": 0.16525211930274963,
"learning_rate": 6.917081272888697e-05,
"loss": 0.0018,
"step": 381
},
{
"epoch": 0.21679909194097616,
"grad_norm": 0.7403731942176819,
"learning_rate": 6.902417612324615e-05,
"loss": 0.0156,
"step": 382
},
{
"epoch": 0.21736662883087401,
"grad_norm": 0.7522996068000793,
"learning_rate": 6.8877347944363e-05,
"loss": 0.0106,
"step": 383
},
{
"epoch": 0.21793416572077184,
"grad_norm": 0.35488778352737427,
"learning_rate": 6.873032967079561e-05,
"loss": 0.0266,
"step": 384
},
{
"epoch": 0.2185017026106697,
"grad_norm": 0.5787685513496399,
"learning_rate": 6.858312278301637e-05,
"loss": 0.0044,
"step": 385
},
{
"epoch": 0.21906923950056753,
"grad_norm": 0.5935757756233215,
"learning_rate": 6.843572876339705e-05,
"loss": 0.0101,
"step": 386
},
{
"epoch": 0.2196367763904654,
"grad_norm": 0.0742402896285057,
"learning_rate": 6.828814909619373e-05,
"loss": 0.0006,
"step": 387
},
{
"epoch": 0.22020431328036322,
"grad_norm": 0.08480936288833618,
"learning_rate": 6.814038526753205e-05,
"loss": 0.0014,
"step": 388
},
{
"epoch": 0.22077185017026107,
"grad_norm": 0.20840072631835938,
"learning_rate": 6.799243876539212e-05,
"loss": 0.0017,
"step": 389
},
{
"epoch": 0.2213393870601589,
"grad_norm": 0.050797827541828156,
"learning_rate": 6.784431107959359e-05,
"loss": 0.0007,
"step": 390
},
{
"epoch": 0.22190692395005676,
"grad_norm": 0.0777006596326828,
"learning_rate": 6.769600370178059e-05,
"loss": 0.0013,
"step": 391
},
{
"epoch": 0.2224744608399546,
"grad_norm": 0.04812987521290779,
"learning_rate": 6.754751812540679e-05,
"loss": 0.0008,
"step": 392
},
{
"epoch": 0.22304199772985245,
"grad_norm": 0.07565217465162277,
"learning_rate": 6.739885584572026e-05,
"loss": 0.0006,
"step": 393
},
{
"epoch": 0.22360953461975028,
"grad_norm": 0.03945764899253845,
"learning_rate": 6.725001835974853e-05,
"loss": 0.0005,
"step": 394
},
{
"epoch": 0.22417707150964813,
"grad_norm": 1.1225963830947876,
"learning_rate": 6.710100716628344e-05,
"loss": 0.0077,
"step": 395
},
{
"epoch": 0.22474460839954596,
"grad_norm": 0.24701376259326935,
"learning_rate": 6.695182376586603e-05,
"loss": 0.0024,
"step": 396
},
{
"epoch": 0.22531214528944382,
"grad_norm": 0.0,
"learning_rate": 6.680246966077151e-05,
"loss": 0.0,
"step": 397
},
{
"epoch": 0.22587968217934165,
"grad_norm": 0.0,
"learning_rate": 6.665294635499404e-05,
"loss": 0.0,
"step": 398
},
{
"epoch": 0.2264472190692395,
"grad_norm": 0.0,
"learning_rate": 6.650325535423167e-05,
"loss": 0.0,
"step": 399
},
{
"epoch": 0.22701475595913734,
"grad_norm": 0.0,
"learning_rate": 6.635339816587109e-05,
"loss": 0.0,
"step": 400
},
{
"epoch": 0.22701475595913734,
"eval_loss": NaN,
"eval_runtime": 106.1666,
"eval_samples_per_second": 27.956,
"eval_steps_per_second": 6.989,
"step": 400
},
{
"epoch": 0.2275822928490352,
"grad_norm": 1.7875468730926514,
"learning_rate": 6.620337629897254e-05,
"loss": 0.1379,
"step": 401
},
{
"epoch": 0.22814982973893302,
"grad_norm": 1.6237751245498657,
"learning_rate": 6.605319126425454e-05,
"loss": 0.02,
"step": 402
},
{
"epoch": 0.22871736662883088,
"grad_norm": 0.16924582421779633,
"learning_rate": 6.590284457407876e-05,
"loss": 0.0052,
"step": 403
},
{
"epoch": 0.2292849035187287,
"grad_norm": 0.3651062250137329,
"learning_rate": 6.575233774243465e-05,
"loss": 0.0265,
"step": 404
},
{
"epoch": 0.22985244040862657,
"grad_norm": 0.13029779493808746,
"learning_rate": 6.560167228492436e-05,
"loss": 0.0032,
"step": 405
},
{
"epoch": 0.2304199772985244,
"grad_norm": 0.11845195293426514,
"learning_rate": 6.545084971874738e-05,
"loss": 0.0039,
"step": 406
},
{
"epoch": 0.23098751418842225,
"grad_norm": 0.344365656375885,
"learning_rate": 6.529987156268526e-05,
"loss": 0.0043,
"step": 407
},
{
"epoch": 0.23155505107832008,
"grad_norm": 0.8041085004806519,
"learning_rate": 6.514873933708638e-05,
"loss": 0.0077,
"step": 408
},
{
"epoch": 0.23212258796821794,
"grad_norm": 0.05168134719133377,
"learning_rate": 6.499745456385054e-05,
"loss": 0.0016,
"step": 409
},
{
"epoch": 0.23269012485811577,
"grad_norm": 0.27078044414520264,
"learning_rate": 6.484601876641375e-05,
"loss": 0.0126,
"step": 410
},
{
"epoch": 0.23325766174801363,
"grad_norm": 0.28879401087760925,
"learning_rate": 6.46944334697328e-05,
"loss": 0.0115,
"step": 411
},
{
"epoch": 0.23382519863791146,
"grad_norm": 0.9801868200302124,
"learning_rate": 6.454270020026995e-05,
"loss": 0.0165,
"step": 412
},
{
"epoch": 0.23439273552780931,
"grad_norm": 0.28464144468307495,
"learning_rate": 6.439082048597755e-05,
"loss": 0.0039,
"step": 413
},
{
"epoch": 0.23496027241770714,
"grad_norm": 0.07441152632236481,
"learning_rate": 6.423879585628261e-05,
"loss": 0.0013,
"step": 414
},
{
"epoch": 0.235527809307605,
"grad_norm": 0.4811006188392639,
"learning_rate": 6.408662784207149e-05,
"loss": 0.0168,
"step": 415
},
{
"epoch": 0.23609534619750283,
"grad_norm": 0.15365412831306458,
"learning_rate": 6.39343179756744e-05,
"loss": 0.0032,
"step": 416
},
{
"epoch": 0.2366628830874007,
"grad_norm": 0.1440768986940384,
"learning_rate": 6.378186779084995e-05,
"loss": 0.0026,
"step": 417
},
{
"epoch": 0.23723041997729852,
"grad_norm": 0.12057225406169891,
"learning_rate": 6.36292788227699e-05,
"loss": 0.0019,
"step": 418
},
{
"epoch": 0.23779795686719638,
"grad_norm": 0.41369637846946716,
"learning_rate": 6.34765526080034e-05,
"loss": 0.0114,
"step": 419
},
{
"epoch": 0.2383654937570942,
"grad_norm": 0.06439776718616486,
"learning_rate": 6.332369068450174e-05,
"loss": 0.0015,
"step": 420
},
{
"epoch": 0.23893303064699206,
"grad_norm": 0.05166054517030716,
"learning_rate": 6.317069459158284e-05,
"loss": 0.0012,
"step": 421
},
{
"epoch": 0.2395005675368899,
"grad_norm": 0.040280867367982864,
"learning_rate": 6.30175658699156e-05,
"loss": 0.0011,
"step": 422
},
{
"epoch": 0.24006810442678775,
"grad_norm": 0.2597777843475342,
"learning_rate": 6.286430606150459e-05,
"loss": 0.0034,
"step": 423
},
{
"epoch": 0.24063564131668558,
"grad_norm": 0.34109053015708923,
"learning_rate": 6.271091670967436e-05,
"loss": 0.0055,
"step": 424
},
{
"epoch": 0.24120317820658344,
"grad_norm": 0.6494819521903992,
"learning_rate": 6.255739935905396e-05,
"loss": 0.0092,
"step": 425
},
{
"epoch": 0.24177071509648126,
"grad_norm": 0.6631916165351868,
"learning_rate": 6.240375555556145e-05,
"loss": 0.016,
"step": 426
},
{
"epoch": 0.24233825198637912,
"grad_norm": 0.23462681472301483,
"learning_rate": 6.22499868463882e-05,
"loss": 0.0034,
"step": 427
},
{
"epoch": 0.24290578887627695,
"grad_norm": 0.03203234449028969,
"learning_rate": 6.209609477998338e-05,
"loss": 0.0007,
"step": 428
},
{
"epoch": 0.2434733257661748,
"grad_norm": 0.08686164021492004,
"learning_rate": 6.194208090603844e-05,
"loss": 0.0013,
"step": 429
},
{
"epoch": 0.24404086265607264,
"grad_norm": 0.2361176759004593,
"learning_rate": 6.178794677547137e-05,
"loss": 0.0025,
"step": 430
},
{
"epoch": 0.2446083995459705,
"grad_norm": 0.30464842915534973,
"learning_rate": 6.163369394041111e-05,
"loss": 0.0026,
"step": 431
},
{
"epoch": 0.24517593643586832,
"grad_norm": 1.3310155868530273,
"learning_rate": 6.147932395418205e-05,
"loss": 0.0677,
"step": 432
},
{
"epoch": 0.24574347332576618,
"grad_norm": 0.006984487175941467,
"learning_rate": 6.132483837128823e-05,
"loss": 0.0002,
"step": 433
},
{
"epoch": 0.246311010215664,
"grad_norm": 0.7241768836975098,
"learning_rate": 6.117023874739772e-05,
"loss": 0.0213,
"step": 434
},
{
"epoch": 0.24687854710556187,
"grad_norm": 0.06802449375391006,
"learning_rate": 6.1015526639327035e-05,
"loss": 0.0007,
"step": 435
},
{
"epoch": 0.2474460839954597,
"grad_norm": 0.22861234843730927,
"learning_rate": 6.0860703605025395e-05,
"loss": 0.0027,
"step": 436
},
{
"epoch": 0.24801362088535756,
"grad_norm": 0.09795883297920227,
"learning_rate": 6.0705771203559024e-05,
"loss": 0.0008,
"step": 437
},
{
"epoch": 0.24858115777525538,
"grad_norm": 0.9199258685112,
"learning_rate": 6.05507309950955e-05,
"loss": 0.0459,
"step": 438
},
{
"epoch": 0.24914869466515324,
"grad_norm": 1.980196475982666,
"learning_rate": 6.0395584540887963e-05,
"loss": 0.0015,
"step": 439
},
{
"epoch": 0.24971623155505107,
"grad_norm": 0.2596800625324249,
"learning_rate": 6.024033340325954e-05,
"loss": 0.0033,
"step": 440
},
{
"epoch": 0.25028376844494893,
"grad_norm": 0.9777965545654297,
"learning_rate": 6.008497914558744e-05,
"loss": 0.017,
"step": 441
},
{
"epoch": 0.2508513053348468,
"grad_norm": 0.36498022079467773,
"learning_rate": 5.992952333228728e-05,
"loss": 0.0042,
"step": 442
},
{
"epoch": 0.2514188422247446,
"grad_norm": 0.022741034626960754,
"learning_rate": 5.9773967528797414e-05,
"loss": 0.0004,
"step": 443
},
{
"epoch": 0.25198637911464244,
"grad_norm": 0.27772486209869385,
"learning_rate": 5.9618313301563055e-05,
"loss": 0.0032,
"step": 444
},
{
"epoch": 0.2525539160045403,
"grad_norm": 0.0808940976858139,
"learning_rate": 5.946256221802051e-05,
"loss": 0.0002,
"step": 445
},
{
"epoch": 0.25312145289443816,
"grad_norm": 0.27334359288215637,
"learning_rate": 5.9306715846581506e-05,
"loss": 0.0004,
"step": 446
},
{
"epoch": 0.25368898978433596,
"grad_norm": 0.0,
"learning_rate": 5.915077575661723e-05,
"loss": 0.0,
"step": 447
},
{
"epoch": 0.2542565266742338,
"grad_norm": 0.0,
"learning_rate": 5.8994743518442694e-05,
"loss": 0.0,
"step": 448
},
{
"epoch": 0.2548240635641317,
"grad_norm": 0.0,
"learning_rate": 5.8838620703300784e-05,
"loss": 0.0,
"step": 449
},
{
"epoch": 0.25539160045402953,
"grad_norm": 0.0,
"learning_rate": 5.868240888334653e-05,
"loss": 0.0,
"step": 450
},
{
"epoch": 0.25595913734392733,
"grad_norm": 1.7918459177017212,
"learning_rate": 5.85261096316312e-05,
"loss": 0.1334,
"step": 451
},
{
"epoch": 0.2565266742338252,
"grad_norm": 1.2250529527664185,
"learning_rate": 5.836972452208654e-05,
"loss": 0.0459,
"step": 452
},
{
"epoch": 0.25709421112372305,
"grad_norm": 0.15675216913223267,
"learning_rate": 5.821325512950886e-05,
"loss": 0.0036,
"step": 453
},
{
"epoch": 0.2576617480136209,
"grad_norm": 0.18163926899433136,
"learning_rate": 5.805670302954321e-05,
"loss": 0.0018,
"step": 454
},
{
"epoch": 0.2582292849035187,
"grad_norm": 0.2342452108860016,
"learning_rate": 5.79000697986675e-05,
"loss": 0.0119,
"step": 455
},
{
"epoch": 0.25879682179341656,
"grad_norm": 0.7253749370574951,
"learning_rate": 5.7743357014176624e-05,
"loss": 0.0193,
"step": 456
},
{
"epoch": 0.2593643586833144,
"grad_norm": 0.45949435234069824,
"learning_rate": 5.7586566254166583e-05,
"loss": 0.0124,
"step": 457
},
{
"epoch": 0.2599318955732123,
"grad_norm": 0.05333389714360237,
"learning_rate": 5.7429699097518585e-05,
"loss": 0.0021,
"step": 458
},
{
"epoch": 0.2604994324631101,
"grad_norm": 0.22825823724269867,
"learning_rate": 5.7272757123883184e-05,
"loss": 0.0032,
"step": 459
},
{
"epoch": 0.26106696935300794,
"grad_norm": 0.35596853494644165,
"learning_rate": 5.7115741913664264e-05,
"loss": 0.0059,
"step": 460
},
{
"epoch": 0.2616345062429058,
"grad_norm": 0.4787977635860443,
"learning_rate": 5.695865504800327e-05,
"loss": 0.0123,
"step": 461
},
{
"epoch": 0.26220204313280365,
"grad_norm": 0.15637889504432678,
"learning_rate": 5.680149810876322e-05,
"loss": 0.0022,
"step": 462
},
{
"epoch": 0.26276958002270145,
"grad_norm": 0.1934584677219391,
"learning_rate": 5.664427267851271e-05,
"loss": 0.0058,
"step": 463
},
{
"epoch": 0.2633371169125993,
"grad_norm": 0.08029989898204803,
"learning_rate": 5.6486980340510086e-05,
"loss": 0.0021,
"step": 464
},
{
"epoch": 0.26390465380249717,
"grad_norm": 0.21124523878097534,
"learning_rate": 5.6329622678687463e-05,
"loss": 0.0061,
"step": 465
},
{
"epoch": 0.264472190692395,
"grad_norm": 0.05409558117389679,
"learning_rate": 5.617220127763474e-05,
"loss": 0.0013,
"step": 466
},
{
"epoch": 0.26503972758229283,
"grad_norm": 0.5069450736045837,
"learning_rate": 5.601471772258368e-05,
"loss": 0.0084,
"step": 467
},
{
"epoch": 0.2656072644721907,
"grad_norm": 0.30205589532852173,
"learning_rate": 5.585717359939192e-05,
"loss": 0.0114,
"step": 468
},
{
"epoch": 0.26617480136208854,
"grad_norm": 0.13869255781173706,
"learning_rate": 5.569957049452703e-05,
"loss": 0.0011,
"step": 469
},
{
"epoch": 0.2667423382519864,
"grad_norm": 0.048545245081186295,
"learning_rate": 5.5541909995050554e-05,
"loss": 0.001,
"step": 470
},
{
"epoch": 0.2673098751418842,
"grad_norm": 0.01697002351284027,
"learning_rate": 5.538419368860196e-05,
"loss": 0.0006,
"step": 471
},
{
"epoch": 0.26787741203178206,
"grad_norm": 0.3081216812133789,
"learning_rate": 5.522642316338268e-05,
"loss": 0.0097,
"step": 472
},
{
"epoch": 0.2684449489216799,
"grad_norm": 0.1498224139213562,
"learning_rate": 5.506860000814017e-05,
"loss": 0.0024,
"step": 473
},
{
"epoch": 0.2690124858115778,
"grad_norm": 0.039433132857084274,
"learning_rate": 5.4910725812151864e-05,
"loss": 0.0007,
"step": 474
},
{
"epoch": 0.2695800227014756,
"grad_norm": 0.0603884682059288,
"learning_rate": 5.475280216520913e-05,
"loss": 0.0012,
"step": 475
},
{
"epoch": 0.27014755959137343,
"grad_norm": 0.8099268078804016,
"learning_rate": 5.4594830657601384e-05,
"loss": 0.0319,
"step": 476
},
{
"epoch": 0.2707150964812713,
"grad_norm": 0.23851421475410461,
"learning_rate": 5.443681288009991e-05,
"loss": 0.007,
"step": 477
},
{
"epoch": 0.27128263337116915,
"grad_norm": 0.09010318666696548,
"learning_rate": 5.427875042394199e-05,
"loss": 0.0015,
"step": 478
},
{
"epoch": 0.27185017026106695,
"grad_norm": 0.29702991247177124,
"learning_rate": 5.412064488081482e-05,
"loss": 0.0017,
"step": 479
},
{
"epoch": 0.2724177071509648,
"grad_norm": 0.46306195855140686,
"learning_rate": 5.396249784283942e-05,
"loss": 0.0063,
"step": 480
},
{
"epoch": 0.27298524404086266,
"grad_norm": 0.40036702156066895,
"learning_rate": 5.3804310902554754e-05,
"loss": 0.0078,
"step": 481
},
{
"epoch": 0.2735527809307605,
"grad_norm": 0.014721478335559368,
"learning_rate": 5.364608565290155e-05,
"loss": 0.0003,
"step": 482
},
{
"epoch": 0.2741203178206583,
"grad_norm": 0.43365278840065,
"learning_rate": 5.348782368720626e-05,
"loss": 0.0272,
"step": 483
},
{
"epoch": 0.2746878547105562,
"grad_norm": 0.8489099144935608,
"learning_rate": 5.3329526599165204e-05,
"loss": 0.0232,
"step": 484
},
{
"epoch": 0.27525539160045404,
"grad_norm": 0.22350762784481049,
"learning_rate": 5.317119598282823e-05,
"loss": 0.005,
"step": 485
},
{
"epoch": 0.2758229284903519,
"grad_norm": 0.028759444132447243,
"learning_rate": 5.301283343258293e-05,
"loss": 0.0007,
"step": 486
},
{
"epoch": 0.2763904653802497,
"grad_norm": 0.0732959434390068,
"learning_rate": 5.2854440543138406e-05,
"loss": 0.0015,
"step": 487
},
{
"epoch": 0.27695800227014755,
"grad_norm": 0.19646039605140686,
"learning_rate": 5.2696018909509306e-05,
"loss": 0.0036,
"step": 488
},
{
"epoch": 0.2775255391600454,
"grad_norm": 0.19537141919136047,
"learning_rate": 5.253757012699972e-05,
"loss": 0.0038,
"step": 489
},
{
"epoch": 0.27809307604994327,
"grad_norm": 0.18001757562160492,
"learning_rate": 5.2379095791187124e-05,
"loss": 0.0029,
"step": 490
},
{
"epoch": 0.27866061293984107,
"grad_norm": 0.05839482694864273,
"learning_rate": 5.2220597497906307e-05,
"loss": 0.0013,
"step": 491
},
{
"epoch": 0.2792281498297389,
"grad_norm": 0.0680989921092987,
"learning_rate": 5.2062076843233366e-05,
"loss": 0.0016,
"step": 492
},
{
"epoch": 0.2797956867196368,
"grad_norm": 0.03857843577861786,
"learning_rate": 5.1903535423469505e-05,
"loss": 0.0009,
"step": 493
},
{
"epoch": 0.28036322360953464,
"grad_norm": 0.2856152355670929,
"learning_rate": 5.174497483512506e-05,
"loss": 0.0027,
"step": 494
},
{
"epoch": 0.28093076049943244,
"grad_norm": 0.0,
"learning_rate": 5.158639667490339e-05,
"loss": 0.0,
"step": 495
},
{
"epoch": 0.2814982973893303,
"grad_norm": 0.0,
"learning_rate": 5.142780253968481e-05,
"loss": 0.0,
"step": 496
},
{
"epoch": 0.28206583427922816,
"grad_norm": 0.0,
"learning_rate": 5.126919402651052e-05,
"loss": 0.0,
"step": 497
},
{
"epoch": 0.282633371169126,
"grad_norm": 0.0,
"learning_rate": 5.1110572732566475e-05,
"loss": 0.0,
"step": 498
},
{
"epoch": 0.2832009080590238,
"grad_norm": 0.0,
"learning_rate": 5.095194025516733e-05,
"loss": 0.0,
"step": 499
},
{
"epoch": 0.28376844494892167,
"grad_norm": 0.0,
"learning_rate": 5.0793298191740404e-05,
"loss": 0.0,
"step": 500
},
{
"epoch": 0.28433598183881953,
"grad_norm": 3.819697856903076,
"learning_rate": 5.063464813980948e-05,
"loss": 0.271,
"step": 501
},
{
"epoch": 0.2849035187287174,
"grad_norm": 0.8054677844047546,
"learning_rate": 5.047599169697884e-05,
"loss": 0.0246,
"step": 502
},
{
"epoch": 0.2854710556186152,
"grad_norm": 0.3912200629711151,
"learning_rate": 5.03173304609171e-05,
"loss": 0.0167,
"step": 503
},
{
"epoch": 0.28603859250851305,
"grad_norm": 0.3080594539642334,
"learning_rate": 5.015866602934112e-05,
"loss": 0.0099,
"step": 504
},
{
"epoch": 0.2866061293984109,
"grad_norm": 0.6711567044258118,
"learning_rate": 5e-05,
"loss": 0.0091,
"step": 505
},
{
"epoch": 0.28717366628830876,
"grad_norm": 0.41851040720939636,
"learning_rate": 4.984133397065889e-05,
"loss": 0.0136,
"step": 506
},
{
"epoch": 0.28774120317820656,
"grad_norm": 0.15193375945091248,
"learning_rate": 4.968266953908292e-05,
"loss": 0.0029,
"step": 507
},
{
"epoch": 0.2883087400681044,
"grad_norm": 0.27360770106315613,
"learning_rate": 4.952400830302117e-05,
"loss": 0.0049,
"step": 508
},
{
"epoch": 0.2888762769580023,
"grad_norm": 0.06797119230031967,
"learning_rate": 4.9365351860190526e-05,
"loss": 0.0012,
"step": 509
},
{
"epoch": 0.28944381384790013,
"grad_norm": 0.42943570017814636,
"learning_rate": 4.92067018082596e-05,
"loss": 0.007,
"step": 510
},
{
"epoch": 0.29001135073779793,
"grad_norm": 0.41933485865592957,
"learning_rate": 4.9048059744832666e-05,
"loss": 0.024,
"step": 511
},
{
"epoch": 0.2905788876276958,
"grad_norm": 0.07279060781002045,
"learning_rate": 4.888942726743353e-05,
"loss": 0.0014,
"step": 512
},
{
"epoch": 0.29114642451759365,
"grad_norm": 0.022339163348078728,
"learning_rate": 4.8730805973489476e-05,
"loss": 0.0007,
"step": 513
},
{
"epoch": 0.2917139614074915,
"grad_norm": 0.033492498099803925,
"learning_rate": 4.85721974603152e-05,
"loss": 0.0011,
"step": 514
},
{
"epoch": 0.2922814982973893,
"grad_norm": 0.15747897326946259,
"learning_rate": 4.841360332509663e-05,
"loss": 0.0023,
"step": 515
},
{
"epoch": 0.29284903518728717,
"grad_norm": 0.4097544252872467,
"learning_rate": 4.825502516487497e-05,
"loss": 0.0324,
"step": 516
},
{
"epoch": 0.293416572077185,
"grad_norm": 0.39055225253105164,
"learning_rate": 4.8096464576530507e-05,
"loss": 0.0043,
"step": 517
},
{
"epoch": 0.2939841089670829,
"grad_norm": 0.011669347062706947,
"learning_rate": 4.7937923156766646e-05,
"loss": 0.0005,
"step": 518
},
{
"epoch": 0.2945516458569807,
"grad_norm": 0.15535907447338104,
"learning_rate": 4.77794025020937e-05,
"loss": 0.0019,
"step": 519
},
{
"epoch": 0.29511918274687854,
"grad_norm": 0.042138513177633286,
"learning_rate": 4.762090420881289e-05,
"loss": 0.0009,
"step": 520
},
{
"epoch": 0.2956867196367764,
"grad_norm": 0.24504394829273224,
"learning_rate": 4.7462429873000295e-05,
"loss": 0.0047,
"step": 521
},
{
"epoch": 0.29625425652667425,
"grad_norm": 0.08995606750249863,
"learning_rate": 4.730398109049071e-05,
"loss": 0.0018,
"step": 522
},
{
"epoch": 0.29682179341657206,
"grad_norm": 0.020904161036014557,
"learning_rate": 4.71455594568616e-05,
"loss": 0.0004,
"step": 523
},
{
"epoch": 0.2973893303064699,
"grad_norm": 0.028654640540480614,
"learning_rate": 4.698716656741708e-05,
"loss": 0.0007,
"step": 524
},
{
"epoch": 0.29795686719636777,
"grad_norm": 0.05848681926727295,
"learning_rate": 4.6828804017171776e-05,
"loss": 0.0009,
"step": 525
},
{
"epoch": 0.2985244040862656,
"grad_norm": 0.3643423020839691,
"learning_rate": 4.667047340083481e-05,
"loss": 0.0033,
"step": 526
},
{
"epoch": 0.29909194097616343,
"grad_norm": 0.12494904547929764,
"learning_rate": 4.6512176312793736e-05,
"loss": 0.0018,
"step": 527
},
{
"epoch": 0.2996594778660613,
"grad_norm": 0.03870720416307449,
"learning_rate": 4.635391434709847e-05,
"loss": 0.0008,
"step": 528
},
{
"epoch": 0.30022701475595914,
"grad_norm": 0.8066434264183044,
"learning_rate": 4.619568909744524e-05,
"loss": 0.0234,
"step": 529
},
{
"epoch": 0.300794551645857,
"grad_norm": 0.04658526927232742,
"learning_rate": 4.603750215716057e-05,
"loss": 0.0006,
"step": 530
},
{
"epoch": 0.3013620885357548,
"grad_norm": 0.362132728099823,
"learning_rate": 4.587935511918521e-05,
"loss": 0.04,
"step": 531
},
{
"epoch": 0.30192962542565266,
"grad_norm": 0.06521368026733398,
"learning_rate": 4.5721249576058027e-05,
"loss": 0.0009,
"step": 532
},
{
"epoch": 0.3024971623155505,
"grad_norm": 0.029287142679095268,
"learning_rate": 4.5563187119900104e-05,
"loss": 0.0005,
"step": 533
},
{
"epoch": 0.3030646992054484,
"grad_norm": 0.1905515044927597,
"learning_rate": 4.5405169342398634e-05,
"loss": 0.0018,
"step": 534
},
{
"epoch": 0.3036322360953462,
"grad_norm": 0.3800831735134125,
"learning_rate": 4.5247197834790876e-05,
"loss": 0.0213,
"step": 535
},
{
"epoch": 0.30419977298524403,
"grad_norm": 0.02702210657298565,
"learning_rate": 4.508927418784815e-05,
"loss": 0.0005,
"step": 536
},
{
"epoch": 0.3047673098751419,
"grad_norm": 0.021253688260912895,
"learning_rate": 4.493139999185983e-05,
"loss": 0.0004,
"step": 537
},
{
"epoch": 0.30533484676503975,
"grad_norm": 0.019332874566316605,
"learning_rate": 4.477357683661734e-05,
"loss": 0.0004,
"step": 538
},
{
"epoch": 0.30590238365493755,
"grad_norm": 0.20063234865665436,
"learning_rate": 4.461580631139805e-05,
"loss": 0.0033,
"step": 539
},
{
"epoch": 0.3064699205448354,
"grad_norm": 0.031181707978248596,
"learning_rate": 4.445809000494946e-05,
"loss": 0.0005,
"step": 540
},
{
"epoch": 0.30703745743473326,
"grad_norm": 0.6691449284553528,
"learning_rate": 4.4300429505472976e-05,
"loss": 0.0271,
"step": 541
},
{
"epoch": 0.3076049943246311,
"grad_norm": 0.009593677707016468,
"learning_rate": 4.4142826400608086e-05,
"loss": 0.0002,
"step": 542
},
{
"epoch": 0.3081725312145289,
"grad_norm": 0.009516783058643341,
"learning_rate": 4.398528227741633e-05,
"loss": 0.0003,
"step": 543
},
{
"epoch": 0.3087400681044268,
"grad_norm": 0.05990798771381378,
"learning_rate": 4.3827798722365264e-05,
"loss": 0.001,
"step": 544
},
{
"epoch": 0.30930760499432464,
"grad_norm": 0.0,
"learning_rate": 4.3670377321312535e-05,
"loss": 0.0,
"step": 545
},
{
"epoch": 0.3098751418842225,
"grad_norm": 0.0,
"learning_rate": 4.351301965948991e-05,
"loss": 0.0,
"step": 546
},
{
"epoch": 0.3104426787741203,
"grad_norm": 0.0,
"learning_rate": 4.33557273214873e-05,
"loss": 0.0,
"step": 547
},
{
"epoch": 0.31101021566401815,
"grad_norm": 0.0,
"learning_rate": 4.3198501891236804e-05,
"loss": 0.0,
"step": 548
},
{
"epoch": 0.311577752553916,
"grad_norm": 0.0,
"learning_rate": 4.3041344951996746e-05,
"loss": 0.0,
"step": 549
},
{
"epoch": 0.31214528944381387,
"grad_norm": 0.0,
"learning_rate": 4.288425808633575e-05,
"loss": 0.0,
"step": 550
},
{
"epoch": 0.31271282633371167,
"grad_norm": 0.8620555400848389,
"learning_rate": 4.272724287611684e-05,
"loss": 0.0271,
"step": 551
},
{
"epoch": 0.3132803632236095,
"grad_norm": 0.25762560963630676,
"learning_rate": 4.2570300902481426e-05,
"loss": 0.0042,
"step": 552
},
{
"epoch": 0.3138479001135074,
"grad_norm": 0.15001147985458374,
"learning_rate": 4.241343374583343e-05,
"loss": 0.0047,
"step": 553
},
{
"epoch": 0.31441543700340524,
"grad_norm": 0.2072601318359375,
"learning_rate": 4.2256642985823395e-05,
"loss": 0.0076,
"step": 554
},
{
"epoch": 0.31498297389330304,
"grad_norm": 0.19115956127643585,
"learning_rate": 4.20999302013325e-05,
"loss": 0.004,
"step": 555
},
{
"epoch": 0.3155505107832009,
"grad_norm": 0.19318291544914246,
"learning_rate": 4.19432969704568e-05,
"loss": 0.0067,
"step": 556
},
{
"epoch": 0.31611804767309876,
"grad_norm": 0.0738341435790062,
"learning_rate": 4.178674487049116e-05,
"loss": 0.0015,
"step": 557
},
{
"epoch": 0.3166855845629966,
"grad_norm": 0.3997495174407959,
"learning_rate": 4.163027547791347e-05,
"loss": 0.013,
"step": 558
},
{
"epoch": 0.3172531214528944,
"grad_norm": 0.10490912199020386,
"learning_rate": 4.147389036836881e-05,
"loss": 0.002,
"step": 559
},
{
"epoch": 0.3178206583427923,
"grad_norm": 0.3020564615726471,
"learning_rate": 4.131759111665349e-05,
"loss": 0.0184,
"step": 560
},
{
"epoch": 0.31838819523269013,
"grad_norm": 0.1874658763408661,
"learning_rate": 4.116137929669921e-05,
"loss": 0.0056,
"step": 561
},
{
"epoch": 0.318955732122588,
"grad_norm": 0.1652912199497223,
"learning_rate": 4.100525648155731e-05,
"loss": 0.0029,
"step": 562
},
{
"epoch": 0.3195232690124858,
"grad_norm": 0.2240392118692398,
"learning_rate": 4.084922424338277e-05,
"loss": 0.0084,
"step": 563
},
{
"epoch": 0.32009080590238365,
"grad_norm": 0.3925991654396057,
"learning_rate": 4.06932841534185e-05,
"loss": 0.0035,
"step": 564
},
{
"epoch": 0.3206583427922815,
"grad_norm": 0.23100757598876953,
"learning_rate": 4.0537437781979506e-05,
"loss": 0.0054,
"step": 565
},
{
"epoch": 0.32122587968217936,
"grad_norm": 0.05905711650848389,
"learning_rate": 4.038168669843697e-05,
"loss": 0.0007,
"step": 566
},
{
"epoch": 0.32179341657207716,
"grad_norm": 0.26876482367515564,
"learning_rate": 4.0226032471202604e-05,
"loss": 0.0166,
"step": 567
},
{
"epoch": 0.322360953461975,
"grad_norm": 0.5311969518661499,
"learning_rate": 4.007047666771274e-05,
"loss": 0.0041,
"step": 568
},
{
"epoch": 0.3229284903518729,
"grad_norm": 0.0217901561409235,
"learning_rate": 3.991502085441259e-05,
"loss": 0.0006,
"step": 569
},
{
"epoch": 0.32349602724177073,
"grad_norm": 0.04868500307202339,
"learning_rate": 3.9759666596740476e-05,
"loss": 0.0009,
"step": 570
},
{
"epoch": 0.32406356413166854,
"grad_norm": 0.01934129185974598,
"learning_rate": 3.960441545911204e-05,
"loss": 0.0005,
"step": 571
},
{
"epoch": 0.3246311010215664,
"grad_norm": 0.3002278208732605,
"learning_rate": 3.944926900490452e-05,
"loss": 0.006,
"step": 572
},
{
"epoch": 0.32519863791146425,
"grad_norm": 0.08359342068433762,
"learning_rate": 3.929422879644099e-05,
"loss": 0.0008,
"step": 573
},
{
"epoch": 0.3257661748013621,
"grad_norm": 0.05703277885913849,
"learning_rate": 3.913929639497462e-05,
"loss": 0.0006,
"step": 574
},
{
"epoch": 0.3263337116912599,
"grad_norm": 0.1300325244665146,
"learning_rate": 3.898447336067297e-05,
"loss": 0.0007,
"step": 575
},
{
"epoch": 0.32690124858115777,
"grad_norm": 0.2558203935623169,
"learning_rate": 3.882976125260229e-05,
"loss": 0.0116,
"step": 576
},
{
"epoch": 0.3274687854710556,
"grad_norm": 0.12041133642196655,
"learning_rate": 3.8675161628711776e-05,
"loss": 0.0024,
"step": 577
},
{
"epoch": 0.3280363223609535,
"grad_norm": 0.01596921868622303,
"learning_rate": 3.852067604581794e-05,
"loss": 0.0005,
"step": 578
},
{
"epoch": 0.3286038592508513,
"grad_norm": 0.35162538290023804,
"learning_rate": 3.836630605958888e-05,
"loss": 0.017,
"step": 579
},
{
"epoch": 0.32917139614074914,
"grad_norm": 0.43673884868621826,
"learning_rate": 3.821205322452863e-05,
"loss": 0.0116,
"step": 580
},
{
"epoch": 0.329738933030647,
"grad_norm": 0.9133800268173218,
"learning_rate": 3.8057919093961553e-05,
"loss": 0.0122,
"step": 581
},
{
"epoch": 0.33030646992054485,
"grad_norm": 0.024000134319067,
"learning_rate": 3.790390522001662e-05,
"loss": 0.0007,
"step": 582
},
{
"epoch": 0.33087400681044266,
"grad_norm": 0.934701681137085,
"learning_rate": 3.775001315361183e-05,
"loss": 0.0058,
"step": 583
},
{
"epoch": 0.3314415437003405,
"grad_norm": 0.904114842414856,
"learning_rate": 3.759624444443858e-05,
"loss": 0.036,
"step": 584
},
{
"epoch": 0.33200908059023837,
"grad_norm": 0.07346749305725098,
"learning_rate": 3.744260064094604e-05,
"loss": 0.0009,
"step": 585
},
{
"epoch": 0.3325766174801362,
"grad_norm": 0.03927430883049965,
"learning_rate": 3.728908329032567e-05,
"loss": 0.0008,
"step": 586
},
{
"epoch": 0.33314415437003403,
"grad_norm": 0.42941439151763916,
"learning_rate": 3.713569393849543e-05,
"loss": 0.015,
"step": 587
},
{
"epoch": 0.3337116912599319,
"grad_norm": 0.03521761670708656,
"learning_rate": 3.69824341300844e-05,
"loss": 0.0008,
"step": 588
},
{
"epoch": 0.33427922814982974,
"grad_norm": 0.019192036241292953,
"learning_rate": 3.6829305408417166e-05,
"loss": 0.0005,
"step": 589
},
{
"epoch": 0.3348467650397276,
"grad_norm": 0.07493746280670166,
"learning_rate": 3.6676309315498256e-05,
"loss": 0.0011,
"step": 590
},
{
"epoch": 0.3354143019296254,
"grad_norm": 0.43895918130874634,
"learning_rate": 3.6523447391996614e-05,
"loss": 0.0059,
"step": 591
},
{
"epoch": 0.33598183881952326,
"grad_norm": 0.6594648361206055,
"learning_rate": 3.6370721177230116e-05,
"loss": 0.0177,
"step": 592
},
{
"epoch": 0.3365493757094211,
"grad_norm": 0.12148375064134598,
"learning_rate": 3.6218132209150045e-05,
"loss": 0.0016,
"step": 593
},
{
"epoch": 0.337116912599319,
"grad_norm": 0.033791981637477875,
"learning_rate": 3.606568202432562e-05,
"loss": 0.0006,
"step": 594
},
{
"epoch": 0.3376844494892168,
"grad_norm": 0.0,
"learning_rate": 3.591337215792852e-05,
"loss": 0.0,
"step": 595
},
{
"epoch": 0.33825198637911463,
"grad_norm": 0.12862201035022736,
"learning_rate": 3.5761204143717385e-05,
"loss": 0.0021,
"step": 596
},
{
"epoch": 0.3388195232690125,
"grad_norm": 0.05550703406333923,
"learning_rate": 3.560917951402245e-05,
"loss": 0.0005,
"step": 597
},
{
"epoch": 0.33938706015891035,
"grad_norm": 0.0,
"learning_rate": 3.545729979973005e-05,
"loss": 0.0,
"step": 598
},
{
"epoch": 0.33995459704880815,
"grad_norm": 0.0,
"learning_rate": 3.530556653026721e-05,
"loss": 0.0,
"step": 599
},
{
"epoch": 0.340522133938706,
"grad_norm": 0.0,
"learning_rate": 3.515398123358627e-05,
"loss": 0.0,
"step": 600
},
{
"epoch": 0.340522133938706,
"eval_loss": NaN,
"eval_runtime": 107.34,
"eval_samples_per_second": 27.65,
"eval_steps_per_second": 6.913,
"step": 600
},
{
"epoch": 0.34108967082860386,
"grad_norm": 0.9571943283081055,
"learning_rate": 3.5002545436149474e-05,
"loss": 0.0473,
"step": 601
},
{
"epoch": 0.3416572077185017,
"grad_norm": 0.480672150850296,
"learning_rate": 3.485126066291364e-05,
"loss": 0.0109,
"step": 602
},
{
"epoch": 0.3422247446083995,
"grad_norm": 0.6047912836074829,
"learning_rate": 3.470012843731476e-05,
"loss": 0.0188,
"step": 603
},
{
"epoch": 0.3427922814982974,
"grad_norm": 0.02252427488565445,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.0008,
"step": 604
},
{
"epoch": 0.34335981838819524,
"grad_norm": 0.1912216693162918,
"learning_rate": 3.439832771507565e-05,
"loss": 0.0036,
"step": 605
},
{
"epoch": 0.3439273552780931,
"grad_norm": 0.18382948637008667,
"learning_rate": 3.424766225756537e-05,
"loss": 0.0077,
"step": 606
},
{
"epoch": 0.3444948921679909,
"grad_norm": 0.10974773019552231,
"learning_rate": 3.4097155425921254e-05,
"loss": 0.0018,
"step": 607
},
{
"epoch": 0.34506242905788875,
"grad_norm": 0.32770535349845886,
"learning_rate": 3.394680873574546e-05,
"loss": 0.0061,
"step": 608
},
{
"epoch": 0.3456299659477866,
"grad_norm": 0.389616996049881,
"learning_rate": 3.3796623701027476e-05,
"loss": 0.0097,
"step": 609
},
{
"epoch": 0.34619750283768447,
"grad_norm": 0.11117340624332428,
"learning_rate": 3.364660183412892e-05,
"loss": 0.0024,
"step": 610
},
{
"epoch": 0.34676503972758227,
"grad_norm": 0.11243616044521332,
"learning_rate": 3.349674464576834e-05,
"loss": 0.0022,
"step": 611
},
{
"epoch": 0.3473325766174801,
"grad_norm": 0.05497328191995621,
"learning_rate": 3.334705364500596e-05,
"loss": 0.0012,
"step": 612
},
{
"epoch": 0.347900113507378,
"grad_norm": 0.03620595484972,
"learning_rate": 3.3197530339228487e-05,
"loss": 0.0011,
"step": 613
},
{
"epoch": 0.34846765039727584,
"grad_norm": 0.029850907623767853,
"learning_rate": 3.304817623413397e-05,
"loss": 0.0007,
"step": 614
},
{
"epoch": 0.34903518728717364,
"grad_norm": 0.4595206677913666,
"learning_rate": 3.289899283371657e-05,
"loss": 0.0065,
"step": 615
},
{
"epoch": 0.3496027241770715,
"grad_norm": 0.3553248941898346,
"learning_rate": 3.274998164025148e-05,
"loss": 0.006,
"step": 616
},
{
"epoch": 0.35017026106696936,
"grad_norm": 0.06678071618080139,
"learning_rate": 3.260114415427975e-05,
"loss": 0.0012,
"step": 617
},
{
"epoch": 0.3507377979568672,
"grad_norm": 0.08883315324783325,
"learning_rate": 3.2452481874593234e-05,
"loss": 0.0022,
"step": 618
},
{
"epoch": 0.351305334846765,
"grad_norm": 0.04673172906041145,
"learning_rate": 3.230399629821942e-05,
"loss": 0.001,
"step": 619
},
{
"epoch": 0.3518728717366629,
"grad_norm": 0.03680291026830673,
"learning_rate": 3.215568892040641e-05,
"loss": 0.0009,
"step": 620
},
{
"epoch": 0.35244040862656073,
"grad_norm": 0.08393888175487518,
"learning_rate": 3.200756123460788e-05,
"loss": 0.0013,
"step": 621
},
{
"epoch": 0.3530079455164586,
"grad_norm": 0.3266814649105072,
"learning_rate": 3.1859614732467954e-05,
"loss": 0.0146,
"step": 622
},
{
"epoch": 0.3535754824063564,
"grad_norm": 0.38569343090057373,
"learning_rate": 3.171185090380628e-05,
"loss": 0.0099,
"step": 623
},
{
"epoch": 0.35414301929625425,
"grad_norm": 0.02289619669318199,
"learning_rate": 3.156427123660297e-05,
"loss": 0.0005,
"step": 624
},
{
"epoch": 0.3547105561861521,
"grad_norm": 0.3161522448062897,
"learning_rate": 3.141687721698363e-05,
"loss": 0.0036,
"step": 625
},
{
"epoch": 0.35527809307604996,
"grad_norm": 0.30018478631973267,
"learning_rate": 3.12696703292044e-05,
"loss": 0.0065,
"step": 626
},
{
"epoch": 0.35584562996594776,
"grad_norm": 0.41302579641342163,
"learning_rate": 3.1122652055637015e-05,
"loss": 0.0059,
"step": 627
},
{
"epoch": 0.3564131668558456,
"grad_norm": 0.5455114245414734,
"learning_rate": 3.097582387675385e-05,
"loss": 0.0085,
"step": 628
},
{
"epoch": 0.3569807037457435,
"grad_norm": 0.028173979371786118,
"learning_rate": 3.082918727111304e-05,
"loss": 0.0006,
"step": 629
},
{
"epoch": 0.35754824063564133,
"grad_norm": 0.035780180245637894,
"learning_rate": 3.0682743715343564e-05,
"loss": 0.0005,
"step": 630
},
{
"epoch": 0.35811577752553914,
"grad_norm": 0.031802088022232056,
"learning_rate": 3.053649468413043e-05,
"loss": 0.0008,
"step": 631
},
{
"epoch": 0.358683314415437,
"grad_norm": 0.5345308184623718,
"learning_rate": 3.0390441650199724e-05,
"loss": 0.0111,
"step": 632
},
{
"epoch": 0.35925085130533485,
"grad_norm": 0.2168167382478714,
"learning_rate": 3.0244586084303905e-05,
"loss": 0.0022,
"step": 633
},
{
"epoch": 0.3598183881952327,
"grad_norm": 0.36197105050086975,
"learning_rate": 3.0098929455206904e-05,
"loss": 0.003,
"step": 634
},
{
"epoch": 0.3603859250851305,
"grad_norm": 0.31182852387428284,
"learning_rate": 2.9953473229669328e-05,
"loss": 0.0045,
"step": 635
},
{
"epoch": 0.36095346197502837,
"grad_norm": 0.013852439820766449,
"learning_rate": 2.9808218872433767e-05,
"loss": 0.0003,
"step": 636
},
{
"epoch": 0.3615209988649262,
"grad_norm": 0.09768744558095932,
"learning_rate": 2.9663167846209998e-05,
"loss": 0.0016,
"step": 637
},
{
"epoch": 0.3620885357548241,
"grad_norm": 0.8393372297286987,
"learning_rate": 2.9518321611660237e-05,
"loss": 0.0051,
"step": 638
},
{
"epoch": 0.3626560726447219,
"grad_norm": 0.03129115700721741,
"learning_rate": 2.9373681627384447e-05,
"loss": 0.0006,
"step": 639
},
{
"epoch": 0.36322360953461974,
"grad_norm": 0.20783281326293945,
"learning_rate": 2.9229249349905684e-05,
"loss": 0.0013,
"step": 640
},
{
"epoch": 0.3637911464245176,
"grad_norm": 0.42036134004592896,
"learning_rate": 2.9085026233655365e-05,
"loss": 0.0089,
"step": 641
},
{
"epoch": 0.36435868331441545,
"grad_norm": 0.18196117877960205,
"learning_rate": 2.894101373095867e-05,
"loss": 0.002,
"step": 642
},
{
"epoch": 0.36492622020431326,
"grad_norm": 0.008144269697368145,
"learning_rate": 2.8797213292019926e-05,
"loss": 0.0002,
"step": 643
},
{
"epoch": 0.3654937570942111,
"grad_norm": 4.53141450881958,
"learning_rate": 2.8653626364907917e-05,
"loss": 0.0394,
"step": 644
},
{
"epoch": 0.36606129398410897,
"grad_norm": 0.011452808044850826,
"learning_rate": 2.851025439554142e-05,
"loss": 0.0002,
"step": 645
},
{
"epoch": 0.36662883087400683,
"grad_norm": 0.0,
"learning_rate": 2.8367098827674578e-05,
"loss": 0.0,
"step": 646
},
{
"epoch": 0.36719636776390463,
"grad_norm": 0.0,
"learning_rate": 2.8224161102882397e-05,
"loss": 0.0,
"step": 647
},
{
"epoch": 0.3677639046538025,
"grad_norm": 0.11076200008392334,
"learning_rate": 2.8081442660546125e-05,
"loss": 0.0003,
"step": 648
},
{
"epoch": 0.36833144154370034,
"grad_norm": 0.0,
"learning_rate": 2.7938944937838923e-05,
"loss": 0.0,
"step": 649
},
{
"epoch": 0.3688989784335982,
"grad_norm": 0.0,
"learning_rate": 2.7796669369711294e-05,
"loss": 0.0,
"step": 650
},
{
"epoch": 0.369466515323496,
"grad_norm": 0.6546966433525085,
"learning_rate": 2.7654617388876615e-05,
"loss": 0.0211,
"step": 651
},
{
"epoch": 0.37003405221339386,
"grad_norm": 0.1961146891117096,
"learning_rate": 2.7512790425796718e-05,
"loss": 0.004,
"step": 652
},
{
"epoch": 0.3706015891032917,
"grad_norm": 0.7561377882957458,
"learning_rate": 2.7371189908667604e-05,
"loss": 0.0221,
"step": 653
},
{
"epoch": 0.3711691259931896,
"grad_norm": 0.2948407530784607,
"learning_rate": 2.7229817263404866e-05,
"loss": 0.0067,
"step": 654
},
{
"epoch": 0.3717366628830874,
"grad_norm": 0.0565456859767437,
"learning_rate": 2.708867391362948e-05,
"loss": 0.0011,
"step": 655
},
{
"epoch": 0.37230419977298523,
"grad_norm": 0.13881909847259521,
"learning_rate": 2.694776128065345e-05,
"loss": 0.0015,
"step": 656
},
{
"epoch": 0.3728717366628831,
"grad_norm": 0.3793433904647827,
"learning_rate": 2.6807080783465376e-05,
"loss": 0.0015,
"step": 657
},
{
"epoch": 0.37343927355278095,
"grad_norm": 0.03849627077579498,
"learning_rate": 2.6666633838716314e-05,
"loss": 0.0009,
"step": 658
},
{
"epoch": 0.37400681044267875,
"grad_norm": 0.5067241787910461,
"learning_rate": 2.6526421860705473e-05,
"loss": 0.0078,
"step": 659
},
{
"epoch": 0.3745743473325766,
"grad_norm": 0.2140672206878662,
"learning_rate": 2.638644626136587e-05,
"loss": 0.0036,
"step": 660
},
{
"epoch": 0.37514188422247446,
"grad_norm": 0.051408469676971436,
"learning_rate": 2.6246708450250256e-05,
"loss": 0.0011,
"step": 661
},
{
"epoch": 0.3757094211123723,
"grad_norm": 0.3721614480018616,
"learning_rate": 2.6107209834516854e-05,
"loss": 0.0084,
"step": 662
},
{
"epoch": 0.3762769580022701,
"grad_norm": 0.21189674735069275,
"learning_rate": 2.596795181891514e-05,
"loss": 0.0024,
"step": 663
},
{
"epoch": 0.376844494892168,
"grad_norm": 0.46903058886528015,
"learning_rate": 2.5828935805771802e-05,
"loss": 0.0079,
"step": 664
},
{
"epoch": 0.37741203178206584,
"grad_norm": 0.03936934471130371,
"learning_rate": 2.5690163194976575e-05,
"loss": 0.0007,
"step": 665
},
{
"epoch": 0.3779795686719637,
"grad_norm": 0.034673310816287994,
"learning_rate": 2.5551635383968065e-05,
"loss": 0.0006,
"step": 666
},
{
"epoch": 0.3785471055618615,
"grad_norm": 0.01133895106613636,
"learning_rate": 2.5413353767719805e-05,
"loss": 0.0004,
"step": 667
},
{
"epoch": 0.37911464245175935,
"grad_norm": 0.48205289244651794,
"learning_rate": 2.5275319738726165e-05,
"loss": 0.0069,
"step": 668
},
{
"epoch": 0.3796821793416572,
"grad_norm": 0.24426761269569397,
"learning_rate": 2.513753468698826e-05,
"loss": 0.0019,
"step": 669
},
{
"epoch": 0.38024971623155507,
"grad_norm": 0.03907699137926102,
"learning_rate": 2.500000000000001e-05,
"loss": 0.0003,
"step": 670
},
{
"epoch": 0.38081725312145287,
"grad_norm": 0.2291565239429474,
"learning_rate": 2.486271706273421e-05,
"loss": 0.0106,
"step": 671
},
{
"epoch": 0.3813847900113507,
"grad_norm": 0.019995173439383507,
"learning_rate": 2.4725687257628534e-05,
"loss": 0.0004,
"step": 672
},
{
"epoch": 0.3819523269012486,
"grad_norm": 0.10260719805955887,
"learning_rate": 2.4588911964571553e-05,
"loss": 0.0011,
"step": 673
},
{
"epoch": 0.38251986379114644,
"grad_norm": 0.024873068556189537,
"learning_rate": 2.4452392560888976e-05,
"loss": 0.0004,
"step": 674
},
{
"epoch": 0.38308740068104424,
"grad_norm": 0.5002231001853943,
"learning_rate": 2.4316130421329697e-05,
"loss": 0.0108,
"step": 675
},
{
"epoch": 0.3836549375709421,
"grad_norm": 0.037127118557691574,
"learning_rate": 2.418012691805191e-05,
"loss": 0.0005,
"step": 676
},
{
"epoch": 0.38422247446083996,
"grad_norm": 0.2599027752876282,
"learning_rate": 2.4044383420609406e-05,
"loss": 0.0083,
"step": 677
},
{
"epoch": 0.3847900113507378,
"grad_norm": 0.018559547141194344,
"learning_rate": 2.3908901295937713e-05,
"loss": 0.0004,
"step": 678
},
{
"epoch": 0.3853575482406356,
"grad_norm": 0.13668963313102722,
"learning_rate": 2.3773681908340284e-05,
"loss": 0.0018,
"step": 679
},
{
"epoch": 0.3859250851305335,
"grad_norm": 0.21998494863510132,
"learning_rate": 2.363872661947488e-05,
"loss": 0.0011,
"step": 680
},
{
"epoch": 0.38649262202043133,
"grad_norm": 0.12420105934143066,
"learning_rate": 2.350403678833976e-05,
"loss": 0.0014,
"step": 681
},
{
"epoch": 0.3870601589103292,
"grad_norm": 0.2006537914276123,
"learning_rate": 2.336961377126001e-05,
"loss": 0.0045,
"step": 682
},
{
"epoch": 0.387627695800227,
"grad_norm": 0.09202957153320312,
"learning_rate": 2.3235458921873925e-05,
"loss": 0.0011,
"step": 683
},
{
"epoch": 0.38819523269012485,
"grad_norm": 0.1019575372338295,
"learning_rate": 2.310157359111938e-05,
"loss": 0.0009,
"step": 684
},
{
"epoch": 0.3887627695800227,
"grad_norm": 0.01464917603880167,
"learning_rate": 2.296795912722014e-05,
"loss": 0.0003,
"step": 685
},
{
"epoch": 0.38933030646992056,
"grad_norm": 0.024914277717471123,
"learning_rate": 2.283461687567236e-05,
"loss": 0.0004,
"step": 686
},
{
"epoch": 0.38989784335981836,
"grad_norm": 0.03489803895354271,
"learning_rate": 2.2701548179231048e-05,
"loss": 0.0005,
"step": 687
},
{
"epoch": 0.3904653802497162,
"grad_norm": 0.0056303925812244415,
"learning_rate": 2.2568754377896516e-05,
"loss": 0.0002,
"step": 688
},
{
"epoch": 0.3910329171396141,
"grad_norm": 1.0861475467681885,
"learning_rate": 2.2436236808900844e-05,
"loss": 0.0469,
"step": 689
},
{
"epoch": 0.39160045402951194,
"grad_norm": 0.7234563231468201,
"learning_rate": 2.2303996806694488e-05,
"loss": 0.0229,
"step": 690
},
{
"epoch": 0.39216799091940974,
"grad_norm": 0.01756235770881176,
"learning_rate": 2.2172035702932825e-05,
"loss": 0.0004,
"step": 691
},
{
"epoch": 0.3927355278093076,
"grad_norm": 0.007982458919286728,
"learning_rate": 2.2040354826462668e-05,
"loss": 0.0002,
"step": 692
},
{
"epoch": 0.39330306469920545,
"grad_norm": 0.004965408705174923,
"learning_rate": 2.1908955503308993e-05,
"loss": 0.0001,
"step": 693
},
{
"epoch": 0.3938706015891033,
"grad_norm": 1.1320465803146362,
"learning_rate": 2.1777839056661554e-05,
"loss": 0.0237,
"step": 694
},
{
"epoch": 0.3944381384790011,
"grad_norm": 0.0,
"learning_rate": 2.164700680686147e-05,
"loss": 0.0,
"step": 695
},
{
"epoch": 0.39500567536889897,
"grad_norm": 0.05166594311594963,
"learning_rate": 2.1516460071388062e-05,
"loss": 0.0004,
"step": 696
},
{
"epoch": 0.3955732122587968,
"grad_norm": 0.013484718278050423,
"learning_rate": 2.1386200164845526e-05,
"loss": 0.0002,
"step": 697
},
{
"epoch": 0.3961407491486947,
"grad_norm": 26.556129455566406,
"learning_rate": 2.125622839894964e-05,
"loss": 0.4464,
"step": 698
},
{
"epoch": 0.3967082860385925,
"grad_norm": 0.0,
"learning_rate": 2.1126546082514664e-05,
"loss": 0.0,
"step": 699
},
{
"epoch": 0.39727582292849034,
"grad_norm": 0.0,
"learning_rate": 2.09971545214401e-05,
"loss": 0.0,
"step": 700
},
{
"epoch": 0.3978433598183882,
"grad_norm": 1.003913402557373,
"learning_rate": 2.086805501869749e-05,
"loss": 0.0426,
"step": 701
},
{
"epoch": 0.39841089670828606,
"grad_norm": 0.2675124406814575,
"learning_rate": 2.073924887431744e-05,
"loss": 0.0221,
"step": 702
},
{
"epoch": 0.39897843359818386,
"grad_norm": 0.4695562720298767,
"learning_rate": 2.061073738537635e-05,
"loss": 0.0276,
"step": 703
},
{
"epoch": 0.3995459704880817,
"grad_norm": 0.8297376036643982,
"learning_rate": 2.048252184598352e-05,
"loss": 0.0142,
"step": 704
},
{
"epoch": 0.40011350737797957,
"grad_norm": 0.12075504660606384,
"learning_rate": 2.0354603547267985e-05,
"loss": 0.0023,
"step": 705
},
{
"epoch": 0.40068104426787743,
"grad_norm": 0.36436015367507935,
"learning_rate": 2.0226983777365604e-05,
"loss": 0.0067,
"step": 706
},
{
"epoch": 0.40124858115777523,
"grad_norm": 0.21536526083946228,
"learning_rate": 2.0099663821406056e-05,
"loss": 0.0066,
"step": 707
},
{
"epoch": 0.4018161180476731,
"grad_norm": 0.1440303772687912,
"learning_rate": 1.9972644961499854e-05,
"loss": 0.0027,
"step": 708
},
{
"epoch": 0.40238365493757094,
"grad_norm": 0.31195855140686035,
"learning_rate": 1.9845928476725524e-05,
"loss": 0.0039,
"step": 709
},
{
"epoch": 0.4029511918274688,
"grad_norm": 0.19405895471572876,
"learning_rate": 1.9719515643116674e-05,
"loss": 0.0015,
"step": 710
},
{
"epoch": 0.4035187287173666,
"grad_norm": 0.118756964802742,
"learning_rate": 1.959340773364911e-05,
"loss": 0.0014,
"step": 711
},
{
"epoch": 0.40408626560726446,
"grad_norm": 0.651408851146698,
"learning_rate": 1.946760601822809e-05,
"loss": 0.0064,
"step": 712
},
{
"epoch": 0.4046538024971623,
"grad_norm": 0.09903181344270706,
"learning_rate": 1.9342111763675512e-05,
"loss": 0.0012,
"step": 713
},
{
"epoch": 0.4052213393870602,
"grad_norm": 0.09159818291664124,
"learning_rate": 1.9216926233717085e-05,
"loss": 0.0012,
"step": 714
},
{
"epoch": 0.405788876276958,
"grad_norm": 0.4659949839115143,
"learning_rate": 1.9092050688969738e-05,
"loss": 0.0086,
"step": 715
},
{
"epoch": 0.40635641316685583,
"grad_norm": 0.0276198647916317,
"learning_rate": 1.8967486386928817e-05,
"loss": 0.0006,
"step": 716
},
{
"epoch": 0.4069239500567537,
"grad_norm": 0.11385304480791092,
"learning_rate": 1.8843234581955442e-05,
"loss": 0.0015,
"step": 717
},
{
"epoch": 0.40749148694665155,
"grad_norm": 0.30209067463874817,
"learning_rate": 1.8719296525263922e-05,
"loss": 0.0127,
"step": 718
},
{
"epoch": 0.40805902383654935,
"grad_norm": 0.24259290099143982,
"learning_rate": 1.859567346490913e-05,
"loss": 0.0012,
"step": 719
},
{
"epoch": 0.4086265607264472,
"grad_norm": 0.2819889783859253,
"learning_rate": 1.847236664577389e-05,
"loss": 0.0059,
"step": 720
},
{
"epoch": 0.40919409761634506,
"grad_norm": 0.5837430953979492,
"learning_rate": 1.8349377309556486e-05,
"loss": 0.0041,
"step": 721
},
{
"epoch": 0.4097616345062429,
"grad_norm": 0.05538428574800491,
"learning_rate": 1.8226706694758195e-05,
"loss": 0.001,
"step": 722
},
{
"epoch": 0.4103291713961407,
"grad_norm": 0.08718933165073395,
"learning_rate": 1.810435603667075e-05,
"loss": 0.0009,
"step": 723
},
{
"epoch": 0.4108967082860386,
"grad_norm": 0.09461364895105362,
"learning_rate": 1.7982326567363888e-05,
"loss": 0.0017,
"step": 724
},
{
"epoch": 0.41146424517593644,
"grad_norm": 0.43470796942710876,
"learning_rate": 1.7860619515673033e-05,
"loss": 0.0176,
"step": 725
},
{
"epoch": 0.4120317820658343,
"grad_norm": 0.04546342045068741,
"learning_rate": 1.773923610718686e-05,
"loss": 0.0009,
"step": 726
},
{
"epoch": 0.4125993189557321,
"grad_norm": 0.014890948310494423,
"learning_rate": 1.7618177564234905e-05,
"loss": 0.0003,
"step": 727
},
{
"epoch": 0.41316685584562995,
"grad_norm": 0.030182119458913803,
"learning_rate": 1.7497445105875377e-05,
"loss": 0.0005,
"step": 728
},
{
"epoch": 0.4137343927355278,
"grad_norm": 0.05278665944933891,
"learning_rate": 1.73770399478828e-05,
"loss": 0.0008,
"step": 729
},
{
"epoch": 0.41430192962542567,
"grad_norm": 0.4548901915550232,
"learning_rate": 1.725696330273575e-05,
"loss": 0.0118,
"step": 730
},
{
"epoch": 0.41486946651532347,
"grad_norm": 0.33028581738471985,
"learning_rate": 1.7137216379604727e-05,
"loss": 0.0071,
"step": 731
},
{
"epoch": 0.41543700340522133,
"grad_norm": 0.0861010029911995,
"learning_rate": 1.7017800384339928e-05,
"loss": 0.001,
"step": 732
},
{
"epoch": 0.4160045402951192,
"grad_norm": 0.438575804233551,
"learning_rate": 1.6898716519459074e-05,
"loss": 0.0071,
"step": 733
},
{
"epoch": 0.41657207718501704,
"grad_norm": 0.05357427895069122,
"learning_rate": 1.6779965984135377e-05,
"loss": 0.0006,
"step": 734
},
{
"epoch": 0.41713961407491484,
"grad_norm": 0.022530531510710716,
"learning_rate": 1.6661549974185424e-05,
"loss": 0.0003,
"step": 735
},
{
"epoch": 0.4177071509648127,
"grad_norm": 0.4733809232711792,
"learning_rate": 1.6543469682057106e-05,
"loss": 0.0127,
"step": 736
},
{
"epoch": 0.41827468785471056,
"grad_norm": 0.12539038062095642,
"learning_rate": 1.6425726296817633e-05,
"loss": 0.0014,
"step": 737
},
{
"epoch": 0.4188422247446084,
"grad_norm": 0.4548875689506531,
"learning_rate": 1.6308321004141607e-05,
"loss": 0.0076,
"step": 738
},
{
"epoch": 0.4194097616345062,
"grad_norm": 0.0097389817237854,
"learning_rate": 1.619125498629904e-05,
"loss": 0.0002,
"step": 739
},
{
"epoch": 0.4199772985244041,
"grad_norm": 0.019004186615347862,
"learning_rate": 1.60745294221434e-05,
"loss": 0.0004,
"step": 740
},
{
"epoch": 0.42054483541430193,
"grad_norm": 0.03138939291238785,
"learning_rate": 1.595814548709983e-05,
"loss": 0.0004,
"step": 741
},
{
"epoch": 0.4211123723041998,
"grad_norm": 0.5367324948310852,
"learning_rate": 1.5842104353153287e-05,
"loss": 0.0249,
"step": 742
},
{
"epoch": 0.4216799091940976,
"grad_norm": 1.0344882011413574,
"learning_rate": 1.5726407188836673e-05,
"loss": 0.0335,
"step": 743
},
{
"epoch": 0.42224744608399545,
"grad_norm": 0.0143516156822443,
"learning_rate": 1.5611055159219152e-05,
"loss": 0.0003,
"step": 744
},
{
"epoch": 0.4228149829738933,
"grad_norm": 31.227991104125977,
"learning_rate": 1.549604942589441e-05,
"loss": 1.1122,
"step": 745
},
{
"epoch": 0.42338251986379116,
"grad_norm": 0.10639174282550812,
"learning_rate": 1.5381391146968866e-05,
"loss": 0.0009,
"step": 746
},
{
"epoch": 0.42395005675368896,
"grad_norm": 0.05225389450788498,
"learning_rate": 1.526708147705013e-05,
"loss": 0.0003,
"step": 747
},
{
"epoch": 0.4245175936435868,
"grad_norm": 0.0,
"learning_rate": 1.5153121567235335e-05,
"loss": 0.0,
"step": 748
},
{
"epoch": 0.4250851305334847,
"grad_norm": 0.0,
"learning_rate": 1.5039512565099467e-05,
"loss": 0.0,
"step": 749
},
{
"epoch": 0.42565266742338254,
"grad_norm": 0.0,
"learning_rate": 1.4926255614683932e-05,
"loss": 0.0,
"step": 750
},
{
"epoch": 0.42622020431328034,
"grad_norm": 0.5881960988044739,
"learning_rate": 1.481335185648498e-05,
"loss": 0.0209,
"step": 751
},
{
"epoch": 0.4267877412031782,
"grad_norm": 0.46018141508102417,
"learning_rate": 1.4700802427442179e-05,
"loss": 0.009,
"step": 752
},
{
"epoch": 0.42735527809307605,
"grad_norm": 0.40768754482269287,
"learning_rate": 1.458860846092705e-05,
"loss": 0.0032,
"step": 753
},
{
"epoch": 0.4279228149829739,
"grad_norm": 0.2659337520599365,
"learning_rate": 1.4476771086731567e-05,
"loss": 0.0061,
"step": 754
},
{
"epoch": 0.4284903518728717,
"grad_norm": 0.10703348368406296,
"learning_rate": 1.4365291431056871e-05,
"loss": 0.0017,
"step": 755
},
{
"epoch": 0.42905788876276957,
"grad_norm": 0.3611052930355072,
"learning_rate": 1.4254170616501827e-05,
"loss": 0.0034,
"step": 756
},
{
"epoch": 0.4296254256526674,
"grad_norm": 0.05121847242116928,
"learning_rate": 1.414340976205183e-05,
"loss": 0.001,
"step": 757
},
{
"epoch": 0.4301929625425653,
"grad_norm": 0.31806862354278564,
"learning_rate": 1.4033009983067452e-05,
"loss": 0.0059,
"step": 758
},
{
"epoch": 0.4307604994324631,
"grad_norm": 0.05238351970911026,
"learning_rate": 1.3922972391273226e-05,
"loss": 0.0011,
"step": 759
},
{
"epoch": 0.43132803632236094,
"grad_norm": 0.17556647956371307,
"learning_rate": 1.3813298094746491e-05,
"loss": 0.0029,
"step": 760
},
{
"epoch": 0.4318955732122588,
"grad_norm": 0.1977948248386383,
"learning_rate": 1.3703988197906209e-05,
"loss": 0.0043,
"step": 761
},
{
"epoch": 0.43246311010215666,
"grad_norm": 0.058601368218660355,
"learning_rate": 1.3595043801501794e-05,
"loss": 0.0013,
"step": 762
},
{
"epoch": 0.43303064699205446,
"grad_norm": 0.2709505558013916,
"learning_rate": 1.3486466002602133e-05,
"loss": 0.0022,
"step": 763
},
{
"epoch": 0.4335981838819523,
"grad_norm": 0.04408566281199455,
"learning_rate": 1.3378255894584463e-05,
"loss": 0.0008,
"step": 764
},
{
"epoch": 0.43416572077185017,
"grad_norm": 0.034973569214344025,
"learning_rate": 1.327041456712334e-05,
"loss": 0.0006,
"step": 765
},
{
"epoch": 0.43473325766174803,
"grad_norm": 0.32786574959754944,
"learning_rate": 1.3162943106179749e-05,
"loss": 0.018,
"step": 766
},
{
"epoch": 0.43530079455164583,
"grad_norm": 0.05485441908240318,
"learning_rate": 1.3055842593990131e-05,
"loss": 0.0005,
"step": 767
},
{
"epoch": 0.4358683314415437,
"grad_norm": 0.07898583263158798,
"learning_rate": 1.2949114109055415e-05,
"loss": 0.0013,
"step": 768
},
{
"epoch": 0.43643586833144155,
"grad_norm": 0.03237922489643097,
"learning_rate": 1.2842758726130283e-05,
"loss": 0.0007,
"step": 769
},
{
"epoch": 0.4370034052213394,
"grad_norm": 0.05610362067818642,
"learning_rate": 1.2736777516212266e-05,
"loss": 0.0008,
"step": 770
},
{
"epoch": 0.4375709421112372,
"grad_norm": 0.24216708540916443,
"learning_rate": 1.2631171546530968e-05,
"loss": 0.0037,
"step": 771
},
{
"epoch": 0.43813847900113506,
"grad_norm": 0.07961627095937729,
"learning_rate": 1.2525941880537307e-05,
"loss": 0.0013,
"step": 772
},
{
"epoch": 0.4387060158910329,
"grad_norm": 0.11050142347812653,
"learning_rate": 1.2421089577892869e-05,
"loss": 0.0012,
"step": 773
},
{
"epoch": 0.4392735527809308,
"grad_norm": 0.04272003099322319,
"learning_rate": 1.2316615694459189e-05,
"loss": 0.0006,
"step": 774
},
{
"epoch": 0.4398410896708286,
"grad_norm": 0.02150142751634121,
"learning_rate": 1.2212521282287092e-05,
"loss": 0.0004,
"step": 775
},
{
"epoch": 0.44040862656072643,
"grad_norm": 0.08354512602090836,
"learning_rate": 1.2108807389606158e-05,
"loss": 0.001,
"step": 776
},
{
"epoch": 0.4409761634506243,
"grad_norm": 0.1265098601579666,
"learning_rate": 1.2005475060814159e-05,
"loss": 0.0018,
"step": 777
},
{
"epoch": 0.44154370034052215,
"grad_norm": 0.18211375176906586,
"learning_rate": 1.1902525336466464e-05,
"loss": 0.002,
"step": 778
},
{
"epoch": 0.44211123723041995,
"grad_norm": 0.017222406342625618,
"learning_rate": 1.1799959253265668e-05,
"loss": 0.0004,
"step": 779
},
{
"epoch": 0.4426787741203178,
"grad_norm": 0.030056394636631012,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.0007,
"step": 780
},
{
"epoch": 0.44324631101021567,
"grad_norm": 0.2576983571052551,
"learning_rate": 1.1595982137788403e-05,
"loss": 0.002,
"step": 781
},
{
"epoch": 0.4438138479001135,
"grad_norm": 0.20658354461193085,
"learning_rate": 1.1494573159559213e-05,
"loss": 0.0021,
"step": 782
},
{
"epoch": 0.4443813847900113,
"grad_norm": 0.324457049369812,
"learning_rate": 1.1393551930550828e-05,
"loss": 0.0023,
"step": 783
},
{
"epoch": 0.4449489216799092,
"grad_norm": 0.2382335364818573,
"learning_rate": 1.1292919468045877e-05,
"loss": 0.0017,
"step": 784
},
{
"epoch": 0.44551645856980704,
"grad_norm": 0.3122727572917938,
"learning_rate": 1.1192676785412154e-05,
"loss": 0.0041,
"step": 785
},
{
"epoch": 0.4460839954597049,
"grad_norm": 0.06047174334526062,
"learning_rate": 1.1092824892092373e-05,
"loss": 0.0011,
"step": 786
},
{
"epoch": 0.4466515323496027,
"grad_norm": 0.12172012776136398,
"learning_rate": 1.099336479359398e-05,
"loss": 0.002,
"step": 787
},
{
"epoch": 0.44721906923950056,
"grad_norm": 0.05569892004132271,
"learning_rate": 1.0894297491479045e-05,
"loss": 0.0008,
"step": 788
},
{
"epoch": 0.4477866061293984,
"grad_norm": 0.02484039030969143,
"learning_rate": 1.0795623983354215e-05,
"loss": 0.0004,
"step": 789
},
{
"epoch": 0.44835414301929627,
"grad_norm": 0.0289757139980793,
"learning_rate": 1.0697345262860636e-05,
"loss": 0.0005,
"step": 790
},
{
"epoch": 0.44892167990919407,
"grad_norm": 0.025877099484205246,
"learning_rate": 1.0599462319663905e-05,
"loss": 0.0004,
"step": 791
},
{
"epoch": 0.44948921679909193,
"grad_norm": 0.011032159440219402,
"learning_rate": 1.0501976139444191e-05,
"loss": 0.0002,
"step": 792
},
{
"epoch": 0.4500567536889898,
"grad_norm": 0.008858336135745049,
"learning_rate": 1.0404887703886251e-05,
"loss": 0.0001,
"step": 793
},
{
"epoch": 0.45062429057888764,
"grad_norm": 0.022364582866430283,
"learning_rate": 1.0308197990669538e-05,
"loss": 0.0003,
"step": 794
},
{
"epoch": 0.45119182746878544,
"grad_norm": 0.0,
"learning_rate": 1.021190797345839e-05,
"loss": 0.0,
"step": 795
},
{
"epoch": 0.4517593643586833,
"grad_norm": 0.1561431884765625,
"learning_rate": 1.0116018621892237e-05,
"loss": 0.0011,
"step": 796
},
{
"epoch": 0.45232690124858116,
"grad_norm": 0.0,
"learning_rate": 1.0020530901575754e-05,
"loss": 0.0,
"step": 797
},
{
"epoch": 0.452894438138479,
"grad_norm": 0.0,
"learning_rate": 9.92544577406923e-06,
"loss": 0.0,
"step": 798
},
{
"epoch": 0.4534619750283768,
"grad_norm": 0.0,
"learning_rate": 9.830764196878872e-06,
"loss": 0.0,
"step": 799
},
{
"epoch": 0.4540295119182747,
"grad_norm": 0.0,
"learning_rate": 9.73648712344707e-06,
"loss": 0.0,
"step": 800
},
{
"epoch": 0.4540295119182747,
"eval_loss": NaN,
"eval_runtime": 105.9039,
"eval_samples_per_second": 28.025,
"eval_steps_per_second": 7.006,
"step": 800
},
{
"epoch": 0.45459704880817253,
"grad_norm": 0.623717725276947,
"learning_rate": 9.642615503142926e-06,
"loss": 0.0176,
"step": 801
},
{
"epoch": 0.4551645856980704,
"grad_norm": 0.24405649304389954,
"learning_rate": 9.549150281252633e-06,
"loss": 0.0043,
"step": 802
},
{
"epoch": 0.4557321225879682,
"grad_norm": 0.04640405625104904,
"learning_rate": 9.456092398969902e-06,
"loss": 0.0013,
"step": 803
},
{
"epoch": 0.45629965947786605,
"grad_norm": 0.018343493342399597,
"learning_rate": 9.363442793386606e-06,
"loss": 0.0005,
"step": 804
},
{
"epoch": 0.4568671963677639,
"grad_norm": 0.1869243085384369,
"learning_rate": 9.271202397483215e-06,
"loss": 0.0109,
"step": 805
},
{
"epoch": 0.45743473325766176,
"grad_norm": 1.0756665468215942,
"learning_rate": 9.179372140119525e-06,
"loss": 0.0142,
"step": 806
},
{
"epoch": 0.45800227014755956,
"grad_norm": 0.23151446878910065,
"learning_rate": 9.087952946025175e-06,
"loss": 0.0147,
"step": 807
},
{
"epoch": 0.4585698070374574,
"grad_norm": 0.01738697662949562,
"learning_rate": 8.996945735790447e-06,
"loss": 0.0004,
"step": 808
},
{
"epoch": 0.4591373439273553,
"grad_norm": 0.45961514115333557,
"learning_rate": 8.906351425856952e-06,
"loss": 0.0045,
"step": 809
},
{
"epoch": 0.45970488081725314,
"grad_norm": 0.3801596760749817,
"learning_rate": 8.816170928508365e-06,
"loss": 0.009,
"step": 810
},
{
"epoch": 0.46027241770715094,
"grad_norm": 0.03849168121814728,
"learning_rate": 8.7264051518613e-06,
"loss": 0.0008,
"step": 811
},
{
"epoch": 0.4608399545970488,
"grad_norm": 0.2326851785182953,
"learning_rate": 8.637054999856148e-06,
"loss": 0.0093,
"step": 812
},
{
"epoch": 0.46140749148694665,
"grad_norm": 0.048841919749975204,
"learning_rate": 8.548121372247918e-06,
"loss": 0.0007,
"step": 813
},
{
"epoch": 0.4619750283768445,
"grad_norm": 0.02097911760210991,
"learning_rate": 8.459605164597267e-06,
"loss": 0.0005,
"step": 814
},
{
"epoch": 0.4625425652667423,
"grad_norm": 0.3414818346500397,
"learning_rate": 8.371507268261437e-06,
"loss": 0.0023,
"step": 815
},
{
"epoch": 0.46311010215664017,
"grad_norm": 0.08118417859077454,
"learning_rate": 8.283828570385238e-06,
"loss": 0.0008,
"step": 816
},
{
"epoch": 0.463677639046538,
"grad_norm": 0.41794729232788086,
"learning_rate": 8.196569953892202e-06,
"loss": 0.0112,
"step": 817
},
{
"epoch": 0.4642451759364359,
"grad_norm": 0.3035317063331604,
"learning_rate": 8.109732297475635e-06,
"loss": 0.0123,
"step": 818
},
{
"epoch": 0.4648127128263337,
"grad_norm": 0.03364351764321327,
"learning_rate": 8.023316475589754e-06,
"loss": 0.0007,
"step": 819
},
{
"epoch": 0.46538024971623154,
"grad_norm": 0.48411476612091064,
"learning_rate": 7.937323358440935e-06,
"loss": 0.0161,
"step": 820
},
{
"epoch": 0.4659477866061294,
"grad_norm": 0.020044121891260147,
"learning_rate": 7.851753811978924e-06,
"loss": 0.0005,
"step": 821
},
{
"epoch": 0.46651532349602726,
"grad_norm": 0.011223547160625458,
"learning_rate": 7.766608697888095e-06,
"loss": 0.0002,
"step": 822
},
{
"epoch": 0.46708286038592506,
"grad_norm": 0.16546539962291718,
"learning_rate": 7.681888873578786e-06,
"loss": 0.0025,
"step": 823
},
{
"epoch": 0.4676503972758229,
"grad_norm": 0.009762106463313103,
"learning_rate": 7.597595192178702e-06,
"loss": 0.0003,
"step": 824
},
{
"epoch": 0.4682179341657208,
"grad_norm": 0.023298079147934914,
"learning_rate": 7.513728502524286e-06,
"loss": 0.0004,
"step": 825
},
{
"epoch": 0.46878547105561863,
"grad_norm": 0.18507546186447144,
"learning_rate": 7.430289649152156e-06,
"loss": 0.0015,
"step": 826
},
{
"epoch": 0.46935300794551643,
"grad_norm": 0.37739697098731995,
"learning_rate": 7.347279472290647e-06,
"loss": 0.0096,
"step": 827
},
{
"epoch": 0.4699205448354143,
"grad_norm": 0.10507706552743912,
"learning_rate": 7.264698807851328e-06,
"loss": 0.0015,
"step": 828
},
{
"epoch": 0.47048808172531215,
"grad_norm": 0.049794506281614304,
"learning_rate": 7.182548487420554e-06,
"loss": 0.0009,
"step": 829
},
{
"epoch": 0.47105561861521,
"grad_norm": 0.061873581260442734,
"learning_rate": 7.100829338251147e-06,
"loss": 0.001,
"step": 830
},
{
"epoch": 0.4716231555051078,
"grad_norm": 0.02593647502362728,
"learning_rate": 7.019542183254046e-06,
"loss": 0.0004,
"step": 831
},
{
"epoch": 0.47219069239500566,
"grad_norm": 0.6185386776924133,
"learning_rate": 6.9386878409899715e-06,
"loss": 0.0073,
"step": 832
},
{
"epoch": 0.4727582292849035,
"grad_norm": 0.666622519493103,
"learning_rate": 6.858267125661272e-06,
"loss": 0.0093,
"step": 833
},
{
"epoch": 0.4733257661748014,
"grad_norm": 0.05342670530080795,
"learning_rate": 6.778280847103669e-06,
"loss": 0.0008,
"step": 834
},
{
"epoch": 0.4738933030646992,
"grad_norm": 1.0183546543121338,
"learning_rate": 6.698729810778065e-06,
"loss": 0.0078,
"step": 835
},
{
"epoch": 0.47446083995459704,
"grad_norm": 0.1871764212846756,
"learning_rate": 6.619614817762537e-06,
"loss": 0.0014,
"step": 836
},
{
"epoch": 0.4750283768444949,
"grad_norm": 0.19295842945575714,
"learning_rate": 6.540936664744196e-06,
"loss": 0.002,
"step": 837
},
{
"epoch": 0.47559591373439275,
"grad_norm": 0.7019402384757996,
"learning_rate": 6.462696144011149e-06,
"loss": 0.0207,
"step": 838
},
{
"epoch": 0.47616345062429055,
"grad_norm": 0.02959679253399372,
"learning_rate": 6.384894043444567e-06,
"loss": 0.0004,
"step": 839
},
{
"epoch": 0.4767309875141884,
"grad_norm": 0.1982329785823822,
"learning_rate": 6.3075311465107535e-06,
"loss": 0.0008,
"step": 840
},
{
"epoch": 0.47729852440408627,
"grad_norm": 0.03872201591730118,
"learning_rate": 6.230608232253227e-06,
"loss": 0.0004,
"step": 841
},
{
"epoch": 0.4778660612939841,
"grad_norm": 0.08221829682588577,
"learning_rate": 6.154126075284855e-06,
"loss": 0.0006,
"step": 842
},
{
"epoch": 0.4784335981838819,
"grad_norm": 0.007916197180747986,
"learning_rate": 6.078085445780129e-06,
"loss": 0.0002,
"step": 843
},
{
"epoch": 0.4790011350737798,
"grad_norm": 0.008437985554337502,
"learning_rate": 6.002487109467347e-06,
"loss": 0.0002,
"step": 844
},
{
"epoch": 0.47956867196367764,
"grad_norm": 0.005791019182652235,
"learning_rate": 5.927331827620903e-06,
"loss": 0.0001,
"step": 845
},
{
"epoch": 0.4801362088535755,
"grad_norm": 0.0,
"learning_rate": 5.852620357053651e-06,
"loss": 0.0,
"step": 846
},
{
"epoch": 0.4807037457434733,
"grad_norm": 0.0,
"learning_rate": 5.778353450109286e-06,
"loss": 0.0,
"step": 847
},
{
"epoch": 0.48127128263337116,
"grad_norm": 0.0,
"learning_rate": 5.704531854654721e-06,
"loss": 0.0,
"step": 848
},
{
"epoch": 0.481838819523269,
"grad_norm": 0.0,
"learning_rate": 5.631156314072605e-06,
"loss": 0.0,
"step": 849
},
{
"epoch": 0.48240635641316687,
"grad_norm": 0.0,
"learning_rate": 5.558227567253832e-06,
"loss": 0.0,
"step": 850
},
{
"epoch": 0.48297389330306467,
"grad_norm": 0.3478143811225891,
"learning_rate": 5.485746348590048e-06,
"loss": 0.0098,
"step": 851
},
{
"epoch": 0.48354143019296253,
"grad_norm": 0.38309186697006226,
"learning_rate": 5.413713387966329e-06,
"loss": 0.0101,
"step": 852
},
{
"epoch": 0.4841089670828604,
"grad_norm": 0.28978919982910156,
"learning_rate": 5.34212941075381e-06,
"loss": 0.0048,
"step": 853
},
{
"epoch": 0.48467650397275824,
"grad_norm": 0.016624854877591133,
"learning_rate": 5.270995137802315e-06,
"loss": 0.0004,
"step": 854
},
{
"epoch": 0.48524404086265605,
"grad_norm": 0.03391743823885918,
"learning_rate": 5.200311285433213e-06,
"loss": 0.0006,
"step": 855
},
{
"epoch": 0.4858115777525539,
"grad_norm": 0.1863984912633896,
"learning_rate": 5.13007856543209e-06,
"loss": 0.0104,
"step": 856
},
{
"epoch": 0.48637911464245176,
"grad_norm": 0.01976456306874752,
"learning_rate": 5.060297685041659e-06,
"loss": 0.0004,
"step": 857
},
{
"epoch": 0.4869466515323496,
"grad_norm": 0.21336333453655243,
"learning_rate": 4.99096934695461e-06,
"loss": 0.0115,
"step": 858
},
{
"epoch": 0.4875141884222474,
"grad_norm": 0.0573849081993103,
"learning_rate": 4.922094249306558e-06,
"loss": 0.0007,
"step": 859
},
{
"epoch": 0.4880817253121453,
"grad_norm": 0.0516238808631897,
"learning_rate": 4.853673085668947e-06,
"loss": 0.0008,
"step": 860
},
{
"epoch": 0.48864926220204313,
"grad_norm": 0.548546314239502,
"learning_rate": 4.78570654504214e-06,
"loss": 0.0216,
"step": 861
},
{
"epoch": 0.489216799091941,
"grad_norm": 0.03412328287959099,
"learning_rate": 4.7181953118484556e-06,
"loss": 0.0006,
"step": 862
},
{
"epoch": 0.4897843359818388,
"grad_norm": 0.03500758484005928,
"learning_rate": 4.651140065925269e-06,
"loss": 0.0008,
"step": 863
},
{
"epoch": 0.49035187287173665,
"grad_norm": 0.019669918343424797,
"learning_rate": 4.58454148251814e-06,
"loss": 0.0004,
"step": 864
},
{
"epoch": 0.4909194097616345,
"grad_norm": 0.3114485442638397,
"learning_rate": 4.5184002322740785e-06,
"loss": 0.0028,
"step": 865
},
{
"epoch": 0.49148694665153236,
"grad_norm": 0.5113534331321716,
"learning_rate": 4.452716981234744e-06,
"loss": 0.0185,
"step": 866
},
{
"epoch": 0.49205448354143017,
"grad_norm": 0.07297579944133759,
"learning_rate": 4.387492390829734e-06,
"loss": 0.0009,
"step": 867
},
{
"epoch": 0.492622020431328,
"grad_norm": 0.4376738667488098,
"learning_rate": 4.322727117869951e-06,
"loss": 0.0091,
"step": 868
},
{
"epoch": 0.4931895573212259,
"grad_norm": 0.011865437030792236,
"learning_rate": 4.258421814540992e-06,
"loss": 0.0002,
"step": 869
},
{
"epoch": 0.49375709421112374,
"grad_norm": 0.16000713407993317,
"learning_rate": 4.19457712839652e-06,
"loss": 0.0013,
"step": 870
},
{
"epoch": 0.49432463110102154,
"grad_norm": 0.15423361957073212,
"learning_rate": 4.131193702351827e-06,
"loss": 0.0024,
"step": 871
},
{
"epoch": 0.4948921679909194,
"grad_norm": 0.022781820967793465,
"learning_rate": 4.068272174677335e-06,
"loss": 0.0004,
"step": 872
},
{
"epoch": 0.49545970488081725,
"grad_norm": 0.025359636172652245,
"learning_rate": 4.005813178992091e-06,
"loss": 0.0005,
"step": 873
},
{
"epoch": 0.4960272417707151,
"grad_norm": 0.029715919867157936,
"learning_rate": 3.9438173442575e-06,
"loss": 0.0006,
"step": 874
},
{
"epoch": 0.4965947786606129,
"grad_norm": 0.019626963883638382,
"learning_rate": 3.8822852947709375e-06,
"loss": 0.0003,
"step": 875
},
{
"epoch": 0.49716231555051077,
"grad_norm": 0.05726097524166107,
"learning_rate": 3.821217650159453e-06,
"loss": 0.0008,
"step": 876
},
{
"epoch": 0.4977298524404086,
"grad_norm": 0.2366546243429184,
"learning_rate": 3.760615025373543e-06,
"loss": 0.0064,
"step": 877
},
{
"epoch": 0.4982973893303065,
"grad_norm": 0.10293308645486832,
"learning_rate": 3.700478030680987e-06,
"loss": 0.0014,
"step": 878
},
{
"epoch": 0.4988649262202043,
"grad_norm": 0.01754389889538288,
"learning_rate": 3.6408072716606346e-06,
"loss": 0.0003,
"step": 879
},
{
"epoch": 0.49943246311010214,
"grad_norm": 0.8322834968566895,
"learning_rate": 3.581603349196372e-06,
"loss": 0.0213,
"step": 880
},
{
"epoch": 0.5,
"grad_norm": 0.2963551878929138,
"learning_rate": 3.522866859471047e-06,
"loss": 0.0038,
"step": 881
},
{
"epoch": 0.5005675368898979,
"grad_norm": 0.47917166352272034,
"learning_rate": 3.4645983939604496e-06,
"loss": 0.008,
"step": 882
},
{
"epoch": 0.5011350737797957,
"grad_norm": 0.04047662764787674,
"learning_rate": 3.406798539427386e-06,
"loss": 0.0004,
"step": 883
},
{
"epoch": 0.5017026106696936,
"grad_norm": 0.03451113775372505,
"learning_rate": 3.349467877915746e-06,
"loss": 0.0002,
"step": 884
},
{
"epoch": 0.5022701475595914,
"grad_norm": 0.04126652702689171,
"learning_rate": 3.2926069867446675e-06,
"loss": 0.0005,
"step": 885
},
{
"epoch": 0.5028376844494892,
"grad_norm": 0.5223488211631775,
"learning_rate": 3.2362164385026706e-06,
"loss": 0.0093,
"step": 886
},
{
"epoch": 0.503405221339387,
"grad_norm": 0.06192615255713463,
"learning_rate": 3.180296801041971e-06,
"loss": 0.0007,
"step": 887
},
{
"epoch": 0.5039727582292849,
"grad_norm": 0.022352147847414017,
"learning_rate": 3.1248486374726883e-06,
"loss": 0.0003,
"step": 888
},
{
"epoch": 0.5045402951191827,
"grad_norm": 0.3611301779747009,
"learning_rate": 3.069872506157212e-06,
"loss": 0.0028,
"step": 889
},
{
"epoch": 0.5051078320090806,
"grad_norm": 0.4576888978481293,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.0058,
"step": 890
},
{
"epoch": 0.5056753688989785,
"grad_norm": 0.22780518233776093,
"learning_rate": 2.961338549964893e-06,
"loss": 0.0019,
"step": 891
},
{
"epoch": 0.5062429057888763,
"grad_norm": 0.05007459968328476,
"learning_rate": 2.9077818180237693e-06,
"loss": 0.0005,
"step": 892
},
{
"epoch": 0.5068104426787742,
"grad_norm": 0.012189110741019249,
"learning_rate": 2.8546993041969173e-06,
"loss": 0.0002,
"step": 893
},
{
"epoch": 0.5073779795686719,
"grad_norm": 0.009090066887438297,
"learning_rate": 2.802091543024671e-06,
"loss": 0.0002,
"step": 894
},
{
"epoch": 0.5079455164585698,
"grad_norm": 0.6339737176895142,
"learning_rate": 2.7499590642665774e-06,
"loss": 0.0025,
"step": 895
},
{
"epoch": 0.5085130533484676,
"grad_norm": 0.06957720965147018,
"learning_rate": 2.6983023928961404e-06,
"loss": 0.0005,
"step": 896
},
{
"epoch": 0.5090805902383655,
"grad_norm": 7.535338878631592,
"learning_rate": 2.647122049095463e-06,
"loss": 0.0468,
"step": 897
},
{
"epoch": 0.5096481271282634,
"grad_norm": 0.0,
"learning_rate": 2.596418548250029e-06,
"loss": 0.0,
"step": 898
},
{
"epoch": 0.5102156640181612,
"grad_norm": 0.0,
"learning_rate": 2.546192400943537e-06,
"loss": 0.0,
"step": 899
},
{
"epoch": 0.5107832009080591,
"grad_norm": 0.0,
"learning_rate": 2.496444112952734e-06,
"loss": 0.0,
"step": 900
},
{
"epoch": 0.5113507377979569,
"grad_norm": 0.33791133761405945,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.01,
"step": 901
},
{
"epoch": 0.5119182746878547,
"grad_norm": 0.10187462717294693,
"learning_rate": 2.3983831139599287e-06,
"loss": 0.002,
"step": 902
},
{
"epoch": 0.5124858115777525,
"grad_norm": 0.02221851982176304,
"learning_rate": 2.3500713904311024e-06,
"loss": 0.0007,
"step": 903
},
{
"epoch": 0.5130533484676504,
"grad_norm": 0.014832521788775921,
"learning_rate": 2.3022395011543686e-06,
"loss": 0.0005,
"step": 904
},
{
"epoch": 0.5136208853575482,
"grad_norm": 0.26342910528182983,
"learning_rate": 2.2548879277963064e-06,
"loss": 0.0045,
"step": 905
},
{
"epoch": 0.5141884222474461,
"grad_norm": 0.08771803975105286,
"learning_rate": 2.208017147186736e-06,
"loss": 0.0009,
"step": 906
},
{
"epoch": 0.514755959137344,
"grad_norm": 0.2147466242313385,
"learning_rate": 2.161627631313923e-06,
"loss": 0.001,
"step": 907
},
{
"epoch": 0.5153234960272418,
"grad_norm": 0.09128167480230331,
"learning_rate": 2.1157198473197414e-06,
"loss": 0.0013,
"step": 908
},
{
"epoch": 0.5158910329171397,
"grad_norm": 0.01900799199938774,
"learning_rate": 2.070294257495081e-06,
"loss": 0.0004,
"step": 909
},
{
"epoch": 0.5164585698070374,
"grad_norm": 0.040807388722896576,
"learning_rate": 2.0253513192751373e-06,
"loss": 0.0006,
"step": 910
},
{
"epoch": 0.5170261066969353,
"grad_norm": 0.19579406082630157,
"learning_rate": 1.9808914852347813e-06,
"loss": 0.0024,
"step": 911
},
{
"epoch": 0.5175936435868331,
"grad_norm": 0.22570084035396576,
"learning_rate": 1.9369152030840556e-06,
"loss": 0.011,
"step": 912
},
{
"epoch": 0.518161180476731,
"grad_norm": 0.16357550024986267,
"learning_rate": 1.8934229156636452e-06,
"loss": 0.0018,
"step": 913
},
{
"epoch": 0.5187287173666288,
"grad_norm": 0.024271734058856964,
"learning_rate": 1.8504150609403858e-06,
"loss": 0.0004,
"step": 914
},
{
"epoch": 0.5192962542565267,
"grad_norm": 0.4943500757217407,
"learning_rate": 1.807892072002898e-06,
"loss": 0.0099,
"step": 915
},
{
"epoch": 0.5198637911464246,
"grad_norm": 0.22430936992168427,
"learning_rate": 1.7658543770572189e-06,
"loss": 0.0062,
"step": 916
},
{
"epoch": 0.5204313280363224,
"grad_norm": 0.02454625442624092,
"learning_rate": 1.724302399422456e-06,
"loss": 0.0006,
"step": 917
},
{
"epoch": 0.5209988649262202,
"grad_norm": 0.11323986947536469,
"learning_rate": 1.6832365575265741e-06,
"loss": 0.0014,
"step": 918
},
{
"epoch": 0.521566401816118,
"grad_norm": 0.03107013925909996,
"learning_rate": 1.6426572649021476e-06,
"loss": 0.0006,
"step": 919
},
{
"epoch": 0.5221339387060159,
"grad_norm": 0.04345984384417534,
"learning_rate": 1.6025649301821876e-06,
"loss": 0.0004,
"step": 920
},
{
"epoch": 0.5227014755959137,
"grad_norm": 0.4001345932483673,
"learning_rate": 1.5629599570960718e-06,
"loss": 0.0115,
"step": 921
},
{
"epoch": 0.5232690124858116,
"grad_norm": 0.09491916000843048,
"learning_rate": 1.523842744465437e-06,
"loss": 0.0006,
"step": 922
},
{
"epoch": 0.5238365493757094,
"grad_norm": 0.17167732119560242,
"learning_rate": 1.4852136862001764e-06,
"loss": 0.0021,
"step": 923
},
{
"epoch": 0.5244040862656073,
"grad_norm": 0.09016118198633194,
"learning_rate": 1.4470731712944884e-06,
"loss": 0.0004,
"step": 924
},
{
"epoch": 0.5249716231555052,
"grad_norm": 0.10207764804363251,
"learning_rate": 1.4094215838229176e-06,
"loss": 0.001,
"step": 925
},
{
"epoch": 0.5255391600454029,
"grad_norm": 0.009659104980528355,
"learning_rate": 1.372259302936546e-06,
"loss": 0.0002,
"step": 926
},
{
"epoch": 0.5261066969353008,
"grad_norm": 0.022223835811018944,
"learning_rate": 1.3355867028591208e-06,
"loss": 0.0004,
"step": 927
},
{
"epoch": 0.5266742338251986,
"grad_norm": 0.3852534890174866,
"learning_rate": 1.2994041528833266e-06,
"loss": 0.0176,
"step": 928
},
{
"epoch": 0.5272417707150965,
"grad_norm": 0.032568030059337616,
"learning_rate": 1.2637120173670358e-06,
"loss": 0.0005,
"step": 929
},
{
"epoch": 0.5278093076049943,
"grad_norm": 0.12185148894786835,
"learning_rate": 1.2285106557296477e-06,
"loss": 0.0014,
"step": 930
},
{
"epoch": 0.5283768444948922,
"grad_norm": 0.610008716583252,
"learning_rate": 1.1938004224484988e-06,
"loss": 0.0051,
"step": 931
},
{
"epoch": 0.52894438138479,
"grad_norm": 0.06961622089147568,
"learning_rate": 1.1595816670552428e-06,
"loss": 0.0011,
"step": 932
},
{
"epoch": 0.5295119182746879,
"grad_norm": 0.055191271007061005,
"learning_rate": 1.1258547341323699e-06,
"loss": 0.0005,
"step": 933
},
{
"epoch": 0.5300794551645857,
"grad_norm": 0.43228384852409363,
"learning_rate": 1.0926199633097157e-06,
"loss": 0.0173,
"step": 934
},
{
"epoch": 0.5306469920544835,
"grad_norm": 0.4304471015930176,
"learning_rate": 1.0598776892610685e-06,
"loss": 0.0156,
"step": 935
},
{
"epoch": 0.5312145289443814,
"grad_norm": 0.02435118891298771,
"learning_rate": 1.02762824170074e-06,
"loss": 0.0003,
"step": 936
},
{
"epoch": 0.5317820658342792,
"grad_norm": 0.18826240301132202,
"learning_rate": 9.958719453803278e-07,
"loss": 0.002,
"step": 937
},
{
"epoch": 0.5323496027241771,
"grad_norm": 0.5460970401763916,
"learning_rate": 9.646091200853802e-07,
"loss": 0.0294,
"step": 938
},
{
"epoch": 0.5329171396140749,
"grad_norm": 0.013325286097824574,
"learning_rate": 9.338400806321978e-07,
"loss": 0.0002,
"step": 939
},
{
"epoch": 0.5334846765039728,
"grad_norm": 0.010715479031205177,
"learning_rate": 9.035651368646648e-07,
"loss": 0.0002,
"step": 940
},
{
"epoch": 0.5340522133938707,
"grad_norm": 0.05249933898448944,
"learning_rate": 8.737845936511335e-07,
"loss": 0.0007,
"step": 941
},
{
"epoch": 0.5346197502837684,
"grad_norm": 0.6176870465278625,
"learning_rate": 8.444987508813451e-07,
"loss": 0.0114,
"step": 942
},
{
"epoch": 0.5351872871736663,
"grad_norm": 0.05264892801642418,
"learning_rate": 8.157079034633974e-07,
"loss": 0.0003,
"step": 943
},
{
"epoch": 0.5357548240635641,
"grad_norm": 0.022312408313155174,
"learning_rate": 7.874123413208145e-07,
"loss": 0.0002,
"step": 944
},
{
"epoch": 0.536322360953462,
"grad_norm": 0.0,
"learning_rate": 7.596123493895991e-07,
"loss": 0.0,
"step": 945
},
{
"epoch": 0.5368898978433598,
"grad_norm": 0.0,
"learning_rate": 7.323082076153509e-07,
"loss": 0.0,
"step": 946
},
{
"epoch": 0.5374574347332577,
"grad_norm": 0.0,
"learning_rate": 7.055001909504755e-07,
"loss": 0.0,
"step": 947
},
{
"epoch": 0.5380249716231555,
"grad_norm": 0.0,
"learning_rate": 6.791885693514133e-07,
"loss": 0.0,
"step": 948
},
{
"epoch": 0.5385925085130534,
"grad_norm": 29.0640926361084,
"learning_rate": 6.533736077758868e-07,
"loss": 0.3484,
"step": 949
},
{
"epoch": 0.5391600454029511,
"grad_norm": 0.0,
"learning_rate": 6.280555661802856e-07,
"loss": 0.0,
"step": 950
},
{
"epoch": 0.539727582292849,
"grad_norm": 0.7819038033485413,
"learning_rate": 6.032346995169968e-07,
"loss": 0.0194,
"step": 951
},
{
"epoch": 0.5402951191827469,
"grad_norm": 0.15347006916999817,
"learning_rate": 5.78911257731879e-07,
"loss": 0.0101,
"step": 952
},
{
"epoch": 0.5408626560726447,
"grad_norm": 0.2514842748641968,
"learning_rate": 5.550854857617193e-07,
"loss": 0.0147,
"step": 953
},
{
"epoch": 0.5414301929625426,
"grad_norm": 0.018654726445674896,
"learning_rate": 5.317576235317756e-07,
"loss": 0.0004,
"step": 954
},
{
"epoch": 0.5419977298524404,
"grad_norm": 0.017752651125192642,
"learning_rate": 5.089279059533658e-07,
"loss": 0.0005,
"step": 955
},
{
"epoch": 0.5425652667423383,
"grad_norm": 0.010392699390649796,
"learning_rate": 4.865965629214819e-07,
"loss": 0.0003,
"step": 956
},
{
"epoch": 0.5431328036322361,
"grad_norm": 0.183840811252594,
"learning_rate": 4.647638193125137e-07,
"loss": 0.009,
"step": 957
},
{
"epoch": 0.5437003405221339,
"grad_norm": 0.22836080193519592,
"learning_rate": 4.434298949819449e-07,
"loss": 0.0039,
"step": 958
},
{
"epoch": 0.5442678774120318,
"grad_norm": 0.19406422972679138,
"learning_rate": 4.2259500476214407e-07,
"loss": 0.0091,
"step": 959
},
{
"epoch": 0.5448354143019296,
"grad_norm": 0.015072612091898918,
"learning_rate": 4.02259358460233e-07,
"loss": 0.0004,
"step": 960
},
{
"epoch": 0.5454029511918275,
"grad_norm": 0.09216684103012085,
"learning_rate": 3.824231608559492e-07,
"loss": 0.0006,
"step": 961
},
{
"epoch": 0.5459704880817253,
"grad_norm": 0.35321223735809326,
"learning_rate": 3.630866116995757e-07,
"loss": 0.006,
"step": 962
},
{
"epoch": 0.5465380249716232,
"grad_norm": 0.1484595090150833,
"learning_rate": 3.4424990570994797e-07,
"loss": 0.0039,
"step": 963
},
{
"epoch": 0.547105561861521,
"grad_norm": 0.25822874903678894,
"learning_rate": 3.2591323257248893e-07,
"loss": 0.0036,
"step": 964
},
{
"epoch": 0.5476730987514189,
"grad_norm": 0.11979275941848755,
"learning_rate": 3.080767769372939e-07,
"loss": 0.0012,
"step": 965
},
{
"epoch": 0.5482406356413166,
"grad_norm": 0.013929195702075958,
"learning_rate": 2.907407184172706e-07,
"loss": 0.0003,
"step": 966
},
{
"epoch": 0.5488081725312145,
"grad_norm": 0.017956310883164406,
"learning_rate": 2.7390523158633554e-07,
"loss": 0.0004,
"step": 967
},
{
"epoch": 0.5493757094211124,
"grad_norm": 0.010877071879804134,
"learning_rate": 2.5757048597765396e-07,
"loss": 0.0003,
"step": 968
},
{
"epoch": 0.5499432463110102,
"grad_norm": 0.016699308529496193,
"learning_rate": 2.4173664608193593e-07,
"loss": 0.0004,
"step": 969
},
{
"epoch": 0.5505107832009081,
"grad_norm": 0.014117494225502014,
"learning_rate": 2.2640387134577058e-07,
"loss": 0.0003,
"step": 970
},
{
"epoch": 0.5510783200908059,
"grad_norm": 0.3465084135532379,
"learning_rate": 2.1157231617002783e-07,
"loss": 0.0134,
"step": 971
},
{
"epoch": 0.5516458569807038,
"grad_norm": 0.098577119410038,
"learning_rate": 1.9724212990830938e-07,
"loss": 0.0015,
"step": 972
},
{
"epoch": 0.5522133938706016,
"grad_norm": 0.008477923460304737,
"learning_rate": 1.8341345686543332e-07,
"loss": 0.0002,
"step": 973
},
{
"epoch": 0.5527809307604994,
"grad_norm": 0.31904277205467224,
"learning_rate": 1.7008643629596866e-07,
"loss": 0.0125,
"step": 974
},
{
"epoch": 0.5533484676503972,
"grad_norm": 0.02196822501718998,
"learning_rate": 1.5726120240288634e-07,
"loss": 0.0005,
"step": 975
},
{
"epoch": 0.5539160045402951,
"grad_norm": 0.03046775981783867,
"learning_rate": 1.449378843361271e-07,
"loss": 0.0004,
"step": 976
},
{
"epoch": 0.554483541430193,
"grad_norm": 0.021513327956199646,
"learning_rate": 1.3311660619138578e-07,
"loss": 0.0004,
"step": 977
},
{
"epoch": 0.5550510783200908,
"grad_norm": 0.01769246906042099,
"learning_rate": 1.2179748700879012e-07,
"loss": 0.0004,
"step": 978
},
{
"epoch": 0.5556186152099887,
"grad_norm": 0.12877629697322845,
"learning_rate": 1.109806407717462e-07,
"loss": 0.0014,
"step": 979
},
{
"epoch": 0.5561861520998865,
"grad_norm": 0.3173483610153198,
"learning_rate": 1.0066617640578368e-07,
"loss": 0.0027,
"step": 980
},
{
"epoch": 0.5567536889897844,
"grad_norm": 0.045826442539691925,
"learning_rate": 9.085419777743465e-08,
"loss": 0.0007,
"step": 981
},
{
"epoch": 0.5573212258796821,
"grad_norm": 0.01739875227212906,
"learning_rate": 8.15448036932176e-08,
"loss": 0.0002,
"step": 982
},
{
"epoch": 0.55788876276958,
"grad_norm": 0.04034648835659027,
"learning_rate": 7.273808789862724e-08,
"loss": 0.0005,
"step": 983
},
{
"epoch": 0.5584562996594779,
"grad_norm": 0.02119840681552887,
"learning_rate": 6.443413907720186e-08,
"loss": 0.0003,
"step": 984
},
{
"epoch": 0.5590238365493757,
"grad_norm": 0.006225614342838526,
"learning_rate": 5.663304084960186e-08,
"loss": 0.0001,
"step": 985
},
{
"epoch": 0.5595913734392736,
"grad_norm": 0.07903040200471878,
"learning_rate": 4.933487177280482e-08,
"loss": 0.0005,
"step": 986
},
{
"epoch": 0.5601589103291714,
"grad_norm": 0.5586972832679749,
"learning_rate": 4.253970533929508e-08,
"loss": 0.011,
"step": 987
},
{
"epoch": 0.5607264472190693,
"grad_norm": 0.9101510047912598,
"learning_rate": 3.624760997631982e-08,
"loss": 0.0241,
"step": 988
},
{
"epoch": 0.5612939841089671,
"grad_norm": 0.04058132693171501,
"learning_rate": 3.04586490452119e-08,
"loss": 0.0004,
"step": 989
},
{
"epoch": 0.5618615209988649,
"grad_norm": 0.017602860927581787,
"learning_rate": 2.5172880840745873e-08,
"loss": 0.0003,
"step": 990
},
{
"epoch": 0.5624290578887627,
"grad_norm": 0.008799027651548386,
"learning_rate": 2.0390358590538504e-08,
"loss": 0.0002,
"step": 991
},
{
"epoch": 0.5629965947786606,
"grad_norm": 0.022075001150369644,
"learning_rate": 1.6111130454543598e-08,
"loss": 0.0003,
"step": 992
},
{
"epoch": 0.5635641316685585,
"grad_norm": 0.00847385823726654,
"learning_rate": 1.2335239524541299e-08,
"loss": 0.0002,
"step": 993
},
{
"epoch": 0.5641316685584563,
"grad_norm": 0.039732273668050766,
"learning_rate": 9.06272382371065e-09,
"loss": 0.0004,
"step": 994
},
{
"epoch": 0.5646992054483542,
"grad_norm": 0.022346949204802513,
"learning_rate": 6.293616306246586e-09,
"loss": 0.0003,
"step": 995
},
{
"epoch": 0.565266742338252,
"grad_norm": 0.0,
"learning_rate": 4.0279448570323954e-09,
"loss": 0.0,
"step": 996
},
{
"epoch": 0.5658342792281499,
"grad_norm": 0.0,
"learning_rate": 2.265732291356626e-09,
"loss": 0.0,
"step": 997
},
{
"epoch": 0.5664018161180476,
"grad_norm": 0.0,
"learning_rate": 1.0069963546743832e-09,
"loss": 0.0,
"step": 998
},
{
"epoch": 0.5669693530079455,
"grad_norm": 0.0,
"learning_rate": 2.5174972244634833e-10,
"loss": 0.0,
"step": 999
},
{
"epoch": 0.5675368898978433,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"step": 1000
},
{
"epoch": 0.5675368898978433,
"eval_loss": NaN,
"eval_runtime": 105.9662,
"eval_samples_per_second": 28.009,
"eval_steps_per_second": 7.002,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 4
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.3001540599808e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}