Shawon16's picture
End of training
3e8a954 verified
{
"best_metric": 0.3927477017364658,
"best_model_checkpoint": "/media/cse/HDD/Shawon/shawon/MY DATA/VideoMAE_WLASL_2000_200_epochs_p20_SR_8_kinetics/checkpoint-28580",
"epoch": 35.005000699888015,
"eval_steps": 500,
"global_step": 64305,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002799552071668533,
"grad_norm": 20.226421356201172,
"learning_rate": 1.3997760358342666e-07,
"loss": 30.4518,
"step": 100
},
{
"epoch": 0.0005599104143337066,
"grad_norm": 24.225967407226562,
"learning_rate": 2.799552071668533e-07,
"loss": 30.4528,
"step": 200
},
{
"epoch": 0.0008398656215005599,
"grad_norm": 20.26617431640625,
"learning_rate": 4.1993281075028e-07,
"loss": 30.5309,
"step": 300
},
{
"epoch": 0.0011198208286674132,
"grad_norm": 18.84654426574707,
"learning_rate": 5.599104143337066e-07,
"loss": 30.4432,
"step": 400
},
{
"epoch": 0.0013997760358342665,
"grad_norm": 19.313968658447266,
"learning_rate": 6.998880179171333e-07,
"loss": 30.476,
"step": 500
},
{
"epoch": 0.0016797312430011197,
"grad_norm": 19.507740020751953,
"learning_rate": 8.3986562150056e-07,
"loss": 30.4617,
"step": 600
},
{
"epoch": 0.001959686450167973,
"grad_norm": 18.0329647064209,
"learning_rate": 9.798432250839866e-07,
"loss": 30.4795,
"step": 700
},
{
"epoch": 0.0022396416573348264,
"grad_norm": 17.514469146728516,
"learning_rate": 1.1198208286674133e-06,
"loss": 30.4514,
"step": 800
},
{
"epoch": 0.0025195968645016797,
"grad_norm": 18.40747833251953,
"learning_rate": 1.25979843225084e-06,
"loss": 30.4258,
"step": 900
},
{
"epoch": 0.002799552071668533,
"grad_norm": 15.273112297058105,
"learning_rate": 1.3997760358342666e-06,
"loss": 30.4209,
"step": 1000
},
{
"epoch": 0.003079507278835386,
"grad_norm": 16.360700607299805,
"learning_rate": 1.5397536394176933e-06,
"loss": 30.4484,
"step": 1100
},
{
"epoch": 0.0033594624860022394,
"grad_norm": 14.241786003112793,
"learning_rate": 1.67973124300112e-06,
"loss": 30.4332,
"step": 1200
},
{
"epoch": 0.003639417693169093,
"grad_norm": 13.431915283203125,
"learning_rate": 1.8197088465845464e-06,
"loss": 30.4436,
"step": 1300
},
{
"epoch": 0.003919372900335946,
"grad_norm": 12.783411026000977,
"learning_rate": 1.9596864501679732e-06,
"loss": 30.436,
"step": 1400
},
{
"epoch": 0.0041993281075028,
"grad_norm": 12.723618507385254,
"learning_rate": 2.0996640537514e-06,
"loss": 30.4174,
"step": 1500
},
{
"epoch": 0.004479283314669653,
"grad_norm": 14.043514251708984,
"learning_rate": 2.2396416573348266e-06,
"loss": 30.4404,
"step": 1600
},
{
"epoch": 0.004759238521836506,
"grad_norm": 11.425649642944336,
"learning_rate": 2.379619260918253e-06,
"loss": 30.4419,
"step": 1700
},
{
"epoch": 0.005,
"eval_accuracy": 0.0012768130745658835,
"eval_f1": 0.0003286915125900798,
"eval_loss": 7.599897861480713,
"eval_precision": 0.0003945151684672421,
"eval_recall": 0.0012768130745658835,
"eval_runtime": 160.9217,
"eval_samples_per_second": 24.335,
"eval_steps_per_second": 12.167,
"eval_top_10_accuracy": 0.007660878447395302,
"eval_top_1_accuracy": 0.0012768130745658835,
"eval_top_5_accuracy": 0.0040858018386108275,
"step": 1786
},
{
"epoch": 1.0000384938409854,
"grad_norm": 11.807793617248535,
"learning_rate": 2.5181970884658456e-06,
"loss": 30.4173,
"step": 1800
},
{
"epoch": 1.0003184490481523,
"grad_norm": 12.485947608947754,
"learning_rate": 2.6581746920492722e-06,
"loss": 30.3847,
"step": 1900
},
{
"epoch": 1.000598404255319,
"grad_norm": 12.61637020111084,
"learning_rate": 2.798152295632699e-06,
"loss": 30.3864,
"step": 2000
},
{
"epoch": 1.000878359462486,
"grad_norm": 14.853839874267578,
"learning_rate": 2.9381298992161256e-06,
"loss": 30.3646,
"step": 2100
},
{
"epoch": 1.0011583146696528,
"grad_norm": 13.900428771972656,
"learning_rate": 3.0781075027995522e-06,
"loss": 30.3906,
"step": 2200
},
{
"epoch": 1.0014382698768196,
"grad_norm": 15.140420913696289,
"learning_rate": 3.218085106382979e-06,
"loss": 30.38,
"step": 2300
},
{
"epoch": 1.0017182250839867,
"grad_norm": 13.726295471191406,
"learning_rate": 3.358062709966406e-06,
"loss": 30.387,
"step": 2400
},
{
"epoch": 1.0019981802911535,
"grad_norm": 14.190909385681152,
"learning_rate": 3.498040313549832e-06,
"loss": 30.3637,
"step": 2500
},
{
"epoch": 1.0022781354983203,
"grad_norm": 14.675251960754395,
"learning_rate": 3.638017917133259e-06,
"loss": 30.3778,
"step": 2600
},
{
"epoch": 1.0025580907054872,
"grad_norm": 12.957648277282715,
"learning_rate": 3.777995520716685e-06,
"loss": 30.3976,
"step": 2700
},
{
"epoch": 1.002838045912654,
"grad_norm": 12.860355377197266,
"learning_rate": 3.917973124300112e-06,
"loss": 30.3693,
"step": 2800
},
{
"epoch": 1.0031180011198209,
"grad_norm": 14.768610954284668,
"learning_rate": 4.057950727883539e-06,
"loss": 30.3588,
"step": 2900
},
{
"epoch": 1.0033979563269877,
"grad_norm": 16.286272048950195,
"learning_rate": 4.1979283314669655e-06,
"loss": 30.3603,
"step": 3000
},
{
"epoch": 1.0036779115341545,
"grad_norm": 18.51091766357422,
"learning_rate": 4.337905935050392e-06,
"loss": 30.3436,
"step": 3100
},
{
"epoch": 1.0039578667413214,
"grad_norm": 19.337949752807617,
"learning_rate": 4.477883538633819e-06,
"loss": 30.3448,
"step": 3200
},
{
"epoch": 1.0042378219484882,
"grad_norm": 19.713043212890625,
"learning_rate": 4.6178611422172455e-06,
"loss": 30.3127,
"step": 3300
},
{
"epoch": 1.004517777155655,
"grad_norm": 20.15603256225586,
"learning_rate": 4.757838745800672e-06,
"loss": 30.3277,
"step": 3400
},
{
"epoch": 1.0047977323628219,
"grad_norm": 20.172250747680664,
"learning_rate": 4.897816349384099e-06,
"loss": 30.3231,
"step": 3500
},
{
"epoch": 1.0049993001119821,
"eval_accuracy": 0.001787538304392237,
"eval_f1": 0.0005964394765241968,
"eval_loss": 7.570636749267578,
"eval_precision": 0.0006831536678159338,
"eval_recall": 0.001787538304392237,
"eval_runtime": 116.0933,
"eval_samples_per_second": 33.731,
"eval_steps_per_second": 16.866,
"eval_top_10_accuracy": 0.01174668028600613,
"eval_top_1_accuracy": 0.001787538304392237,
"eval_top_5_accuracy": 0.007660878447395302,
"step": 3572
},
{
"epoch": 2.000076987681971,
"grad_norm": 22.496971130371094,
"learning_rate": 5.0377939529675254e-06,
"loss": 30.2773,
"step": 3600
},
{
"epoch": 2.000356942889138,
"grad_norm": 29.170331954956055,
"learning_rate": 5.177771556550952e-06,
"loss": 30.1604,
"step": 3700
},
{
"epoch": 2.0006368980963045,
"grad_norm": 33.307071685791016,
"learning_rate": 5.3163493840985445e-06,
"loss": 30.077,
"step": 3800
},
{
"epoch": 2.0009168533034716,
"grad_norm": 29.411121368408203,
"learning_rate": 5.456326987681971e-06,
"loss": 30.0944,
"step": 3900
},
{
"epoch": 2.001196808510638,
"grad_norm": 35.26206588745117,
"learning_rate": 5.596304591265398e-06,
"loss": 30.0089,
"step": 4000
},
{
"epoch": 2.0014767637178053,
"grad_norm": 38.77720260620117,
"learning_rate": 5.7362821948488245e-06,
"loss": 29.9731,
"step": 4100
},
{
"epoch": 2.001756718924972,
"grad_norm": 33.86006164550781,
"learning_rate": 5.876259798432251e-06,
"loss": 30.0306,
"step": 4200
},
{
"epoch": 2.002036674132139,
"grad_norm": 38.09397888183594,
"learning_rate": 6.016237402015678e-06,
"loss": 29.8864,
"step": 4300
},
{
"epoch": 2.0023166293393055,
"grad_norm": 39.75291442871094,
"learning_rate": 6.1562150055991044e-06,
"loss": 29.8617,
"step": 4400
},
{
"epoch": 2.0025965845464726,
"grad_norm": 41.37080764770508,
"learning_rate": 6.294792833146697e-06,
"loss": 29.9367,
"step": 4500
},
{
"epoch": 2.002876539753639,
"grad_norm": 37.2781982421875,
"learning_rate": 6.434770436730124e-06,
"loss": 29.9061,
"step": 4600
},
{
"epoch": 2.0031564949608063,
"grad_norm": 41.08019256591797,
"learning_rate": 6.57474804031355e-06,
"loss": 29.779,
"step": 4700
},
{
"epoch": 2.0034364501679733,
"grad_norm": 38.2447624206543,
"learning_rate": 6.714725643896977e-06,
"loss": 29.893,
"step": 4800
},
{
"epoch": 2.00371640537514,
"grad_norm": 42.33695983886719,
"learning_rate": 6.854703247480404e-06,
"loss": 29.6515,
"step": 4900
},
{
"epoch": 2.003996360582307,
"grad_norm": 40.0054817199707,
"learning_rate": 6.99468085106383e-06,
"loss": 29.7127,
"step": 5000
},
{
"epoch": 2.0042763157894736,
"grad_norm": 39.20315170288086,
"learning_rate": 7.134658454647257e-06,
"loss": 29.6478,
"step": 5100
},
{
"epoch": 2.0045562709966407,
"grad_norm": 39.36686325073242,
"learning_rate": 7.274636058230684e-06,
"loss": 29.506,
"step": 5200
},
{
"epoch": 2.0048362262038073,
"grad_norm": 43.698448181152344,
"learning_rate": 7.41461366181411e-06,
"loss": 29.5546,
"step": 5300
},
{
"epoch": 2.0049986002239644,
"eval_accuracy": 0.007405515832482125,
"eval_f1": 0.0019352484689869383,
"eval_loss": 7.384550094604492,
"eval_precision": 0.0016670227951139786,
"eval_recall": 0.007405515832482125,
"eval_runtime": 115.3473,
"eval_samples_per_second": 33.95,
"eval_steps_per_second": 16.975,
"eval_top_10_accuracy": 0.04162410623084781,
"eval_top_1_accuracy": 0.007405515832482125,
"eval_top_5_accuracy": 0.02400408580183861,
"step": 5358
},
{
"epoch": 3.0001154815229563,
"grad_norm": 41.90152359008789,
"learning_rate": 7.554591265397537e-06,
"loss": 29.4201,
"step": 5400
},
{
"epoch": 3.0003954367301233,
"grad_norm": 40.79745864868164,
"learning_rate": 7.694568868980963e-06,
"loss": 28.9946,
"step": 5500
},
{
"epoch": 3.00067539193729,
"grad_norm": 42.561485290527344,
"learning_rate": 7.83454647256439e-06,
"loss": 28.9452,
"step": 5600
},
{
"epoch": 3.000955347144457,
"grad_norm": 43.85258865356445,
"learning_rate": 7.974524076147817e-06,
"loss": 28.9244,
"step": 5700
},
{
"epoch": 3.0012353023516236,
"grad_norm": 42.8238639831543,
"learning_rate": 8.114501679731243e-06,
"loss": 28.8971,
"step": 5800
},
{
"epoch": 3.0015152575587907,
"grad_norm": 42.562843322753906,
"learning_rate": 8.25447928331467e-06,
"loss": 28.8098,
"step": 5900
},
{
"epoch": 3.0017952127659573,
"grad_norm": 49.49258041381836,
"learning_rate": 8.394456886898098e-06,
"loss": 28.7524,
"step": 6000
},
{
"epoch": 3.0020751679731243,
"grad_norm": 42.201019287109375,
"learning_rate": 8.534434490481523e-06,
"loss": 28.8442,
"step": 6100
},
{
"epoch": 3.002355123180291,
"grad_norm": 40.5860595703125,
"learning_rate": 8.674412094064951e-06,
"loss": 28.6063,
"step": 6200
},
{
"epoch": 3.002635078387458,
"grad_norm": 42.844871520996094,
"learning_rate": 8.814389697648377e-06,
"loss": 28.6592,
"step": 6300
},
{
"epoch": 3.0029150335946246,
"grad_norm": 39.86319351196289,
"learning_rate": 8.954367301231802e-06,
"loss": 28.5535,
"step": 6400
},
{
"epoch": 3.0031949888017917,
"grad_norm": 46.02956771850586,
"learning_rate": 9.09434490481523e-06,
"loss": 28.6243,
"step": 6500
},
{
"epoch": 3.0034749440089588,
"grad_norm": 44.78327560424805,
"learning_rate": 9.232922732362822e-06,
"loss": 28.456,
"step": 6600
},
{
"epoch": 3.0037548992161254,
"grad_norm": 45.030609130859375,
"learning_rate": 9.37290033594625e-06,
"loss": 28.4532,
"step": 6700
},
{
"epoch": 3.0040348544232924,
"grad_norm": 42.88111877441406,
"learning_rate": 9.512877939529676e-06,
"loss": 28.4727,
"step": 6800
},
{
"epoch": 3.004314809630459,
"grad_norm": 40.01094436645508,
"learning_rate": 9.652855543113103e-06,
"loss": 28.3569,
"step": 6900
},
{
"epoch": 3.004594764837626,
"grad_norm": 41.569541931152344,
"learning_rate": 9.792833146696529e-06,
"loss": 28.2736,
"step": 7000
},
{
"epoch": 3.0048747200447927,
"grad_norm": 41.992950439453125,
"learning_rate": 9.932810750279955e-06,
"loss": 28.1321,
"step": 7100
},
{
"epoch": 3.005000699888018,
"eval_accuracy": 0.02400408580183861,
"eval_f1": 0.008749887799994431,
"eval_loss": 7.097176551818848,
"eval_precision": 0.008811658496728353,
"eval_recall": 0.02400408580183861,
"eval_runtime": 116.5095,
"eval_samples_per_second": 33.611,
"eval_steps_per_second": 16.806,
"eval_top_10_accuracy": 0.10929519918283963,
"eval_top_1_accuracy": 0.02400408580183861,
"eval_top_5_accuracy": 0.07073544433094994,
"step": 7145
},
{
"epoch": 4.000153975363942,
"grad_norm": 40.83894729614258,
"learning_rate": 1.0072788353863382e-05,
"loss": 27.8419,
"step": 7200
},
{
"epoch": 4.000433930571108,
"grad_norm": 40.60577392578125,
"learning_rate": 1.0212765957446808e-05,
"loss": 27.5321,
"step": 7300
},
{
"epoch": 4.000713885778276,
"grad_norm": 44.68207931518555,
"learning_rate": 1.0352743561030236e-05,
"loss": 27.5769,
"step": 7400
},
{
"epoch": 4.000993840985442,
"grad_norm": 43.748348236083984,
"learning_rate": 1.0492721164613663e-05,
"loss": 27.4798,
"step": 7500
},
{
"epoch": 4.001273796192609,
"grad_norm": 40.99502182006836,
"learning_rate": 1.0632698768197089e-05,
"loss": 27.3829,
"step": 7600
},
{
"epoch": 4.001553751399776,
"grad_norm": 42.76571273803711,
"learning_rate": 1.0772676371780516e-05,
"loss": 27.3276,
"step": 7700
},
{
"epoch": 4.001833706606943,
"grad_norm": 45.32221221923828,
"learning_rate": 1.0912653975363942e-05,
"loss": 27.2542,
"step": 7800
},
{
"epoch": 4.00211366181411,
"grad_norm": 44.121070861816406,
"learning_rate": 1.1052631578947368e-05,
"loss": 27.3298,
"step": 7900
},
{
"epoch": 4.002393617021276,
"grad_norm": 42.33357238769531,
"learning_rate": 1.1192609182530796e-05,
"loss": 27.1349,
"step": 8000
},
{
"epoch": 4.002673572228444,
"grad_norm": 41.43338394165039,
"learning_rate": 1.1332586786114223e-05,
"loss": 27.1831,
"step": 8100
},
{
"epoch": 4.0029535274356105,
"grad_norm": 43.54603958129883,
"learning_rate": 1.1472564389697649e-05,
"loss": 26.9691,
"step": 8200
},
{
"epoch": 4.003233482642777,
"grad_norm": 40.624549865722656,
"learning_rate": 1.1612541993281076e-05,
"loss": 27.0156,
"step": 8300
},
{
"epoch": 4.003513437849944,
"grad_norm": 43.285614013671875,
"learning_rate": 1.1752519596864502e-05,
"loss": 27.0116,
"step": 8400
},
{
"epoch": 4.003793393057111,
"grad_norm": 40.69886779785156,
"learning_rate": 1.1892497200447928e-05,
"loss": 26.9703,
"step": 8500
},
{
"epoch": 4.004073348264278,
"grad_norm": 40.920372009277344,
"learning_rate": 1.2032474804031356e-05,
"loss": 26.9118,
"step": 8600
},
{
"epoch": 4.0043533034714445,
"grad_norm": 43.02329635620117,
"learning_rate": 1.2172452407614781e-05,
"loss": 26.81,
"step": 8700
},
{
"epoch": 4.004633258678611,
"grad_norm": 46.108150482177734,
"learning_rate": 1.2312430011198209e-05,
"loss": 26.6285,
"step": 8800
},
{
"epoch": 4.004913213885779,
"grad_norm": 43.06632995605469,
"learning_rate": 1.2452407614781636e-05,
"loss": 26.7987,
"step": 8900
},
{
"epoch": 4.005,
"eval_accuracy": 0.04647599591419816,
"eval_f1": 0.01843357325169779,
"eval_loss": 6.765562534332275,
"eval_precision": 0.01696919088530181,
"eval_recall": 0.04647599591419816,
"eval_runtime": 114.7714,
"eval_samples_per_second": 34.12,
"eval_steps_per_second": 17.06,
"eval_top_10_accuracy": 0.19126659856996936,
"eval_top_1_accuracy": 0.04647599591419816,
"eval_top_5_accuracy": 0.127170582226762,
"step": 8931
},
{
"epoch": 5.0001924692049275,
"grad_norm": 38.85215377807617,
"learning_rate": 1.2590985442329229e-05,
"loss": 26.275,
"step": 9000
},
{
"epoch": 5.000472424412094,
"grad_norm": 44.39185333251953,
"learning_rate": 1.2730963045912656e-05,
"loss": 25.8789,
"step": 9100
},
{
"epoch": 5.000752379619261,
"grad_norm": 42.62602996826172,
"learning_rate": 1.287094064949608e-05,
"loss": 25.7782,
"step": 9200
},
{
"epoch": 5.001032334826427,
"grad_norm": 41.51466751098633,
"learning_rate": 1.3010918253079508e-05,
"loss": 25.8335,
"step": 9300
},
{
"epoch": 5.001312290033595,
"grad_norm": 42.93080139160156,
"learning_rate": 1.3150895856662934e-05,
"loss": 25.7338,
"step": 9400
},
{
"epoch": 5.0015922452407615,
"grad_norm": 42.483642578125,
"learning_rate": 1.3290873460246361e-05,
"loss": 25.7467,
"step": 9500
},
{
"epoch": 5.001872200447928,
"grad_norm": 45.41472625732422,
"learning_rate": 1.3430851063829789e-05,
"loss": 25.5684,
"step": 9600
},
{
"epoch": 5.002152155655095,
"grad_norm": 42.98210144042969,
"learning_rate": 1.3570828667413216e-05,
"loss": 25.6864,
"step": 9700
},
{
"epoch": 5.002432110862262,
"grad_norm": 48.234275817871094,
"learning_rate": 1.371080627099664e-05,
"loss": 25.4836,
"step": 9800
},
{
"epoch": 5.002712066069429,
"grad_norm": 43.54291915893555,
"learning_rate": 1.3850783874580068e-05,
"loss": 25.404,
"step": 9900
},
{
"epoch": 5.0029920212765955,
"grad_norm": 44.19446563720703,
"learning_rate": 1.3990761478163494e-05,
"loss": 25.3612,
"step": 10000
},
{
"epoch": 5.003271976483763,
"grad_norm": 40.334381103515625,
"learning_rate": 1.4130739081746921e-05,
"loss": 25.5031,
"step": 10100
},
{
"epoch": 5.00355193169093,
"grad_norm": 44.043067932128906,
"learning_rate": 1.4270716685330349e-05,
"loss": 25.2018,
"step": 10200
},
{
"epoch": 5.003831886898096,
"grad_norm": 39.68805694580078,
"learning_rate": 1.4410694288913776e-05,
"loss": 25.3125,
"step": 10300
},
{
"epoch": 5.004111842105263,
"grad_norm": 45.52452850341797,
"learning_rate": 1.45506718924972e-05,
"loss": 25.1784,
"step": 10400
},
{
"epoch": 5.00439179731243,
"grad_norm": 43.4058723449707,
"learning_rate": 1.4690649496080628e-05,
"loss": 25.1405,
"step": 10500
},
{
"epoch": 5.004671752519597,
"grad_norm": 43.045166015625,
"learning_rate": 1.4830627099664055e-05,
"loss": 25.0181,
"step": 10600
},
{
"epoch": 5.004951707726764,
"grad_norm": 44.4958381652832,
"learning_rate": 1.4970604703247481e-05,
"loss": 24.7006,
"step": 10700
},
{
"epoch": 5.004999300111982,
"eval_accuracy": 0.0702247191011236,
"eval_f1": 0.03296463425661127,
"eval_loss": 6.407422065734863,
"eval_precision": 0.030481743763272673,
"eval_recall": 0.0702247191011236,
"eval_runtime": 112.2695,
"eval_samples_per_second": 34.88,
"eval_steps_per_second": 17.44,
"eval_top_10_accuracy": 0.28651685393258425,
"eval_top_1_accuracy": 0.0702247191011236,
"eval_top_5_accuracy": 0.19816138917262513,
"step": 10717
},
{
"epoch": 6.0002309630459125,
"grad_norm": 46.9362907409668,
"learning_rate": 1.5110582306830909e-05,
"loss": 24.0997,
"step": 10800
},
{
"epoch": 6.000510918253079,
"grad_norm": 40.902610778808594,
"learning_rate": 1.5250559910414333e-05,
"loss": 24.1422,
"step": 10900
},
{
"epoch": 6.000790873460247,
"grad_norm": 44.164756774902344,
"learning_rate": 1.5389137737961927e-05,
"loss": 23.8297,
"step": 11000
},
{
"epoch": 6.001070828667413,
"grad_norm": 46.19126510620117,
"learning_rate": 1.5529115341545353e-05,
"loss": 23.9123,
"step": 11100
},
{
"epoch": 6.00135078387458,
"grad_norm": 42.22810363769531,
"learning_rate": 1.566909294512878e-05,
"loss": 23.7447,
"step": 11200
},
{
"epoch": 6.0016307390817465,
"grad_norm": 43.56826400756836,
"learning_rate": 1.5809070548712208e-05,
"loss": 23.7598,
"step": 11300
},
{
"epoch": 6.001910694288914,
"grad_norm": 44.37257385253906,
"learning_rate": 1.5949048152295633e-05,
"loss": 23.6209,
"step": 11400
},
{
"epoch": 6.002190649496081,
"grad_norm": 47.96746063232422,
"learning_rate": 1.6089025755879063e-05,
"loss": 23.9762,
"step": 11500
},
{
"epoch": 6.002470604703247,
"grad_norm": 46.42436599731445,
"learning_rate": 1.6229003359462485e-05,
"loss": 23.7592,
"step": 11600
},
{
"epoch": 6.002750559910415,
"grad_norm": 41.59648895263672,
"learning_rate": 1.6368980963045914e-05,
"loss": 23.4793,
"step": 11700
},
{
"epoch": 6.003030515117581,
"grad_norm": 43.254478454589844,
"learning_rate": 1.650895856662934e-05,
"loss": 23.5082,
"step": 11800
},
{
"epoch": 6.003310470324748,
"grad_norm": 42.162628173828125,
"learning_rate": 1.6648936170212766e-05,
"loss": 23.494,
"step": 11900
},
{
"epoch": 6.003590425531915,
"grad_norm": 47.11122512817383,
"learning_rate": 1.6788913773796195e-05,
"loss": 23.5767,
"step": 12000
},
{
"epoch": 6.003870380739082,
"grad_norm": 43.33349609375,
"learning_rate": 1.6928891377379617e-05,
"loss": 23.3579,
"step": 12100
},
{
"epoch": 6.004150335946249,
"grad_norm": 44.804664611816406,
"learning_rate": 1.7068868980963047e-05,
"loss": 23.2691,
"step": 12200
},
{
"epoch": 6.004430291153415,
"grad_norm": 43.124515533447266,
"learning_rate": 1.7208846584546473e-05,
"loss": 23.1194,
"step": 12300
},
{
"epoch": 6.004710246360582,
"grad_norm": 43.7430419921875,
"learning_rate": 1.7348824188129902e-05,
"loss": 23.0302,
"step": 12400
},
{
"epoch": 6.004990201567749,
"grad_norm": 52.04240798950195,
"learning_rate": 1.7488801791713328e-05,
"loss": 22.9951,
"step": 12500
},
{
"epoch": 6.004998600223964,
"eval_accuracy": 0.10342185903983657,
"eval_f1": 0.05378757190292093,
"eval_loss": 6.031359672546387,
"eval_precision": 0.05112106716669837,
"eval_recall": 0.10342185903983657,
"eval_runtime": 116.7973,
"eval_samples_per_second": 33.528,
"eval_steps_per_second": 16.764,
"eval_top_10_accuracy": 0.368488253319714,
"eval_top_1_accuracy": 0.10342185903983657,
"eval_top_5_accuracy": 0.26940755873340144,
"step": 12503
},
{
"epoch": 7.000269456886898,
"grad_norm": 44.359230041503906,
"learning_rate": 1.7628779395296753e-05,
"loss": 22.1196,
"step": 12600
},
{
"epoch": 7.000549412094065,
"grad_norm": 48.224517822265625,
"learning_rate": 1.776875699888018e-05,
"loss": 22.1125,
"step": 12700
},
{
"epoch": 7.000829367301232,
"grad_norm": 48.176475524902344,
"learning_rate": 1.7908734602463605e-05,
"loss": 21.6989,
"step": 12800
},
{
"epoch": 7.001109322508398,
"grad_norm": 46.10124588012695,
"learning_rate": 1.8048712206047034e-05,
"loss": 21.7023,
"step": 12900
},
{
"epoch": 7.001389277715566,
"grad_norm": 48.56136703491211,
"learning_rate": 1.818868980963046e-05,
"loss": 21.6049,
"step": 13000
},
{
"epoch": 7.001669232922732,
"grad_norm": 45.572750091552734,
"learning_rate": 1.8328667413213886e-05,
"loss": 21.7628,
"step": 13100
},
{
"epoch": 7.001949188129899,
"grad_norm": 46.16699981689453,
"learning_rate": 1.846724524076148e-05,
"loss": 21.8182,
"step": 13200
},
{
"epoch": 7.0022291433370665,
"grad_norm": 45.37190628051758,
"learning_rate": 1.8607222844344906e-05,
"loss": 21.6372,
"step": 13300
},
{
"epoch": 7.002509098544233,
"grad_norm": 48.9770622253418,
"learning_rate": 1.874720044792833e-05,
"loss": 21.6686,
"step": 13400
},
{
"epoch": 7.0027890537514,
"grad_norm": 45.02006530761719,
"learning_rate": 1.8887178051511757e-05,
"loss": 21.6113,
"step": 13500
},
{
"epoch": 7.003069008958566,
"grad_norm": 42.47629928588867,
"learning_rate": 1.9027155655095186e-05,
"loss": 21.3825,
"step": 13600
},
{
"epoch": 7.003348964165734,
"grad_norm": 49.70724868774414,
"learning_rate": 1.9167133258678612e-05,
"loss": 21.0877,
"step": 13700
},
{
"epoch": 7.0036289193729,
"grad_norm": 45.634117126464844,
"learning_rate": 1.930711086226204e-05,
"loss": 21.2323,
"step": 13800
},
{
"epoch": 7.003908874580067,
"grad_norm": 48.9789924621582,
"learning_rate": 1.9447088465845464e-05,
"loss": 20.915,
"step": 13900
},
{
"epoch": 7.004188829787234,
"grad_norm": 46.66007614135742,
"learning_rate": 1.9587066069428893e-05,
"loss": 20.9474,
"step": 14000
},
{
"epoch": 7.004468784994401,
"grad_norm": 48.38516616821289,
"learning_rate": 1.972704367301232e-05,
"loss": 21.0617,
"step": 14100
},
{
"epoch": 7.004748740201568,
"grad_norm": 52.5923957824707,
"learning_rate": 1.9867021276595745e-05,
"loss": 21.0796,
"step": 14200
},
{
"epoch": 7.005000699888018,
"eval_accuracy": 0.14555669050051073,
"eval_f1": 0.0839986463173031,
"eval_loss": 5.593417644500732,
"eval_precision": 0.08039314901923203,
"eval_recall": 0.14555669050051073,
"eval_runtime": 115.0353,
"eval_samples_per_second": 34.042,
"eval_steps_per_second": 17.021,
"eval_top_10_accuracy": 0.45965270684371806,
"eval_top_1_accuracy": 0.14555669050051073,
"eval_top_5_accuracy": 0.3455056179775281,
"step": 14290
},
{
"epoch": 8.000027995520716,
"grad_norm": 50.10724639892578,
"learning_rate": 2.0006998880179174e-05,
"loss": 20.794,
"step": 14300
},
{
"epoch": 8.000307950727883,
"grad_norm": 47.62452697753906,
"learning_rate": 2.01469764837626e-05,
"loss": 19.6639,
"step": 14400
},
{
"epoch": 8.00058790593505,
"grad_norm": 52.315673828125,
"learning_rate": 2.0286954087346026e-05,
"loss": 19.4833,
"step": 14500
},
{
"epoch": 8.000867861142217,
"grad_norm": 47.47260665893555,
"learning_rate": 2.042693169092945e-05,
"loss": 19.5356,
"step": 14600
},
{
"epoch": 8.001147816349384,
"grad_norm": 49.96697235107422,
"learning_rate": 2.0566909294512877e-05,
"loss": 19.4718,
"step": 14700
},
{
"epoch": 8.001427771556552,
"grad_norm": 52.33249282836914,
"learning_rate": 2.0706886898096306e-05,
"loss": 19.4401,
"step": 14800
},
{
"epoch": 8.001707726763717,
"grad_norm": 51.28528594970703,
"learning_rate": 2.0846864501679732e-05,
"loss": 19.3228,
"step": 14900
},
{
"epoch": 8.001987681970885,
"grad_norm": 55.38609313964844,
"learning_rate": 2.0986842105263158e-05,
"loss": 19.431,
"step": 15000
},
{
"epoch": 8.002267637178052,
"grad_norm": 44.709232330322266,
"learning_rate": 2.1126819708846584e-05,
"loss": 19.0222,
"step": 15100
},
{
"epoch": 8.002547592385218,
"grad_norm": 50.02477264404297,
"learning_rate": 2.1266797312430013e-05,
"loss": 19.1266,
"step": 15200
},
{
"epoch": 8.002827547592386,
"grad_norm": 53.93327331542969,
"learning_rate": 2.1405375139977604e-05,
"loss": 19.0785,
"step": 15300
},
{
"epoch": 8.003107502799551,
"grad_norm": 47.67805862426758,
"learning_rate": 2.1545352743561033e-05,
"loss": 19.1712,
"step": 15400
},
{
"epoch": 8.003387458006719,
"grad_norm": 51.58271789550781,
"learning_rate": 2.168533034714446e-05,
"loss": 19.1443,
"step": 15500
},
{
"epoch": 8.003667413213886,
"grad_norm": 50.68724822998047,
"learning_rate": 2.1825307950727885e-05,
"loss": 18.8917,
"step": 15600
},
{
"epoch": 8.003947368421052,
"grad_norm": 56.362327575683594,
"learning_rate": 2.196528555431131e-05,
"loss": 18.6252,
"step": 15700
},
{
"epoch": 8.00422732362822,
"grad_norm": 50.97798538208008,
"learning_rate": 2.2105263157894736e-05,
"loss": 18.6619,
"step": 15800
},
{
"epoch": 8.004507278835387,
"grad_norm": 47.420658111572266,
"learning_rate": 2.2245240761478165e-05,
"loss": 18.7196,
"step": 15900
},
{
"epoch": 8.004787234042553,
"grad_norm": 49.1622428894043,
"learning_rate": 2.238521836506159e-05,
"loss": 18.8279,
"step": 16000
},
{
"epoch": 8.005,
"eval_accuracy": 0.17288049029622063,
"eval_f1": 0.10595764407765323,
"eval_loss": 5.153491020202637,
"eval_precision": 0.10202419282977301,
"eval_recall": 0.17288049029622063,
"eval_runtime": 118.0289,
"eval_samples_per_second": 33.178,
"eval_steps_per_second": 16.589,
"eval_top_10_accuracy": 0.5298774259448417,
"eval_top_1_accuracy": 0.17288049029622063,
"eval_top_5_accuracy": 0.4090909090909091,
"step": 16076
},
{
"epoch": 9.000066489361702,
"grad_norm": 55.131221771240234,
"learning_rate": 2.252519596864502e-05,
"loss": 18.3135,
"step": 16100
},
{
"epoch": 9.00034644456887,
"grad_norm": 49.48210144042969,
"learning_rate": 2.2665173572228446e-05,
"loss": 17.3571,
"step": 16200
},
{
"epoch": 9.000626399776035,
"grad_norm": 48.50625228881836,
"learning_rate": 2.280515117581187e-05,
"loss": 17.119,
"step": 16300
},
{
"epoch": 9.000906354983202,
"grad_norm": 51.08823013305664,
"learning_rate": 2.2945128779395298e-05,
"loss": 17.2323,
"step": 16400
},
{
"epoch": 9.00118631019037,
"grad_norm": 47.95245361328125,
"learning_rate": 2.3085106382978724e-05,
"loss": 17.0125,
"step": 16500
},
{
"epoch": 9.001466265397536,
"grad_norm": 54.29073715209961,
"learning_rate": 2.3225083986562153e-05,
"loss": 16.7668,
"step": 16600
},
{
"epoch": 9.001746220604703,
"grad_norm": 48.46099090576172,
"learning_rate": 2.336506159014558e-05,
"loss": 16.9132,
"step": 16700
},
{
"epoch": 9.00202617581187,
"grad_norm": 56.089088439941406,
"learning_rate": 2.3505039193729004e-05,
"loss": 16.6505,
"step": 16800
},
{
"epoch": 9.002306131019036,
"grad_norm": 51.676971435546875,
"learning_rate": 2.364501679731243e-05,
"loss": 16.1727,
"step": 16900
},
{
"epoch": 9.002586086226204,
"grad_norm": 47.286293029785156,
"learning_rate": 2.3784994400895856e-05,
"loss": 16.6123,
"step": 17000
},
{
"epoch": 9.002866041433371,
"grad_norm": 50.159095764160156,
"learning_rate": 2.3924972004479285e-05,
"loss": 16.6695,
"step": 17100
},
{
"epoch": 9.003145996640537,
"grad_norm": 54.44445037841797,
"learning_rate": 2.406494960806271e-05,
"loss": 16.6419,
"step": 17200
},
{
"epoch": 9.003425951847705,
"grad_norm": 51.42140579223633,
"learning_rate": 2.420492721164614e-05,
"loss": 16.5902,
"step": 17300
},
{
"epoch": 9.00370590705487,
"grad_norm": 66.35865783691406,
"learning_rate": 2.4344904815229563e-05,
"loss": 16.2157,
"step": 17400
},
{
"epoch": 9.003985862262038,
"grad_norm": 56.74697494506836,
"learning_rate": 2.4484882418812992e-05,
"loss": 16.3898,
"step": 17500
},
{
"epoch": 9.004265817469205,
"grad_norm": 63.02745819091797,
"learning_rate": 2.4624860022396418e-05,
"loss": 16.0808,
"step": 17600
},
{
"epoch": 9.004545772676371,
"grad_norm": 54.27665328979492,
"learning_rate": 2.4764837625979844e-05,
"loss": 15.9748,
"step": 17700
},
{
"epoch": 9.004825727883539,
"grad_norm": 51.662681579589844,
"learning_rate": 2.4904815229563273e-05,
"loss": 16.0168,
"step": 17800
},
{
"epoch": 9.004999300111981,
"eval_accuracy": 0.2270173646578141,
"eval_f1": 0.15350337193318386,
"eval_loss": 4.687228679656982,
"eval_precision": 0.14829049623881735,
"eval_recall": 0.2270173646578141,
"eval_runtime": 116.4833,
"eval_samples_per_second": 33.619,
"eval_steps_per_second": 16.809,
"eval_top_10_accuracy": 0.6123595505617978,
"eval_top_1_accuracy": 0.2270173646578141,
"eval_top_5_accuracy": 0.486976506639428,
"step": 17862
},
{
"epoch": 10.000104983202688,
"grad_norm": 55.48484802246094,
"learning_rate": 2.5043393057110863e-05,
"loss": 15.4913,
"step": 17900
},
{
"epoch": 10.000384938409855,
"grad_norm": 54.82772445678711,
"learning_rate": 2.518337066069429e-05,
"loss": 14.5521,
"step": 18000
},
{
"epoch": 10.00066489361702,
"grad_norm": 57.67549514770508,
"learning_rate": 2.532334826427772e-05,
"loss": 14.5236,
"step": 18100
},
{
"epoch": 10.000944848824188,
"grad_norm": 58.726463317871094,
"learning_rate": 2.5463325867861144e-05,
"loss": 14.3662,
"step": 18200
},
{
"epoch": 10.001224804031356,
"grad_norm": 53.089969635009766,
"learning_rate": 2.5603303471444567e-05,
"loss": 14.2559,
"step": 18300
},
{
"epoch": 10.001504759238522,
"grad_norm": 57.39337921142578,
"learning_rate": 2.5743281075027996e-05,
"loss": 14.4143,
"step": 18400
},
{
"epoch": 10.001784714445689,
"grad_norm": 56.75421905517578,
"learning_rate": 2.588325867861142e-05,
"loss": 14.2805,
"step": 18500
},
{
"epoch": 10.002064669652855,
"grad_norm": 52.258358001708984,
"learning_rate": 2.602323628219485e-05,
"loss": 14.1713,
"step": 18600
},
{
"epoch": 10.002344624860022,
"grad_norm": 47.68207550048828,
"learning_rate": 2.6163213885778277e-05,
"loss": 13.8049,
"step": 18700
},
{
"epoch": 10.00262458006719,
"grad_norm": 52.4375,
"learning_rate": 2.6303191489361706e-05,
"loss": 13.8381,
"step": 18800
},
{
"epoch": 10.002904535274356,
"grad_norm": 52.6395378112793,
"learning_rate": 2.644316909294513e-05,
"loss": 13.5344,
"step": 18900
},
{
"epoch": 10.003184490481523,
"grad_norm": 59.29886245727539,
"learning_rate": 2.6583146696528554e-05,
"loss": 13.7621,
"step": 19000
},
{
"epoch": 10.00346444568869,
"grad_norm": 49.460113525390625,
"learning_rate": 2.6723124300111983e-05,
"loss": 13.7878,
"step": 19100
},
{
"epoch": 10.003744400895856,
"grad_norm": 56.663902282714844,
"learning_rate": 2.686310190369541e-05,
"loss": 13.856,
"step": 19200
},
{
"epoch": 10.004024356103024,
"grad_norm": 58.83359909057617,
"learning_rate": 2.700307950727884e-05,
"loss": 13.6796,
"step": 19300
},
{
"epoch": 10.00430431131019,
"grad_norm": 60.854942321777344,
"learning_rate": 2.7143057110862264e-05,
"loss": 13.3986,
"step": 19400
},
{
"epoch": 10.004584266517357,
"grad_norm": 48.193965911865234,
"learning_rate": 2.7283034714445687e-05,
"loss": 13.606,
"step": 19500
},
{
"epoch": 10.004864221724524,
"grad_norm": 57.63452911376953,
"learning_rate": 2.742301231802912e-05,
"loss": 13.4662,
"step": 19600
},
{
"epoch": 10.004998600223963,
"eval_accuracy": 0.27068437180796734,
"eval_f1": 0.19264529608572983,
"eval_loss": 4.217360019683838,
"eval_precision": 0.18252703447090352,
"eval_recall": 0.27068437180796734,
"eval_runtime": 119.985,
"eval_samples_per_second": 32.637,
"eval_steps_per_second": 16.319,
"eval_top_10_accuracy": 0.6802860061287027,
"eval_top_1_accuracy": 0.2704290091930541,
"eval_top_5_accuracy": 0.5584780388151175,
"step": 19648
},
{
"epoch": 11.000143477043673,
"grad_norm": 45.823238372802734,
"learning_rate": 2.756298992161254e-05,
"loss": 12.3464,
"step": 19700
},
{
"epoch": 11.00042343225084,
"grad_norm": 63.88374328613281,
"learning_rate": 2.770296752519597e-05,
"loss": 11.6032,
"step": 19800
},
{
"epoch": 11.000703387458007,
"grad_norm": 64.50922393798828,
"learning_rate": 2.7841545352743565e-05,
"loss": 11.5978,
"step": 19900
},
{
"epoch": 11.000983342665174,
"grad_norm": 58.88920211791992,
"learning_rate": 2.7981522956326987e-05,
"loss": 11.3596,
"step": 20000
},
{
"epoch": 11.00126329787234,
"grad_norm": 56.350189208984375,
"learning_rate": 2.8121500559910413e-05,
"loss": 11.7526,
"step": 20100
},
{
"epoch": 11.001543253079507,
"grad_norm": 56.470664978027344,
"learning_rate": 2.8261478163493842e-05,
"loss": 11.2894,
"step": 20200
},
{
"epoch": 11.001823208286675,
"grad_norm": 69.60688781738281,
"learning_rate": 2.8401455767077268e-05,
"loss": 11.3725,
"step": 20300
},
{
"epoch": 11.00210316349384,
"grad_norm": 52.4871711730957,
"learning_rate": 2.8541433370660697e-05,
"loss": 11.4314,
"step": 20400
},
{
"epoch": 11.002383118701008,
"grad_norm": 55.38204574584961,
"learning_rate": 2.8681410974244123e-05,
"loss": 11.2979,
"step": 20500
},
{
"epoch": 11.002663073908174,
"grad_norm": 75.11087036132812,
"learning_rate": 2.8821388577827552e-05,
"loss": 11.306,
"step": 20600
},
{
"epoch": 11.002943029115341,
"grad_norm": 57.3330078125,
"learning_rate": 2.8961366181410975e-05,
"loss": 11.0933,
"step": 20700
},
{
"epoch": 11.003222984322509,
"grad_norm": 54.95749282836914,
"learning_rate": 2.91013437849944e-05,
"loss": 11.3495,
"step": 20800
},
{
"epoch": 11.003502939529675,
"grad_norm": 64.22066497802734,
"learning_rate": 2.924132138857783e-05,
"loss": 11.4248,
"step": 20900
},
{
"epoch": 11.003782894736842,
"grad_norm": 59.43482208251953,
"learning_rate": 2.9381298992161256e-05,
"loss": 11.1135,
"step": 21000
},
{
"epoch": 11.00406284994401,
"grad_norm": 53.853492736816406,
"learning_rate": 2.9521276595744685e-05,
"loss": 11.0782,
"step": 21100
},
{
"epoch": 11.004342805151175,
"grad_norm": 63.04794692993164,
"learning_rate": 2.966125419932811e-05,
"loss": 10.8959,
"step": 21200
},
{
"epoch": 11.004622760358343,
"grad_norm": 59.868072509765625,
"learning_rate": 2.97998320268757e-05,
"loss": 11.1934,
"step": 21300
},
{
"epoch": 11.00490271556551,
"grad_norm": 62.26585006713867,
"learning_rate": 2.9939809630459127e-05,
"loss": 10.8825,
"step": 21400
},
{
"epoch": 11.005000699888019,
"eval_accuracy": 0.3166496424923391,
"eval_f1": 0.24054554724380248,
"eval_loss": 3.7936577796936035,
"eval_precision": 0.2340502427789698,
"eval_recall": 0.3166496424923391,
"eval_runtime": 113.6495,
"eval_samples_per_second": 34.457,
"eval_steps_per_second": 17.228,
"eval_top_10_accuracy": 0.7288049029622063,
"eval_top_1_accuracy": 0.3158835546475996,
"eval_top_5_accuracy": 0.6182328907048008,
"step": 21435
},
{
"epoch": 12.00018197088466,
"grad_norm": 61.41596984863281,
"learning_rate": 3.0079787234042556e-05,
"loss": 9.9124,
"step": 21500
},
{
"epoch": 12.000461926091825,
"grad_norm": 44.31859588623047,
"learning_rate": 3.021976483762598e-05,
"loss": 9.0461,
"step": 21600
},
{
"epoch": 12.000741881298993,
"grad_norm": 52.90311813354492,
"learning_rate": 3.035974244120941e-05,
"loss": 8.8258,
"step": 21700
},
{
"epoch": 12.001021836506158,
"grad_norm": 54.84385299682617,
"learning_rate": 3.0499720044792834e-05,
"loss": 8.6913,
"step": 21800
},
{
"epoch": 12.001301791713326,
"grad_norm": 56.61054611206055,
"learning_rate": 3.063969764837626e-05,
"loss": 9.0271,
"step": 21900
},
{
"epoch": 12.001581746920493,
"grad_norm": 68.57939147949219,
"learning_rate": 3.077967525195969e-05,
"loss": 9.0834,
"step": 22000
},
{
"epoch": 12.001861702127659,
"grad_norm": 42.94862747192383,
"learning_rate": 3.091965285554311e-05,
"loss": 8.8857,
"step": 22100
},
{
"epoch": 12.002141657334827,
"grad_norm": 58.98466491699219,
"learning_rate": 3.1059630459126544e-05,
"loss": 8.98,
"step": 22200
},
{
"epoch": 12.002421612541994,
"grad_norm": 46.89601135253906,
"learning_rate": 3.119960806270997e-05,
"loss": 8.6856,
"step": 22300
},
{
"epoch": 12.00270156774916,
"grad_norm": 60.492645263671875,
"learning_rate": 3.1339585666293395e-05,
"loss": 8.4309,
"step": 22400
},
{
"epoch": 12.002981522956327,
"grad_norm": 55.55988693237305,
"learning_rate": 3.147956326987682e-05,
"loss": 9.0933,
"step": 22500
},
{
"epoch": 12.003261478163493,
"grad_norm": 59.575557708740234,
"learning_rate": 3.161954087346025e-05,
"loss": 8.7039,
"step": 22600
},
{
"epoch": 12.00354143337066,
"grad_norm": 66.46175384521484,
"learning_rate": 3.175951847704367e-05,
"loss": 8.8195,
"step": 22700
},
{
"epoch": 12.003821388577828,
"grad_norm": 57.742042541503906,
"learning_rate": 3.18994960806271e-05,
"loss": 8.4553,
"step": 22800
},
{
"epoch": 12.004101343784994,
"grad_norm": 56.0836181640625,
"learning_rate": 3.203947368421053e-05,
"loss": 8.6364,
"step": 22900
},
{
"epoch": 12.004381298992161,
"grad_norm": 75.34262084960938,
"learning_rate": 3.217945128779396e-05,
"loss": 8.6335,
"step": 23000
},
{
"epoch": 12.004661254199329,
"grad_norm": 60.73629379272461,
"learning_rate": 3.2319428891377376e-05,
"loss": 8.6328,
"step": 23100
},
{
"epoch": 12.004941209406494,
"grad_norm": 46.728450775146484,
"learning_rate": 3.245940649496081e-05,
"loss": 8.493,
"step": 23200
},
{
"epoch": 12.005,
"eval_accuracy": 0.3406537282941777,
"eval_f1": 0.27469889034445055,
"eval_loss": 3.452850103378296,
"eval_precision": 0.26851533056393956,
"eval_recall": 0.3406537282941777,
"eval_runtime": 114.9185,
"eval_samples_per_second": 34.076,
"eval_steps_per_second": 17.038,
"eval_top_10_accuracy": 0.7602145045965271,
"eval_top_1_accuracy": 0.34014300306435136,
"eval_top_5_accuracy": 0.6618998978549541,
"step": 23221
},
{
"epoch": 13.000220464725643,
"grad_norm": 60.434505462646484,
"learning_rate": 3.2599384098544234e-05,
"loss": 7.3664,
"step": 23300
},
{
"epoch": 13.000500419932811,
"grad_norm": 44.0378532409668,
"learning_rate": 3.273936170212766e-05,
"loss": 6.6296,
"step": 23400
},
{
"epoch": 13.000780375139978,
"grad_norm": 50.515316009521484,
"learning_rate": 3.2879339305711086e-05,
"loss": 6.7951,
"step": 23500
},
{
"epoch": 13.001060330347144,
"grad_norm": 60.511566162109375,
"learning_rate": 3.301931690929451e-05,
"loss": 6.8428,
"step": 23600
},
{
"epoch": 13.001340285554312,
"grad_norm": 50.42853546142578,
"learning_rate": 3.3159294512877944e-05,
"loss": 6.8744,
"step": 23700
},
{
"epoch": 13.001620240761477,
"grad_norm": 53.49345016479492,
"learning_rate": 3.3299272116461363e-05,
"loss": 6.7632,
"step": 23800
},
{
"epoch": 13.001900195968645,
"grad_norm": 61.3757438659668,
"learning_rate": 3.3439249720044796e-05,
"loss": 6.7297,
"step": 23900
},
{
"epoch": 13.002180151175812,
"grad_norm": 53.99778747558594,
"learning_rate": 3.357922732362822e-05,
"loss": 6.6704,
"step": 24000
},
{
"epoch": 13.002460106382978,
"grad_norm": 55.42316818237305,
"learning_rate": 3.371920492721165e-05,
"loss": 6.6194,
"step": 24100
},
{
"epoch": 13.002740061590146,
"grad_norm": 52.400909423828125,
"learning_rate": 3.3859182530795074e-05,
"loss": 6.4215,
"step": 24200
},
{
"epoch": 13.003020016797313,
"grad_norm": 52.664127349853516,
"learning_rate": 3.39991601343785e-05,
"loss": 6.8347,
"step": 24300
},
{
"epoch": 13.003299972004479,
"grad_norm": 54.587646484375,
"learning_rate": 3.413913773796193e-05,
"loss": 6.3553,
"step": 24400
},
{
"epoch": 13.003579927211646,
"grad_norm": 57.914894104003906,
"learning_rate": 3.427911534154535e-05,
"loss": 6.9907,
"step": 24500
},
{
"epoch": 13.003859882418814,
"grad_norm": 53.43699645996094,
"learning_rate": 3.4419092945128784e-05,
"loss": 6.2836,
"step": 24600
},
{
"epoch": 13.00413983762598,
"grad_norm": 51.49525833129883,
"learning_rate": 3.455907054871221e-05,
"loss": 6.4987,
"step": 24700
},
{
"epoch": 13.004419792833147,
"grad_norm": 66.59857177734375,
"learning_rate": 3.4699048152295635e-05,
"loss": 6.4633,
"step": 24800
},
{
"epoch": 13.004699748040313,
"grad_norm": 59.3377685546875,
"learning_rate": 3.483902575587906e-05,
"loss": 6.4775,
"step": 24900
},
{
"epoch": 13.00497970324748,
"grad_norm": 53.3898811340332,
"learning_rate": 3.497900335946249e-05,
"loss": 6.3864,
"step": 25000
},
{
"epoch": 13.004999300111981,
"eval_accuracy": 0.3631256384065373,
"eval_f1": 0.3063358878242328,
"eval_loss": 3.142681837081909,
"eval_precision": 0.3058190177087762,
"eval_recall": 0.3631256384065373,
"eval_runtime": 117.2715,
"eval_samples_per_second": 33.393,
"eval_steps_per_second": 16.696,
"eval_top_10_accuracy": 0.7852400408580184,
"eval_top_1_accuracy": 0.3631256384065373,
"eval_top_5_accuracy": 0.6973953013278856,
"step": 25007
},
{
"epoch": 14.00025895856663,
"grad_norm": 56.198509216308594,
"learning_rate": 3.511898096304592e-05,
"loss": 4.7395,
"step": 25100
},
{
"epoch": 14.000538913773797,
"grad_norm": 34.69329833984375,
"learning_rate": 3.525895856662934e-05,
"loss": 4.5337,
"step": 25200
},
{
"epoch": 14.000818868980963,
"grad_norm": 40.5659065246582,
"learning_rate": 3.5398936170212764e-05,
"loss": 4.9053,
"step": 25300
},
{
"epoch": 14.00109882418813,
"grad_norm": 54.99299240112305,
"learning_rate": 3.553751399776036e-05,
"loss": 4.9332,
"step": 25400
},
{
"epoch": 14.001378779395298,
"grad_norm": 65.95709228515625,
"learning_rate": 3.567749160134379e-05,
"loss": 4.9058,
"step": 25500
},
{
"epoch": 14.001658734602463,
"grad_norm": 64.16633605957031,
"learning_rate": 3.581746920492721e-05,
"loss": 4.9348,
"step": 25600
},
{
"epoch": 14.00193868980963,
"grad_norm": 61.57122039794922,
"learning_rate": 3.595744680851064e-05,
"loss": 4.7432,
"step": 25700
},
{
"epoch": 14.002218645016796,
"grad_norm": 47.70122528076172,
"learning_rate": 3.609742441209407e-05,
"loss": 4.7825,
"step": 25800
},
{
"epoch": 14.002498600223964,
"grad_norm": 42.792518615722656,
"learning_rate": 3.623740201567749e-05,
"loss": 4.7859,
"step": 25900
},
{
"epoch": 14.002778555431131,
"grad_norm": 54.17988204956055,
"learning_rate": 3.637737961926092e-05,
"loss": 4.8806,
"step": 26000
},
{
"epoch": 14.003058510638297,
"grad_norm": 55.54975509643555,
"learning_rate": 3.6517357222844346e-05,
"loss": 5.0199,
"step": 26100
},
{
"epoch": 14.003338465845465,
"grad_norm": 53.45595932006836,
"learning_rate": 3.665733482642777e-05,
"loss": 4.8493,
"step": 26200
},
{
"epoch": 14.003618421052632,
"grad_norm": 59.02440643310547,
"learning_rate": 3.67973124300112e-05,
"loss": 5.0224,
"step": 26300
},
{
"epoch": 14.003898376259798,
"grad_norm": 51.76025390625,
"learning_rate": 3.693729003359463e-05,
"loss": 4.7993,
"step": 26400
},
{
"epoch": 14.004178331466965,
"grad_norm": 57.27727508544922,
"learning_rate": 3.7077267637178056e-05,
"loss": 4.6562,
"step": 26500
},
{
"epoch": 14.004458286674133,
"grad_norm": 57.676143646240234,
"learning_rate": 3.7217245240761475e-05,
"loss": 4.848,
"step": 26600
},
{
"epoch": 14.004738241881299,
"grad_norm": 42.8398551940918,
"learning_rate": 3.735722284434491e-05,
"loss": 4.8598,
"step": 26700
},
{
"epoch": 14.004998600223963,
"eval_accuracy": 0.3672114402451481,
"eval_f1": 0.31953414391226087,
"eval_loss": 2.9818193912506104,
"eval_precision": 0.32370179182577785,
"eval_recall": 0.3672114402451481,
"eval_runtime": 116.3065,
"eval_samples_per_second": 33.67,
"eval_steps_per_second": 16.835,
"eval_top_10_accuracy": 0.8008171603677222,
"eval_top_1_accuracy": 0.3669560776302349,
"eval_top_5_accuracy": 0.6996935648621042,
"step": 26793
},
{
"epoch": 15.000017497200448,
"grad_norm": 55.68752670288086,
"learning_rate": 3.749720044792833e-05,
"loss": 4.6942,
"step": 26800
},
{
"epoch": 15.000297452407615,
"grad_norm": 47.778018951416016,
"learning_rate": 3.763717805151176e-05,
"loss": 3.488,
"step": 26900
},
{
"epoch": 15.00057740761478,
"grad_norm": 50.85312271118164,
"learning_rate": 3.7777155655095185e-05,
"loss": 3.2244,
"step": 27000
},
{
"epoch": 15.000857362821948,
"grad_norm": 51.211734771728516,
"learning_rate": 3.791713325867861e-05,
"loss": 3.3155,
"step": 27100
},
{
"epoch": 15.001137318029116,
"grad_norm": 39.02397918701172,
"learning_rate": 3.805711086226204e-05,
"loss": 3.47,
"step": 27200
},
{
"epoch": 15.001417273236282,
"grad_norm": 55.1838264465332,
"learning_rate": 3.819708846584546e-05,
"loss": 3.4093,
"step": 27300
},
{
"epoch": 15.001697228443449,
"grad_norm": 53.85865783691406,
"learning_rate": 3.8335666293393056e-05,
"loss": 3.3777,
"step": 27400
},
{
"epoch": 15.001977183650617,
"grad_norm": 52.40718460083008,
"learning_rate": 3.847564389697649e-05,
"loss": 3.7357,
"step": 27500
},
{
"epoch": 15.002257138857782,
"grad_norm": 42.77007293701172,
"learning_rate": 3.8615621500559915e-05,
"loss": 3.5572,
"step": 27600
},
{
"epoch": 15.00253709406495,
"grad_norm": 75.84447479248047,
"learning_rate": 3.8755599104143334e-05,
"loss": 3.65,
"step": 27700
},
{
"epoch": 15.002817049272117,
"grad_norm": 39.93196105957031,
"learning_rate": 3.8895576707726766e-05,
"loss": 3.3187,
"step": 27800
},
{
"epoch": 15.003097004479283,
"grad_norm": 55.67115020751953,
"learning_rate": 3.903555431131019e-05,
"loss": 3.3455,
"step": 27900
},
{
"epoch": 15.00337695968645,
"grad_norm": 32.676544189453125,
"learning_rate": 3.917553191489362e-05,
"loss": 3.6407,
"step": 28000
},
{
"epoch": 15.003656914893616,
"grad_norm": 51.30035400390625,
"learning_rate": 3.9315509518477044e-05,
"loss": 3.6281,
"step": 28100
},
{
"epoch": 15.003936870100784,
"grad_norm": 77.08026123046875,
"learning_rate": 3.9455487122060476e-05,
"loss": 3.4781,
"step": 28200
},
{
"epoch": 15.004216825307951,
"grad_norm": 41.604736328125,
"learning_rate": 3.95954647256439e-05,
"loss": 3.4542,
"step": 28300
},
{
"epoch": 15.004496780515117,
"grad_norm": 39.74190902709961,
"learning_rate": 3.973544232922732e-05,
"loss": 3.5986,
"step": 28400
},
{
"epoch": 15.004776735722285,
"grad_norm": 44.98125076293945,
"learning_rate": 3.9875419932810754e-05,
"loss": 3.5894,
"step": 28500
},
{
"epoch": 15.005000699888019,
"eval_accuracy": 0.3927477017364658,
"eval_f1": 0.34565750681985735,
"eval_loss": 2.7899587154388428,
"eval_precision": 0.349307388786449,
"eval_recall": 0.3927477017364658,
"eval_runtime": 114.0889,
"eval_samples_per_second": 34.324,
"eval_steps_per_second": 17.162,
"eval_top_10_accuracy": 0.8207354443309499,
"eval_top_1_accuracy": 0.3924923391215526,
"eval_top_5_accuracy": 0.7359550561797753,
"step": 28580
},
{
"epoch": 16.00005599104143,
"grad_norm": 45.47757339477539,
"learning_rate": 4.001539753639418e-05,
"loss": 3.3691,
"step": 28600
},
{
"epoch": 16.0003359462486,
"grad_norm": 52.39859390258789,
"learning_rate": 4.0155375139977605e-05,
"loss": 2.2598,
"step": 28700
},
{
"epoch": 16.000615901455767,
"grad_norm": 42.28738784790039,
"learning_rate": 4.029535274356103e-05,
"loss": 2.4562,
"step": 28800
},
{
"epoch": 16.000895856662932,
"grad_norm": 49.735939025878906,
"learning_rate": 4.043533034714446e-05,
"loss": 2.5948,
"step": 28900
},
{
"epoch": 16.0011758118701,
"grad_norm": 54.75900650024414,
"learning_rate": 4.057530795072789e-05,
"loss": 2.5475,
"step": 29000
},
{
"epoch": 16.001455767077267,
"grad_norm": 54.90763473510742,
"learning_rate": 4.071528555431131e-05,
"loss": 2.519,
"step": 29100
},
{
"epoch": 16.001735722284433,
"grad_norm": 61.103111267089844,
"learning_rate": 4.085526315789474e-05,
"loss": 2.4507,
"step": 29200
},
{
"epoch": 16.002015677491602,
"grad_norm": 47.821693420410156,
"learning_rate": 4.099524076147817e-05,
"loss": 2.5811,
"step": 29300
},
{
"epoch": 16.002295632698768,
"grad_norm": 44.895851135253906,
"learning_rate": 4.113521836506159e-05,
"loss": 2.6464,
"step": 29400
},
{
"epoch": 16.002575587905934,
"grad_norm": 24.25828742980957,
"learning_rate": 4.127379619260918e-05,
"loss": 2.5047,
"step": 29500
},
{
"epoch": 16.002855543113103,
"grad_norm": 54.69134521484375,
"learning_rate": 4.141377379619261e-05,
"loss": 2.6585,
"step": 29600
},
{
"epoch": 16.00313549832027,
"grad_norm": 61.3412971496582,
"learning_rate": 4.155375139977604e-05,
"loss": 2.6409,
"step": 29700
},
{
"epoch": 16.003415453527435,
"grad_norm": 52.66725540161133,
"learning_rate": 4.1693729003359464e-05,
"loss": 2.7054,
"step": 29800
},
{
"epoch": 16.003695408734604,
"grad_norm": 53.61050033569336,
"learning_rate": 4.183370660694289e-05,
"loss": 2.6963,
"step": 29900
},
{
"epoch": 16.00397536394177,
"grad_norm": 53.02050018310547,
"learning_rate": 4.1973684210526316e-05,
"loss": 2.6771,
"step": 30000
},
{
"epoch": 16.004255319148935,
"grad_norm": 61.56930160522461,
"learning_rate": 4.211366181410974e-05,
"loss": 2.7266,
"step": 30100
},
{
"epoch": 16.004535274356105,
"grad_norm": 52.07630920410156,
"learning_rate": 4.225363941769317e-05,
"loss": 2.7177,
"step": 30200
},
{
"epoch": 16.00481522956327,
"grad_norm": 40.4132080078125,
"learning_rate": 4.23936170212766e-05,
"loss": 2.7053,
"step": 30300
},
{
"epoch": 16.005,
"eval_accuracy": 0.39198161389172625,
"eval_f1": 0.348649355978263,
"eval_loss": 2.7045156955718994,
"eval_precision": 0.3579045415485661,
"eval_recall": 0.39198161389172625,
"eval_runtime": 115.0059,
"eval_samples_per_second": 34.05,
"eval_steps_per_second": 17.025,
"eval_top_10_accuracy": 0.8296731358529111,
"eval_top_1_accuracy": 0.39198161389172625,
"eval_top_5_accuracy": 0.7402962206332993,
"step": 30366
},
{
"epoch": 17.000094484882418,
"grad_norm": 53.307334899902344,
"learning_rate": 4.2532194848824194e-05,
"loss": 2.7488,
"step": 30400
},
{
"epoch": 17.000374440089587,
"grad_norm": 52.3875732421875,
"learning_rate": 4.267217245240761e-05,
"loss": 1.6991,
"step": 30500
},
{
"epoch": 17.000654395296753,
"grad_norm": 13.349233627319336,
"learning_rate": 4.2812150055991046e-05,
"loss": 1.7715,
"step": 30600
},
{
"epoch": 17.00093435050392,
"grad_norm": 50.092220306396484,
"learning_rate": 4.295212765957447e-05,
"loss": 1.8627,
"step": 30700
},
{
"epoch": 17.001214305711088,
"grad_norm": 41.946617126464844,
"learning_rate": 4.30921052631579e-05,
"loss": 1.8379,
"step": 30800
},
{
"epoch": 17.001494260918253,
"grad_norm": 31.524587631225586,
"learning_rate": 4.323208286674132e-05,
"loss": 1.7017,
"step": 30900
},
{
"epoch": 17.00177421612542,
"grad_norm": 27.594568252563477,
"learning_rate": 4.337206047032475e-05,
"loss": 1.9809,
"step": 31000
},
{
"epoch": 17.00205417133259,
"grad_norm": 35.60299301147461,
"learning_rate": 4.351203807390818e-05,
"loss": 2.1175,
"step": 31100
},
{
"epoch": 17.002334126539754,
"grad_norm": 53.827823638916016,
"learning_rate": 4.36520156774916e-05,
"loss": 2.0042,
"step": 31200
},
{
"epoch": 17.00261408174692,
"grad_norm": 49.536380767822266,
"learning_rate": 4.379199328107503e-05,
"loss": 1.9332,
"step": 31300
},
{
"epoch": 17.00289403695409,
"grad_norm": 31.038576126098633,
"learning_rate": 4.393197088465846e-05,
"loss": 2.1337,
"step": 31400
},
{
"epoch": 17.003173992161255,
"grad_norm": 62.93170928955078,
"learning_rate": 4.4071948488241885e-05,
"loss": 2.1531,
"step": 31500
},
{
"epoch": 17.00345394736842,
"grad_norm": 66.991455078125,
"learning_rate": 4.421192609182531e-05,
"loss": 2.003,
"step": 31600
},
{
"epoch": 17.003733902575586,
"grad_norm": 44.7550048828125,
"learning_rate": 4.435190369540874e-05,
"loss": 2.3327,
"step": 31700
},
{
"epoch": 17.004013857782756,
"grad_norm": 79.54408264160156,
"learning_rate": 4.449188129899216e-05,
"loss": 2.3132,
"step": 31800
},
{
"epoch": 17.00429381298992,
"grad_norm": 57.92308807373047,
"learning_rate": 4.463185890257559e-05,
"loss": 2.4269,
"step": 31900
},
{
"epoch": 17.004573768197087,
"grad_norm": 36.562435150146484,
"learning_rate": 4.4771836506159014e-05,
"loss": 2.1766,
"step": 32000
},
{
"epoch": 17.004853723404256,
"grad_norm": 45.46358871459961,
"learning_rate": 4.491181410974245e-05,
"loss": 2.0517,
"step": 32100
},
{
"epoch": 17.004999300111983,
"eval_accuracy": 0.3878958120531154,
"eval_f1": 0.34810126375123673,
"eval_loss": 2.733853816986084,
"eval_precision": 0.35983602993306774,
"eval_recall": 0.3878958120531154,
"eval_runtime": 118.3652,
"eval_samples_per_second": 33.084,
"eval_steps_per_second": 16.542,
"eval_top_10_accuracy": 0.8204800817160368,
"eval_top_1_accuracy": 0.3884065372829418,
"eval_top_5_accuracy": 0.7344228804902962,
"step": 32152
},
{
"epoch": 18.000132978723403,
"grad_norm": 26.470172882080078,
"learning_rate": 4.505179171332587e-05,
"loss": 1.7708,
"step": 32200
},
{
"epoch": 18.000412933930573,
"grad_norm": 28.779521942138672,
"learning_rate": 4.51917693169093e-05,
"loss": 1.3716,
"step": 32300
},
{
"epoch": 18.00069288913774,
"grad_norm": 63.082313537597656,
"learning_rate": 4.5331746920492724e-05,
"loss": 1.5498,
"step": 32400
},
{
"epoch": 18.000972844344904,
"grad_norm": 43.071693420410156,
"learning_rate": 4.547172452407615e-05,
"loss": 1.6103,
"step": 32500
},
{
"epoch": 18.00125279955207,
"grad_norm": 46.50571823120117,
"learning_rate": 4.5611702127659576e-05,
"loss": 1.493,
"step": 32600
},
{
"epoch": 18.00153275475924,
"grad_norm": 76.01538848876953,
"learning_rate": 4.5751679731243e-05,
"loss": 1.5295,
"step": 32700
},
{
"epoch": 18.001812709966405,
"grad_norm": 11.543736457824707,
"learning_rate": 4.5891657334826434e-05,
"loss": 1.8705,
"step": 32800
},
{
"epoch": 18.00209266517357,
"grad_norm": 39.0593147277832,
"learning_rate": 4.603163493840986e-05,
"loss": 1.6423,
"step": 32900
},
{
"epoch": 18.00237262038074,
"grad_norm": 46.54063415527344,
"learning_rate": 4.617161254199328e-05,
"loss": 1.6499,
"step": 33000
},
{
"epoch": 18.002652575587906,
"grad_norm": 45.89179992675781,
"learning_rate": 4.631159014557671e-05,
"loss": 1.7307,
"step": 33100
},
{
"epoch": 18.00293253079507,
"grad_norm": 37.720703125,
"learning_rate": 4.645156774916014e-05,
"loss": 1.8645,
"step": 33200
},
{
"epoch": 18.00321248600224,
"grad_norm": 37.555179595947266,
"learning_rate": 4.659154535274356e-05,
"loss": 1.7513,
"step": 33300
},
{
"epoch": 18.003492441209406,
"grad_norm": 30.639272689819336,
"learning_rate": 4.673152295632699e-05,
"loss": 1.8245,
"step": 33400
},
{
"epoch": 18.003772396416572,
"grad_norm": 41.111690521240234,
"learning_rate": 4.6871500559910415e-05,
"loss": 1.7562,
"step": 33500
},
{
"epoch": 18.00405235162374,
"grad_norm": 27.824993133544922,
"learning_rate": 4.701147816349384e-05,
"loss": 1.7368,
"step": 33600
},
{
"epoch": 18.004332306830907,
"grad_norm": 11.345212936401367,
"learning_rate": 4.7151455767077266e-05,
"loss": 1.8328,
"step": 33700
},
{
"epoch": 18.004612262038073,
"grad_norm": 53.33864974975586,
"learning_rate": 4.72914333706607e-05,
"loss": 2.1876,
"step": 33800
},
{
"epoch": 18.004892217245242,
"grad_norm": 58.86853790283203,
"learning_rate": 4.7431410974244125e-05,
"loss": 1.9862,
"step": 33900
},
{
"epoch": 18.004998600223963,
"eval_accuracy": 0.3817671092951992,
"eval_f1": 0.34462121977443727,
"eval_loss": 2.774939775466919,
"eval_precision": 0.3614003286181529,
"eval_recall": 0.3817671092951992,
"eval_runtime": 118.5272,
"eval_samples_per_second": 33.039,
"eval_steps_per_second": 16.519,
"eval_top_10_accuracy": 0.8171603677221655,
"eval_top_1_accuracy": 0.38202247191011235,
"eval_top_5_accuracy": 0.7285495403472931,
"step": 33938
},
{
"epoch": 19.00017147256439,
"grad_norm": 31.951032638549805,
"learning_rate": 4.757138857782755e-05,
"loss": 1.6611,
"step": 34000
},
{
"epoch": 19.000451427771555,
"grad_norm": 17.33244514465332,
"learning_rate": 4.7711366181410976e-05,
"loss": 1.0391,
"step": 34100
},
{
"epoch": 19.000731382978724,
"grad_norm": 46.239593505859375,
"learning_rate": 4.78513437849944e-05,
"loss": 1.2688,
"step": 34200
},
{
"epoch": 19.00101133818589,
"grad_norm": 26.96588706970215,
"learning_rate": 4.799132138857783e-05,
"loss": 1.4947,
"step": 34300
},
{
"epoch": 19.001291293393056,
"grad_norm": 34.70318603515625,
"learning_rate": 4.812989921612542e-05,
"loss": 1.3438,
"step": 34400
},
{
"epoch": 19.001571248600225,
"grad_norm": 14.275991439819336,
"learning_rate": 4.826987681970885e-05,
"loss": 1.5921,
"step": 34500
},
{
"epoch": 19.00185120380739,
"grad_norm": 20.836231231689453,
"learning_rate": 4.840985442329228e-05,
"loss": 1.4238,
"step": 34600
},
{
"epoch": 19.002131159014557,
"grad_norm": 45.50111389160156,
"learning_rate": 4.85498320268757e-05,
"loss": 1.4567,
"step": 34700
},
{
"epoch": 19.002411114221726,
"grad_norm": 17.043527603149414,
"learning_rate": 4.8689809630459125e-05,
"loss": 1.5576,
"step": 34800
},
{
"epoch": 19.00269106942889,
"grad_norm": 71.65300750732422,
"learning_rate": 4.882978723404256e-05,
"loss": 1.5049,
"step": 34900
},
{
"epoch": 19.002971024636057,
"grad_norm": 50.4085807800293,
"learning_rate": 4.8969764837625984e-05,
"loss": 1.671,
"step": 35000
},
{
"epoch": 19.003250979843227,
"grad_norm": 41.303321838378906,
"learning_rate": 4.910974244120941e-05,
"loss": 1.5439,
"step": 35100
},
{
"epoch": 19.003530935050392,
"grad_norm": 61.70454406738281,
"learning_rate": 4.9249720044792835e-05,
"loss": 1.4943,
"step": 35200
},
{
"epoch": 19.003810890257558,
"grad_norm": 37.11140060424805,
"learning_rate": 4.938969764837626e-05,
"loss": 1.9324,
"step": 35300
},
{
"epoch": 19.004090845464727,
"grad_norm": 22.8467960357666,
"learning_rate": 4.952967525195969e-05,
"loss": 1.5832,
"step": 35400
},
{
"epoch": 19.004370800671893,
"grad_norm": 24.005002975463867,
"learning_rate": 4.966965285554311e-05,
"loss": 1.7288,
"step": 35500
},
{
"epoch": 19.00465075587906,
"grad_norm": 68.68001556396484,
"learning_rate": 4.9809630459126545e-05,
"loss": 1.8915,
"step": 35600
},
{
"epoch": 19.004930711086224,
"grad_norm": 40.64888000488281,
"learning_rate": 4.994960806270997e-05,
"loss": 1.9352,
"step": 35700
},
{
"epoch": 19.00500069988802,
"eval_accuracy": 0.36338100102145043,
"eval_f1": 0.32556302652063634,
"eval_loss": 2.815650701522827,
"eval_precision": 0.3396402371887561,
"eval_recall": 0.36338100102145043,
"eval_runtime": 119.1107,
"eval_samples_per_second": 32.877,
"eval_steps_per_second": 16.438,
"eval_top_10_accuracy": 0.8156281920326864,
"eval_top_1_accuracy": 0.36338100102145043,
"eval_top_5_accuracy": 0.7134831460674157,
"step": 35725
},
{
"epoch": 20.000209966405375,
"grad_norm": 37.2190055847168,
"learning_rate": 4.9990046037078515e-05,
"loss": 1.304,
"step": 35800
},
{
"epoch": 20.00048992161254,
"grad_norm": 18.588062286376953,
"learning_rate": 4.997449297001369e-05,
"loss": 1.2178,
"step": 35900
},
{
"epoch": 20.00076987681971,
"grad_norm": 46.44628143310547,
"learning_rate": 4.9958939902948865e-05,
"loss": 1.2133,
"step": 36000
},
{
"epoch": 20.001049832026876,
"grad_norm": 17.273088455200195,
"learning_rate": 4.994338683588404e-05,
"loss": 1.1894,
"step": 36100
},
{
"epoch": 20.00132978723404,
"grad_norm": 52.89341735839844,
"learning_rate": 4.9927833768819216e-05,
"loss": 1.1865,
"step": 36200
},
{
"epoch": 20.00160974244121,
"grad_norm": 35.168704986572266,
"learning_rate": 4.991228070175439e-05,
"loss": 1.4459,
"step": 36300
},
{
"epoch": 20.001889697648377,
"grad_norm": 21.019437789916992,
"learning_rate": 4.989672763468956e-05,
"loss": 1.3725,
"step": 36400
},
{
"epoch": 20.002169652855542,
"grad_norm": 30.197154998779297,
"learning_rate": 4.988133009829539e-05,
"loss": 1.3246,
"step": 36500
},
{
"epoch": 20.00244960806271,
"grad_norm": 76.85697174072266,
"learning_rate": 4.986577703123056e-05,
"loss": 1.3171,
"step": 36600
},
{
"epoch": 20.002729563269877,
"grad_norm": 29.73351287841797,
"learning_rate": 4.985022396416574e-05,
"loss": 1.3113,
"step": 36700
},
{
"epoch": 20.003009518477043,
"grad_norm": 50.711971282958984,
"learning_rate": 4.983467089710091e-05,
"loss": 1.4899,
"step": 36800
},
{
"epoch": 20.00328947368421,
"grad_norm": 34.413719177246094,
"learning_rate": 4.9819117830036085e-05,
"loss": 1.6135,
"step": 36900
},
{
"epoch": 20.003569428891378,
"grad_norm": 51.75621032714844,
"learning_rate": 4.980356476297126e-05,
"loss": 1.4968,
"step": 37000
},
{
"epoch": 20.003849384098544,
"grad_norm": 28.784854888916016,
"learning_rate": 4.9788011695906435e-05,
"loss": 1.5756,
"step": 37100
},
{
"epoch": 20.00412933930571,
"grad_norm": 15.39171314239502,
"learning_rate": 4.9772458628841614e-05,
"loss": 1.3599,
"step": 37200
},
{
"epoch": 20.00440929451288,
"grad_norm": 43.16920852661133,
"learning_rate": 4.9756905561776786e-05,
"loss": 1.3928,
"step": 37300
},
{
"epoch": 20.004689249720045,
"grad_norm": 63.47052001953125,
"learning_rate": 4.974135249471196e-05,
"loss": 1.671,
"step": 37400
},
{
"epoch": 20.00496920492721,
"grad_norm": 30.383426666259766,
"learning_rate": 4.9725799427647136e-05,
"loss": 1.6389,
"step": 37500
},
{
"epoch": 20.005,
"eval_accuracy": 0.3799795709908069,
"eval_f1": 0.3403035730650944,
"eval_loss": 2.796839475631714,
"eval_precision": 0.35319952859534065,
"eval_recall": 0.3799795709908069,
"eval_runtime": 116.2913,
"eval_samples_per_second": 33.674,
"eval_steps_per_second": 16.837,
"eval_top_10_accuracy": 0.8143513789581205,
"eval_top_1_accuracy": 0.3799795709908069,
"eval_top_5_accuracy": 0.7247191011235955,
"step": 37511
},
{
"epoch": 21.00024846024636,
"grad_norm": 7.234004497528076,
"learning_rate": 4.971024636058231e-05,
"loss": 0.9637,
"step": 37600
},
{
"epoch": 21.000528415453527,
"grad_norm": 46.028507232666016,
"learning_rate": 4.969469329351749e-05,
"loss": 0.8584,
"step": 37700
},
{
"epoch": 21.000808370660696,
"grad_norm": 19.228715896606445,
"learning_rate": 4.967914022645266e-05,
"loss": 1.1736,
"step": 37800
},
{
"epoch": 21.001088325867862,
"grad_norm": 38.96889114379883,
"learning_rate": 4.966358715938783e-05,
"loss": 1.135,
"step": 37900
},
{
"epoch": 21.001368281075028,
"grad_norm": 61.55207824707031,
"learning_rate": 4.964803409232301e-05,
"loss": 1.1689,
"step": 38000
},
{
"epoch": 21.001648236282193,
"grad_norm": 24.2206974029541,
"learning_rate": 4.963248102525818e-05,
"loss": 1.0731,
"step": 38100
},
{
"epoch": 21.001928191489363,
"grad_norm": 26.989789962768555,
"learning_rate": 4.961692795819336e-05,
"loss": 1.0659,
"step": 38200
},
{
"epoch": 21.00220814669653,
"grad_norm": 29.301572799682617,
"learning_rate": 4.9601530421799184e-05,
"loss": 1.2649,
"step": 38300
},
{
"epoch": 21.002488101903694,
"grad_norm": 41.62022018432617,
"learning_rate": 4.9585977354734355e-05,
"loss": 1.2247,
"step": 38400
},
{
"epoch": 21.002768057110863,
"grad_norm": 24.48542022705078,
"learning_rate": 4.9570424287669534e-05,
"loss": 1.2741,
"step": 38500
},
{
"epoch": 21.00304801231803,
"grad_norm": 72.86893463134766,
"learning_rate": 4.9554871220604706e-05,
"loss": 1.4002,
"step": 38600
},
{
"epoch": 21.003327967525195,
"grad_norm": 70.68368530273438,
"learning_rate": 4.9539318153539885e-05,
"loss": 1.5305,
"step": 38700
},
{
"epoch": 21.003607922732364,
"grad_norm": 43.46640396118164,
"learning_rate": 4.9523765086475057e-05,
"loss": 1.29,
"step": 38800
},
{
"epoch": 21.00388787793953,
"grad_norm": 63.277320861816406,
"learning_rate": 4.9508212019410235e-05,
"loss": 1.4255,
"step": 38900
},
{
"epoch": 21.004167833146695,
"grad_norm": 35.36826705932617,
"learning_rate": 4.949265895234541e-05,
"loss": 1.3306,
"step": 39000
},
{
"epoch": 21.004447788353865,
"grad_norm": 46.617313385009766,
"learning_rate": 4.947710588528058e-05,
"loss": 1.4394,
"step": 39100
},
{
"epoch": 21.00472774356103,
"grad_norm": 19.11472511291504,
"learning_rate": 4.946155281821576e-05,
"loss": 1.4166,
"step": 39200
},
{
"epoch": 21.004999300111983,
"eval_accuracy": 0.37410623084780387,
"eval_f1": 0.3358086003770376,
"eval_loss": 2.8413541316986084,
"eval_precision": 0.34984856960086785,
"eval_recall": 0.37410623084780387,
"eval_runtime": 115.7697,
"eval_samples_per_second": 33.826,
"eval_steps_per_second": 16.913,
"eval_top_10_accuracy": 0.8066905005107252,
"eval_top_1_accuracy": 0.3738508682328907,
"eval_top_5_accuracy": 0.7132277834525026,
"step": 39297
},
{
"epoch": 22.000006998880178,
"grad_norm": 12.816142082214355,
"learning_rate": 4.944599975115093e-05,
"loss": 1.6177,
"step": 39300
},
{
"epoch": 22.000286954087347,
"grad_norm": 17.605751037597656,
"learning_rate": 4.943044668408611e-05,
"loss": 0.7587,
"step": 39400
},
{
"epoch": 22.000566909294513,
"grad_norm": 73.58073425292969,
"learning_rate": 4.941489361702128e-05,
"loss": 0.9073,
"step": 39500
},
{
"epoch": 22.00084686450168,
"grad_norm": 69.34747314453125,
"learning_rate": 4.939934054995645e-05,
"loss": 0.9082,
"step": 39600
},
{
"epoch": 22.001126819708848,
"grad_norm": 40.95439147949219,
"learning_rate": 4.938378748289163e-05,
"loss": 0.9475,
"step": 39700
},
{
"epoch": 22.001406774916013,
"grad_norm": 37.23710632324219,
"learning_rate": 4.93682344158268e-05,
"loss": 1.0648,
"step": 39800
},
{
"epoch": 22.00168673012318,
"grad_norm": 45.12014389038086,
"learning_rate": 4.9352681348761975e-05,
"loss": 1.1583,
"step": 39900
},
{
"epoch": 22.00196668533035,
"grad_norm": 21.451051712036133,
"learning_rate": 4.933712828169715e-05,
"loss": 1.1223,
"step": 40000
},
{
"epoch": 22.002246640537514,
"grad_norm": 6.596524238586426,
"learning_rate": 4.9321575214632325e-05,
"loss": 1.2806,
"step": 40100
},
{
"epoch": 22.00252659574468,
"grad_norm": 40.44515609741211,
"learning_rate": 4.9306022147567504e-05,
"loss": 1.22,
"step": 40200
},
{
"epoch": 22.00280655095185,
"grad_norm": 47.3612060546875,
"learning_rate": 4.9290469080502676e-05,
"loss": 1.3957,
"step": 40300
},
{
"epoch": 22.003086506159015,
"grad_norm": 9.846660614013672,
"learning_rate": 4.927491601343785e-05,
"loss": 1.2558,
"step": 40400
},
{
"epoch": 22.00336646136618,
"grad_norm": 10.630510330200195,
"learning_rate": 4.9259362946373026e-05,
"loss": 1.2827,
"step": 40500
},
{
"epoch": 22.00364641657335,
"grad_norm": 25.466978073120117,
"learning_rate": 4.92438098793082e-05,
"loss": 1.2335,
"step": 40600
},
{
"epoch": 22.003926371780516,
"grad_norm": 46.57871627807617,
"learning_rate": 4.922825681224338e-05,
"loss": 1.2621,
"step": 40700
},
{
"epoch": 22.00420632698768,
"grad_norm": 38.56644821166992,
"learning_rate": 4.921270374517855e-05,
"loss": 1.3509,
"step": 40800
},
{
"epoch": 22.00448628219485,
"grad_norm": 53.1245002746582,
"learning_rate": 4.919715067811372e-05,
"loss": 1.3637,
"step": 40900
},
{
"epoch": 22.004766237402016,
"grad_norm": 92.77754974365234,
"learning_rate": 4.91815976110489e-05,
"loss": 1.3113,
"step": 41000
},
{
"epoch": 22.004998600223963,
"eval_accuracy": 0.3669560776302349,
"eval_f1": 0.33397568836792535,
"eval_loss": 2.911128282546997,
"eval_precision": 0.35075591826740965,
"eval_recall": 0.3669560776302349,
"eval_runtime": 117.1252,
"eval_samples_per_second": 33.434,
"eval_steps_per_second": 16.717,
"eval_top_10_accuracy": 0.8041368743615934,
"eval_top_1_accuracy": 0.36670071501532175,
"eval_top_5_accuracy": 0.7032686414708886,
"step": 41083
},
{
"epoch": 23.000045492721163,
"grad_norm": 67.09935760498047,
"learning_rate": 4.916604454398407e-05,
"loss": 1.4924,
"step": 41100
},
{
"epoch": 23.000325447928333,
"grad_norm": 65.86184692382812,
"learning_rate": 4.915049147691925e-05,
"loss": 0.8501,
"step": 41200
},
{
"epoch": 23.0006054031355,
"grad_norm": 27.802021026611328,
"learning_rate": 4.913493840985443e-05,
"loss": 0.6177,
"step": 41300
},
{
"epoch": 23.000885358342664,
"grad_norm": 53.507080078125,
"learning_rate": 4.91193853427896e-05,
"loss": 0.921,
"step": 41400
},
{
"epoch": 23.001165313549834,
"grad_norm": 34.499412536621094,
"learning_rate": 4.910383227572478e-05,
"loss": 0.9269,
"step": 41500
},
{
"epoch": 23.001445268757,
"grad_norm": 19.095460891723633,
"learning_rate": 4.908827920865995e-05,
"loss": 0.9606,
"step": 41600
},
{
"epoch": 23.001725223964165,
"grad_norm": 18.78254508972168,
"learning_rate": 4.907272614159513e-05,
"loss": 1.0296,
"step": 41700
},
{
"epoch": 23.002005179171334,
"grad_norm": 51.5577392578125,
"learning_rate": 4.90571730745303e-05,
"loss": 1.0291,
"step": 41800
},
{
"epoch": 23.0022851343785,
"grad_norm": 5.804263114929199,
"learning_rate": 4.9041620007465474e-05,
"loss": 1.1576,
"step": 41900
},
{
"epoch": 23.002565089585666,
"grad_norm": 36.027217864990234,
"learning_rate": 4.902606694040065e-05,
"loss": 1.1973,
"step": 42000
},
{
"epoch": 23.00284504479283,
"grad_norm": 15.984498023986816,
"learning_rate": 4.9010513873335824e-05,
"loss": 1.2031,
"step": 42100
},
{
"epoch": 23.003125,
"grad_norm": 45.6442985534668,
"learning_rate": 4.8994960806271e-05,
"loss": 1.1533,
"step": 42200
},
{
"epoch": 23.003404955207166,
"grad_norm": 49.245113372802734,
"learning_rate": 4.897956326987682e-05,
"loss": 1.239,
"step": 42300
},
{
"epoch": 23.003684910414332,
"grad_norm": 2.6122968196868896,
"learning_rate": 4.8964010202812e-05,
"loss": 1.0613,
"step": 42400
},
{
"epoch": 23.0039648656215,
"grad_norm": 63.001007080078125,
"learning_rate": 4.894845713574717e-05,
"loss": 0.9579,
"step": 42500
},
{
"epoch": 23.004244820828667,
"grad_norm": 47.75780487060547,
"learning_rate": 4.893290406868234e-05,
"loss": 1.444,
"step": 42600
},
{
"epoch": 23.004524776035833,
"grad_norm": 26.68849754333496,
"learning_rate": 4.891735100161752e-05,
"loss": 1.2768,
"step": 42700
},
{
"epoch": 23.004804731243002,
"grad_norm": 31.364978790283203,
"learning_rate": 4.890179793455269e-05,
"loss": 1.4698,
"step": 42800
},
{
"epoch": 23.00500069988802,
"eval_accuracy": 0.36772216547497444,
"eval_f1": 0.32961532849173303,
"eval_loss": 2.9281723499298096,
"eval_precision": 0.34658649431376704,
"eval_recall": 0.36772216547497444,
"eval_runtime": 114.379,
"eval_samples_per_second": 34.237,
"eval_steps_per_second": 17.119,
"eval_top_10_accuracy": 0.7946884576098059,
"eval_top_1_accuracy": 0.3674668028600613,
"eval_top_5_accuracy": 0.7070990806945863,
"step": 42870
},
{
"epoch": 24.00008398656215,
"grad_norm": 29.290653228759766,
"learning_rate": 4.888624486748787e-05,
"loss": 1.3773,
"step": 42900
},
{
"epoch": 24.00036394176932,
"grad_norm": 7.684507369995117,
"learning_rate": 4.8870847331093695e-05,
"loss": 0.7986,
"step": 43000
},
{
"epoch": 24.000643896976484,
"grad_norm": 3.6387572288513184,
"learning_rate": 4.885529426402887e-05,
"loss": 0.889,
"step": 43100
},
{
"epoch": 24.00092385218365,
"grad_norm": 16.366535186767578,
"learning_rate": 4.8839741196964046e-05,
"loss": 0.7856,
"step": 43200
},
{
"epoch": 24.001203807390816,
"grad_norm": 41.7845573425293,
"learning_rate": 4.882418812989922e-05,
"loss": 0.9726,
"step": 43300
},
{
"epoch": 24.001483762597985,
"grad_norm": 15.093663215637207,
"learning_rate": 4.8808635062834396e-05,
"loss": 0.8313,
"step": 43400
},
{
"epoch": 24.00176371780515,
"grad_norm": 53.5677490234375,
"learning_rate": 4.879308199576957e-05,
"loss": 1.0816,
"step": 43500
},
{
"epoch": 24.002043673012317,
"grad_norm": 54.367271423339844,
"learning_rate": 4.877752892870474e-05,
"loss": 1.0475,
"step": 43600
},
{
"epoch": 24.002323628219486,
"grad_norm": 41.49662399291992,
"learning_rate": 4.876197586163992e-05,
"loss": 0.8916,
"step": 43700
},
{
"epoch": 24.00260358342665,
"grad_norm": 16.519433975219727,
"learning_rate": 4.874642279457509e-05,
"loss": 0.8659,
"step": 43800
},
{
"epoch": 24.002883538633817,
"grad_norm": 61.23699188232422,
"learning_rate": 4.873086972751027e-05,
"loss": 1.2549,
"step": 43900
},
{
"epoch": 24.003163493840987,
"grad_norm": 10.837386131286621,
"learning_rate": 4.871531666044544e-05,
"loss": 1.1515,
"step": 44000
},
{
"epoch": 24.003443449048152,
"grad_norm": 27.969383239746094,
"learning_rate": 4.869976359338061e-05,
"loss": 1.1793,
"step": 44100
},
{
"epoch": 24.003723404255318,
"grad_norm": 50.31840133666992,
"learning_rate": 4.868421052631579e-05,
"loss": 1.3247,
"step": 44200
},
{
"epoch": 24.004003359462487,
"grad_norm": 11.990558624267578,
"learning_rate": 4.8668657459250964e-05,
"loss": 1.1745,
"step": 44300
},
{
"epoch": 24.004283314669653,
"grad_norm": 46.31821060180664,
"learning_rate": 4.865310439218614e-05,
"loss": 1.1745,
"step": 44400
},
{
"epoch": 24.00456326987682,
"grad_norm": 8.647761344909668,
"learning_rate": 4.8637551325121314e-05,
"loss": 1.1952,
"step": 44500
},
{
"epoch": 24.004843225083988,
"grad_norm": 9.188798904418945,
"learning_rate": 4.8621998258056486e-05,
"loss": 1.1594,
"step": 44600
},
{
"epoch": 24.005,
"eval_accuracy": 0.38993871297242083,
"eval_f1": 0.35323001094121087,
"eval_loss": 2.9185523986816406,
"eval_precision": 0.36621209910280395,
"eval_recall": 0.38993871297242083,
"eval_runtime": 116.0344,
"eval_samples_per_second": 33.749,
"eval_steps_per_second": 16.874,
"eval_top_10_accuracy": 0.7987742594484167,
"eval_top_1_accuracy": 0.38993871297242083,
"eval_top_5_accuracy": 0.7124616956077631,
"step": 44656
},
{
"epoch": 25.000122480403135,
"grad_norm": 7.127791404724121,
"learning_rate": 4.8606445190991665e-05,
"loss": 0.9656,
"step": 44700
},
{
"epoch": 25.0004024356103,
"grad_norm": 6.424502372741699,
"learning_rate": 4.859089212392684e-05,
"loss": 0.8375,
"step": 44800
},
{
"epoch": 25.00068239081747,
"grad_norm": 56.99434280395508,
"learning_rate": 4.8575339056862015e-05,
"loss": 0.7758,
"step": 44900
},
{
"epoch": 25.000962346024636,
"grad_norm": 29.912485122680664,
"learning_rate": 4.8559785989797194e-05,
"loss": 0.7439,
"step": 45000
},
{
"epoch": 25.0012423012318,
"grad_norm": 35.43897247314453,
"learning_rate": 4.8544232922732366e-05,
"loss": 0.7085,
"step": 45100
},
{
"epoch": 25.00152225643897,
"grad_norm": 9.219779968261719,
"learning_rate": 4.8528679855667544e-05,
"loss": 0.9102,
"step": 45200
},
{
"epoch": 25.001802211646137,
"grad_norm": 4.801015853881836,
"learning_rate": 4.8513126788602716e-05,
"loss": 1.014,
"step": 45300
},
{
"epoch": 25.002082166853302,
"grad_norm": 43.83305740356445,
"learning_rate": 4.8497573721537895e-05,
"loss": 0.8622,
"step": 45400
},
{
"epoch": 25.00236212206047,
"grad_norm": 66.25143432617188,
"learning_rate": 4.848202065447307e-05,
"loss": 0.9431,
"step": 45500
},
{
"epoch": 25.002642077267637,
"grad_norm": 29.633420944213867,
"learning_rate": 4.846646758740824e-05,
"loss": 1.0897,
"step": 45600
},
{
"epoch": 25.002922032474803,
"grad_norm": 33.82661819458008,
"learning_rate": 4.845091452034342e-05,
"loss": 1.117,
"step": 45700
},
{
"epoch": 25.003201987681972,
"grad_norm": 46.27246856689453,
"learning_rate": 4.843536145327859e-05,
"loss": 1.2575,
"step": 45800
},
{
"epoch": 25.003481942889138,
"grad_norm": 18.312467575073242,
"learning_rate": 4.841980838621377e-05,
"loss": 1.2425,
"step": 45900
},
{
"epoch": 25.003761898096304,
"grad_norm": 28.17131233215332,
"learning_rate": 4.840425531914894e-05,
"loss": 1.2341,
"step": 46000
},
{
"epoch": 25.004041853303473,
"grad_norm": 28.144180297851562,
"learning_rate": 4.838870225208411e-05,
"loss": 1.2243,
"step": 46100
},
{
"epoch": 25.00432180851064,
"grad_norm": 16.919340133666992,
"learning_rate": 4.837314918501929e-05,
"loss": 0.9636,
"step": 46200
},
{
"epoch": 25.004601763717805,
"grad_norm": 5.783956527709961,
"learning_rate": 4.835759611795446e-05,
"loss": 1.0324,
"step": 46300
},
{
"epoch": 25.00488171892497,
"grad_norm": 9.570374488830566,
"learning_rate": 4.834204305088964e-05,
"loss": 0.8815,
"step": 46400
},
{
"epoch": 25.004999300111983,
"eval_accuracy": 0.3825331971399387,
"eval_f1": 0.34693060387008295,
"eval_loss": 3.0209546089172363,
"eval_precision": 0.3639688225745427,
"eval_recall": 0.3825331971399387,
"eval_runtime": 115.6367,
"eval_samples_per_second": 33.865,
"eval_steps_per_second": 16.932,
"eval_top_10_accuracy": 0.7964759959141982,
"eval_top_1_accuracy": 0.3827885597548519,
"eval_top_5_accuracy": 0.7053115423901941,
"step": 46442
},
{
"epoch": 26.00016097424412,
"grad_norm": 16.35182762145996,
"learning_rate": 4.832648998382481e-05,
"loss": 0.7362,
"step": 46500
},
{
"epoch": 26.000440929451287,
"grad_norm": 34.7078742980957,
"learning_rate": 4.8310936916759985e-05,
"loss": 0.8688,
"step": 46600
},
{
"epoch": 26.000720884658456,
"grad_norm": 2.940587043762207,
"learning_rate": 4.8295383849695164e-05,
"loss": 0.8609,
"step": 46700
},
{
"epoch": 26.001000839865622,
"grad_norm": 35.10813903808594,
"learning_rate": 4.8279830782630336e-05,
"loss": 0.6674,
"step": 46800
},
{
"epoch": 26.001280795072788,
"grad_norm": 4.418734550476074,
"learning_rate": 4.8264277715565514e-05,
"loss": 0.856,
"step": 46900
},
{
"epoch": 26.001560750279957,
"grad_norm": 37.9604377746582,
"learning_rate": 4.824888017917134e-05,
"loss": 0.7902,
"step": 47000
},
{
"epoch": 26.001840705487123,
"grad_norm": 32.70397186279297,
"learning_rate": 4.823332711210651e-05,
"loss": 0.7813,
"step": 47100
},
{
"epoch": 26.00212066069429,
"grad_norm": 35.73707962036133,
"learning_rate": 4.821777404504169e-05,
"loss": 0.9995,
"step": 47200
},
{
"epoch": 26.002400615901454,
"grad_norm": 9.47586727142334,
"learning_rate": 4.820222097797686e-05,
"loss": 1.0135,
"step": 47300
},
{
"epoch": 26.002680571108623,
"grad_norm": 37.69544982910156,
"learning_rate": 4.818666791091204e-05,
"loss": 0.9796,
"step": 47400
},
{
"epoch": 26.00296052631579,
"grad_norm": 10.6676025390625,
"learning_rate": 4.817111484384721e-05,
"loss": 1.3322,
"step": 47500
},
{
"epoch": 26.003240481522955,
"grad_norm": 40.86244201660156,
"learning_rate": 4.815556177678238e-05,
"loss": 1.1818,
"step": 47600
},
{
"epoch": 26.003520436730124,
"grad_norm": 34.827247619628906,
"learning_rate": 4.814000870971756e-05,
"loss": 0.8761,
"step": 47700
},
{
"epoch": 26.00380039193729,
"grad_norm": 23.81336784362793,
"learning_rate": 4.812445564265273e-05,
"loss": 1.1277,
"step": 47800
},
{
"epoch": 26.004080347144455,
"grad_norm": 44.09092330932617,
"learning_rate": 4.810890257558791e-05,
"loss": 1.4091,
"step": 47900
},
{
"epoch": 26.004360302351625,
"grad_norm": 36.06610107421875,
"learning_rate": 4.8093349508523084e-05,
"loss": 1.1976,
"step": 48000
},
{
"epoch": 26.00464025755879,
"grad_norm": 2.117263078689575,
"learning_rate": 4.8077796441458256e-05,
"loss": 1.1112,
"step": 48100
},
{
"epoch": 26.004920212765956,
"grad_norm": 9.393363952636719,
"learning_rate": 4.806239890506408e-05,
"loss": 1.348,
"step": 48200
},
{
"epoch": 26.004998600223963,
"eval_accuracy": 0.377170582226762,
"eval_f1": 0.33966766691485795,
"eval_loss": 3.026718854904175,
"eval_precision": 0.3537270321042027,
"eval_recall": 0.377170582226762,
"eval_runtime": 117.8101,
"eval_samples_per_second": 33.24,
"eval_steps_per_second": 16.62,
"eval_top_10_accuracy": 0.797752808988764,
"eval_top_1_accuracy": 0.377170582226762,
"eval_top_5_accuracy": 0.7073544433094995,
"step": 48228
},
{
"epoch": 27.000199468085107,
"grad_norm": 48.105777740478516,
"learning_rate": 4.804684583799926e-05,
"loss": 0.9049,
"step": 48300
},
{
"epoch": 27.000479423292273,
"grad_norm": 39.231109619140625,
"learning_rate": 4.803129277093443e-05,
"loss": 0.8953,
"step": 48400
},
{
"epoch": 27.00075937849944,
"grad_norm": 2.2942845821380615,
"learning_rate": 4.801573970386961e-05,
"loss": 0.7185,
"step": 48500
},
{
"epoch": 27.001039333706608,
"grad_norm": 0.9364669322967529,
"learning_rate": 4.800018663680478e-05,
"loss": 0.7835,
"step": 48600
},
{
"epoch": 27.001319288913773,
"grad_norm": 40.069271087646484,
"learning_rate": 4.798463356973996e-05,
"loss": 0.8621,
"step": 48700
},
{
"epoch": 27.00159924412094,
"grad_norm": 11.61902141571045,
"learning_rate": 4.796908050267513e-05,
"loss": 0.9434,
"step": 48800
},
{
"epoch": 27.00187919932811,
"grad_norm": 3.5938429832458496,
"learning_rate": 4.795352743561031e-05,
"loss": 0.9811,
"step": 48900
},
{
"epoch": 27.002159154535274,
"grad_norm": 47.871726989746094,
"learning_rate": 4.793797436854548e-05,
"loss": 0.9615,
"step": 49000
},
{
"epoch": 27.00243910974244,
"grad_norm": 22.639846801757812,
"learning_rate": 4.7922421301480654e-05,
"loss": 1.0804,
"step": 49100
},
{
"epoch": 27.00271906494961,
"grad_norm": 53.62788772583008,
"learning_rate": 4.790686823441583e-05,
"loss": 0.9659,
"step": 49200
},
{
"epoch": 27.002999020156775,
"grad_norm": 18.483760833740234,
"learning_rate": 4.7891315167351004e-05,
"loss": 0.9848,
"step": 49300
},
{
"epoch": 27.00327897536394,
"grad_norm": 30.66340446472168,
"learning_rate": 4.787576210028618e-05,
"loss": 1.042,
"step": 49400
},
{
"epoch": 27.00355893057111,
"grad_norm": 29.91460418701172,
"learning_rate": 4.7860209033221355e-05,
"loss": 0.9379,
"step": 49500
},
{
"epoch": 27.003838885778276,
"grad_norm": 17.159460067749023,
"learning_rate": 4.784465596615653e-05,
"loss": 1.1205,
"step": 49600
},
{
"epoch": 27.00411884098544,
"grad_norm": 46.86989974975586,
"learning_rate": 4.7829102899091705e-05,
"loss": 1.08,
"step": 49700
},
{
"epoch": 27.00439879619261,
"grad_norm": 13.506775856018066,
"learning_rate": 4.781354983202688e-05,
"loss": 1.2232,
"step": 49800
},
{
"epoch": 27.004678751399776,
"grad_norm": 33.25643539428711,
"learning_rate": 4.7797996764962056e-05,
"loss": 1.1331,
"step": 49900
},
{
"epoch": 27.004958706606942,
"grad_norm": 47.94986343383789,
"learning_rate": 4.778244369789723e-05,
"loss": 1.0531,
"step": 50000
},
{
"epoch": 27.00500069988802,
"eval_accuracy": 0.38074565883554645,
"eval_f1": 0.3463921527059934,
"eval_loss": 3.005545139312744,
"eval_precision": 0.36555650965763326,
"eval_recall": 0.38074565883554645,
"eval_runtime": 114.6503,
"eval_samples_per_second": 34.156,
"eval_steps_per_second": 17.078,
"eval_top_10_accuracy": 0.8107763023493361,
"eval_top_1_accuracy": 0.3804902962206333,
"eval_top_5_accuracy": 0.719611848825332,
"step": 50015
},
{
"epoch": 28.000237961926093,
"grad_norm": 6.364796161651611,
"learning_rate": 4.77668906308324e-05,
"loss": 0.5291,
"step": 50100
},
{
"epoch": 28.00051791713326,
"grad_norm": 45.56444549560547,
"learning_rate": 4.775133756376758e-05,
"loss": 0.7383,
"step": 50200
},
{
"epoch": 28.000797872340424,
"grad_norm": 2.9990179538726807,
"learning_rate": 4.773578449670275e-05,
"loss": 0.6026,
"step": 50300
},
{
"epoch": 28.001077827547594,
"grad_norm": 44.893760681152344,
"learning_rate": 4.772023142963793e-05,
"loss": 0.6877,
"step": 50400
},
{
"epoch": 28.00135778275476,
"grad_norm": 4.229214191436768,
"learning_rate": 4.77046783625731e-05,
"loss": 0.9156,
"step": 50500
},
{
"epoch": 28.001637737961925,
"grad_norm": 3.375356912612915,
"learning_rate": 4.768912529550827e-05,
"loss": 0.8209,
"step": 50600
},
{
"epoch": 28.001917693169094,
"grad_norm": 68.21538543701172,
"learning_rate": 4.767357222844345e-05,
"loss": 0.8576,
"step": 50700
},
{
"epoch": 28.00219764837626,
"grad_norm": 36.52906799316406,
"learning_rate": 4.765801916137862e-05,
"loss": 0.8978,
"step": 50800
},
{
"epoch": 28.002477603583426,
"grad_norm": 46.26045227050781,
"learning_rate": 4.76424660943138e-05,
"loss": 1.1163,
"step": 50900
},
{
"epoch": 28.002757558790595,
"grad_norm": 21.620988845825195,
"learning_rate": 4.7626913027248974e-05,
"loss": 0.9089,
"step": 51000
},
{
"epoch": 28.00303751399776,
"grad_norm": 1.2796549797058105,
"learning_rate": 4.7611359960184146e-05,
"loss": 1.1172,
"step": 51100
},
{
"epoch": 28.003317469204926,
"grad_norm": 28.369571685791016,
"learning_rate": 4.7595806893119324e-05,
"loss": 0.9379,
"step": 51200
},
{
"epoch": 28.003597424412096,
"grad_norm": 71.3536148071289,
"learning_rate": 4.7580253826054496e-05,
"loss": 1.0674,
"step": 51300
},
{
"epoch": 28.00387737961926,
"grad_norm": 32.85470199584961,
"learning_rate": 4.7564700758989675e-05,
"loss": 1.1896,
"step": 51400
},
{
"epoch": 28.004157334826427,
"grad_norm": 26.999794006347656,
"learning_rate": 4.754914769192485e-05,
"loss": 0.8049,
"step": 51500
},
{
"epoch": 28.004437290033593,
"grad_norm": 6.688983917236328,
"learning_rate": 4.7533594624860026e-05,
"loss": 1.221,
"step": 51600
},
{
"epoch": 28.004717245240762,
"grad_norm": 31.977493286132812,
"learning_rate": 4.7518041557795204e-05,
"loss": 1.0372,
"step": 51700
},
{
"epoch": 28.004997200447928,
"grad_norm": 45.08998107910156,
"learning_rate": 4.7502488490730376e-05,
"loss": 1.2516,
"step": 51800
},
{
"epoch": 28.005,
"eval_accuracy": 0.3631256384065373,
"eval_f1": 0.3243790610322335,
"eval_loss": 3.1701900959014893,
"eval_precision": 0.3353089935827423,
"eval_recall": 0.3631256384065373,
"eval_runtime": 114.3388,
"eval_samples_per_second": 34.249,
"eval_steps_per_second": 17.125,
"eval_top_10_accuracy": 0.7883043922369765,
"eval_top_1_accuracy": 0.36338100102145043,
"eval_top_5_accuracy": 0.6841164453524005,
"step": 51801
},
{
"epoch": 29.00027645576708,
"grad_norm": 24.97677230834961,
"learning_rate": 4.7486935423665555e-05,
"loss": 0.6765,
"step": 51900
},
{
"epoch": 29.000556410974244,
"grad_norm": 2.3787806034088135,
"learning_rate": 4.747138235660073e-05,
"loss": 0.7211,
"step": 52000
},
{
"epoch": 29.00083636618141,
"grad_norm": 36.792728424072266,
"learning_rate": 4.74558292895359e-05,
"loss": 0.9286,
"step": 52100
},
{
"epoch": 29.00111632138858,
"grad_norm": 39.00687026977539,
"learning_rate": 4.744043175314172e-05,
"loss": 0.7545,
"step": 52200
},
{
"epoch": 29.001396276595745,
"grad_norm": 6.44840669631958,
"learning_rate": 4.7424878686076894e-05,
"loss": 0.6815,
"step": 52300
},
{
"epoch": 29.00167623180291,
"grad_norm": 21.892784118652344,
"learning_rate": 4.740932561901207e-05,
"loss": 0.7581,
"step": 52400
},
{
"epoch": 29.00195618701008,
"grad_norm": 53.59468078613281,
"learning_rate": 4.7393772551947245e-05,
"loss": 0.9388,
"step": 52500
},
{
"epoch": 29.002236142217246,
"grad_norm": 19.31954002380371,
"learning_rate": 4.737821948488242e-05,
"loss": 0.7623,
"step": 52600
},
{
"epoch": 29.00251609742441,
"grad_norm": 56.00475311279297,
"learning_rate": 4.7362666417817595e-05,
"loss": 1.1146,
"step": 52700
},
{
"epoch": 29.002796052631577,
"grad_norm": 69.8625259399414,
"learning_rate": 4.734711335075277e-05,
"loss": 0.8305,
"step": 52800
},
{
"epoch": 29.003076007838747,
"grad_norm": 15.013059616088867,
"learning_rate": 4.7331560283687946e-05,
"loss": 0.9888,
"step": 52900
},
{
"epoch": 29.003355963045912,
"grad_norm": 45.95860290527344,
"learning_rate": 4.731600721662312e-05,
"loss": 0.9892,
"step": 53000
},
{
"epoch": 29.003635918253078,
"grad_norm": 8.254734992980957,
"learning_rate": 4.7300454149558296e-05,
"loss": 0.9714,
"step": 53100
},
{
"epoch": 29.003915873460247,
"grad_norm": 32.338558197021484,
"learning_rate": 4.728490108249347e-05,
"loss": 0.9985,
"step": 53200
},
{
"epoch": 29.004195828667413,
"grad_norm": 43.390541076660156,
"learning_rate": 4.726934801542865e-05,
"loss": 0.948,
"step": 53300
},
{
"epoch": 29.00447578387458,
"grad_norm": 3.346212148666382,
"learning_rate": 4.725379494836382e-05,
"loss": 0.9203,
"step": 53400
},
{
"epoch": 29.004755739081748,
"grad_norm": 53.539886474609375,
"learning_rate": 4.7238241881299e-05,
"loss": 1.2142,
"step": 53500
},
{
"epoch": 29.004999300111983,
"eval_accuracy": 0.37461695607763024,
"eval_f1": 0.34094689004673684,
"eval_loss": 3.153719902038574,
"eval_precision": 0.3627320351592568,
"eval_recall": 0.37461695607763024,
"eval_runtime": 112.1729,
"eval_samples_per_second": 34.91,
"eval_steps_per_second": 17.455,
"eval_top_10_accuracy": 0.7972420837589377,
"eval_top_1_accuracy": 0.3743615934627171,
"eval_top_5_accuracy": 0.6948416751787538,
"step": 53587
},
{
"epoch": 30.000034994400895,
"grad_norm": 1.2987140417099,
"learning_rate": 4.722268881423417e-05,
"loss": 0.994,
"step": 53600
},
{
"epoch": 30.00031494960806,
"grad_norm": 40.64413070678711,
"learning_rate": 4.720713574716935e-05,
"loss": 0.5028,
"step": 53700
},
{
"epoch": 30.00059490481523,
"grad_norm": 5.865800857543945,
"learning_rate": 4.719158268010452e-05,
"loss": 0.7444,
"step": 53800
},
{
"epoch": 30.000874860022396,
"grad_norm": 7.504943370819092,
"learning_rate": 4.71760296130397e-05,
"loss": 0.828,
"step": 53900
},
{
"epoch": 30.00115481522956,
"grad_norm": 7.175074100494385,
"learning_rate": 4.716047654597487e-05,
"loss": 0.6409,
"step": 54000
},
{
"epoch": 30.00143477043673,
"grad_norm": 3.8528194427490234,
"learning_rate": 4.714492347891004e-05,
"loss": 0.7132,
"step": 54100
},
{
"epoch": 30.001714725643897,
"grad_norm": 1.616517424583435,
"learning_rate": 4.7129525942515866e-05,
"loss": 0.7694,
"step": 54200
},
{
"epoch": 30.001994680851062,
"grad_norm": 58.55043411254883,
"learning_rate": 4.711397287545104e-05,
"loss": 1.0431,
"step": 54300
},
{
"epoch": 30.00227463605823,
"grad_norm": 5.023679733276367,
"learning_rate": 4.709841980838622e-05,
"loss": 0.6762,
"step": 54400
},
{
"epoch": 30.002554591265397,
"grad_norm": 21.790382385253906,
"learning_rate": 4.708286674132139e-05,
"loss": 1.0054,
"step": 54500
},
{
"epoch": 30.002834546472563,
"grad_norm": 21.34770965576172,
"learning_rate": 4.706731367425656e-05,
"loss": 0.8381,
"step": 54600
},
{
"epoch": 30.003114501679732,
"grad_norm": 35.34033966064453,
"learning_rate": 4.705176060719174e-05,
"loss": 0.8623,
"step": 54700
},
{
"epoch": 30.003394456886898,
"grad_norm": 45.18914031982422,
"learning_rate": 4.703620754012691e-05,
"loss": 1.0882,
"step": 54800
},
{
"epoch": 30.003674412094064,
"grad_norm": 4.911450386047363,
"learning_rate": 4.702065447306209e-05,
"loss": 0.9956,
"step": 54900
},
{
"epoch": 30.003954367301233,
"grad_norm": 4.490671157836914,
"learning_rate": 4.700510140599726e-05,
"loss": 0.9534,
"step": 55000
},
{
"epoch": 30.0042343225084,
"grad_norm": 0.5849565267562866,
"learning_rate": 4.698954833893244e-05,
"loss": 0.7995,
"step": 55100
},
{
"epoch": 30.004514277715565,
"grad_norm": 1.6884137392044067,
"learning_rate": 4.697399527186762e-05,
"loss": 1.029,
"step": 55200
},
{
"epoch": 30.004794232922734,
"grad_norm": 44.37993240356445,
"learning_rate": 4.695844220480279e-05,
"loss": 1.1783,
"step": 55300
},
{
"epoch": 30.004998600223963,
"eval_accuracy": 0.3659346271705822,
"eval_f1": 0.327783869178149,
"eval_loss": 3.2329089641571045,
"eval_precision": 0.3399942520026789,
"eval_recall": 0.3659346271705822,
"eval_runtime": 111.6292,
"eval_samples_per_second": 35.08,
"eval_steps_per_second": 17.54,
"eval_top_10_accuracy": 0.7916241062308478,
"eval_top_1_accuracy": 0.3659346271705822,
"eval_top_5_accuracy": 0.695097037793667,
"step": 55373
},
{
"epoch": 31.00007348824188,
"grad_norm": 0.8294526934623718,
"learning_rate": 4.694288913773797e-05,
"loss": 0.7754,
"step": 55400
},
{
"epoch": 31.000353443449047,
"grad_norm": 38.75194549560547,
"learning_rate": 4.692733607067314e-05,
"loss": 0.635,
"step": 55500
},
{
"epoch": 31.000633398656216,
"grad_norm": 9.357181549072266,
"learning_rate": 4.6911783003608313e-05,
"loss": 0.5596,
"step": 55600
},
{
"epoch": 31.000913353863382,
"grad_norm": 7.658017158508301,
"learning_rate": 4.689622993654349e-05,
"loss": 0.6122,
"step": 55700
},
{
"epoch": 31.001193309070548,
"grad_norm": 6.124042510986328,
"learning_rate": 4.6880676869478664e-05,
"loss": 0.8859,
"step": 55800
},
{
"epoch": 31.001473264277717,
"grad_norm": 2.957393169403076,
"learning_rate": 4.686512380241384e-05,
"loss": 0.6045,
"step": 55900
},
{
"epoch": 31.001753219484883,
"grad_norm": 11.278263092041016,
"learning_rate": 4.6849570735349015e-05,
"loss": 0.8514,
"step": 56000
},
{
"epoch": 31.00203317469205,
"grad_norm": 32.86345291137695,
"learning_rate": 4.6834017668284186e-05,
"loss": 0.9362,
"step": 56100
},
{
"epoch": 31.002313129899218,
"grad_norm": 37.377540588378906,
"learning_rate": 4.681862013189001e-05,
"loss": 1.0054,
"step": 56200
},
{
"epoch": 31.002593085106383,
"grad_norm": 8.889802932739258,
"learning_rate": 4.680306706482518e-05,
"loss": 0.8472,
"step": 56300
},
{
"epoch": 31.00287304031355,
"grad_norm": 9.699048042297363,
"learning_rate": 4.678751399776036e-05,
"loss": 0.7464,
"step": 56400
},
{
"epoch": 31.00315299552072,
"grad_norm": 62.45649719238281,
"learning_rate": 4.677196093069553e-05,
"loss": 0.892,
"step": 56500
},
{
"epoch": 31.003432950727884,
"grad_norm": 6.620049476623535,
"learning_rate": 4.675640786363071e-05,
"loss": 0.7575,
"step": 56600
},
{
"epoch": 31.00371290593505,
"grad_norm": 0.34919270873069763,
"learning_rate": 4.674085479656588e-05,
"loss": 1.1288,
"step": 56700
},
{
"epoch": 31.003992861142216,
"grad_norm": 73.19112396240234,
"learning_rate": 4.672530172950106e-05,
"loss": 1.1417,
"step": 56800
},
{
"epoch": 31.004272816349385,
"grad_norm": 35.101932525634766,
"learning_rate": 4.6709748662436234e-05,
"loss": 1.1342,
"step": 56900
},
{
"epoch": 31.00455277155655,
"grad_norm": 4.475014686584473,
"learning_rate": 4.669419559537141e-05,
"loss": 1.2289,
"step": 57000
},
{
"epoch": 31.004832726763716,
"grad_norm": 40.93881607055664,
"learning_rate": 4.6678642528306584e-05,
"loss": 1.2075,
"step": 57100
},
{
"epoch": 31.00500069988802,
"eval_accuracy": 0.36925434116445355,
"eval_f1": 0.3347721266424357,
"eval_loss": 3.2250540256500244,
"eval_precision": 0.3521287566422908,
"eval_recall": 0.36925434116445355,
"eval_runtime": 117.4887,
"eval_samples_per_second": 33.331,
"eval_steps_per_second": 16.665,
"eval_top_10_accuracy": 0.7880490296220634,
"eval_top_1_accuracy": 0.3695097037793667,
"eval_top_5_accuracy": 0.6971399387129724,
"step": 57160
},
{
"epoch": 32.00011198208286,
"grad_norm": 1.2258549928665161,
"learning_rate": 4.666308946124176e-05,
"loss": 0.689,
"step": 57200
},
{
"epoch": 32.00039193729003,
"grad_norm": 4.694971084594727,
"learning_rate": 4.6647536394176935e-05,
"loss": 0.5123,
"step": 57300
},
{
"epoch": 32.0006718924972,
"grad_norm": 60.558834075927734,
"learning_rate": 4.6631983327112114e-05,
"loss": 0.4678,
"step": 57400
},
{
"epoch": 32.000951847704364,
"grad_norm": 21.36684799194336,
"learning_rate": 4.6616430260047285e-05,
"loss": 0.6802,
"step": 57500
},
{
"epoch": 32.00123180291153,
"grad_norm": 2.4743974208831787,
"learning_rate": 4.660087719298246e-05,
"loss": 0.7395,
"step": 57600
},
{
"epoch": 32.0015117581187,
"grad_norm": 11.220129013061523,
"learning_rate": 4.6585324125917636e-05,
"loss": 0.7529,
"step": 57700
},
{
"epoch": 32.001791713325865,
"grad_norm": 33.19414138793945,
"learning_rate": 4.656977105885281e-05,
"loss": 0.8596,
"step": 57800
},
{
"epoch": 32.002071668533034,
"grad_norm": 16.343273162841797,
"learning_rate": 4.6554217991787987e-05,
"loss": 0.6243,
"step": 57900
},
{
"epoch": 32.0023516237402,
"grad_norm": 36.327816009521484,
"learning_rate": 4.653866492472316e-05,
"loss": 0.9409,
"step": 58000
},
{
"epoch": 32.002631578947366,
"grad_norm": 41.437198638916016,
"learning_rate": 4.652311185765833e-05,
"loss": 0.8562,
"step": 58100
},
{
"epoch": 32.002911534154535,
"grad_norm": 47.712486267089844,
"learning_rate": 4.6507714321264154e-05,
"loss": 0.504,
"step": 58200
},
{
"epoch": 32.003191489361704,
"grad_norm": 36.536251068115234,
"learning_rate": 4.6492161254199326e-05,
"loss": 1.0219,
"step": 58300
},
{
"epoch": 32.003471444568866,
"grad_norm": 24.974342346191406,
"learning_rate": 4.6476608187134505e-05,
"loss": 1.0403,
"step": 58400
},
{
"epoch": 32.003751399776036,
"grad_norm": 37.766822814941406,
"learning_rate": 4.6461055120069676e-05,
"loss": 0.9055,
"step": 58500
},
{
"epoch": 32.004031354983205,
"grad_norm": 5.955631732940674,
"learning_rate": 4.6445502053004855e-05,
"loss": 0.9022,
"step": 58600
},
{
"epoch": 32.00431131019037,
"grad_norm": 36.062278747558594,
"learning_rate": 4.643010451661068e-05,
"loss": 1.2884,
"step": 58700
},
{
"epoch": 32.004591265397536,
"grad_norm": 42.342796325683594,
"learning_rate": 4.641455144954585e-05,
"loss": 1.1833,
"step": 58800
},
{
"epoch": 32.004871220604706,
"grad_norm": 44.055450439453125,
"learning_rate": 4.639899838248103e-05,
"loss": 1.1369,
"step": 58900
},
{
"epoch": 32.005,
"eval_accuracy": 0.34601634320735447,
"eval_f1": 0.3114462019084082,
"eval_loss": 3.3421857357025146,
"eval_precision": 0.3287510207134824,
"eval_recall": 0.34601634320735447,
"eval_runtime": 114.9634,
"eval_samples_per_second": 34.063,
"eval_steps_per_second": 17.032,
"eval_top_10_accuracy": 0.7737487231869254,
"eval_top_1_accuracy": 0.34576098059244126,
"eval_top_5_accuracy": 0.6802860061287027,
"step": 58946
},
{
"epoch": 33.00015047592385,
"grad_norm": 3.8549163341522217,
"learning_rate": 4.63834453154162e-05,
"loss": 0.8772,
"step": 59000
},
{
"epoch": 33.00043043113102,
"grad_norm": 1.2819772958755493,
"learning_rate": 4.636789224835137e-05,
"loss": 0.5074,
"step": 59100
},
{
"epoch": 33.000710386338184,
"grad_norm": 69.6226806640625,
"learning_rate": 4.635233918128655e-05,
"loss": 0.8393,
"step": 59200
},
{
"epoch": 33.000990341545354,
"grad_norm": 13.237231254577637,
"learning_rate": 4.6336786114221724e-05,
"loss": 0.7097,
"step": 59300
},
{
"epoch": 33.00127029675252,
"grad_norm": 24.04904556274414,
"learning_rate": 4.63212330471569e-05,
"loss": 0.8112,
"step": 59400
},
{
"epoch": 33.001550251959685,
"grad_norm": 36.736419677734375,
"learning_rate": 4.6305679980092074e-05,
"loss": 0.8325,
"step": 59500
},
{
"epoch": 33.001830207166854,
"grad_norm": 1.0379083156585693,
"learning_rate": 4.6290126913027246e-05,
"loss": 0.8391,
"step": 59600
},
{
"epoch": 33.002110162374024,
"grad_norm": 30.109365463256836,
"learning_rate": 4.6274573845962425e-05,
"loss": 0.8687,
"step": 59700
},
{
"epoch": 33.002390117581186,
"grad_norm": 6.3692731857299805,
"learning_rate": 4.62590207788976e-05,
"loss": 0.9515,
"step": 59800
},
{
"epoch": 33.002670072788355,
"grad_norm": 0.17226508259773254,
"learning_rate": 4.6243467711832775e-05,
"loss": 0.7252,
"step": 59900
},
{
"epoch": 33.00295002799552,
"grad_norm": 2.027529716491699,
"learning_rate": 4.622791464476795e-05,
"loss": 0.7717,
"step": 60000
},
{
"epoch": 33.00322998320269,
"grad_norm": 29.886762619018555,
"learning_rate": 4.6212361577703126e-05,
"loss": 1.0916,
"step": 60100
},
{
"epoch": 33.003509938409856,
"grad_norm": 4.087385177612305,
"learning_rate": 4.61968085106383e-05,
"loss": 0.8275,
"step": 60200
},
{
"epoch": 33.00378989361702,
"grad_norm": 12.894125938415527,
"learning_rate": 4.6181255443573477e-05,
"loss": 0.8847,
"step": 60300
},
{
"epoch": 33.00406984882419,
"grad_norm": 44.40382385253906,
"learning_rate": 4.616570237650865e-05,
"loss": 0.9739,
"step": 60400
},
{
"epoch": 33.00434980403136,
"grad_norm": 7.521993160247803,
"learning_rate": 4.615014930944383e-05,
"loss": 0.9827,
"step": 60500
},
{
"epoch": 33.00462975923852,
"grad_norm": 2.905057430267334,
"learning_rate": 4.6134596242379e-05,
"loss": 1.1328,
"step": 60600
},
{
"epoch": 33.00490971444569,
"grad_norm": 9.729948997497559,
"learning_rate": 4.611904317531418e-05,
"loss": 1.1948,
"step": 60700
},
{
"epoch": 33.00499930011198,
"eval_accuracy": 0.36338100102145043,
"eval_f1": 0.3240467396278912,
"eval_loss": 3.34759521484375,
"eval_precision": 0.33750582212022456,
"eval_recall": 0.36338100102145043,
"eval_runtime": 118.2099,
"eval_samples_per_second": 33.128,
"eval_steps_per_second": 16.564,
"eval_top_10_accuracy": 0.7734933605720122,
"eval_top_1_accuracy": 0.36338100102145043,
"eval_top_5_accuracy": 0.6726251276813074,
"step": 60732
},
{
"epoch": 34.000188969764835,
"grad_norm": 1.6296502351760864,
"learning_rate": 4.610349010824935e-05,
"loss": 0.7557,
"step": 60800
},
{
"epoch": 34.000468924972004,
"grad_norm": 5.730977535247803,
"learning_rate": 4.608793704118453e-05,
"loss": 0.6354,
"step": 60900
},
{
"epoch": 34.000748880179174,
"grad_norm": 3.109783172607422,
"learning_rate": 4.60723839741197e-05,
"loss": 0.6765,
"step": 61000
},
{
"epoch": 34.001028835386336,
"grad_norm": 0.2169467806816101,
"learning_rate": 4.605683090705487e-05,
"loss": 0.6485,
"step": 61100
},
{
"epoch": 34.001308790593505,
"grad_norm": 12.250176429748535,
"learning_rate": 4.604127783999005e-05,
"loss": 0.9218,
"step": 61200
},
{
"epoch": 34.001588745800674,
"grad_norm": 28.112674713134766,
"learning_rate": 4.602572477292522e-05,
"loss": 0.6046,
"step": 61300
},
{
"epoch": 34.00186870100784,
"grad_norm": 0.6601081490516663,
"learning_rate": 4.60101717058604e-05,
"loss": 0.8272,
"step": 61400
},
{
"epoch": 34.002148656215006,
"grad_norm": 10.42507266998291,
"learning_rate": 4.599461863879557e-05,
"loss": 0.8769,
"step": 61500
},
{
"epoch": 34.002428611422175,
"grad_norm": 42.463401794433594,
"learning_rate": 4.597922110240139e-05,
"loss": 0.9232,
"step": 61600
},
{
"epoch": 34.00270856662934,
"grad_norm": 0.37827855348587036,
"learning_rate": 4.596366803533657e-05,
"loss": 0.809,
"step": 61700
},
{
"epoch": 34.00298852183651,
"grad_norm": 60.71481704711914,
"learning_rate": 4.594811496827174e-05,
"loss": 1.0087,
"step": 61800
},
{
"epoch": 34.003268477043676,
"grad_norm": 30.099050521850586,
"learning_rate": 4.593256190120692e-05,
"loss": 0.7332,
"step": 61900
},
{
"epoch": 34.00354843225084,
"grad_norm": 37.70903396606445,
"learning_rate": 4.591700883414209e-05,
"loss": 0.9752,
"step": 62000
},
{
"epoch": 34.00382838745801,
"grad_norm": 36.76481246948242,
"learning_rate": 4.590145576707727e-05,
"loss": 1.3231,
"step": 62100
},
{
"epoch": 34.00410834266518,
"grad_norm": 24.240903854370117,
"learning_rate": 4.588590270001245e-05,
"loss": 0.7914,
"step": 62200
},
{
"epoch": 34.00438829787234,
"grad_norm": 1.8038338422775269,
"learning_rate": 4.587034963294762e-05,
"loss": 0.9946,
"step": 62300
},
{
"epoch": 34.00466825307951,
"grad_norm": 27.41489028930664,
"learning_rate": 4.58547965658828e-05,
"loss": 0.9436,
"step": 62400
},
{
"epoch": 34.00494820828668,
"grad_norm": 73.32023620605469,
"learning_rate": 4.583924349881797e-05,
"loss": 1.0164,
"step": 62500
},
{
"epoch": 34.00499860022396,
"eval_accuracy": 0.3687436159346272,
"eval_f1": 0.33117920128900724,
"eval_loss": 3.318777322769165,
"eval_precision": 0.3433794202052629,
"eval_recall": 0.3687436159346272,
"eval_runtime": 115.7117,
"eval_samples_per_second": 33.843,
"eval_steps_per_second": 16.921,
"eval_top_10_accuracy": 0.7854954034729316,
"eval_top_1_accuracy": 0.3687436159346272,
"eval_top_5_accuracy": 0.6968845760980592,
"step": 62518
},
{
"epoch": 35.000227463605825,
"grad_norm": 17.592552185058594,
"learning_rate": 4.582369043175314e-05,
"loss": 0.6981,
"step": 62600
},
{
"epoch": 35.00050741881299,
"grad_norm": 34.496055603027344,
"learning_rate": 4.580813736468832e-05,
"loss": 0.5374,
"step": 62700
},
{
"epoch": 35.000787374020156,
"grad_norm": 48.30421829223633,
"learning_rate": 4.5792584297623494e-05,
"loss": 0.7976,
"step": 62800
},
{
"epoch": 35.001067329227325,
"grad_norm": 36.826202392578125,
"learning_rate": 4.577703123055867e-05,
"loss": 0.6289,
"step": 62900
},
{
"epoch": 35.00134728443449,
"grad_norm": 31.521106719970703,
"learning_rate": 4.5761478163493844e-05,
"loss": 0.7489,
"step": 63000
},
{
"epoch": 35.00162723964166,
"grad_norm": 0.38109827041625977,
"learning_rate": 4.5745925096429016e-05,
"loss": 0.6528,
"step": 63100
},
{
"epoch": 35.001907194848826,
"grad_norm": 0.5344606041908264,
"learning_rate": 4.5730372029364195e-05,
"loss": 0.4075,
"step": 63200
},
{
"epoch": 35.00218715005599,
"grad_norm": 1.7213187217712402,
"learning_rate": 4.5714818962299367e-05,
"loss": 0.9644,
"step": 63300
},
{
"epoch": 35.00246710526316,
"grad_norm": 0.6551365852355957,
"learning_rate": 4.5699265895234545e-05,
"loss": 0.8157,
"step": 63400
},
{
"epoch": 35.00274706047033,
"grad_norm": 27.040754318237305,
"learning_rate": 4.568371282816972e-05,
"loss": 0.797,
"step": 63500
},
{
"epoch": 35.00302701567749,
"grad_norm": 10.12726879119873,
"learning_rate": 4.566815976110489e-05,
"loss": 0.9778,
"step": 63600
},
{
"epoch": 35.00330697088466,
"grad_norm": 43.65861892700195,
"learning_rate": 4.565260669404007e-05,
"loss": 0.9172,
"step": 63700
},
{
"epoch": 35.00358692609183,
"grad_norm": 23.98292350769043,
"learning_rate": 4.563705362697524e-05,
"loss": 0.8713,
"step": 63800
},
{
"epoch": 35.00386688129899,
"grad_norm": 35.00185012817383,
"learning_rate": 4.562150055991042e-05,
"loss": 0.87,
"step": 63900
},
{
"epoch": 35.00414683650616,
"grad_norm": 1.3145835399627686,
"learning_rate": 4.560594749284559e-05,
"loss": 0.9978,
"step": 64000
},
{
"epoch": 35.00442679171333,
"grad_norm": 0.8550679683685303,
"learning_rate": 4.559039442578076e-05,
"loss": 1.0991,
"step": 64100
},
{
"epoch": 35.00470674692049,
"grad_norm": 7.469823360443115,
"learning_rate": 4.557484135871594e-05,
"loss": 1.0481,
"step": 64200
},
{
"epoch": 35.00498670212766,
"grad_norm": 8.4567232131958,
"learning_rate": 4.555928829165111e-05,
"loss": 0.8987,
"step": 64300
},
{
"epoch": 35.005000699888015,
"eval_accuracy": 0.3718079673135853,
"eval_f1": 0.33662947681078426,
"eval_loss": 3.3163700103759766,
"eval_precision": 0.35664473422389664,
"eval_recall": 0.3718079673135853,
"eval_runtime": 118.3976,
"eval_samples_per_second": 33.075,
"eval_steps_per_second": 16.538,
"eval_top_10_accuracy": 0.7911133810010215,
"eval_top_1_accuracy": 0.37206332992849844,
"eval_top_5_accuracy": 0.6986721144024515,
"step": 64305
},
{
"epoch": 35.005000699888015,
"step": 64305,
"total_flos": 6.524018866222793e+20,
"train_loss": 9.042802423402353,
"train_runtime": 31106.056,
"train_samples_per_second": 91.866,
"train_steps_per_second": 11.483
}
],
"logging_steps": 100,
"max_steps": 357200,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.524018866222793e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}