ducdatit2002's picture
Upload folder using huggingface_hub
e09333c verified
{
"best_metric": 0.9376445510679586,
"best_model_checkpoint": "./phobert_results_v2/checkpoint-12145",
"epoch": 7.0,
"eval_steps": 500,
"global_step": 12145,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02881844380403458,
"grad_norm": 4.723288059234619,
"learning_rate": 1.994236311239193e-05,
"loss": 1.9047,
"step": 50
},
{
"epoch": 0.05763688760806916,
"grad_norm": 32.87104034423828,
"learning_rate": 1.9884726224783863e-05,
"loss": 1.7269,
"step": 100
},
{
"epoch": 0.08645533141210375,
"grad_norm": 5.206538677215576,
"learning_rate": 1.9827089337175795e-05,
"loss": 1.6865,
"step": 150
},
{
"epoch": 0.11527377521613832,
"grad_norm": 52.5133056640625,
"learning_rate": 1.9769452449567724e-05,
"loss": 1.633,
"step": 200
},
{
"epoch": 0.1440922190201729,
"grad_norm": 12.493169784545898,
"learning_rate": 1.9711815561959656e-05,
"loss": 1.553,
"step": 250
},
{
"epoch": 0.1729106628242075,
"grad_norm": 7.675703048706055,
"learning_rate": 1.9654178674351588e-05,
"loss": 1.4131,
"step": 300
},
{
"epoch": 0.2017291066282421,
"grad_norm": 5.984454154968262,
"learning_rate": 1.9596541786743517e-05,
"loss": 1.3606,
"step": 350
},
{
"epoch": 0.23054755043227665,
"grad_norm": 9.403727531433105,
"learning_rate": 1.953890489913545e-05,
"loss": 1.2202,
"step": 400
},
{
"epoch": 0.25936599423631124,
"grad_norm": 7.890481472015381,
"learning_rate": 1.9481268011527378e-05,
"loss": 1.186,
"step": 450
},
{
"epoch": 0.2881844380403458,
"grad_norm": 12.102771759033203,
"learning_rate": 1.942363112391931e-05,
"loss": 1.0865,
"step": 500
},
{
"epoch": 0.3170028818443804,
"grad_norm": 17.669885635375977,
"learning_rate": 1.936599423631124e-05,
"loss": 1.124,
"step": 550
},
{
"epoch": 0.345821325648415,
"grad_norm": 10.14146614074707,
"learning_rate": 1.930835734870317e-05,
"loss": 1.1307,
"step": 600
},
{
"epoch": 0.3746397694524496,
"grad_norm": 9.363780975341797,
"learning_rate": 1.9250720461095104e-05,
"loss": 1.0582,
"step": 650
},
{
"epoch": 0.4034582132564842,
"grad_norm": 22.122907638549805,
"learning_rate": 1.9193083573487033e-05,
"loss": 1.007,
"step": 700
},
{
"epoch": 0.4322766570605187,
"grad_norm": 22.921249389648438,
"learning_rate": 1.9135446685878965e-05,
"loss": 1.0046,
"step": 750
},
{
"epoch": 0.4610951008645533,
"grad_norm": 13.880660057067871,
"learning_rate": 1.9077809798270894e-05,
"loss": 0.9786,
"step": 800
},
{
"epoch": 0.4899135446685879,
"grad_norm": 13.75207805633545,
"learning_rate": 1.9020172910662826e-05,
"loss": 0.9166,
"step": 850
},
{
"epoch": 0.5187319884726225,
"grad_norm": 15.701948165893555,
"learning_rate": 1.8962536023054755e-05,
"loss": 0.8942,
"step": 900
},
{
"epoch": 0.547550432276657,
"grad_norm": 13.106974601745605,
"learning_rate": 1.8904899135446687e-05,
"loss": 0.8905,
"step": 950
},
{
"epoch": 0.5763688760806917,
"grad_norm": 13.854475021362305,
"learning_rate": 1.884726224783862e-05,
"loss": 0.9089,
"step": 1000
},
{
"epoch": 0.6051873198847262,
"grad_norm": 14.22427749633789,
"learning_rate": 1.878962536023055e-05,
"loss": 0.8688,
"step": 1050
},
{
"epoch": 0.6340057636887608,
"grad_norm": 14.088136672973633,
"learning_rate": 1.873198847262248e-05,
"loss": 0.859,
"step": 1100
},
{
"epoch": 0.6628242074927954,
"grad_norm": 15.746428489685059,
"learning_rate": 1.867435158501441e-05,
"loss": 0.8356,
"step": 1150
},
{
"epoch": 0.69164265129683,
"grad_norm": 10.937832832336426,
"learning_rate": 1.861671469740634e-05,
"loss": 0.8699,
"step": 1200
},
{
"epoch": 0.7204610951008645,
"grad_norm": 14.396600723266602,
"learning_rate": 1.855907780979827e-05,
"loss": 0.7789,
"step": 1250
},
{
"epoch": 0.7492795389048992,
"grad_norm": 11.801300048828125,
"learning_rate": 1.8501440922190203e-05,
"loss": 0.8713,
"step": 1300
},
{
"epoch": 0.7780979827089337,
"grad_norm": 10.435481071472168,
"learning_rate": 1.8443804034582135e-05,
"loss": 0.7961,
"step": 1350
},
{
"epoch": 0.8069164265129684,
"grad_norm": 11.2293062210083,
"learning_rate": 1.8386167146974067e-05,
"loss": 0.8397,
"step": 1400
},
{
"epoch": 0.8357348703170029,
"grad_norm": 12.809613227844238,
"learning_rate": 1.8328530259365996e-05,
"loss": 0.8297,
"step": 1450
},
{
"epoch": 0.8645533141210374,
"grad_norm": 14.791847229003906,
"learning_rate": 1.8270893371757928e-05,
"loss": 0.7709,
"step": 1500
},
{
"epoch": 0.8933717579250721,
"grad_norm": 14.587478637695312,
"learning_rate": 1.8213256484149857e-05,
"loss": 0.7552,
"step": 1550
},
{
"epoch": 0.9221902017291066,
"grad_norm": 16.79636001586914,
"learning_rate": 1.815561959654179e-05,
"loss": 0.7937,
"step": 1600
},
{
"epoch": 0.9510086455331412,
"grad_norm": 8.510680198669434,
"learning_rate": 1.8097982708933718e-05,
"loss": 0.7181,
"step": 1650
},
{
"epoch": 0.9798270893371758,
"grad_norm": 20.466522216796875,
"learning_rate": 1.804034582132565e-05,
"loss": 0.712,
"step": 1700
},
{
"epoch": 1.0,
"eval_accuracy": 0.7955036748811067,
"eval_f1_macro": 0.795530535937033,
"eval_f1_weighted": 0.795536244762841,
"eval_loss": 0.6264312267303467,
"eval_precision_macro": 0.7978076489526655,
"eval_precision_weighted": 0.797819457255313,
"eval_recall_macro": 0.7955041514180622,
"eval_recall_weighted": 0.7955036748811067,
"eval_runtime": 29.7702,
"eval_samples_per_second": 233.086,
"eval_steps_per_second": 14.578,
"step": 1735
},
{
"epoch": 1.0086455331412103,
"grad_norm": 17.28911018371582,
"learning_rate": 1.7982708933717582e-05,
"loss": 0.6512,
"step": 1750
},
{
"epoch": 1.037463976945245,
"grad_norm": 28.79306411743164,
"learning_rate": 1.792507204610951e-05,
"loss": 0.6036,
"step": 1800
},
{
"epoch": 1.0662824207492796,
"grad_norm": 13.444112777709961,
"learning_rate": 1.7867435158501444e-05,
"loss": 0.552,
"step": 1850
},
{
"epoch": 1.0951008645533142,
"grad_norm": 22.068620681762695,
"learning_rate": 1.7809798270893372e-05,
"loss": 0.577,
"step": 1900
},
{
"epoch": 1.1239193083573487,
"grad_norm": 13.942420959472656,
"learning_rate": 1.7752161383285305e-05,
"loss": 0.5573,
"step": 1950
},
{
"epoch": 1.1527377521613833,
"grad_norm": 13.652227401733398,
"learning_rate": 1.7694524495677234e-05,
"loss": 0.5401,
"step": 2000
},
{
"epoch": 1.181556195965418,
"grad_norm": 22.252243041992188,
"learning_rate": 1.7636887608069166e-05,
"loss": 0.558,
"step": 2050
},
{
"epoch": 1.2103746397694524,
"grad_norm": 17.13372802734375,
"learning_rate": 1.7579250720461095e-05,
"loss": 0.514,
"step": 2100
},
{
"epoch": 1.239193083573487,
"grad_norm": 17.164276123046875,
"learning_rate": 1.7521613832853027e-05,
"loss": 0.5137,
"step": 2150
},
{
"epoch": 1.2680115273775217,
"grad_norm": 14.449616432189941,
"learning_rate": 1.746397694524496e-05,
"loss": 0.5,
"step": 2200
},
{
"epoch": 1.2968299711815563,
"grad_norm": 11.61601734161377,
"learning_rate": 1.7406340057636888e-05,
"loss": 0.5071,
"step": 2250
},
{
"epoch": 1.3256484149855907,
"grad_norm": 20.6407527923584,
"learning_rate": 1.734870317002882e-05,
"loss": 0.4803,
"step": 2300
},
{
"epoch": 1.3544668587896254,
"grad_norm": 18.60149383544922,
"learning_rate": 1.729106628242075e-05,
"loss": 0.4967,
"step": 2350
},
{
"epoch": 1.38328530259366,
"grad_norm": 22.12944793701172,
"learning_rate": 1.723342939481268e-05,
"loss": 0.4652,
"step": 2400
},
{
"epoch": 1.4121037463976944,
"grad_norm": 18.460689544677734,
"learning_rate": 1.717579250720461e-05,
"loss": 0.4417,
"step": 2450
},
{
"epoch": 1.440922190201729,
"grad_norm": 19.452363967895508,
"learning_rate": 1.7118155619596542e-05,
"loss": 0.5082,
"step": 2500
},
{
"epoch": 1.4697406340057637,
"grad_norm": 41.46109390258789,
"learning_rate": 1.7060518731988475e-05,
"loss": 0.463,
"step": 2550
},
{
"epoch": 1.4985590778097984,
"grad_norm": 25.584379196166992,
"learning_rate": 1.7002881844380407e-05,
"loss": 0.4788,
"step": 2600
},
{
"epoch": 1.527377521613833,
"grad_norm": 16.92909049987793,
"learning_rate": 1.6945244956772336e-05,
"loss": 0.4752,
"step": 2650
},
{
"epoch": 1.5561959654178674,
"grad_norm": 10.342531204223633,
"learning_rate": 1.6887608069164268e-05,
"loss": 0.4848,
"step": 2700
},
{
"epoch": 1.585014409221902,
"grad_norm": 15.596243858337402,
"learning_rate": 1.6829971181556197e-05,
"loss": 0.5158,
"step": 2750
},
{
"epoch": 1.6138328530259365,
"grad_norm": 17.031354904174805,
"learning_rate": 1.6772334293948126e-05,
"loss": 0.4577,
"step": 2800
},
{
"epoch": 1.6426512968299711,
"grad_norm": 13.550045013427734,
"learning_rate": 1.6714697406340058e-05,
"loss": 0.4947,
"step": 2850
},
{
"epoch": 1.6714697406340058,
"grad_norm": 10.102880477905273,
"learning_rate": 1.665706051873199e-05,
"loss": 0.4681,
"step": 2900
},
{
"epoch": 1.7002881844380404,
"grad_norm": 10.968811988830566,
"learning_rate": 1.6599423631123922e-05,
"loss": 0.452,
"step": 2950
},
{
"epoch": 1.729106628242075,
"grad_norm": 4.670314311981201,
"learning_rate": 1.654178674351585e-05,
"loss": 0.4346,
"step": 3000
},
{
"epoch": 1.7579250720461095,
"grad_norm": 28.008899688720703,
"learning_rate": 1.6484149855907783e-05,
"loss": 0.4404,
"step": 3050
},
{
"epoch": 1.7867435158501441,
"grad_norm": 27.77347183227539,
"learning_rate": 1.6426512968299712e-05,
"loss": 0.4547,
"step": 3100
},
{
"epoch": 1.8155619596541785,
"grad_norm": 21.949289321899414,
"learning_rate": 1.6368876080691644e-05,
"loss": 0.4924,
"step": 3150
},
{
"epoch": 1.8443804034582132,
"grad_norm": 2.8766281604766846,
"learning_rate": 1.6311239193083573e-05,
"loss": 0.3776,
"step": 3200
},
{
"epoch": 1.8731988472622478,
"grad_norm": 38.812625885009766,
"learning_rate": 1.6253602305475506e-05,
"loss": 0.4176,
"step": 3250
},
{
"epoch": 1.9020172910662825,
"grad_norm": 24.132482528686523,
"learning_rate": 1.6195965417867438e-05,
"loss": 0.4407,
"step": 3300
},
{
"epoch": 1.9308357348703171,
"grad_norm": 19.72426414489746,
"learning_rate": 1.613832853025937e-05,
"loss": 0.4503,
"step": 3350
},
{
"epoch": 1.9596541786743515,
"grad_norm": 16.077899932861328,
"learning_rate": 1.60806916426513e-05,
"loss": 0.4244,
"step": 3400
},
{
"epoch": 1.9884726224783862,
"grad_norm": 21.258529663085938,
"learning_rate": 1.6023054755043228e-05,
"loss": 0.4161,
"step": 3450
},
{
"epoch": 2.0,
"eval_accuracy": 0.87101887880098,
"eval_f1_macro": 0.8705600204306078,
"eval_f1_weighted": 0.8705702821418305,
"eval_loss": 0.43536442518234253,
"eval_precision_macro": 0.8749786652431168,
"eval_precision_weighted": 0.8749977247467932,
"eval_recall_macro": 0.8710178763247105,
"eval_recall_weighted": 0.87101887880098,
"eval_runtime": 29.9836,
"eval_samples_per_second": 231.426,
"eval_steps_per_second": 14.475,
"step": 3470
},
{
"epoch": 2.0172910662824206,
"grad_norm": 13.94642448425293,
"learning_rate": 1.596541786743516e-05,
"loss": 0.321,
"step": 3500
},
{
"epoch": 2.0461095100864553,
"grad_norm": 17.44521713256836,
"learning_rate": 1.590778097982709e-05,
"loss": 0.2707,
"step": 3550
},
{
"epoch": 2.07492795389049,
"grad_norm": 12.439199447631836,
"learning_rate": 1.585014409221902e-05,
"loss": 0.2606,
"step": 3600
},
{
"epoch": 2.1037463976945245,
"grad_norm": 10.950318336486816,
"learning_rate": 1.5792507204610953e-05,
"loss": 0.2601,
"step": 3650
},
{
"epoch": 2.132564841498559,
"grad_norm": 20.581911087036133,
"learning_rate": 1.5734870317002882e-05,
"loss": 0.265,
"step": 3700
},
{
"epoch": 2.161383285302594,
"grad_norm": 9.320990562438965,
"learning_rate": 1.5677233429394814e-05,
"loss": 0.297,
"step": 3750
},
{
"epoch": 2.1902017291066285,
"grad_norm": 20.260753631591797,
"learning_rate": 1.5619596541786747e-05,
"loss": 0.3121,
"step": 3800
},
{
"epoch": 2.2190201729106627,
"grad_norm": 55.92860412597656,
"learning_rate": 1.5561959654178675e-05,
"loss": 0.2295,
"step": 3850
},
{
"epoch": 2.2478386167146973,
"grad_norm": 9.637039184570312,
"learning_rate": 1.5504322766570608e-05,
"loss": 0.2558,
"step": 3900
},
{
"epoch": 2.276657060518732,
"grad_norm": 13.105185508728027,
"learning_rate": 1.5446685878962537e-05,
"loss": 0.2432,
"step": 3950
},
{
"epoch": 2.3054755043227666,
"grad_norm": 17.233076095581055,
"learning_rate": 1.538904899135447e-05,
"loss": 0.2817,
"step": 4000
},
{
"epoch": 2.3342939481268012,
"grad_norm": 14.211281776428223,
"learning_rate": 1.5331412103746398e-05,
"loss": 0.2663,
"step": 4050
},
{
"epoch": 2.363112391930836,
"grad_norm": 30.84634780883789,
"learning_rate": 1.527377521613833e-05,
"loss": 0.3084,
"step": 4100
},
{
"epoch": 2.39193083573487,
"grad_norm": 29.224945068359375,
"learning_rate": 1.521613832853026e-05,
"loss": 0.3109,
"step": 4150
},
{
"epoch": 2.4207492795389047,
"grad_norm": 3.674872398376465,
"learning_rate": 1.5158501440922191e-05,
"loss": 0.2346,
"step": 4200
},
{
"epoch": 2.4495677233429394,
"grad_norm": 5.084238052368164,
"learning_rate": 1.5100864553314123e-05,
"loss": 0.2227,
"step": 4250
},
{
"epoch": 2.478386167146974,
"grad_norm": 21.61268424987793,
"learning_rate": 1.5043227665706052e-05,
"loss": 0.2426,
"step": 4300
},
{
"epoch": 2.5072046109510087,
"grad_norm": 12.604729652404785,
"learning_rate": 1.4985590778097984e-05,
"loss": 0.325,
"step": 4350
},
{
"epoch": 2.5360230547550433,
"grad_norm": 25.42458152770996,
"learning_rate": 1.4927953890489915e-05,
"loss": 0.3394,
"step": 4400
},
{
"epoch": 2.564841498559078,
"grad_norm": 9.900081634521484,
"learning_rate": 1.4870317002881847e-05,
"loss": 0.2565,
"step": 4450
},
{
"epoch": 2.5936599423631126,
"grad_norm": 14.56777572631836,
"learning_rate": 1.4812680115273776e-05,
"loss": 0.2542,
"step": 4500
},
{
"epoch": 2.6224783861671472,
"grad_norm": 27.645551681518555,
"learning_rate": 1.4755043227665706e-05,
"loss": 0.2753,
"step": 4550
},
{
"epoch": 2.6512968299711814,
"grad_norm": 24.675256729125977,
"learning_rate": 1.4697406340057639e-05,
"loss": 0.2648,
"step": 4600
},
{
"epoch": 2.680115273775216,
"grad_norm": 19.5800838470459,
"learning_rate": 1.4639769452449568e-05,
"loss": 0.2441,
"step": 4650
},
{
"epoch": 2.7089337175792507,
"grad_norm": 32.76830291748047,
"learning_rate": 1.45821325648415e-05,
"loss": 0.2748,
"step": 4700
},
{
"epoch": 2.7377521613832854,
"grad_norm": 9.661020278930664,
"learning_rate": 1.452449567723343e-05,
"loss": 0.2721,
"step": 4750
},
{
"epoch": 2.76657060518732,
"grad_norm": 25.79545021057129,
"learning_rate": 1.4466858789625363e-05,
"loss": 0.2743,
"step": 4800
},
{
"epoch": 2.795389048991354,
"grad_norm": 30.077226638793945,
"learning_rate": 1.4409221902017291e-05,
"loss": 0.2314,
"step": 4850
},
{
"epoch": 2.824207492795389,
"grad_norm": 11.516585350036621,
"learning_rate": 1.4351585014409224e-05,
"loss": 0.2098,
"step": 4900
},
{
"epoch": 2.8530259365994235,
"grad_norm": 12.071518898010254,
"learning_rate": 1.4293948126801154e-05,
"loss": 0.2447,
"step": 4950
},
{
"epoch": 2.881844380403458,
"grad_norm": 1.786498785018921,
"learning_rate": 1.4236311239193086e-05,
"loss": 0.2374,
"step": 5000
},
{
"epoch": 2.910662824207493,
"grad_norm": 31.069623947143555,
"learning_rate": 1.4178674351585015e-05,
"loss": 0.2667,
"step": 5050
},
{
"epoch": 2.9394812680115274,
"grad_norm": 2.980510950088501,
"learning_rate": 1.4121037463976946e-05,
"loss": 0.28,
"step": 5100
},
{
"epoch": 2.968299711815562,
"grad_norm": 18.496265411376953,
"learning_rate": 1.4063400576368878e-05,
"loss": 0.1831,
"step": 5150
},
{
"epoch": 2.9971181556195967,
"grad_norm": 33.23817825317383,
"learning_rate": 1.4005763688760807e-05,
"loss": 0.3026,
"step": 5200
},
{
"epoch": 3.0,
"eval_accuracy": 0.9050295431618389,
"eval_f1_macro": 0.9049667283676557,
"eval_f1_weighted": 0.9049779714917217,
"eval_loss": 0.3607948124408722,
"eval_precision_macro": 0.9065337153272169,
"eval_precision_weighted": 0.9065450713204785,
"eval_recall_macro": 0.9050188737810804,
"eval_recall_weighted": 0.9050295431618389,
"eval_runtime": 29.9421,
"eval_samples_per_second": 231.747,
"eval_steps_per_second": 14.495,
"step": 5205
},
{
"epoch": 3.025936599423631,
"grad_norm": 14.13509750366211,
"learning_rate": 1.3948126801152739e-05,
"loss": 0.1617,
"step": 5250
},
{
"epoch": 3.0547550432276656,
"grad_norm": 0.5927883982658386,
"learning_rate": 1.389048991354467e-05,
"loss": 0.1693,
"step": 5300
},
{
"epoch": 3.0835734870317,
"grad_norm": 14.065008163452148,
"learning_rate": 1.3832853025936602e-05,
"loss": 0.128,
"step": 5350
},
{
"epoch": 3.112391930835735,
"grad_norm": 40.97966003417969,
"learning_rate": 1.377521613832853e-05,
"loss": 0.1558,
"step": 5400
},
{
"epoch": 3.1412103746397695,
"grad_norm": 10.36765193939209,
"learning_rate": 1.3717579250720463e-05,
"loss": 0.207,
"step": 5450
},
{
"epoch": 3.170028818443804,
"grad_norm": 0.5478718876838684,
"learning_rate": 1.3659942363112394e-05,
"loss": 0.1802,
"step": 5500
},
{
"epoch": 3.1988472622478388,
"grad_norm": 0.6302068829536438,
"learning_rate": 1.3602305475504324e-05,
"loss": 0.14,
"step": 5550
},
{
"epoch": 3.227665706051873,
"grad_norm": 2.4789652824401855,
"learning_rate": 1.3544668587896255e-05,
"loss": 0.1137,
"step": 5600
},
{
"epoch": 3.2564841498559076,
"grad_norm": 22.161422729492188,
"learning_rate": 1.3487031700288185e-05,
"loss": 0.2182,
"step": 5650
},
{
"epoch": 3.2853025936599423,
"grad_norm": 28.67848014831543,
"learning_rate": 1.3429394812680117e-05,
"loss": 0.1741,
"step": 5700
},
{
"epoch": 3.314121037463977,
"grad_norm": 13.24758243560791,
"learning_rate": 1.3371757925072046e-05,
"loss": 0.2205,
"step": 5750
},
{
"epoch": 3.3429394812680115,
"grad_norm": 0.6006250977516174,
"learning_rate": 1.3314121037463979e-05,
"loss": 0.1419,
"step": 5800
},
{
"epoch": 3.371757925072046,
"grad_norm": 4.931090831756592,
"learning_rate": 1.3256484149855909e-05,
"loss": 0.1466,
"step": 5850
},
{
"epoch": 3.400576368876081,
"grad_norm": 17.918506622314453,
"learning_rate": 1.319884726224784e-05,
"loss": 0.1437,
"step": 5900
},
{
"epoch": 3.4293948126801155,
"grad_norm": 18.44112205505371,
"learning_rate": 1.314121037463977e-05,
"loss": 0.1923,
"step": 5950
},
{
"epoch": 3.4582132564841497,
"grad_norm": 36.937828063964844,
"learning_rate": 1.3083573487031702e-05,
"loss": 0.2021,
"step": 6000
},
{
"epoch": 3.4870317002881843,
"grad_norm": 0.9251816868782043,
"learning_rate": 1.3025936599423631e-05,
"loss": 0.2166,
"step": 6050
},
{
"epoch": 3.515850144092219,
"grad_norm": 29.732206344604492,
"learning_rate": 1.2968299711815563e-05,
"loss": 0.1772,
"step": 6100
},
{
"epoch": 3.5446685878962536,
"grad_norm": 8.258246421813965,
"learning_rate": 1.2910662824207494e-05,
"loss": 0.2043,
"step": 6150
},
{
"epoch": 3.5734870317002883,
"grad_norm": 0.8275717496871948,
"learning_rate": 1.2853025936599423e-05,
"loss": 0.1502,
"step": 6200
},
{
"epoch": 3.602305475504323,
"grad_norm": 27.115209579467773,
"learning_rate": 1.2795389048991355e-05,
"loss": 0.1733,
"step": 6250
},
{
"epoch": 3.631123919308357,
"grad_norm": 0.7643899321556091,
"learning_rate": 1.2737752161383286e-05,
"loss": 0.1349,
"step": 6300
},
{
"epoch": 3.6599423631123917,
"grad_norm": 33.05510330200195,
"learning_rate": 1.2680115273775218e-05,
"loss": 0.1526,
"step": 6350
},
{
"epoch": 3.6887608069164264,
"grad_norm": 13.939464569091797,
"learning_rate": 1.2622478386167147e-05,
"loss": 0.1616,
"step": 6400
},
{
"epoch": 3.717579250720461,
"grad_norm": 2.4451797008514404,
"learning_rate": 1.2564841498559079e-05,
"loss": 0.1751,
"step": 6450
},
{
"epoch": 3.7463976945244957,
"grad_norm": 17.99618148803711,
"learning_rate": 1.250720461095101e-05,
"loss": 0.2061,
"step": 6500
},
{
"epoch": 3.7752161383285303,
"grad_norm": 19.82054328918457,
"learning_rate": 1.2449567723342942e-05,
"loss": 0.2168,
"step": 6550
},
{
"epoch": 3.804034582132565,
"grad_norm": 17.200483322143555,
"learning_rate": 1.239193083573487e-05,
"loss": 0.1849,
"step": 6600
},
{
"epoch": 3.8328530259365996,
"grad_norm": 9.288723945617676,
"learning_rate": 1.2334293948126803e-05,
"loss": 0.1902,
"step": 6650
},
{
"epoch": 3.8616714697406342,
"grad_norm": 52.95684051513672,
"learning_rate": 1.2276657060518733e-05,
"loss": 0.1566,
"step": 6700
},
{
"epoch": 3.8904899135446684,
"grad_norm": 22.813720703125,
"learning_rate": 1.2219020172910662e-05,
"loss": 0.1212,
"step": 6750
},
{
"epoch": 3.919308357348703,
"grad_norm": 3.377829074859619,
"learning_rate": 1.2161383285302594e-05,
"loss": 0.1147,
"step": 6800
},
{
"epoch": 3.9481268011527377,
"grad_norm": 0.6791939735412598,
"learning_rate": 1.2103746397694525e-05,
"loss": 0.1678,
"step": 6850
},
{
"epoch": 3.9769452449567724,
"grad_norm": 12.582691192626953,
"learning_rate": 1.2046109510086457e-05,
"loss": 0.2679,
"step": 6900
},
{
"epoch": 4.0,
"eval_accuracy": 0.9279435077100446,
"eval_f1_macro": 0.927584474052213,
"eval_f1_weighted": 0.927592833052565,
"eval_loss": 0.35773107409477234,
"eval_precision_macro": 0.9276877579070416,
"eval_precision_weighted": 0.9276960791160526,
"eval_recall_macro": 0.9279352304379973,
"eval_recall_weighted": 0.9279435077100446,
"eval_runtime": 30.2758,
"eval_samples_per_second": 229.193,
"eval_steps_per_second": 14.335,
"step": 6940
},
{
"epoch": 4.005763688760807,
"grad_norm": 0.19622278213500977,
"learning_rate": 1.1988472622478386e-05,
"loss": 0.1063,
"step": 6950
},
{
"epoch": 4.034582132564841,
"grad_norm": 2.3306796550750732,
"learning_rate": 1.1930835734870318e-05,
"loss": 0.1007,
"step": 7000
},
{
"epoch": 4.063400576368876,
"grad_norm": 36.256927490234375,
"learning_rate": 1.1873198847262249e-05,
"loss": 0.0781,
"step": 7050
},
{
"epoch": 4.0922190201729105,
"grad_norm": 13.904011726379395,
"learning_rate": 1.1815561959654181e-05,
"loss": 0.1152,
"step": 7100
},
{
"epoch": 4.121037463976945,
"grad_norm": 2.325575828552246,
"learning_rate": 1.175792507204611e-05,
"loss": 0.1033,
"step": 7150
},
{
"epoch": 4.14985590778098,
"grad_norm": 0.9815200567245483,
"learning_rate": 1.1700288184438042e-05,
"loss": 0.0964,
"step": 7200
},
{
"epoch": 4.178674351585014,
"grad_norm": 32.72802734375,
"learning_rate": 1.1642651296829973e-05,
"loss": 0.1008,
"step": 7250
},
{
"epoch": 4.207492795389049,
"grad_norm": 19.83048439025879,
"learning_rate": 1.1585014409221902e-05,
"loss": 0.1044,
"step": 7300
},
{
"epoch": 4.236311239193084,
"grad_norm": 22.580406188964844,
"learning_rate": 1.1527377521613834e-05,
"loss": 0.1528,
"step": 7350
},
{
"epoch": 4.265129682997118,
"grad_norm": 2.180345296859741,
"learning_rate": 1.1469740634005764e-05,
"loss": 0.1219,
"step": 7400
},
{
"epoch": 4.293948126801153,
"grad_norm": 8.74466609954834,
"learning_rate": 1.1412103746397697e-05,
"loss": 0.0763,
"step": 7450
},
{
"epoch": 4.322766570605188,
"grad_norm": 0.5323246717453003,
"learning_rate": 1.1354466858789625e-05,
"loss": 0.0875,
"step": 7500
},
{
"epoch": 4.351585014409222,
"grad_norm": 0.2620614171028137,
"learning_rate": 1.1296829971181558e-05,
"loss": 0.1046,
"step": 7550
},
{
"epoch": 4.380403458213257,
"grad_norm": 7.299178123474121,
"learning_rate": 1.1239193083573488e-05,
"loss": 0.1218,
"step": 7600
},
{
"epoch": 4.409221902017291,
"grad_norm": 12.042703628540039,
"learning_rate": 1.1181556195965419e-05,
"loss": 0.1083,
"step": 7650
},
{
"epoch": 4.438040345821325,
"grad_norm": 0.6515465378761292,
"learning_rate": 1.112391930835735e-05,
"loss": 0.0987,
"step": 7700
},
{
"epoch": 4.46685878962536,
"grad_norm": 4.6855316162109375,
"learning_rate": 1.1066282420749282e-05,
"loss": 0.0983,
"step": 7750
},
{
"epoch": 4.495677233429395,
"grad_norm": 23.7063045501709,
"learning_rate": 1.100864553314121e-05,
"loss": 0.0989,
"step": 7800
},
{
"epoch": 4.524495677233429,
"grad_norm": 1.5553531646728516,
"learning_rate": 1.0951008645533141e-05,
"loss": 0.1309,
"step": 7850
},
{
"epoch": 4.553314121037464,
"grad_norm": 32.84361267089844,
"learning_rate": 1.0893371757925073e-05,
"loss": 0.1097,
"step": 7900
},
{
"epoch": 4.582132564841499,
"grad_norm": 70.86231231689453,
"learning_rate": 1.0835734870317004e-05,
"loss": 0.1408,
"step": 7950
},
{
"epoch": 4.610951008645533,
"grad_norm": 0.31812503933906555,
"learning_rate": 1.0778097982708934e-05,
"loss": 0.1226,
"step": 8000
},
{
"epoch": 4.639769452449568,
"grad_norm": 39.376468658447266,
"learning_rate": 1.0720461095100865e-05,
"loss": 0.1727,
"step": 8050
},
{
"epoch": 4.6685878962536025,
"grad_norm": 28.209671020507812,
"learning_rate": 1.0662824207492797e-05,
"loss": 0.1426,
"step": 8100
},
{
"epoch": 4.697406340057637,
"grad_norm": 30.422863006591797,
"learning_rate": 1.0605187319884726e-05,
"loss": 0.1224,
"step": 8150
},
{
"epoch": 4.726224783861672,
"grad_norm": 0.9933204054832458,
"learning_rate": 1.0547550432276658e-05,
"loss": 0.0922,
"step": 8200
},
{
"epoch": 4.755043227665706,
"grad_norm": 0.3882824182510376,
"learning_rate": 1.0489913544668589e-05,
"loss": 0.1163,
"step": 8250
},
{
"epoch": 4.78386167146974,
"grad_norm": 0.04565083980560303,
"learning_rate": 1.0432276657060521e-05,
"loss": 0.1135,
"step": 8300
},
{
"epoch": 4.812680115273775,
"grad_norm": 34.51498794555664,
"learning_rate": 1.037463976945245e-05,
"loss": 0.0817,
"step": 8350
},
{
"epoch": 4.8414985590778095,
"grad_norm": 23.297779083251953,
"learning_rate": 1.031700288184438e-05,
"loss": 0.1033,
"step": 8400
},
{
"epoch": 4.870317002881844,
"grad_norm": 0.41480186581611633,
"learning_rate": 1.0259365994236313e-05,
"loss": 0.107,
"step": 8450
},
{
"epoch": 4.899135446685879,
"grad_norm": 49.01393508911133,
"learning_rate": 1.0201729106628241e-05,
"loss": 0.1095,
"step": 8500
},
{
"epoch": 4.927953890489913,
"grad_norm": 0.21958515048027039,
"learning_rate": 1.0144092219020174e-05,
"loss": 0.0695,
"step": 8550
},
{
"epoch": 4.956772334293948,
"grad_norm": 0.6442630290985107,
"learning_rate": 1.0086455331412104e-05,
"loss": 0.0979,
"step": 8600
},
{
"epoch": 4.985590778097983,
"grad_norm": 0.04263289272785187,
"learning_rate": 1.0028818443804036e-05,
"loss": 0.1149,
"step": 8650
},
{
"epoch": 5.0,
"eval_accuracy": 0.933852140077821,
"eval_f1_macro": 0.9335300711231832,
"eval_f1_weighted": 0.9335363129664682,
"eval_loss": 0.3687053918838501,
"eval_precision_macro": 0.9336924382488527,
"eval_precision_weighted": 0.9336961352032257,
"eval_recall_macro": 0.9338433865620074,
"eval_recall_weighted": 0.933852140077821,
"eval_runtime": 29.8558,
"eval_samples_per_second": 232.417,
"eval_steps_per_second": 14.537,
"step": 8675
},
{
"epoch": 5.014409221902017,
"grad_norm": 4.232170581817627,
"learning_rate": 9.971181556195965e-06,
"loss": 0.1073,
"step": 8700
},
{
"epoch": 5.043227665706052,
"grad_norm": 83.36505126953125,
"learning_rate": 9.913544668587897e-06,
"loss": 0.0736,
"step": 8750
},
{
"epoch": 5.072046109510087,
"grad_norm": 0.28978821635246277,
"learning_rate": 9.855907780979828e-06,
"loss": 0.0804,
"step": 8800
},
{
"epoch": 5.100864553314121,
"grad_norm": 11.236791610717773,
"learning_rate": 9.798270893371759e-06,
"loss": 0.0989,
"step": 8850
},
{
"epoch": 5.129682997118156,
"grad_norm": 0.07707870006561279,
"learning_rate": 9.740634005763689e-06,
"loss": 0.0519,
"step": 8900
},
{
"epoch": 5.1585014409221905,
"grad_norm": 0.45296791195869446,
"learning_rate": 9.68299711815562e-06,
"loss": 0.0403,
"step": 8950
},
{
"epoch": 5.187319884726225,
"grad_norm": 46.82713317871094,
"learning_rate": 9.625360230547552e-06,
"loss": 0.0895,
"step": 9000
},
{
"epoch": 5.216138328530259,
"grad_norm": 6.646805286407471,
"learning_rate": 9.567723342939482e-06,
"loss": 0.0419,
"step": 9050
},
{
"epoch": 5.244956772334294,
"grad_norm": 21.99289321899414,
"learning_rate": 9.510086455331413e-06,
"loss": 0.0573,
"step": 9100
},
{
"epoch": 5.273775216138328,
"grad_norm": 39.594390869140625,
"learning_rate": 9.452449567723344e-06,
"loss": 0.0585,
"step": 9150
},
{
"epoch": 5.302593659942363,
"grad_norm": 18.201231002807617,
"learning_rate": 9.394812680115276e-06,
"loss": 0.1113,
"step": 9200
},
{
"epoch": 5.3314121037463975,
"grad_norm": 30.270816802978516,
"learning_rate": 9.337175792507205e-06,
"loss": 0.0531,
"step": 9250
},
{
"epoch": 5.360230547550432,
"grad_norm": 42.12540817260742,
"learning_rate": 9.279538904899135e-06,
"loss": 0.0975,
"step": 9300
},
{
"epoch": 5.389048991354467,
"grad_norm": 0.4828750193119049,
"learning_rate": 9.221902017291067e-06,
"loss": 0.0636,
"step": 9350
},
{
"epoch": 5.417867435158501,
"grad_norm": 21.690710067749023,
"learning_rate": 9.164265129682998e-06,
"loss": 0.042,
"step": 9400
},
{
"epoch": 5.446685878962536,
"grad_norm": 43.906761169433594,
"learning_rate": 9.106628242074928e-06,
"loss": 0.0887,
"step": 9450
},
{
"epoch": 5.475504322766571,
"grad_norm": 37.99407958984375,
"learning_rate": 9.048991354466859e-06,
"loss": 0.0444,
"step": 9500
},
{
"epoch": 5.504322766570605,
"grad_norm": 0.9924225807189941,
"learning_rate": 8.991354466858791e-06,
"loss": 0.0893,
"step": 9550
},
{
"epoch": 5.53314121037464,
"grad_norm": 0.024848056957125664,
"learning_rate": 8.933717579250722e-06,
"loss": 0.0701,
"step": 9600
},
{
"epoch": 5.561959654178675,
"grad_norm": 0.013018026947975159,
"learning_rate": 8.876080691642652e-06,
"loss": 0.1031,
"step": 9650
},
{
"epoch": 5.590778097982709,
"grad_norm": 0.36339619755744934,
"learning_rate": 8.818443804034583e-06,
"loss": 0.0422,
"step": 9700
},
{
"epoch": 5.619596541786743,
"grad_norm": 0.5731251835823059,
"learning_rate": 8.760806916426513e-06,
"loss": 0.0666,
"step": 9750
},
{
"epoch": 5.648414985590778,
"grad_norm": 3.643000841140747,
"learning_rate": 8.703170028818444e-06,
"loss": 0.0817,
"step": 9800
},
{
"epoch": 5.677233429394812,
"grad_norm": 0.039307739585638046,
"learning_rate": 8.645533141210375e-06,
"loss": 0.1151,
"step": 9850
},
{
"epoch": 5.706051873198847,
"grad_norm": 0.2764396667480469,
"learning_rate": 8.587896253602305e-06,
"loss": 0.1021,
"step": 9900
},
{
"epoch": 5.734870317002882,
"grad_norm": 46.3472785949707,
"learning_rate": 8.530259365994237e-06,
"loss": 0.0848,
"step": 9950
},
{
"epoch": 5.763688760806916,
"grad_norm": 0.2657397389411926,
"learning_rate": 8.472622478386168e-06,
"loss": 0.0904,
"step": 10000
},
{
"epoch": 5.792507204610951,
"grad_norm": 30.730506896972656,
"learning_rate": 8.414985590778098e-06,
"loss": 0.0715,
"step": 10050
},
{
"epoch": 5.821325648414986,
"grad_norm": 2.388108253479004,
"learning_rate": 8.357348703170029e-06,
"loss": 0.0474,
"step": 10100
},
{
"epoch": 5.85014409221902,
"grad_norm": 0.06589208543300629,
"learning_rate": 8.299711815561961e-06,
"loss": 0.0718,
"step": 10150
},
{
"epoch": 5.878962536023055,
"grad_norm": 0.017373552545905113,
"learning_rate": 8.242074927953892e-06,
"loss": 0.0851,
"step": 10200
},
{
"epoch": 5.9077809798270895,
"grad_norm": 1.8382971286773682,
"learning_rate": 8.184438040345822e-06,
"loss": 0.0771,
"step": 10250
},
{
"epoch": 5.936599423631124,
"grad_norm": 0.5171680450439453,
"learning_rate": 8.126801152737753e-06,
"loss": 0.0352,
"step": 10300
},
{
"epoch": 5.965417867435159,
"grad_norm": 0.052714597433805466,
"learning_rate": 8.069164265129685e-06,
"loss": 0.0541,
"step": 10350
},
{
"epoch": 5.994236311239193,
"grad_norm": 55.4918327331543,
"learning_rate": 8.011527377521614e-06,
"loss": 0.0491,
"step": 10400
},
{
"epoch": 6.0,
"eval_accuracy": 0.9348609309698804,
"eval_f1_macro": 0.9339829958075209,
"eval_f1_weighted": 0.9339902153482186,
"eval_loss": 0.4194980561733246,
"eval_precision_macro": 0.9352763632278608,
"eval_precision_weighted": 0.9352802017792717,
"eval_recall_macro": 0.9348508698098554,
"eval_recall_weighted": 0.9348609309698804,
"eval_runtime": 29.9694,
"eval_samples_per_second": 231.536,
"eval_steps_per_second": 14.481,
"step": 10410
},
{
"epoch": 6.023054755043228,
"grad_norm": 0.13180404901504517,
"learning_rate": 7.953890489913544e-06,
"loss": 0.0896,
"step": 10450
},
{
"epoch": 6.051873198847262,
"grad_norm": 14.752634048461914,
"learning_rate": 7.896253602305477e-06,
"loss": 0.0528,
"step": 10500
},
{
"epoch": 6.0806916426512965,
"grad_norm": 0.12015581876039505,
"learning_rate": 7.838616714697407e-06,
"loss": 0.0762,
"step": 10550
},
{
"epoch": 6.109510086455331,
"grad_norm": 0.8792430758476257,
"learning_rate": 7.780979827089338e-06,
"loss": 0.0199,
"step": 10600
},
{
"epoch": 6.138328530259366,
"grad_norm": 0.15485620498657227,
"learning_rate": 7.723342939481268e-06,
"loss": 0.0448,
"step": 10650
},
{
"epoch": 6.1671469740634,
"grad_norm": 24.48517417907715,
"learning_rate": 7.665706051873199e-06,
"loss": 0.0477,
"step": 10700
},
{
"epoch": 6.195965417867435,
"grad_norm": 0.01565726287662983,
"learning_rate": 7.60806916426513e-06,
"loss": 0.0477,
"step": 10750
},
{
"epoch": 6.22478386167147,
"grad_norm": 0.06371276825666428,
"learning_rate": 7.550432276657062e-06,
"loss": 0.072,
"step": 10800
},
{
"epoch": 6.253602305475504,
"grad_norm": 0.036789000034332275,
"learning_rate": 7.492795389048992e-06,
"loss": 0.024,
"step": 10850
},
{
"epoch": 6.282420749279539,
"grad_norm": 10.600634574890137,
"learning_rate": 7.4351585014409235e-06,
"loss": 0.0583,
"step": 10900
},
{
"epoch": 6.311239193083574,
"grad_norm": 0.00421316921710968,
"learning_rate": 7.377521613832853e-06,
"loss": 0.0416,
"step": 10950
},
{
"epoch": 6.340057636887608,
"grad_norm": 43.42197799682617,
"learning_rate": 7.319884726224784e-06,
"loss": 0.0642,
"step": 11000
},
{
"epoch": 6.368876080691643,
"grad_norm": 0.05901940539479256,
"learning_rate": 7.262247838616715e-06,
"loss": 0.0511,
"step": 11050
},
{
"epoch": 6.3976945244956775,
"grad_norm": 0.022665705531835556,
"learning_rate": 7.204610951008646e-06,
"loss": 0.0076,
"step": 11100
},
{
"epoch": 6.426512968299712,
"grad_norm": 1.6211119890213013,
"learning_rate": 7.146974063400577e-06,
"loss": 0.0401,
"step": 11150
},
{
"epoch": 6.455331412103746,
"grad_norm": 0.006085489876568317,
"learning_rate": 7.089337175792508e-06,
"loss": 0.0732,
"step": 11200
},
{
"epoch": 6.484149855907781,
"grad_norm": 44.77919006347656,
"learning_rate": 7.031700288184439e-06,
"loss": 0.0634,
"step": 11250
},
{
"epoch": 6.512968299711815,
"grad_norm": 13.231744766235352,
"learning_rate": 6.9740634005763696e-06,
"loss": 0.0429,
"step": 11300
},
{
"epoch": 6.54178674351585,
"grad_norm": 0.025533217936754227,
"learning_rate": 6.916426512968301e-06,
"loss": 0.0301,
"step": 11350
},
{
"epoch": 6.5706051873198845,
"grad_norm": 0.014192778617143631,
"learning_rate": 6.8587896253602315e-06,
"loss": 0.0588,
"step": 11400
},
{
"epoch": 6.599423631123919,
"grad_norm": 0.12580570578575134,
"learning_rate": 6.801152737752162e-06,
"loss": 0.0463,
"step": 11450
},
{
"epoch": 6.628242074927954,
"grad_norm": 27.74668312072754,
"learning_rate": 6.743515850144093e-06,
"loss": 0.0801,
"step": 11500
},
{
"epoch": 6.6570605187319885,
"grad_norm": 0.016943486407399178,
"learning_rate": 6.685878962536023e-06,
"loss": 0.0563,
"step": 11550
},
{
"epoch": 6.685878962536023,
"grad_norm": 0.41584333777427673,
"learning_rate": 6.6282420749279545e-06,
"loss": 0.043,
"step": 11600
},
{
"epoch": 6.714697406340058,
"grad_norm": 0.15631648898124695,
"learning_rate": 6.570605187319885e-06,
"loss": 0.0479,
"step": 11650
},
{
"epoch": 6.743515850144092,
"grad_norm": 0.560581386089325,
"learning_rate": 6.512968299711816e-06,
"loss": 0.0374,
"step": 11700
},
{
"epoch": 6.772334293948127,
"grad_norm": 0.1623823046684265,
"learning_rate": 6.455331412103747e-06,
"loss": 0.0402,
"step": 11750
},
{
"epoch": 6.801152737752162,
"grad_norm": 0.0034744683653116226,
"learning_rate": 6.3976945244956775e-06,
"loss": 0.0653,
"step": 11800
},
{
"epoch": 6.829971181556196,
"grad_norm": 11.196998596191406,
"learning_rate": 6.340057636887609e-06,
"loss": 0.0453,
"step": 11850
},
{
"epoch": 6.858789625360231,
"grad_norm": 11.29255199432373,
"learning_rate": 6.2824207492795395e-06,
"loss": 0.0774,
"step": 11900
},
{
"epoch": 6.887608069164266,
"grad_norm": 0.0062851207330822945,
"learning_rate": 6.224783861671471e-06,
"loss": 0.0354,
"step": 11950
},
{
"epoch": 6.916426512968299,
"grad_norm": 0.030905550345778465,
"learning_rate": 6.167146974063401e-06,
"loss": 0.0353,
"step": 12000
},
{
"epoch": 6.945244956772334,
"grad_norm": 0.2567192018032074,
"learning_rate": 6.109510086455331e-06,
"loss": 0.0248,
"step": 12050
},
{
"epoch": 6.974063400576369,
"grad_norm": 0.03146115690469742,
"learning_rate": 6.0518731988472625e-06,
"loss": 0.052,
"step": 12100
},
{
"epoch": 7.0,
"eval_accuracy": 0.9383196425997983,
"eval_f1_macro": 0.9376382344287736,
"eval_f1_weighted": 0.9376445510679586,
"eval_loss": 0.4325847625732422,
"eval_precision_macro": 0.938292936865987,
"eval_precision_weighted": 0.9382956472792613,
"eval_recall_macro": 0.9383099973494167,
"eval_recall_weighted": 0.9383196425997983,
"eval_runtime": 30.0017,
"eval_samples_per_second": 231.287,
"eval_steps_per_second": 14.466,
"step": 12145
}
],
"logging_steps": 50,
"max_steps": 17350,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 2.55585712669056e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}