mobilevit-xx-small-v2024-10-22 / trainer_state.json
nemik's picture
End of training
f814d59 verified
{
"best_metric": 0.1725098043680191,
"best_model_checkpoint": "frost-mobile-apple/mobilevit-xx-small-v2024-10-22/checkpoint-1500",
"epoch": 30.0,
"eval_steps": 100,
"global_step": 1710,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.17543859649122806,
"grad_norm": 0.2625730037689209,
"learning_rate": 1.1695906432748537e-05,
"loss": 0.6928,
"step": 10
},
{
"epoch": 0.3508771929824561,
"grad_norm": 0.2961116135120392,
"learning_rate": 2.3391812865497074e-05,
"loss": 0.6936,
"step": 20
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.24333663284778595,
"learning_rate": 3.508771929824561e-05,
"loss": 0.6917,
"step": 30
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.2218523770570755,
"learning_rate": 4.678362573099415e-05,
"loss": 0.6887,
"step": 40
},
{
"epoch": 0.8771929824561403,
"grad_norm": 0.23965124785900116,
"learning_rate": 5.847953216374269e-05,
"loss": 0.685,
"step": 50
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.23081418871879578,
"learning_rate": 7.017543859649122e-05,
"loss": 0.6815,
"step": 60
},
{
"epoch": 1.2280701754385965,
"grad_norm": 0.23212119936943054,
"learning_rate": 8.187134502923976e-05,
"loss": 0.676,
"step": 70
},
{
"epoch": 1.4035087719298245,
"grad_norm": 0.2775309383869171,
"learning_rate": 9.35672514619883e-05,
"loss": 0.6711,
"step": 80
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.38230618834495544,
"learning_rate": 0.00010526315789473685,
"loss": 0.6617,
"step": 90
},
{
"epoch": 1.7543859649122808,
"grad_norm": 0.29050251841545105,
"learning_rate": 0.00011695906432748539,
"loss": 0.6549,
"step": 100
},
{
"epoch": 1.7543859649122808,
"eval_accuracy": 0.82,
"eval_f1": 0.6260387811634349,
"eval_loss": 0.6288657784461975,
"eval_precision": 0.5191424196018377,
"eval_recall": 0.7883720930232558,
"eval_runtime": 2.6915,
"eval_samples_per_second": 83.597,
"eval_steps_per_second": 10.775,
"step": 100
},
{
"epoch": 1.9298245614035088,
"grad_norm": 0.3310299217700958,
"learning_rate": 0.0001286549707602339,
"loss": 0.6389,
"step": 110
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.35385212302207947,
"learning_rate": 0.00014035087719298245,
"loss": 0.6276,
"step": 120
},
{
"epoch": 2.280701754385965,
"grad_norm": 0.31887122988700867,
"learning_rate": 0.00015204678362573098,
"loss": 0.6068,
"step": 130
},
{
"epoch": 2.456140350877193,
"grad_norm": 0.38656044006347656,
"learning_rate": 0.00016374269005847952,
"loss": 0.5876,
"step": 140
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.40553656220436096,
"learning_rate": 0.00017543859649122806,
"loss": 0.5782,
"step": 150
},
{
"epoch": 2.807017543859649,
"grad_norm": 0.5055739879608154,
"learning_rate": 0.0001871345029239766,
"loss": 0.546,
"step": 160
},
{
"epoch": 2.982456140350877,
"grad_norm": 0.6473321318626404,
"learning_rate": 0.00019883040935672513,
"loss": 0.5322,
"step": 170
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.5542100667953491,
"learning_rate": 0.00019883040935672513,
"loss": 0.5081,
"step": 180
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.525965690612793,
"learning_rate": 0.00019753086419753085,
"loss": 0.4906,
"step": 190
},
{
"epoch": 3.5087719298245617,
"grad_norm": 0.6686927676200867,
"learning_rate": 0.00019623131903833657,
"loss": 0.4616,
"step": 200
},
{
"epoch": 3.5087719298245617,
"eval_accuracy": 0.8866666666666667,
"eval_f1": 0.7295864262990456,
"eval_loss": 0.41918542981147766,
"eval_precision": 0.6705653021442495,
"eval_recall": 0.8,
"eval_runtime": 2.7897,
"eval_samples_per_second": 80.654,
"eval_steps_per_second": 10.395,
"step": 200
},
{
"epoch": 3.6842105263157894,
"grad_norm": 1.5412182807922363,
"learning_rate": 0.0001949317738791423,
"loss": 0.4506,
"step": 210
},
{
"epoch": 3.8596491228070176,
"grad_norm": 0.4173012375831604,
"learning_rate": 0.00019363222871994802,
"loss": 0.4371,
"step": 220
},
{
"epoch": 4.035087719298246,
"grad_norm": 0.42248570919036865,
"learning_rate": 0.00019233268356075374,
"loss": 0.4064,
"step": 230
},
{
"epoch": 4.2105263157894735,
"grad_norm": 0.5491617918014526,
"learning_rate": 0.00019103313840155946,
"loss": 0.3724,
"step": 240
},
{
"epoch": 4.385964912280702,
"grad_norm": 0.35062703490257263,
"learning_rate": 0.00018973359324236518,
"loss": 0.3671,
"step": 250
},
{
"epoch": 4.56140350877193,
"grad_norm": 0.40491071343421936,
"learning_rate": 0.0001884340480831709,
"loss": 0.3683,
"step": 260
},
{
"epoch": 4.7368421052631575,
"grad_norm": 0.9965174794197083,
"learning_rate": 0.0001871345029239766,
"loss": 0.3402,
"step": 270
},
{
"epoch": 4.912280701754386,
"grad_norm": 0.7184051275253296,
"learning_rate": 0.00018583495776478232,
"loss": 0.3348,
"step": 280
},
{
"epoch": 5.087719298245614,
"grad_norm": 1.8915038108825684,
"learning_rate": 0.00018453541260558804,
"loss": 0.32,
"step": 290
},
{
"epoch": 5.2631578947368425,
"grad_norm": 0.5761589407920837,
"learning_rate": 0.00018323586744639376,
"loss": 0.3101,
"step": 300
},
{
"epoch": 5.2631578947368425,
"eval_accuracy": 0.9035555555555556,
"eval_f1": 0.7317676143386898,
"eval_loss": 0.30708780884742737,
"eval_precision": 0.7810026385224275,
"eval_recall": 0.6883720930232559,
"eval_runtime": 2.8811,
"eval_samples_per_second": 78.095,
"eval_steps_per_second": 10.066,
"step": 300
},
{
"epoch": 5.43859649122807,
"grad_norm": 1.1592423915863037,
"learning_rate": 0.00018193632228719948,
"loss": 0.3258,
"step": 310
},
{
"epoch": 5.614035087719298,
"grad_norm": 0.8307028412818909,
"learning_rate": 0.0001806367771280052,
"loss": 0.3149,
"step": 320
},
{
"epoch": 5.7894736842105265,
"grad_norm": 0.9469823837280273,
"learning_rate": 0.00017933723196881092,
"loss": 0.3033,
"step": 330
},
{
"epoch": 5.964912280701754,
"grad_norm": 2.199500322341919,
"learning_rate": 0.00017803768680961664,
"loss": 0.3164,
"step": 340
},
{
"epoch": 6.140350877192983,
"grad_norm": 0.6772398948669434,
"learning_rate": 0.00017673814165042236,
"loss": 0.2806,
"step": 350
},
{
"epoch": 6.315789473684211,
"grad_norm": 0.4862241744995117,
"learning_rate": 0.00017543859649122806,
"loss": 0.2817,
"step": 360
},
{
"epoch": 6.491228070175438,
"grad_norm": 1.2349482774734497,
"learning_rate": 0.00017413905133203378,
"loss": 0.288,
"step": 370
},
{
"epoch": 6.666666666666667,
"grad_norm": 2.9781813621520996,
"learning_rate": 0.00017296946068875895,
"loss": 0.3039,
"step": 380
},
{
"epoch": 6.842105263157895,
"grad_norm": 0.7632750272750854,
"learning_rate": 0.00017166991552956468,
"loss": 0.2836,
"step": 390
},
{
"epoch": 7.017543859649122,
"grad_norm": 1.2420198917388916,
"learning_rate": 0.00017037037037037037,
"loss": 0.2932,
"step": 400
},
{
"epoch": 7.017543859649122,
"eval_accuracy": 0.908,
"eval_f1": 0.7460122699386503,
"eval_loss": 0.24856920540332794,
"eval_precision": 0.7896103896103897,
"eval_recall": 0.7069767441860465,
"eval_runtime": 2.8347,
"eval_samples_per_second": 79.373,
"eval_steps_per_second": 10.23,
"step": 400
},
{
"epoch": 7.192982456140351,
"grad_norm": 0.8554529547691345,
"learning_rate": 0.0001690708252111761,
"loss": 0.2583,
"step": 410
},
{
"epoch": 7.368421052631579,
"grad_norm": 0.5736662745475769,
"learning_rate": 0.0001677712800519818,
"loss": 0.2809,
"step": 420
},
{
"epoch": 7.543859649122807,
"grad_norm": 0.7552086114883423,
"learning_rate": 0.00016647173489278753,
"loss": 0.2774,
"step": 430
},
{
"epoch": 7.719298245614035,
"grad_norm": 0.6094131469726562,
"learning_rate": 0.00016517218973359325,
"loss": 0.2771,
"step": 440
},
{
"epoch": 7.894736842105263,
"grad_norm": 0.5392113924026489,
"learning_rate": 0.00016387264457439898,
"loss": 0.2755,
"step": 450
},
{
"epoch": 8.070175438596491,
"grad_norm": 0.4927959740161896,
"learning_rate": 0.0001625730994152047,
"loss": 0.2572,
"step": 460
},
{
"epoch": 8.24561403508772,
"grad_norm": 0.9484465718269348,
"learning_rate": 0.00016127355425601042,
"loss": 0.2354,
"step": 470
},
{
"epoch": 8.421052631578947,
"grad_norm": 0.71286940574646,
"learning_rate": 0.0001599740090968161,
"loss": 0.2611,
"step": 480
},
{
"epoch": 8.596491228070175,
"grad_norm": 1.9641995429992676,
"learning_rate": 0.00015867446393762183,
"loss": 0.2547,
"step": 490
},
{
"epoch": 8.771929824561404,
"grad_norm": 1.1893583536148071,
"learning_rate": 0.00015737491877842755,
"loss": 0.2652,
"step": 500
},
{
"epoch": 8.771929824561404,
"eval_accuracy": 0.9137777777777778,
"eval_f1": 0.7673860911270983,
"eval_loss": 0.22792504727840424,
"eval_precision": 0.7920792079207921,
"eval_recall": 0.7441860465116279,
"eval_runtime": 1.8141,
"eval_samples_per_second": 124.03,
"eval_steps_per_second": 15.986,
"step": 500
},
{
"epoch": 8.947368421052632,
"grad_norm": 1.0071460008621216,
"learning_rate": 0.00015607537361923327,
"loss": 0.244,
"step": 510
},
{
"epoch": 9.12280701754386,
"grad_norm": 1.22650146484375,
"learning_rate": 0.000154775828460039,
"loss": 0.2377,
"step": 520
},
{
"epoch": 9.298245614035087,
"grad_norm": 2.428567886352539,
"learning_rate": 0.00015347628330084472,
"loss": 0.2494,
"step": 530
},
{
"epoch": 9.473684210526315,
"grad_norm": 1.8254860639572144,
"learning_rate": 0.00015217673814165044,
"loss": 0.2603,
"step": 540
},
{
"epoch": 9.649122807017545,
"grad_norm": 0.6592786908149719,
"learning_rate": 0.00015087719298245616,
"loss": 0.2597,
"step": 550
},
{
"epoch": 9.824561403508772,
"grad_norm": 1.3194756507873535,
"learning_rate": 0.00014957764782326188,
"loss": 0.2313,
"step": 560
},
{
"epoch": 10.0,
"grad_norm": 2.1871612071990967,
"learning_rate": 0.00014827810266406757,
"loss": 0.282,
"step": 570
},
{
"epoch": 10.175438596491228,
"grad_norm": 0.759860098361969,
"learning_rate": 0.0001469785575048733,
"loss": 0.2319,
"step": 580
},
{
"epoch": 10.350877192982455,
"grad_norm": 1.447387933731079,
"learning_rate": 0.00014567901234567902,
"loss": 0.2457,
"step": 590
},
{
"epoch": 10.526315789473685,
"grad_norm": 0.9954220056533813,
"learning_rate": 0.00014437946718648474,
"loss": 0.2253,
"step": 600
},
{
"epoch": 10.526315789473685,
"eval_accuracy": 0.9217777777777778,
"eval_f1": 0.7858880778588808,
"eval_loss": 0.21004962921142578,
"eval_precision": 0.8239795918367347,
"eval_recall": 0.7511627906976744,
"eval_runtime": 4.2925,
"eval_samples_per_second": 52.417,
"eval_steps_per_second": 6.756,
"step": 600
},
{
"epoch": 10.701754385964913,
"grad_norm": 0.715815544128418,
"learning_rate": 0.00014307992202729046,
"loss": 0.2391,
"step": 610
},
{
"epoch": 10.87719298245614,
"grad_norm": 0.6449007391929626,
"learning_rate": 0.00014178037686809618,
"loss": 0.2516,
"step": 620
},
{
"epoch": 11.052631578947368,
"grad_norm": 0.9613096117973328,
"learning_rate": 0.0001404808317089019,
"loss": 0.2157,
"step": 630
},
{
"epoch": 11.228070175438596,
"grad_norm": 2.206623077392578,
"learning_rate": 0.00013918128654970762,
"loss": 0.2365,
"step": 640
},
{
"epoch": 11.403508771929825,
"grad_norm": 1.8694980144500732,
"learning_rate": 0.00013788174139051334,
"loss": 0.2263,
"step": 650
},
{
"epoch": 11.578947368421053,
"grad_norm": 0.7060205340385437,
"learning_rate": 0.00013658219623131904,
"loss": 0.2173,
"step": 660
},
{
"epoch": 11.75438596491228,
"grad_norm": 0.8581671714782715,
"learning_rate": 0.00013528265107212476,
"loss": 0.2204,
"step": 670
},
{
"epoch": 11.929824561403509,
"grad_norm": 1.196590781211853,
"learning_rate": 0.00013398310591293048,
"loss": 0.2519,
"step": 680
},
{
"epoch": 12.105263157894736,
"grad_norm": 1.0726817846298218,
"learning_rate": 0.0001326835607537362,
"loss": 0.2184,
"step": 690
},
{
"epoch": 12.280701754385966,
"grad_norm": 0.6241493821144104,
"learning_rate": 0.00013138401559454192,
"loss": 0.2257,
"step": 700
},
{
"epoch": 12.280701754385966,
"eval_accuracy": 0.9248888888888889,
"eval_f1": 0.8018757327080891,
"eval_loss": 0.19510744512081146,
"eval_precision": 0.8085106382978723,
"eval_recall": 0.7953488372093023,
"eval_runtime": 2.9139,
"eval_samples_per_second": 77.217,
"eval_steps_per_second": 9.952,
"step": 700
},
{
"epoch": 12.456140350877194,
"grad_norm": 1.382541298866272,
"learning_rate": 0.00013008447043534764,
"loss": 0.217,
"step": 710
},
{
"epoch": 12.631578947368421,
"grad_norm": 0.7372106909751892,
"learning_rate": 0.00012878492527615336,
"loss": 0.2209,
"step": 720
},
{
"epoch": 12.807017543859649,
"grad_norm": 1.3437495231628418,
"learning_rate": 0.00012748538011695908,
"loss": 0.2215,
"step": 730
},
{
"epoch": 12.982456140350877,
"grad_norm": 0.8328105807304382,
"learning_rate": 0.0001261858349577648,
"loss": 0.247,
"step": 740
},
{
"epoch": 13.157894736842104,
"grad_norm": 1.166037917137146,
"learning_rate": 0.0001248862897985705,
"loss": 0.2362,
"step": 750
},
{
"epoch": 13.333333333333334,
"grad_norm": 1.8687838315963745,
"learning_rate": 0.00012358674463937622,
"loss": 0.2247,
"step": 760
},
{
"epoch": 13.508771929824562,
"grad_norm": 1.2782139778137207,
"learning_rate": 0.00012228719948018194,
"loss": 0.2134,
"step": 770
},
{
"epoch": 13.68421052631579,
"grad_norm": 1.114933967590332,
"learning_rate": 0.00012098765432098766,
"loss": 0.1965,
"step": 780
},
{
"epoch": 13.859649122807017,
"grad_norm": 1.7937145233154297,
"learning_rate": 0.00011968810916179338,
"loss": 0.2124,
"step": 790
},
{
"epoch": 14.035087719298245,
"grad_norm": 1.6698014736175537,
"learning_rate": 0.0001183885640025991,
"loss": 0.2468,
"step": 800
},
{
"epoch": 14.035087719298245,
"eval_accuracy": 0.9306666666666666,
"eval_f1": 0.8198614318706697,
"eval_loss": 0.19064003229141235,
"eval_precision": 0.8142201834862385,
"eval_recall": 0.8255813953488372,
"eval_runtime": 2.8315,
"eval_samples_per_second": 79.464,
"eval_steps_per_second": 10.242,
"step": 800
},
{
"epoch": 14.210526315789474,
"grad_norm": 0.6950424313545227,
"learning_rate": 0.00011708901884340481,
"loss": 0.2004,
"step": 810
},
{
"epoch": 14.385964912280702,
"grad_norm": 1.5043634176254272,
"learning_rate": 0.00011578947368421053,
"loss": 0.2317,
"step": 820
},
{
"epoch": 14.56140350877193,
"grad_norm": 1.2491843700408936,
"learning_rate": 0.00011448992852501626,
"loss": 0.2027,
"step": 830
},
{
"epoch": 14.736842105263158,
"grad_norm": 0.6502349376678467,
"learning_rate": 0.00011319038336582198,
"loss": 0.2112,
"step": 840
},
{
"epoch": 14.912280701754385,
"grad_norm": 0.40061789751052856,
"learning_rate": 0.0001118908382066277,
"loss": 0.1756,
"step": 850
},
{
"epoch": 15.087719298245615,
"grad_norm": 2.8378994464874268,
"learning_rate": 0.0001105912930474334,
"loss": 0.2216,
"step": 860
},
{
"epoch": 15.263157894736842,
"grad_norm": 1.7187498807907104,
"learning_rate": 0.00010929174788823913,
"loss": 0.2072,
"step": 870
},
{
"epoch": 15.43859649122807,
"grad_norm": 1.774376392364502,
"learning_rate": 0.00010799220272904485,
"loss": 0.239,
"step": 880
},
{
"epoch": 15.614035087719298,
"grad_norm": 1.5812989473342896,
"learning_rate": 0.00010669265756985057,
"loss": 0.2191,
"step": 890
},
{
"epoch": 15.789473684210526,
"grad_norm": 0.9877386689186096,
"learning_rate": 0.00010539311241065628,
"loss": 0.1796,
"step": 900
},
{
"epoch": 15.789473684210526,
"eval_accuracy": 0.9275555555555556,
"eval_f1": 0.81199538638985,
"eval_loss": 0.19485591351985931,
"eval_precision": 0.8054919908466819,
"eval_recall": 0.8186046511627907,
"eval_runtime": 1.8216,
"eval_samples_per_second": 123.517,
"eval_steps_per_second": 15.92,
"step": 900
},
{
"epoch": 15.964912280701755,
"grad_norm": 1.059669017791748,
"learning_rate": 0.000104093567251462,
"loss": 0.1838,
"step": 910
},
{
"epoch": 16.140350877192983,
"grad_norm": 1.4218086004257202,
"learning_rate": 0.00010279402209226772,
"loss": 0.2281,
"step": 920
},
{
"epoch": 16.31578947368421,
"grad_norm": 1.2070213556289673,
"learning_rate": 0.00010149447693307344,
"loss": 0.1997,
"step": 930
},
{
"epoch": 16.49122807017544,
"grad_norm": 2.351250410079956,
"learning_rate": 0.00010019493177387915,
"loss": 0.1843,
"step": 940
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.8852570056915283,
"learning_rate": 9.889538661468485e-05,
"loss": 0.2357,
"step": 950
},
{
"epoch": 16.842105263157894,
"grad_norm": 2.0466091632843018,
"learning_rate": 9.759584145549058e-05,
"loss": 0.2277,
"step": 960
},
{
"epoch": 17.017543859649123,
"grad_norm": 5.798379898071289,
"learning_rate": 9.62962962962963e-05,
"loss": 0.2246,
"step": 970
},
{
"epoch": 17.19298245614035,
"grad_norm": 1.6754958629608154,
"learning_rate": 9.499675113710202e-05,
"loss": 0.1904,
"step": 980
},
{
"epoch": 17.36842105263158,
"grad_norm": 0.6962611675262451,
"learning_rate": 9.369720597790773e-05,
"loss": 0.202,
"step": 990
},
{
"epoch": 17.54385964912281,
"grad_norm": 0.5351881384849548,
"learning_rate": 9.239766081871345e-05,
"loss": 0.1888,
"step": 1000
},
{
"epoch": 17.54385964912281,
"eval_accuracy": 0.9306666666666666,
"eval_f1": 0.8177570093457944,
"eval_loss": 0.18066002428531647,
"eval_precision": 0.8215962441314554,
"eval_recall": 0.813953488372093,
"eval_runtime": 1.8596,
"eval_samples_per_second": 120.996,
"eval_steps_per_second": 15.595,
"step": 1000
},
{
"epoch": 17.719298245614034,
"grad_norm": 1.2162110805511475,
"learning_rate": 9.109811565951917e-05,
"loss": 0.1789,
"step": 1010
},
{
"epoch": 17.894736842105264,
"grad_norm": 1.2040334939956665,
"learning_rate": 8.979857050032489e-05,
"loss": 0.2109,
"step": 1020
},
{
"epoch": 18.07017543859649,
"grad_norm": 0.8599823117256165,
"learning_rate": 8.849902534113061e-05,
"loss": 0.2113,
"step": 1030
},
{
"epoch": 18.24561403508772,
"grad_norm": 1.0291296243667603,
"learning_rate": 8.719948018193632e-05,
"loss": 0.1981,
"step": 1040
},
{
"epoch": 18.42105263157895,
"grad_norm": 3.214996576309204,
"learning_rate": 8.589993502274204e-05,
"loss": 0.1903,
"step": 1050
},
{
"epoch": 18.596491228070175,
"grad_norm": 1.1698780059814453,
"learning_rate": 8.460038986354776e-05,
"loss": 0.192,
"step": 1060
},
{
"epoch": 18.771929824561404,
"grad_norm": 3.0040793418884277,
"learning_rate": 8.330084470435348e-05,
"loss": 0.2062,
"step": 1070
},
{
"epoch": 18.94736842105263,
"grad_norm": 1.365694522857666,
"learning_rate": 8.200129954515919e-05,
"loss": 0.1885,
"step": 1080
},
{
"epoch": 19.12280701754386,
"grad_norm": 0.5183665156364441,
"learning_rate": 8.070175438596491e-05,
"loss": 0.2089,
"step": 1090
},
{
"epoch": 19.29824561403509,
"grad_norm": 0.6474595069885254,
"learning_rate": 7.940220922677063e-05,
"loss": 0.202,
"step": 1100
},
{
"epoch": 19.29824561403509,
"eval_accuracy": 0.9342222222222222,
"eval_f1": 0.8287037037037037,
"eval_loss": 0.1772110015153885,
"eval_precision": 0.8248847926267281,
"eval_recall": 0.8325581395348837,
"eval_runtime": 2.2017,
"eval_samples_per_second": 102.193,
"eval_steps_per_second": 13.171,
"step": 1100
},
{
"epoch": 19.473684210526315,
"grad_norm": 0.7569323778152466,
"learning_rate": 7.810266406757635e-05,
"loss": 0.2037,
"step": 1110
},
{
"epoch": 19.649122807017545,
"grad_norm": 1.068310260772705,
"learning_rate": 7.680311890838207e-05,
"loss": 0.1842,
"step": 1120
},
{
"epoch": 19.82456140350877,
"grad_norm": 1.1388903856277466,
"learning_rate": 7.550357374918778e-05,
"loss": 0.2057,
"step": 1130
},
{
"epoch": 20.0,
"grad_norm": 5.906609535217285,
"learning_rate": 7.42040285899935e-05,
"loss": 0.2088,
"step": 1140
},
{
"epoch": 20.17543859649123,
"grad_norm": 0.9702988862991333,
"learning_rate": 7.290448343079922e-05,
"loss": 0.1939,
"step": 1150
},
{
"epoch": 20.350877192982455,
"grad_norm": 3.627027988433838,
"learning_rate": 7.160493827160494e-05,
"loss": 0.1953,
"step": 1160
},
{
"epoch": 20.526315789473685,
"grad_norm": 1.11257004737854,
"learning_rate": 7.030539311241065e-05,
"loss": 0.1908,
"step": 1170
},
{
"epoch": 20.70175438596491,
"grad_norm": 1.626079797744751,
"learning_rate": 6.900584795321637e-05,
"loss": 0.2011,
"step": 1180
},
{
"epoch": 20.87719298245614,
"grad_norm": 1.8711522817611694,
"learning_rate": 6.770630279402209e-05,
"loss": 0.2106,
"step": 1190
},
{
"epoch": 21.05263157894737,
"grad_norm": 2.9188010692596436,
"learning_rate": 6.640675763482781e-05,
"loss": 0.1824,
"step": 1200
},
{
"epoch": 21.05263157894737,
"eval_accuracy": 0.9275555555555556,
"eval_f1": 0.8080094228504122,
"eval_loss": 0.18258829414844513,
"eval_precision": 0.8186157517899761,
"eval_recall": 0.7976744186046512,
"eval_runtime": 1.8368,
"eval_samples_per_second": 122.496,
"eval_steps_per_second": 15.788,
"step": 1200
},
{
"epoch": 21.228070175438596,
"grad_norm": 1.2156211137771606,
"learning_rate": 6.510721247563352e-05,
"loss": 0.1926,
"step": 1210
},
{
"epoch": 21.403508771929825,
"grad_norm": 0.5184522271156311,
"learning_rate": 6.380766731643924e-05,
"loss": 0.1695,
"step": 1220
},
{
"epoch": 21.57894736842105,
"grad_norm": 1.8020312786102295,
"learning_rate": 6.250812215724496e-05,
"loss": 0.2222,
"step": 1230
},
{
"epoch": 21.75438596491228,
"grad_norm": 1.808860421180725,
"learning_rate": 6.120857699805068e-05,
"loss": 0.2026,
"step": 1240
},
{
"epoch": 21.92982456140351,
"grad_norm": 0.5891908407211304,
"learning_rate": 5.99090318388564e-05,
"loss": 0.1861,
"step": 1250
},
{
"epoch": 22.105263157894736,
"grad_norm": 0.7829120755195618,
"learning_rate": 5.860948667966212e-05,
"loss": 0.1911,
"step": 1260
},
{
"epoch": 22.280701754385966,
"grad_norm": 0.8304038643836975,
"learning_rate": 5.7309941520467835e-05,
"loss": 0.1595,
"step": 1270
},
{
"epoch": 22.45614035087719,
"grad_norm": 0.9477715492248535,
"learning_rate": 5.6010396361273556e-05,
"loss": 0.2207,
"step": 1280
},
{
"epoch": 22.63157894736842,
"grad_norm": 1.6679517030715942,
"learning_rate": 5.471085120207927e-05,
"loss": 0.1885,
"step": 1290
},
{
"epoch": 22.80701754385965,
"grad_norm": 1.945037603378296,
"learning_rate": 5.341130604288499e-05,
"loss": 0.1808,
"step": 1300
},
{
"epoch": 22.80701754385965,
"eval_accuracy": 0.9346666666666666,
"eval_f1": 0.8296639629200464,
"eval_loss": 0.16815528273582458,
"eval_precision": 0.8267898383371824,
"eval_recall": 0.8325581395348837,
"eval_runtime": 2.4881,
"eval_samples_per_second": 90.429,
"eval_steps_per_second": 11.655,
"step": 1300
},
{
"epoch": 22.982456140350877,
"grad_norm": 0.7030972242355347,
"learning_rate": 5.2111760883690706e-05,
"loss": 0.1717,
"step": 1310
},
{
"epoch": 23.157894736842106,
"grad_norm": 1.0622111558914185,
"learning_rate": 5.081221572449643e-05,
"loss": 0.2162,
"step": 1320
},
{
"epoch": 23.333333333333332,
"grad_norm": 1.3687249422073364,
"learning_rate": 4.951267056530214e-05,
"loss": 0.197,
"step": 1330
},
{
"epoch": 23.50877192982456,
"grad_norm": 1.218827724456787,
"learning_rate": 4.821312540610786e-05,
"loss": 0.1811,
"step": 1340
},
{
"epoch": 23.68421052631579,
"grad_norm": 3.9379024505615234,
"learning_rate": 4.691358024691358e-05,
"loss": 0.1896,
"step": 1350
},
{
"epoch": 23.859649122807017,
"grad_norm": 0.9299766421318054,
"learning_rate": 4.56140350877193e-05,
"loss": 0.1663,
"step": 1360
},
{
"epoch": 24.035087719298247,
"grad_norm": 4.373446941375732,
"learning_rate": 4.431448992852502e-05,
"loss": 0.1883,
"step": 1370
},
{
"epoch": 24.210526315789473,
"grad_norm": 1.0416285991668701,
"learning_rate": 4.301494476933073e-05,
"loss": 0.1884,
"step": 1380
},
{
"epoch": 24.385964912280702,
"grad_norm": 1.9816950559616089,
"learning_rate": 4.1715399610136454e-05,
"loss": 0.1949,
"step": 1390
},
{
"epoch": 24.56140350877193,
"grad_norm": 1.6888455152511597,
"learning_rate": 4.041585445094217e-05,
"loss": 0.1792,
"step": 1400
},
{
"epoch": 24.56140350877193,
"eval_accuracy": 0.9364444444444444,
"eval_f1": 0.8323563892145369,
"eval_loss": 0.16882646083831787,
"eval_precision": 0.8392434988179669,
"eval_recall": 0.8255813953488372,
"eval_runtime": 1.8209,
"eval_samples_per_second": 123.562,
"eval_steps_per_second": 15.926,
"step": 1400
},
{
"epoch": 24.736842105263158,
"grad_norm": 0.6522326469421387,
"learning_rate": 3.911630929174789e-05,
"loss": 0.149,
"step": 1410
},
{
"epoch": 24.912280701754387,
"grad_norm": 1.053612470626831,
"learning_rate": 3.7816764132553604e-05,
"loss": 0.1983,
"step": 1420
},
{
"epoch": 25.087719298245613,
"grad_norm": 1.025525689125061,
"learning_rate": 3.664717348927875e-05,
"loss": 0.1973,
"step": 1430
},
{
"epoch": 25.263157894736842,
"grad_norm": 2.1537649631500244,
"learning_rate": 3.534762833008447e-05,
"loss": 0.1797,
"step": 1440
},
{
"epoch": 25.43859649122807,
"grad_norm": 2.6327617168426514,
"learning_rate": 3.404808317089019e-05,
"loss": 0.1693,
"step": 1450
},
{
"epoch": 25.614035087719298,
"grad_norm": 1.1369807720184326,
"learning_rate": 3.274853801169591e-05,
"loss": 0.1826,
"step": 1460
},
{
"epoch": 25.789473684210527,
"grad_norm": 2.0842247009277344,
"learning_rate": 3.1448992852501624e-05,
"loss": 0.1778,
"step": 1470
},
{
"epoch": 25.964912280701753,
"grad_norm": 0.8993640542030334,
"learning_rate": 3.014944769330734e-05,
"loss": 0.1688,
"step": 1480
},
{
"epoch": 26.140350877192983,
"grad_norm": 0.9640088677406311,
"learning_rate": 2.8979857050032487e-05,
"loss": 0.2139,
"step": 1490
},
{
"epoch": 26.31578947368421,
"grad_norm": 1.134974718093872,
"learning_rate": 2.7680311890838205e-05,
"loss": 0.1852,
"step": 1500
},
{
"epoch": 26.31578947368421,
"eval_accuracy": 0.9337777777777778,
"eval_f1": 0.826945412311266,
"eval_loss": 0.1725098043680191,
"eval_precision": 0.8259860788863109,
"eval_recall": 0.827906976744186,
"eval_runtime": 1.8397,
"eval_samples_per_second": 122.305,
"eval_steps_per_second": 15.764,
"step": 1500
},
{
"epoch": 26.49122807017544,
"grad_norm": 0.3995600640773773,
"learning_rate": 2.6380766731643926e-05,
"loss": 0.1703,
"step": 1510
},
{
"epoch": 26.666666666666668,
"grad_norm": 1.8065487146377563,
"learning_rate": 2.5081221572449644e-05,
"loss": 0.2017,
"step": 1520
},
{
"epoch": 26.842105263157894,
"grad_norm": 2.3725926876068115,
"learning_rate": 2.378167641325536e-05,
"loss": 0.1926,
"step": 1530
},
{
"epoch": 27.017543859649123,
"grad_norm": 1.9128490686416626,
"learning_rate": 2.248213125406108e-05,
"loss": 0.1771,
"step": 1540
},
{
"epoch": 27.19298245614035,
"grad_norm": 1.2254141569137573,
"learning_rate": 2.1182586094866797e-05,
"loss": 0.1791,
"step": 1550
},
{
"epoch": 27.36842105263158,
"grad_norm": 1.3266674280166626,
"learning_rate": 1.9883040935672515e-05,
"loss": 0.1671,
"step": 1560
},
{
"epoch": 27.54385964912281,
"grad_norm": 1.2818776369094849,
"learning_rate": 1.8583495776478232e-05,
"loss": 0.17,
"step": 1570
},
{
"epoch": 27.719298245614034,
"grad_norm": 1.0659555196762085,
"learning_rate": 1.728395061728395e-05,
"loss": 0.1712,
"step": 1580
},
{
"epoch": 27.894736842105264,
"grad_norm": 1.0451716184616089,
"learning_rate": 1.5984405458089668e-05,
"loss": 0.1854,
"step": 1590
},
{
"epoch": 28.07017543859649,
"grad_norm": 2.3844401836395264,
"learning_rate": 1.4684860298895387e-05,
"loss": 0.177,
"step": 1600
},
{
"epoch": 28.07017543859649,
"eval_accuracy": 0.9351111111111111,
"eval_f1": 0.8282352941176471,
"eval_loss": 0.16903221607208252,
"eval_precision": 0.8380952380952381,
"eval_recall": 0.8186046511627907,
"eval_runtime": 2.2907,
"eval_samples_per_second": 98.225,
"eval_steps_per_second": 12.66,
"step": 1600
},
{
"epoch": 28.24561403508772,
"grad_norm": 1.8458149433135986,
"learning_rate": 1.3385315139701105e-05,
"loss": 0.2091,
"step": 1610
},
{
"epoch": 28.42105263157895,
"grad_norm": 0.7621822953224182,
"learning_rate": 1.2085769980506823e-05,
"loss": 0.1626,
"step": 1620
},
{
"epoch": 28.596491228070175,
"grad_norm": 0.9533030986785889,
"learning_rate": 1.078622482131254e-05,
"loss": 0.1872,
"step": 1630
},
{
"epoch": 28.771929824561404,
"grad_norm": 1.495856761932373,
"learning_rate": 9.486679662118258e-06,
"loss": 0.1816,
"step": 1640
},
{
"epoch": 28.94736842105263,
"grad_norm": 1.397376537322998,
"learning_rate": 8.187134502923977e-06,
"loss": 0.1821,
"step": 1650
},
{
"epoch": 29.12280701754386,
"grad_norm": 2.07928729057312,
"learning_rate": 6.887589343729694e-06,
"loss": 0.1801,
"step": 1660
},
{
"epoch": 29.29824561403509,
"grad_norm": 1.2872428894042969,
"learning_rate": 5.588044184535413e-06,
"loss": 0.1835,
"step": 1670
},
{
"epoch": 29.473684210526315,
"grad_norm": 0.40397679805755615,
"learning_rate": 4.2884990253411305e-06,
"loss": 0.1597,
"step": 1680
},
{
"epoch": 29.649122807017545,
"grad_norm": 1.12138032913208,
"learning_rate": 2.9889538661468487e-06,
"loss": 0.1771,
"step": 1690
},
{
"epoch": 29.82456140350877,
"grad_norm": 1.8918460607528687,
"learning_rate": 1.6894087069525666e-06,
"loss": 0.1857,
"step": 1700
},
{
"epoch": 29.82456140350877,
"eval_accuracy": 0.9297777777777778,
"eval_f1": 0.8175519630484989,
"eval_loss": 0.17081834375858307,
"eval_precision": 0.8119266055045872,
"eval_recall": 0.8232558139534883,
"eval_runtime": 1.7795,
"eval_samples_per_second": 126.443,
"eval_steps_per_second": 16.297,
"step": 1700
},
{
"epoch": 30.0,
"grad_norm": 1.7220489978790283,
"learning_rate": 3.898635477582846e-07,
"loss": 0.166,
"step": 1710
},
{
"epoch": 30.0,
"step": 1710,
"total_flos": 3.8465920659456e+16,
"train_loss": 0.2702594916025797,
"train_runtime": 403.8981,
"train_samples_per_second": 66.849,
"train_steps_per_second": 4.234
}
],
"logging_steps": 10,
"max_steps": 1710,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.8465920659456e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}