all-observation-type / trainer_state.json
shevek's picture
🍻 cheers
e5e712b verified
{
"best_metric": 0.007714101579040289,
"best_model_checkpoint": "./all-observation-type/checkpoint-2500",
"epoch": 40.0,
"eval_steps": 100,
"global_step": 3440,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11627906976744186,
"grad_norm": 0.09591829776763916,
"learning_rate": 0.0001994186046511628,
"loss": 0.6339,
"step": 10
},
{
"epoch": 0.23255813953488372,
"grad_norm": 0.09048111736774445,
"learning_rate": 0.0001988372093023256,
"loss": 0.5034,
"step": 20
},
{
"epoch": 0.3488372093023256,
"grad_norm": 0.08638294041156769,
"learning_rate": 0.00019825581395348837,
"loss": 0.3894,
"step": 30
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.07995904237031937,
"learning_rate": 0.00019767441860465116,
"loss": 0.2951,
"step": 40
},
{
"epoch": 0.5813953488372093,
"grad_norm": 0.07135872542858124,
"learning_rate": 0.00019709302325581396,
"loss": 0.223,
"step": 50
},
{
"epoch": 0.6976744186046512,
"grad_norm": 0.0624060332775116,
"learning_rate": 0.00019651162790697676,
"loss": 0.1703,
"step": 60
},
{
"epoch": 0.813953488372093,
"grad_norm": 0.05441118776798248,
"learning_rate": 0.00019593023255813952,
"loss": 0.133,
"step": 70
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.04721858724951744,
"learning_rate": 0.00019534883720930232,
"loss": 0.1057,
"step": 80
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.041243478655815125,
"learning_rate": 0.00019476744186046511,
"loss": 0.0866,
"step": 90
},
{
"epoch": 1.1627906976744187,
"grad_norm": 0.036053091287612915,
"learning_rate": 0.0001941860465116279,
"loss": 0.0726,
"step": 100
},
{
"epoch": 1.1627906976744187,
"eval_f1": 0.0,
"eval_loss": 0.06596538424491882,
"eval_runtime": 1.8015,
"eval_samples_per_second": 40.522,
"eval_steps_per_second": 5.551,
"step": 100
},
{
"epoch": 1.2790697674418605,
"grad_norm": 0.03166024759411812,
"learning_rate": 0.0001936046511627907,
"loss": 0.0617,
"step": 110
},
{
"epoch": 1.3953488372093024,
"grad_norm": 0.0280374176800251,
"learning_rate": 0.0001930232558139535,
"loss": 0.0537,
"step": 120
},
{
"epoch": 1.5116279069767442,
"grad_norm": 0.025035331025719643,
"learning_rate": 0.0001924418604651163,
"loss": 0.0469,
"step": 130
},
{
"epoch": 1.627906976744186,
"grad_norm": 0.02311284840106964,
"learning_rate": 0.0001918604651162791,
"loss": 0.042,
"step": 140
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.020456839352846146,
"learning_rate": 0.0001912790697674419,
"loss": 0.0383,
"step": 150
},
{
"epoch": 1.8604651162790697,
"grad_norm": 0.01917686127126217,
"learning_rate": 0.00019069767441860466,
"loss": 0.034,
"step": 160
},
{
"epoch": 1.9767441860465116,
"grad_norm": 0.017376938834786415,
"learning_rate": 0.00019011627906976745,
"loss": 0.0314,
"step": 170
},
{
"epoch": 2.0930232558139537,
"grad_norm": 0.01589462347328663,
"learning_rate": 0.00018953488372093025,
"loss": 0.0292,
"step": 180
},
{
"epoch": 2.2093023255813953,
"grad_norm": 0.014865094795823097,
"learning_rate": 0.00018895348837209304,
"loss": 0.0274,
"step": 190
},
{
"epoch": 2.3255813953488373,
"grad_norm": 0.013694318942725658,
"learning_rate": 0.00018837209302325584,
"loss": 0.0264,
"step": 200
},
{
"epoch": 2.3255813953488373,
"eval_f1": 0.0,
"eval_loss": 0.02468988113105297,
"eval_runtime": 1.6978,
"eval_samples_per_second": 42.996,
"eval_steps_per_second": 5.89,
"step": 200
},
{
"epoch": 2.441860465116279,
"grad_norm": 0.012708564288914204,
"learning_rate": 0.0001877906976744186,
"loss": 0.0244,
"step": 210
},
{
"epoch": 2.558139534883721,
"grad_norm": 0.011777268722653389,
"learning_rate": 0.0001872093023255814,
"loss": 0.023,
"step": 220
},
{
"epoch": 2.6744186046511627,
"grad_norm": 0.011043795384466648,
"learning_rate": 0.0001866279069767442,
"loss": 0.0212,
"step": 230
},
{
"epoch": 2.7906976744186047,
"grad_norm": 0.010540899820625782,
"learning_rate": 0.000186046511627907,
"loss": 0.0206,
"step": 240
},
{
"epoch": 2.9069767441860463,
"grad_norm": 0.0098367715254426,
"learning_rate": 0.00018546511627906976,
"loss": 0.0192,
"step": 250
},
{
"epoch": 3.0232558139534884,
"grad_norm": 0.009457371197640896,
"learning_rate": 0.00018488372093023256,
"loss": 0.0186,
"step": 260
},
{
"epoch": 3.13953488372093,
"grad_norm": 0.0089762257412076,
"learning_rate": 0.00018430232558139535,
"loss": 0.0182,
"step": 270
},
{
"epoch": 3.255813953488372,
"grad_norm": 0.00830694381147623,
"learning_rate": 0.00018372093023255815,
"loss": 0.0177,
"step": 280
},
{
"epoch": 3.3720930232558137,
"grad_norm": 0.008157115429639816,
"learning_rate": 0.00018313953488372094,
"loss": 0.0171,
"step": 290
},
{
"epoch": 3.488372093023256,
"grad_norm": 0.007526726461946964,
"learning_rate": 0.0001825581395348837,
"loss": 0.0161,
"step": 300
},
{
"epoch": 3.488372093023256,
"eval_f1": 0.0,
"eval_loss": 0.016456665471196175,
"eval_runtime": 1.81,
"eval_samples_per_second": 40.33,
"eval_steps_per_second": 5.525,
"step": 300
},
{
"epoch": 3.604651162790698,
"grad_norm": 0.006970840971916914,
"learning_rate": 0.0001819767441860465,
"loss": 0.0171,
"step": 310
},
{
"epoch": 3.7209302325581395,
"grad_norm": 0.006966202985495329,
"learning_rate": 0.0001813953488372093,
"loss": 0.0157,
"step": 320
},
{
"epoch": 3.8372093023255816,
"grad_norm": 0.006569746416062117,
"learning_rate": 0.00018081395348837212,
"loss": 0.0155,
"step": 330
},
{
"epoch": 3.953488372093023,
"grad_norm": 0.006560744717717171,
"learning_rate": 0.0001802325581395349,
"loss": 0.0154,
"step": 340
},
{
"epoch": 4.069767441860465,
"grad_norm": 0.006836111657321453,
"learning_rate": 0.0001796511627906977,
"loss": 0.0144,
"step": 350
},
{
"epoch": 4.186046511627907,
"grad_norm": 0.005944866221398115,
"learning_rate": 0.00017906976744186048,
"loss": 0.0144,
"step": 360
},
{
"epoch": 4.3023255813953485,
"grad_norm": 0.00594041682779789,
"learning_rate": 0.00017848837209302328,
"loss": 0.0142,
"step": 370
},
{
"epoch": 4.4186046511627906,
"grad_norm": 0.005508134141564369,
"learning_rate": 0.00017790697674418605,
"loss": 0.0136,
"step": 380
},
{
"epoch": 4.534883720930233,
"grad_norm": 0.005419102031737566,
"learning_rate": 0.00017732558139534884,
"loss": 0.0135,
"step": 390
},
{
"epoch": 4.651162790697675,
"grad_norm": 0.005490223411470652,
"learning_rate": 0.00017674418604651164,
"loss": 0.0133,
"step": 400
},
{
"epoch": 4.651162790697675,
"eval_f1": 0.0,
"eval_loss": 0.013475539162755013,
"eval_runtime": 1.7409,
"eval_samples_per_second": 41.933,
"eval_steps_per_second": 5.744,
"step": 400
},
{
"epoch": 4.767441860465116,
"grad_norm": 0.005142460577189922,
"learning_rate": 0.00017616279069767443,
"loss": 0.0132,
"step": 410
},
{
"epoch": 4.883720930232558,
"grad_norm": 0.00514647364616394,
"learning_rate": 0.00017558139534883723,
"loss": 0.0137,
"step": 420
},
{
"epoch": 5.0,
"grad_norm": 0.005101104732602835,
"learning_rate": 0.000175,
"loss": 0.0135,
"step": 430
},
{
"epoch": 5.116279069767442,
"grad_norm": 0.005293596535921097,
"learning_rate": 0.0001744186046511628,
"loss": 0.0122,
"step": 440
},
{
"epoch": 5.232558139534884,
"grad_norm": 0.004600506741553545,
"learning_rate": 0.0001738372093023256,
"loss": 0.0123,
"step": 450
},
{
"epoch": 5.348837209302325,
"grad_norm": 0.005292165093123913,
"learning_rate": 0.00017325581395348838,
"loss": 0.0125,
"step": 460
},
{
"epoch": 5.465116279069767,
"grad_norm": 0.004837548825889826,
"learning_rate": 0.00017267441860465118,
"loss": 0.012,
"step": 470
},
{
"epoch": 5.5813953488372094,
"grad_norm": 0.004367106128484011,
"learning_rate": 0.00017209302325581395,
"loss": 0.0127,
"step": 480
},
{
"epoch": 5.6976744186046515,
"grad_norm": 0.004551732446998358,
"learning_rate": 0.00017151162790697674,
"loss": 0.0123,
"step": 490
},
{
"epoch": 5.813953488372093,
"grad_norm": 0.004141143057495356,
"learning_rate": 0.00017093023255813954,
"loss": 0.0124,
"step": 500
},
{
"epoch": 5.813953488372093,
"eval_f1": 0.0,
"eval_loss": 0.012049774639308453,
"eval_runtime": 1.8113,
"eval_samples_per_second": 40.303,
"eval_steps_per_second": 5.521,
"step": 500
},
{
"epoch": 5.930232558139535,
"grad_norm": 0.0041281660087406635,
"learning_rate": 0.00017034883720930233,
"loss": 0.0123,
"step": 510
},
{
"epoch": 6.046511627906977,
"grad_norm": 0.004327772185206413,
"learning_rate": 0.0001697674418604651,
"loss": 0.0114,
"step": 520
},
{
"epoch": 6.162790697674419,
"grad_norm": 0.00425134040415287,
"learning_rate": 0.0001691860465116279,
"loss": 0.0108,
"step": 530
},
{
"epoch": 6.27906976744186,
"grad_norm": 0.004197305999696255,
"learning_rate": 0.00016860465116279072,
"loss": 0.0119,
"step": 540
},
{
"epoch": 6.395348837209302,
"grad_norm": 0.004064254928380251,
"learning_rate": 0.00016802325581395352,
"loss": 0.0117,
"step": 550
},
{
"epoch": 6.511627906976744,
"grad_norm": 0.004467037972062826,
"learning_rate": 0.00016744186046511629,
"loss": 0.012,
"step": 560
},
{
"epoch": 6.627906976744186,
"grad_norm": 0.004210630431771278,
"learning_rate": 0.00016686046511627908,
"loss": 0.0116,
"step": 570
},
{
"epoch": 6.7441860465116275,
"grad_norm": 0.003937269560992718,
"learning_rate": 0.00016627906976744188,
"loss": 0.0116,
"step": 580
},
{
"epoch": 6.8604651162790695,
"grad_norm": 0.003829133929684758,
"learning_rate": 0.00016569767441860467,
"loss": 0.0114,
"step": 590
},
{
"epoch": 6.976744186046512,
"grad_norm": 0.0037258623633533716,
"learning_rate": 0.00016511627906976747,
"loss": 0.011,
"step": 600
},
{
"epoch": 6.976744186046512,
"eval_f1": 0.0,
"eval_loss": 0.011244768276810646,
"eval_runtime": 1.6726,
"eval_samples_per_second": 43.646,
"eval_steps_per_second": 5.979,
"step": 600
},
{
"epoch": 7.093023255813954,
"grad_norm": 0.0038892878219485283,
"learning_rate": 0.00016453488372093024,
"loss": 0.0108,
"step": 610
},
{
"epoch": 7.209302325581396,
"grad_norm": 0.003730091731995344,
"learning_rate": 0.00016395348837209303,
"loss": 0.0108,
"step": 620
},
{
"epoch": 7.325581395348837,
"grad_norm": 0.0035813930444419384,
"learning_rate": 0.00016337209302325583,
"loss": 0.0113,
"step": 630
},
{
"epoch": 7.441860465116279,
"grad_norm": 0.004092409275472164,
"learning_rate": 0.00016279069767441862,
"loss": 0.0105,
"step": 640
},
{
"epoch": 7.558139534883721,
"grad_norm": 0.004152482841163874,
"learning_rate": 0.0001622093023255814,
"loss": 0.0107,
"step": 650
},
{
"epoch": 7.674418604651163,
"grad_norm": 0.0035718800500035286,
"learning_rate": 0.00016162790697674419,
"loss": 0.0106,
"step": 660
},
{
"epoch": 7.790697674418604,
"grad_norm": 0.004181632772088051,
"learning_rate": 0.00016104651162790698,
"loss": 0.0112,
"step": 670
},
{
"epoch": 7.906976744186046,
"grad_norm": 0.003530286019667983,
"learning_rate": 0.00016046511627906978,
"loss": 0.0112,
"step": 680
},
{
"epoch": 8.023255813953488,
"grad_norm": 0.003953536041080952,
"learning_rate": 0.00015988372093023257,
"loss": 0.0107,
"step": 690
},
{
"epoch": 8.13953488372093,
"grad_norm": 0.00364445592276752,
"learning_rate": 0.00015930232558139534,
"loss": 0.0114,
"step": 700
},
{
"epoch": 8.13953488372093,
"eval_f1": 0.0,
"eval_loss": 0.010723751038312912,
"eval_runtime": 1.705,
"eval_samples_per_second": 42.815,
"eval_steps_per_second": 5.865,
"step": 700
},
{
"epoch": 8.255813953488373,
"grad_norm": 0.004072926007211208,
"learning_rate": 0.00015872093023255814,
"loss": 0.0105,
"step": 710
},
{
"epoch": 8.372093023255815,
"grad_norm": 0.003972381353378296,
"learning_rate": 0.00015813953488372093,
"loss": 0.0103,
"step": 720
},
{
"epoch": 8.488372093023255,
"grad_norm": 0.003728296374902129,
"learning_rate": 0.00015755813953488373,
"loss": 0.0107,
"step": 730
},
{
"epoch": 8.604651162790697,
"grad_norm": 0.0036759586073458195,
"learning_rate": 0.00015697674418604652,
"loss": 0.01,
"step": 740
},
{
"epoch": 8.720930232558139,
"grad_norm": 0.0035114786587655544,
"learning_rate": 0.0001563953488372093,
"loss": 0.0105,
"step": 750
},
{
"epoch": 8.837209302325581,
"grad_norm": 0.0033980573061853647,
"learning_rate": 0.0001558139534883721,
"loss": 0.0102,
"step": 760
},
{
"epoch": 8.953488372093023,
"grad_norm": 0.0035892720334231853,
"learning_rate": 0.0001552325581395349,
"loss": 0.0099,
"step": 770
},
{
"epoch": 9.069767441860465,
"grad_norm": 0.0031818547286093235,
"learning_rate": 0.00015465116279069768,
"loss": 0.0098,
"step": 780
},
{
"epoch": 9.186046511627907,
"grad_norm": 0.003511944320052862,
"learning_rate": 0.00015406976744186047,
"loss": 0.0101,
"step": 790
},
{
"epoch": 9.30232558139535,
"grad_norm": 0.0032857146579772234,
"learning_rate": 0.00015348837209302327,
"loss": 0.0109,
"step": 800
},
{
"epoch": 9.30232558139535,
"eval_f1": 0.0,
"eval_loss": 0.010318118147552013,
"eval_runtime": 1.6682,
"eval_samples_per_second": 43.76,
"eval_steps_per_second": 5.995,
"step": 800
},
{
"epoch": 9.418604651162791,
"grad_norm": 0.004180931951850653,
"learning_rate": 0.00015290697674418606,
"loss": 0.0104,
"step": 810
},
{
"epoch": 9.534883720930232,
"grad_norm": 0.0035553001798689365,
"learning_rate": 0.00015232558139534886,
"loss": 0.0107,
"step": 820
},
{
"epoch": 9.651162790697674,
"grad_norm": 0.004481980111449957,
"learning_rate": 0.00015174418604651163,
"loss": 0.0098,
"step": 830
},
{
"epoch": 9.767441860465116,
"grad_norm": 0.004224750213325024,
"learning_rate": 0.00015116279069767442,
"loss": 0.0102,
"step": 840
},
{
"epoch": 9.883720930232558,
"grad_norm": 0.0038431661669164896,
"learning_rate": 0.00015058139534883722,
"loss": 0.0095,
"step": 850
},
{
"epoch": 10.0,
"grad_norm": 0.004212440922856331,
"learning_rate": 0.00015000000000000001,
"loss": 0.0099,
"step": 860
},
{
"epoch": 10.116279069767442,
"grad_norm": 0.003633901011198759,
"learning_rate": 0.0001494186046511628,
"loss": 0.0095,
"step": 870
},
{
"epoch": 10.232558139534884,
"grad_norm": 0.003657746594399214,
"learning_rate": 0.00014883720930232558,
"loss": 0.0096,
"step": 880
},
{
"epoch": 10.348837209302326,
"grad_norm": 0.003825038904324174,
"learning_rate": 0.00014825581395348837,
"loss": 0.0102,
"step": 890
},
{
"epoch": 10.465116279069768,
"grad_norm": 0.0041350796818733215,
"learning_rate": 0.00014767441860465117,
"loss": 0.0096,
"step": 900
},
{
"epoch": 10.465116279069768,
"eval_f1": 0.0,
"eval_loss": 0.010230067186057568,
"eval_runtime": 1.6982,
"eval_samples_per_second": 42.987,
"eval_steps_per_second": 5.889,
"step": 900
},
{
"epoch": 10.581395348837209,
"grad_norm": 0.003846728475764394,
"learning_rate": 0.00014709302325581396,
"loss": 0.0106,
"step": 910
},
{
"epoch": 10.69767441860465,
"grad_norm": 0.0042284755036234856,
"learning_rate": 0.00014651162790697673,
"loss": 0.0102,
"step": 920
},
{
"epoch": 10.813953488372093,
"grad_norm": 0.003174531040713191,
"learning_rate": 0.00014593023255813953,
"loss": 0.009,
"step": 930
},
{
"epoch": 10.930232558139535,
"grad_norm": 0.0035903833340853453,
"learning_rate": 0.00014534883720930232,
"loss": 0.0101,
"step": 940
},
{
"epoch": 11.046511627906977,
"grad_norm": 0.0043946849182248116,
"learning_rate": 0.00014476744186046512,
"loss": 0.0093,
"step": 950
},
{
"epoch": 11.162790697674419,
"grad_norm": 0.0035909838043153286,
"learning_rate": 0.00014418604651162791,
"loss": 0.0093,
"step": 960
},
{
"epoch": 11.279069767441861,
"grad_norm": 0.0036394952330738306,
"learning_rate": 0.0001436046511627907,
"loss": 0.0092,
"step": 970
},
{
"epoch": 11.395348837209303,
"grad_norm": 0.0038749193772673607,
"learning_rate": 0.0001430232558139535,
"loss": 0.009,
"step": 980
},
{
"epoch": 11.511627906976745,
"grad_norm": 0.0044557624496519566,
"learning_rate": 0.0001424418604651163,
"loss": 0.0097,
"step": 990
},
{
"epoch": 11.627906976744185,
"grad_norm": 0.0037609776481986046,
"learning_rate": 0.0001418604651162791,
"loss": 0.0099,
"step": 1000
},
{
"epoch": 11.627906976744185,
"eval_f1": 0.0,
"eval_loss": 0.00977805070579052,
"eval_runtime": 1.7092,
"eval_samples_per_second": 42.71,
"eval_steps_per_second": 5.851,
"step": 1000
},
{
"epoch": 11.744186046511627,
"grad_norm": 0.003938957117497921,
"learning_rate": 0.00014127906976744186,
"loss": 0.0093,
"step": 1010
},
{
"epoch": 11.86046511627907,
"grad_norm": 0.006434954237192869,
"learning_rate": 0.00014069767441860466,
"loss": 0.0095,
"step": 1020
},
{
"epoch": 11.976744186046512,
"grad_norm": 0.007113784551620483,
"learning_rate": 0.00014011627906976746,
"loss": 0.0105,
"step": 1030
},
{
"epoch": 12.093023255813954,
"grad_norm": 0.0040367101319134235,
"learning_rate": 0.00013953488372093025,
"loss": 0.0093,
"step": 1040
},
{
"epoch": 12.209302325581396,
"grad_norm": 0.0035031838342547417,
"learning_rate": 0.00013895348837209302,
"loss": 0.0096,
"step": 1050
},
{
"epoch": 12.325581395348838,
"grad_norm": 0.0033588423393666744,
"learning_rate": 0.00013837209302325582,
"loss": 0.0091,
"step": 1060
},
{
"epoch": 12.44186046511628,
"grad_norm": 0.0037452862598001957,
"learning_rate": 0.0001377906976744186,
"loss": 0.0092,
"step": 1070
},
{
"epoch": 12.55813953488372,
"grad_norm": 0.004363079089671373,
"learning_rate": 0.0001372093023255814,
"loss": 0.0082,
"step": 1080
},
{
"epoch": 12.674418604651162,
"grad_norm": 0.006496877875179052,
"learning_rate": 0.0001366279069767442,
"loss": 0.0092,
"step": 1090
},
{
"epoch": 12.790697674418604,
"grad_norm": 0.00442136637866497,
"learning_rate": 0.00013604651162790697,
"loss": 0.0089,
"step": 1100
},
{
"epoch": 12.790697674418604,
"eval_f1": 0.0,
"eval_loss": 0.009442531503736973,
"eval_runtime": 1.7071,
"eval_samples_per_second": 42.762,
"eval_steps_per_second": 5.858,
"step": 1100
},
{
"epoch": 12.906976744186046,
"grad_norm": 0.004154372029006481,
"learning_rate": 0.00013546511627906977,
"loss": 0.0097,
"step": 1110
},
{
"epoch": 13.023255813953488,
"grad_norm": 0.004947633482515812,
"learning_rate": 0.00013488372093023256,
"loss": 0.0094,
"step": 1120
},
{
"epoch": 13.13953488372093,
"grad_norm": 0.004092765972018242,
"learning_rate": 0.00013430232558139536,
"loss": 0.0087,
"step": 1130
},
{
"epoch": 13.255813953488373,
"grad_norm": 0.003843548009172082,
"learning_rate": 0.00013372093023255815,
"loss": 0.0084,
"step": 1140
},
{
"epoch": 13.372093023255815,
"grad_norm": 0.0034843245521187782,
"learning_rate": 0.00013313953488372092,
"loss": 0.0084,
"step": 1150
},
{
"epoch": 13.488372093023255,
"grad_norm": 0.0045846025459468365,
"learning_rate": 0.00013255813953488372,
"loss": 0.0094,
"step": 1160
},
{
"epoch": 13.604651162790697,
"grad_norm": 0.0040568090043962,
"learning_rate": 0.0001319767441860465,
"loss": 0.0092,
"step": 1170
},
{
"epoch": 13.720930232558139,
"grad_norm": 0.0035901013761758804,
"learning_rate": 0.0001313953488372093,
"loss": 0.0086,
"step": 1180
},
{
"epoch": 13.837209302325581,
"grad_norm": 0.003683890914544463,
"learning_rate": 0.0001308139534883721,
"loss": 0.0092,
"step": 1190
},
{
"epoch": 13.953488372093023,
"grad_norm": 0.003873488400131464,
"learning_rate": 0.0001302325581395349,
"loss": 0.0091,
"step": 1200
},
{
"epoch": 13.953488372093023,
"eval_f1": 0.0,
"eval_loss": 0.009279163554310799,
"eval_runtime": 1.6753,
"eval_samples_per_second": 43.574,
"eval_steps_per_second": 5.969,
"step": 1200
},
{
"epoch": 14.069767441860465,
"grad_norm": 0.0047092861495912075,
"learning_rate": 0.0001296511627906977,
"loss": 0.0078,
"step": 1210
},
{
"epoch": 14.186046511627907,
"grad_norm": 0.005257419776171446,
"learning_rate": 0.0001290697674418605,
"loss": 0.0083,
"step": 1220
},
{
"epoch": 14.30232558139535,
"grad_norm": 0.005552452523261309,
"learning_rate": 0.00012848837209302326,
"loss": 0.0087,
"step": 1230
},
{
"epoch": 14.418604651162791,
"grad_norm": 0.0051083252765238285,
"learning_rate": 0.00012790697674418605,
"loss": 0.0092,
"step": 1240
},
{
"epoch": 14.534883720930232,
"grad_norm": 0.0037808313500136137,
"learning_rate": 0.00012732558139534885,
"loss": 0.0082,
"step": 1250
},
{
"epoch": 14.651162790697674,
"grad_norm": 0.004120196681469679,
"learning_rate": 0.00012674418604651164,
"loss": 0.0082,
"step": 1260
},
{
"epoch": 14.767441860465116,
"grad_norm": 0.004169652238488197,
"learning_rate": 0.00012616279069767444,
"loss": 0.0085,
"step": 1270
},
{
"epoch": 14.883720930232558,
"grad_norm": 0.004740726202726364,
"learning_rate": 0.0001255813953488372,
"loss": 0.0082,
"step": 1280
},
{
"epoch": 15.0,
"grad_norm": 0.00697332015261054,
"learning_rate": 0.000125,
"loss": 0.0087,
"step": 1290
},
{
"epoch": 15.116279069767442,
"grad_norm": 0.004342319909483194,
"learning_rate": 0.0001244186046511628,
"loss": 0.0081,
"step": 1300
},
{
"epoch": 15.116279069767442,
"eval_f1": 0.0,
"eval_loss": 0.008884104900062084,
"eval_runtime": 1.7593,
"eval_samples_per_second": 41.494,
"eval_steps_per_second": 5.684,
"step": 1300
},
{
"epoch": 15.232558139534884,
"grad_norm": 0.00661982037127018,
"learning_rate": 0.0001238372093023256,
"loss": 0.008,
"step": 1310
},
{
"epoch": 15.348837209302326,
"grad_norm": 0.004550145473331213,
"learning_rate": 0.00012325581395348836,
"loss": 0.0082,
"step": 1320
},
{
"epoch": 15.465116279069768,
"grad_norm": 0.00338382157497108,
"learning_rate": 0.00012267441860465116,
"loss": 0.0084,
"step": 1330
},
{
"epoch": 15.581395348837209,
"grad_norm": 0.005567767191678286,
"learning_rate": 0.00012209302325581395,
"loss": 0.008,
"step": 1340
},
{
"epoch": 15.69767441860465,
"grad_norm": 0.0051926677115261555,
"learning_rate": 0.00012151162790697675,
"loss": 0.0071,
"step": 1350
},
{
"epoch": 15.813953488372093,
"grad_norm": 0.004935986362397671,
"learning_rate": 0.00012093023255813953,
"loss": 0.0085,
"step": 1360
},
{
"epoch": 15.930232558139535,
"grad_norm": 0.0048750354908406734,
"learning_rate": 0.00012034883720930233,
"loss": 0.0078,
"step": 1370
},
{
"epoch": 16.046511627906977,
"grad_norm": 0.0055388594046235085,
"learning_rate": 0.00011976744186046511,
"loss": 0.0082,
"step": 1380
},
{
"epoch": 16.162790697674417,
"grad_norm": 0.004739618394523859,
"learning_rate": 0.0001191860465116279,
"loss": 0.0077,
"step": 1390
},
{
"epoch": 16.27906976744186,
"grad_norm": 0.004103164654225111,
"learning_rate": 0.00011860465116279071,
"loss": 0.0073,
"step": 1400
},
{
"epoch": 16.27906976744186,
"eval_f1": 0.0,
"eval_loss": 0.008862593211233616,
"eval_runtime": 1.7176,
"eval_samples_per_second": 42.5,
"eval_steps_per_second": 5.822,
"step": 1400
},
{
"epoch": 16.3953488372093,
"grad_norm": 0.0057744914665818214,
"learning_rate": 0.00011802325581395351,
"loss": 0.0078,
"step": 1410
},
{
"epoch": 16.511627906976745,
"grad_norm": 0.003924832213670015,
"learning_rate": 0.00011744186046511629,
"loss": 0.0076,
"step": 1420
},
{
"epoch": 16.627906976744185,
"grad_norm": 0.004951529670506716,
"learning_rate": 0.00011686046511627909,
"loss": 0.0077,
"step": 1430
},
{
"epoch": 16.74418604651163,
"grad_norm": 0.004139590077102184,
"learning_rate": 0.00011627906976744187,
"loss": 0.0073,
"step": 1440
},
{
"epoch": 16.86046511627907,
"grad_norm": 0.004691829439252615,
"learning_rate": 0.00011569767441860466,
"loss": 0.0078,
"step": 1450
},
{
"epoch": 16.97674418604651,
"grad_norm": 0.011304708197712898,
"learning_rate": 0.00011511627906976746,
"loss": 0.0073,
"step": 1460
},
{
"epoch": 17.093023255813954,
"grad_norm": 0.004497618414461613,
"learning_rate": 0.00011453488372093024,
"loss": 0.0075,
"step": 1470
},
{
"epoch": 17.209302325581394,
"grad_norm": 0.004640204831957817,
"learning_rate": 0.00011395348837209304,
"loss": 0.0067,
"step": 1480
},
{
"epoch": 17.325581395348838,
"grad_norm": 0.008850700221955776,
"learning_rate": 0.00011337209302325582,
"loss": 0.0074,
"step": 1490
},
{
"epoch": 17.441860465116278,
"grad_norm": 0.0063476236537098885,
"learning_rate": 0.00011279069767441861,
"loss": 0.0071,
"step": 1500
},
{
"epoch": 17.441860465116278,
"eval_f1": 0.0,
"eval_loss": 0.008479787036776543,
"eval_runtime": 1.7053,
"eval_samples_per_second": 42.808,
"eval_steps_per_second": 5.864,
"step": 1500
},
{
"epoch": 17.558139534883722,
"grad_norm": 0.005328552797436714,
"learning_rate": 0.0001122093023255814,
"loss": 0.0071,
"step": 1510
},
{
"epoch": 17.674418604651162,
"grad_norm": 0.006423116661608219,
"learning_rate": 0.00011162790697674419,
"loss": 0.0075,
"step": 1520
},
{
"epoch": 17.790697674418606,
"grad_norm": 0.006245277356356382,
"learning_rate": 0.00011104651162790699,
"loss": 0.0074,
"step": 1530
},
{
"epoch": 17.906976744186046,
"grad_norm": 0.006818824913352728,
"learning_rate": 0.00011046511627906977,
"loss": 0.0065,
"step": 1540
},
{
"epoch": 18.023255813953487,
"grad_norm": 0.005169364158064127,
"learning_rate": 0.00010988372093023256,
"loss": 0.0076,
"step": 1550
},
{
"epoch": 18.13953488372093,
"grad_norm": 0.0035659593995660543,
"learning_rate": 0.00010930232558139534,
"loss": 0.0067,
"step": 1560
},
{
"epoch": 18.25581395348837,
"grad_norm": 0.004306168295443058,
"learning_rate": 0.00010872093023255814,
"loss": 0.0066,
"step": 1570
},
{
"epoch": 18.372093023255815,
"grad_norm": 0.004275166429579258,
"learning_rate": 0.00010813953488372092,
"loss": 0.0064,
"step": 1580
},
{
"epoch": 18.488372093023255,
"grad_norm": 0.003919180482625961,
"learning_rate": 0.00010755813953488372,
"loss": 0.0071,
"step": 1590
},
{
"epoch": 18.6046511627907,
"grad_norm": 0.008834786713123322,
"learning_rate": 0.00010697674418604651,
"loss": 0.0068,
"step": 1600
},
{
"epoch": 18.6046511627907,
"eval_f1": 0.0182648401826484,
"eval_loss": 0.008239569142460823,
"eval_runtime": 1.7236,
"eval_samples_per_second": 42.353,
"eval_steps_per_second": 5.802,
"step": 1600
},
{
"epoch": 18.72093023255814,
"grad_norm": 0.004857208579778671,
"learning_rate": 0.0001063953488372093,
"loss": 0.0062,
"step": 1610
},
{
"epoch": 18.837209302325583,
"grad_norm": 0.005715236067771912,
"learning_rate": 0.0001058139534883721,
"loss": 0.0074,
"step": 1620
},
{
"epoch": 18.953488372093023,
"grad_norm": 0.005261226557195187,
"learning_rate": 0.0001052325581395349,
"loss": 0.0065,
"step": 1630
},
{
"epoch": 19.069767441860463,
"grad_norm": 0.005216473713517189,
"learning_rate": 0.00010465116279069768,
"loss": 0.0066,
"step": 1640
},
{
"epoch": 19.186046511627907,
"grad_norm": 0.004990010056644678,
"learning_rate": 0.00010406976744186048,
"loss": 0.0065,
"step": 1650
},
{
"epoch": 19.302325581395348,
"grad_norm": 0.005021219607442617,
"learning_rate": 0.00010348837209302327,
"loss": 0.0064,
"step": 1660
},
{
"epoch": 19.41860465116279,
"grad_norm": 0.006166994571685791,
"learning_rate": 0.00010290697674418605,
"loss": 0.0069,
"step": 1670
},
{
"epoch": 19.53488372093023,
"grad_norm": 0.0043314131908118725,
"learning_rate": 0.00010232558139534885,
"loss": 0.0064,
"step": 1680
},
{
"epoch": 19.651162790697676,
"grad_norm": 0.006082617212086916,
"learning_rate": 0.00010174418604651163,
"loss": 0.006,
"step": 1690
},
{
"epoch": 19.767441860465116,
"grad_norm": 0.008033761754631996,
"learning_rate": 0.00010116279069767443,
"loss": 0.0064,
"step": 1700
},
{
"epoch": 19.767441860465116,
"eval_f1": 0.0365296803652968,
"eval_loss": 0.008163763210177422,
"eval_runtime": 1.7458,
"eval_samples_per_second": 41.815,
"eval_steps_per_second": 5.728,
"step": 1700
},
{
"epoch": 19.88372093023256,
"grad_norm": 0.006994906347244978,
"learning_rate": 0.00010058139534883721,
"loss": 0.0061,
"step": 1710
},
{
"epoch": 20.0,
"grad_norm": 0.006535364780575037,
"learning_rate": 0.0001,
"loss": 0.0061,
"step": 1720
},
{
"epoch": 20.11627906976744,
"grad_norm": 0.004905609879642725,
"learning_rate": 9.94186046511628e-05,
"loss": 0.006,
"step": 1730
},
{
"epoch": 20.232558139534884,
"grad_norm": 0.004114439245313406,
"learning_rate": 9.883720930232558e-05,
"loss": 0.0064,
"step": 1740
},
{
"epoch": 20.348837209302324,
"grad_norm": 0.003827931359410286,
"learning_rate": 9.825581395348838e-05,
"loss": 0.0052,
"step": 1750
},
{
"epoch": 20.46511627906977,
"grad_norm": 0.0036556690465658903,
"learning_rate": 9.767441860465116e-05,
"loss": 0.0059,
"step": 1760
},
{
"epoch": 20.58139534883721,
"grad_norm": 0.00459684245288372,
"learning_rate": 9.709302325581396e-05,
"loss": 0.0053,
"step": 1770
},
{
"epoch": 20.697674418604652,
"grad_norm": 0.0047168247401714325,
"learning_rate": 9.651162790697675e-05,
"loss": 0.0059,
"step": 1780
},
{
"epoch": 20.813953488372093,
"grad_norm": 0.005462776403874159,
"learning_rate": 9.593023255813955e-05,
"loss": 0.006,
"step": 1790
},
{
"epoch": 20.930232558139537,
"grad_norm": 0.004429865162819624,
"learning_rate": 9.534883720930233e-05,
"loss": 0.0061,
"step": 1800
},
{
"epoch": 20.930232558139537,
"eval_f1": 0.0091324200913242,
"eval_loss": 0.008642657659947872,
"eval_runtime": 1.6675,
"eval_samples_per_second": 43.777,
"eval_steps_per_second": 5.997,
"step": 1800
},
{
"epoch": 21.046511627906977,
"grad_norm": 0.008058182895183563,
"learning_rate": 9.476744186046512e-05,
"loss": 0.0062,
"step": 1810
},
{
"epoch": 21.162790697674417,
"grad_norm": 0.0037425195332616568,
"learning_rate": 9.418604651162792e-05,
"loss": 0.0053,
"step": 1820
},
{
"epoch": 21.27906976744186,
"grad_norm": 0.004587921779602766,
"learning_rate": 9.36046511627907e-05,
"loss": 0.006,
"step": 1830
},
{
"epoch": 21.3953488372093,
"grad_norm": 0.006532969884574413,
"learning_rate": 9.30232558139535e-05,
"loss": 0.0056,
"step": 1840
},
{
"epoch": 21.511627906976745,
"grad_norm": 0.008152597583830357,
"learning_rate": 9.244186046511628e-05,
"loss": 0.0061,
"step": 1850
},
{
"epoch": 21.627906976744185,
"grad_norm": 0.005781789310276508,
"learning_rate": 9.186046511627907e-05,
"loss": 0.0059,
"step": 1860
},
{
"epoch": 21.74418604651163,
"grad_norm": 0.004202969837933779,
"learning_rate": 9.127906976744186e-05,
"loss": 0.0055,
"step": 1870
},
{
"epoch": 21.86046511627907,
"grad_norm": 0.0047506955452263355,
"learning_rate": 9.069767441860465e-05,
"loss": 0.0058,
"step": 1880
},
{
"epoch": 21.97674418604651,
"grad_norm": 0.005872243549674749,
"learning_rate": 9.011627906976745e-05,
"loss": 0.0056,
"step": 1890
},
{
"epoch": 22.093023255813954,
"grad_norm": 0.005416091997176409,
"learning_rate": 8.953488372093024e-05,
"loss": 0.0054,
"step": 1900
},
{
"epoch": 22.093023255813954,
"eval_f1": 0.0593607305936073,
"eval_loss": 0.008248222060501575,
"eval_runtime": 1.7364,
"eval_samples_per_second": 42.04,
"eval_steps_per_second": 5.759,
"step": 1900
},
{
"epoch": 22.209302325581394,
"grad_norm": 0.0039680940099060535,
"learning_rate": 8.895348837209302e-05,
"loss": 0.005,
"step": 1910
},
{
"epoch": 22.325581395348838,
"grad_norm": 0.0041343714110553265,
"learning_rate": 8.837209302325582e-05,
"loss": 0.006,
"step": 1920
},
{
"epoch": 22.441860465116278,
"grad_norm": 0.0031660939566791058,
"learning_rate": 8.779069767441861e-05,
"loss": 0.0053,
"step": 1930
},
{
"epoch": 22.558139534883722,
"grad_norm": 0.005090238060802221,
"learning_rate": 8.72093023255814e-05,
"loss": 0.0053,
"step": 1940
},
{
"epoch": 22.674418604651162,
"grad_norm": 0.0037465649656951427,
"learning_rate": 8.662790697674419e-05,
"loss": 0.0048,
"step": 1950
},
{
"epoch": 22.790697674418606,
"grad_norm": 0.0040254793129861355,
"learning_rate": 8.604651162790697e-05,
"loss": 0.0051,
"step": 1960
},
{
"epoch": 22.906976744186046,
"grad_norm": 0.007103138603270054,
"learning_rate": 8.546511627906977e-05,
"loss": 0.0054,
"step": 1970
},
{
"epoch": 23.023255813953487,
"grad_norm": 0.00407650088891387,
"learning_rate": 8.488372093023255e-05,
"loss": 0.0055,
"step": 1980
},
{
"epoch": 23.13953488372093,
"grad_norm": 0.004379500634968281,
"learning_rate": 8.430232558139536e-05,
"loss": 0.0054,
"step": 1990
},
{
"epoch": 23.25581395348837,
"grad_norm": 0.004481241572648287,
"learning_rate": 8.372093023255814e-05,
"loss": 0.0051,
"step": 2000
},
{
"epoch": 23.25581395348837,
"eval_f1": 0.0502283105022831,
"eval_loss": 0.008041327819228172,
"eval_runtime": 1.6921,
"eval_samples_per_second": 43.14,
"eval_steps_per_second": 5.91,
"step": 2000
},
{
"epoch": 23.372093023255815,
"grad_norm": 0.005388026591390371,
"learning_rate": 8.313953488372094e-05,
"loss": 0.0053,
"step": 2010
},
{
"epoch": 23.488372093023255,
"grad_norm": 0.004522989969700575,
"learning_rate": 8.255813953488373e-05,
"loss": 0.0051,
"step": 2020
},
{
"epoch": 23.6046511627907,
"grad_norm": 0.0037742829881608486,
"learning_rate": 8.197674418604652e-05,
"loss": 0.0054,
"step": 2030
},
{
"epoch": 23.72093023255814,
"grad_norm": 0.0024201772175729275,
"learning_rate": 8.139534883720931e-05,
"loss": 0.0047,
"step": 2040
},
{
"epoch": 23.837209302325583,
"grad_norm": 0.010637535713613033,
"learning_rate": 8.081395348837209e-05,
"loss": 0.0053,
"step": 2050
},
{
"epoch": 23.953488372093023,
"grad_norm": 0.0036848525051027536,
"learning_rate": 8.023255813953489e-05,
"loss": 0.0045,
"step": 2060
},
{
"epoch": 24.069767441860463,
"grad_norm": 0.0036426165606826544,
"learning_rate": 7.965116279069767e-05,
"loss": 0.0051,
"step": 2070
},
{
"epoch": 24.186046511627907,
"grad_norm": 0.004131542984396219,
"learning_rate": 7.906976744186047e-05,
"loss": 0.005,
"step": 2080
},
{
"epoch": 24.302325581395348,
"grad_norm": 0.005381637252867222,
"learning_rate": 7.848837209302326e-05,
"loss": 0.0049,
"step": 2090
},
{
"epoch": 24.41860465116279,
"grad_norm": 0.00364381424151361,
"learning_rate": 7.790697674418606e-05,
"loss": 0.0048,
"step": 2100
},
{
"epoch": 24.41860465116279,
"eval_f1": 0.0639269406392694,
"eval_loss": 0.007898622192442417,
"eval_runtime": 1.74,
"eval_samples_per_second": 41.953,
"eval_steps_per_second": 5.747,
"step": 2100
},
{
"epoch": 24.53488372093023,
"grad_norm": 0.0032041827216744423,
"learning_rate": 7.732558139534884e-05,
"loss": 0.0047,
"step": 2110
},
{
"epoch": 24.651162790697676,
"grad_norm": 0.004305603448301554,
"learning_rate": 7.674418604651163e-05,
"loss": 0.0048,
"step": 2120
},
{
"epoch": 24.767441860465116,
"grad_norm": 0.003094649873673916,
"learning_rate": 7.616279069767443e-05,
"loss": 0.0042,
"step": 2130
},
{
"epoch": 24.88372093023256,
"grad_norm": 0.0032246061600744724,
"learning_rate": 7.558139534883721e-05,
"loss": 0.0045,
"step": 2140
},
{
"epoch": 25.0,
"grad_norm": 0.00515501806512475,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0047,
"step": 2150
},
{
"epoch": 25.11627906976744,
"grad_norm": 0.003303398611024022,
"learning_rate": 7.441860465116279e-05,
"loss": 0.0039,
"step": 2160
},
{
"epoch": 25.232558139534884,
"grad_norm": 0.00300194276496768,
"learning_rate": 7.383720930232558e-05,
"loss": 0.0042,
"step": 2170
},
{
"epoch": 25.348837209302324,
"grad_norm": 0.007243013009428978,
"learning_rate": 7.325581395348837e-05,
"loss": 0.0048,
"step": 2180
},
{
"epoch": 25.46511627906977,
"grad_norm": 0.00349716329947114,
"learning_rate": 7.267441860465116e-05,
"loss": 0.005,
"step": 2190
},
{
"epoch": 25.58139534883721,
"grad_norm": 0.003004447091370821,
"learning_rate": 7.209302325581396e-05,
"loss": 0.0045,
"step": 2200
},
{
"epoch": 25.58139534883721,
"eval_f1": 0.0639269406392694,
"eval_loss": 0.008040810003876686,
"eval_runtime": 1.6946,
"eval_samples_per_second": 43.078,
"eval_steps_per_second": 5.901,
"step": 2200
},
{
"epoch": 25.697674418604652,
"grad_norm": 0.0032493805047124624,
"learning_rate": 7.151162790697675e-05,
"loss": 0.0042,
"step": 2210
},
{
"epoch": 25.813953488372093,
"grad_norm": 0.005652161315083504,
"learning_rate": 7.093023255813955e-05,
"loss": 0.0043,
"step": 2220
},
{
"epoch": 25.930232558139537,
"grad_norm": 0.003228959860280156,
"learning_rate": 7.034883720930233e-05,
"loss": 0.004,
"step": 2230
},
{
"epoch": 26.046511627906977,
"grad_norm": 0.0029076021164655685,
"learning_rate": 6.976744186046513e-05,
"loss": 0.0046,
"step": 2240
},
{
"epoch": 26.162790697674417,
"grad_norm": 0.004093965515494347,
"learning_rate": 6.918604651162791e-05,
"loss": 0.0046,
"step": 2250
},
{
"epoch": 26.27906976744186,
"grad_norm": 0.0039006653241813183,
"learning_rate": 6.86046511627907e-05,
"loss": 0.0044,
"step": 2260
},
{
"epoch": 26.3953488372093,
"grad_norm": 0.0035654634702950716,
"learning_rate": 6.802325581395348e-05,
"loss": 0.0043,
"step": 2270
},
{
"epoch": 26.511627906976745,
"grad_norm": 0.0034853783436119556,
"learning_rate": 6.744186046511628e-05,
"loss": 0.0035,
"step": 2280
},
{
"epoch": 26.627906976744185,
"grad_norm": 0.0032556080259382725,
"learning_rate": 6.686046511627908e-05,
"loss": 0.0039,
"step": 2290
},
{
"epoch": 26.74418604651163,
"grad_norm": 0.004529784433543682,
"learning_rate": 6.627906976744186e-05,
"loss": 0.0036,
"step": 2300
},
{
"epoch": 26.74418604651163,
"eval_f1": 0.10273972602739725,
"eval_loss": 0.007886539213359356,
"eval_runtime": 1.7923,
"eval_samples_per_second": 40.73,
"eval_steps_per_second": 5.579,
"step": 2300
},
{
"epoch": 26.86046511627907,
"grad_norm": 0.0036729234270751476,
"learning_rate": 6.569767441860465e-05,
"loss": 0.0041,
"step": 2310
},
{
"epoch": 26.97674418604651,
"grad_norm": 0.003093762556090951,
"learning_rate": 6.511627906976745e-05,
"loss": 0.0043,
"step": 2320
},
{
"epoch": 27.093023255813954,
"grad_norm": 0.0035506237763911486,
"learning_rate": 6.453488372093024e-05,
"loss": 0.0038,
"step": 2330
},
{
"epoch": 27.209302325581394,
"grad_norm": 0.004144872073084116,
"learning_rate": 6.395348837209303e-05,
"loss": 0.0041,
"step": 2340
},
{
"epoch": 27.325581395348838,
"grad_norm": 0.0029570453334599733,
"learning_rate": 6.337209302325582e-05,
"loss": 0.0039,
"step": 2350
},
{
"epoch": 27.441860465116278,
"grad_norm": 0.0029905049595981836,
"learning_rate": 6.27906976744186e-05,
"loss": 0.0038,
"step": 2360
},
{
"epoch": 27.558139534883722,
"grad_norm": 0.0037850120570510626,
"learning_rate": 6.22093023255814e-05,
"loss": 0.0041,
"step": 2370
},
{
"epoch": 27.674418604651162,
"grad_norm": 0.0027285381220281124,
"learning_rate": 6.162790697674418e-05,
"loss": 0.004,
"step": 2380
},
{
"epoch": 27.790697674418606,
"grad_norm": 0.0033172164112329483,
"learning_rate": 6.104651162790698e-05,
"loss": 0.0036,
"step": 2390
},
{
"epoch": 27.906976744186046,
"grad_norm": 0.0038614473305642605,
"learning_rate": 6.0465116279069765e-05,
"loss": 0.0038,
"step": 2400
},
{
"epoch": 27.906976744186046,
"eval_f1": 0.10273972602739725,
"eval_loss": 0.007930786348879337,
"eval_runtime": 1.6954,
"eval_samples_per_second": 43.058,
"eval_steps_per_second": 5.898,
"step": 2400
},
{
"epoch": 28.023255813953487,
"grad_norm": 0.004366340581327677,
"learning_rate": 5.9883720930232554e-05,
"loss": 0.004,
"step": 2410
},
{
"epoch": 28.13953488372093,
"grad_norm": 0.0038022997323423624,
"learning_rate": 5.9302325581395356e-05,
"loss": 0.0038,
"step": 2420
},
{
"epoch": 28.25581395348837,
"grad_norm": 0.0036421448457986116,
"learning_rate": 5.8720930232558145e-05,
"loss": 0.0038,
"step": 2430
},
{
"epoch": 28.372093023255815,
"grad_norm": 0.003124956740066409,
"learning_rate": 5.8139534883720933e-05,
"loss": 0.0038,
"step": 2440
},
{
"epoch": 28.488372093023255,
"grad_norm": 0.0033948852214962244,
"learning_rate": 5.755813953488373e-05,
"loss": 0.0036,
"step": 2450
},
{
"epoch": 28.6046511627907,
"grad_norm": 0.002520653186365962,
"learning_rate": 5.697674418604652e-05,
"loss": 0.0035,
"step": 2460
},
{
"epoch": 28.72093023255814,
"grad_norm": 0.003338318085297942,
"learning_rate": 5.6395348837209306e-05,
"loss": 0.0037,
"step": 2470
},
{
"epoch": 28.837209302325583,
"grad_norm": 0.0030738948844373226,
"learning_rate": 5.5813953488372095e-05,
"loss": 0.0035,
"step": 2480
},
{
"epoch": 28.953488372093023,
"grad_norm": 0.0024965908378362656,
"learning_rate": 5.5232558139534884e-05,
"loss": 0.0037,
"step": 2490
},
{
"epoch": 29.069767441860463,
"grad_norm": 0.0026568674948066473,
"learning_rate": 5.465116279069767e-05,
"loss": 0.0032,
"step": 2500
},
{
"epoch": 29.069767441860463,
"eval_f1": 0.091324200913242,
"eval_loss": 0.007714101579040289,
"eval_runtime": 1.768,
"eval_samples_per_second": 41.289,
"eval_steps_per_second": 5.656,
"step": 2500
},
{
"epoch": 29.186046511627907,
"grad_norm": 0.004428944084793329,
"learning_rate": 5.406976744186046e-05,
"loss": 0.0036,
"step": 2510
},
{
"epoch": 29.302325581395348,
"grad_norm": 0.0083547318354249,
"learning_rate": 5.348837209302326e-05,
"loss": 0.0033,
"step": 2520
},
{
"epoch": 29.41860465116279,
"grad_norm": 0.003402271308004856,
"learning_rate": 5.290697674418605e-05,
"loss": 0.0039,
"step": 2530
},
{
"epoch": 29.53488372093023,
"grad_norm": 0.0033303038217127323,
"learning_rate": 5.232558139534884e-05,
"loss": 0.0035,
"step": 2540
},
{
"epoch": 29.651162790697676,
"grad_norm": 0.03485463559627533,
"learning_rate": 5.1744186046511636e-05,
"loss": 0.0038,
"step": 2550
},
{
"epoch": 29.767441860465116,
"grad_norm": 0.004661730024963617,
"learning_rate": 5.1162790697674425e-05,
"loss": 0.0036,
"step": 2560
},
{
"epoch": 29.88372093023256,
"grad_norm": 0.0038003467489033937,
"learning_rate": 5.0581395348837214e-05,
"loss": 0.0035,
"step": 2570
},
{
"epoch": 30.0,
"grad_norm": 0.004514409229159355,
"learning_rate": 5e-05,
"loss": 0.0033,
"step": 2580
},
{
"epoch": 30.11627906976744,
"grad_norm": 0.004792849998921156,
"learning_rate": 4.941860465116279e-05,
"loss": 0.0035,
"step": 2590
},
{
"epoch": 30.232558139534884,
"grad_norm": 0.004430180415511131,
"learning_rate": 4.883720930232558e-05,
"loss": 0.004,
"step": 2600
},
{
"epoch": 30.232558139534884,
"eval_f1": 0.10273972602739725,
"eval_loss": 0.007851834408938885,
"eval_runtime": 1.6578,
"eval_samples_per_second": 44.034,
"eval_steps_per_second": 6.032,
"step": 2600
},
{
"epoch": 30.348837209302324,
"grad_norm": 0.0030448336619883776,
"learning_rate": 4.8255813953488375e-05,
"loss": 0.0034,
"step": 2610
},
{
"epoch": 30.46511627906977,
"grad_norm": 0.0026765645015984774,
"learning_rate": 4.7674418604651164e-05,
"loss": 0.0033,
"step": 2620
},
{
"epoch": 30.58139534883721,
"grad_norm": 0.002969229593873024,
"learning_rate": 4.709302325581396e-05,
"loss": 0.0033,
"step": 2630
},
{
"epoch": 30.697674418604652,
"grad_norm": 0.005750367883592844,
"learning_rate": 4.651162790697675e-05,
"loss": 0.0029,
"step": 2640
},
{
"epoch": 30.813953488372093,
"grad_norm": 0.0028870333917438984,
"learning_rate": 4.593023255813954e-05,
"loss": 0.0033,
"step": 2650
},
{
"epoch": 30.930232558139537,
"grad_norm": 0.003318176371976733,
"learning_rate": 4.5348837209302326e-05,
"loss": 0.0034,
"step": 2660
},
{
"epoch": 31.046511627906977,
"grad_norm": 0.0030346305575221777,
"learning_rate": 4.476744186046512e-05,
"loss": 0.003,
"step": 2670
},
{
"epoch": 31.162790697674417,
"grad_norm": 0.0031568079721182585,
"learning_rate": 4.418604651162791e-05,
"loss": 0.0033,
"step": 2680
},
{
"epoch": 31.27906976744186,
"grad_norm": 0.0028284024447202682,
"learning_rate": 4.36046511627907e-05,
"loss": 0.0031,
"step": 2690
},
{
"epoch": 31.3953488372093,
"grad_norm": 0.003977891989052296,
"learning_rate": 4.302325581395349e-05,
"loss": 0.003,
"step": 2700
},
{
"epoch": 31.3953488372093,
"eval_f1": 0.09360730593607304,
"eval_loss": 0.008055122569203377,
"eval_runtime": 1.6937,
"eval_samples_per_second": 43.101,
"eval_steps_per_second": 5.904,
"step": 2700
},
{
"epoch": 31.511627906976745,
"grad_norm": 0.0026259180158376694,
"learning_rate": 4.2441860465116276e-05,
"loss": 0.0036,
"step": 2710
},
{
"epoch": 31.627906976744185,
"grad_norm": 0.003417164785787463,
"learning_rate": 4.186046511627907e-05,
"loss": 0.0032,
"step": 2720
},
{
"epoch": 31.74418604651163,
"grad_norm": 0.0038994085043668747,
"learning_rate": 4.127906976744187e-05,
"loss": 0.0032,
"step": 2730
},
{
"epoch": 31.86046511627907,
"grad_norm": 0.0034232349134981632,
"learning_rate": 4.0697674418604655e-05,
"loss": 0.0027,
"step": 2740
},
{
"epoch": 31.97674418604651,
"grad_norm": 0.004144703969359398,
"learning_rate": 4.0116279069767444e-05,
"loss": 0.0035,
"step": 2750
},
{
"epoch": 32.093023255813954,
"grad_norm": 0.0026302810292690992,
"learning_rate": 3.953488372093023e-05,
"loss": 0.0032,
"step": 2760
},
{
"epoch": 32.2093023255814,
"grad_norm": 0.002205599332228303,
"learning_rate": 3.895348837209303e-05,
"loss": 0.0033,
"step": 2770
},
{
"epoch": 32.325581395348834,
"grad_norm": 0.003439367515966296,
"learning_rate": 3.837209302325582e-05,
"loss": 0.0032,
"step": 2780
},
{
"epoch": 32.44186046511628,
"grad_norm": 0.0022263077553361654,
"learning_rate": 3.7790697674418606e-05,
"loss": 0.0029,
"step": 2790
},
{
"epoch": 32.55813953488372,
"grad_norm": 0.003249467583373189,
"learning_rate": 3.7209302325581394e-05,
"loss": 0.0029,
"step": 2800
},
{
"epoch": 32.55813953488372,
"eval_f1": 0.08904109589041094,
"eval_loss": 0.008000507019460201,
"eval_runtime": 1.6639,
"eval_samples_per_second": 43.874,
"eval_steps_per_second": 6.01,
"step": 2800
},
{
"epoch": 32.674418604651166,
"grad_norm": 0.003246536012738943,
"learning_rate": 3.662790697674418e-05,
"loss": 0.0027,
"step": 2810
},
{
"epoch": 32.7906976744186,
"grad_norm": 0.0032464447431266308,
"learning_rate": 3.604651162790698e-05,
"loss": 0.0033,
"step": 2820
},
{
"epoch": 32.906976744186046,
"grad_norm": 0.002286577830091119,
"learning_rate": 3.5465116279069774e-05,
"loss": 0.0029,
"step": 2830
},
{
"epoch": 33.02325581395349,
"grad_norm": 0.0027776581700891256,
"learning_rate": 3.488372093023256e-05,
"loss": 0.003,
"step": 2840
},
{
"epoch": 33.13953488372093,
"grad_norm": 0.0025106158573180437,
"learning_rate": 3.430232558139535e-05,
"loss": 0.0027,
"step": 2850
},
{
"epoch": 33.25581395348837,
"grad_norm": 0.0026736510917544365,
"learning_rate": 3.372093023255814e-05,
"loss": 0.0029,
"step": 2860
},
{
"epoch": 33.372093023255815,
"grad_norm": 0.002507059136405587,
"learning_rate": 3.313953488372093e-05,
"loss": 0.0025,
"step": 2870
},
{
"epoch": 33.48837209302326,
"grad_norm": 0.0028928928077220917,
"learning_rate": 3.2558139534883724e-05,
"loss": 0.0031,
"step": 2880
},
{
"epoch": 33.604651162790695,
"grad_norm": 0.0032647838816046715,
"learning_rate": 3.197674418604651e-05,
"loss": 0.0028,
"step": 2890
},
{
"epoch": 33.72093023255814,
"grad_norm": 0.0033728063572198153,
"learning_rate": 3.13953488372093e-05,
"loss": 0.0033,
"step": 2900
},
{
"epoch": 33.72093023255814,
"eval_f1": 0.08447488584474885,
"eval_loss": 0.008075451478362083,
"eval_runtime": 1.6844,
"eval_samples_per_second": 43.339,
"eval_steps_per_second": 5.937,
"step": 2900
},
{
"epoch": 33.83720930232558,
"grad_norm": 0.0026662382297217846,
"learning_rate": 3.081395348837209e-05,
"loss": 0.0031,
"step": 2910
},
{
"epoch": 33.95348837209303,
"grad_norm": 0.0025568390265107155,
"learning_rate": 3.0232558139534883e-05,
"loss": 0.0028,
"step": 2920
},
{
"epoch": 34.06976744186046,
"grad_norm": 0.002340954029932618,
"learning_rate": 2.9651162790697678e-05,
"loss": 0.0026,
"step": 2930
},
{
"epoch": 34.18604651162791,
"grad_norm": 0.0033266523387283087,
"learning_rate": 2.9069767441860467e-05,
"loss": 0.0031,
"step": 2940
},
{
"epoch": 34.30232558139535,
"grad_norm": 0.0023587134201079607,
"learning_rate": 2.848837209302326e-05,
"loss": 0.0027,
"step": 2950
},
{
"epoch": 34.41860465116279,
"grad_norm": 0.002229692181572318,
"learning_rate": 2.7906976744186048e-05,
"loss": 0.0029,
"step": 2960
},
{
"epoch": 34.53488372093023,
"grad_norm": 0.0029290826059877872,
"learning_rate": 2.7325581395348836e-05,
"loss": 0.0026,
"step": 2970
},
{
"epoch": 34.651162790697676,
"grad_norm": 0.0030265513341873884,
"learning_rate": 2.674418604651163e-05,
"loss": 0.0026,
"step": 2980
},
{
"epoch": 34.76744186046512,
"grad_norm": 0.0028545409440994263,
"learning_rate": 2.616279069767442e-05,
"loss": 0.0032,
"step": 2990
},
{
"epoch": 34.883720930232556,
"grad_norm": 0.002398886950686574,
"learning_rate": 2.5581395348837212e-05,
"loss": 0.0029,
"step": 3000
},
{
"epoch": 34.883720930232556,
"eval_f1": 0.12557077625570776,
"eval_loss": 0.008088447153568268,
"eval_runtime": 1.6856,
"eval_samples_per_second": 43.309,
"eval_steps_per_second": 5.933,
"step": 3000
},
{
"epoch": 35.0,
"grad_norm": 0.00287942448630929,
"learning_rate": 2.5e-05,
"loss": 0.0026,
"step": 3010
},
{
"epoch": 35.116279069767444,
"grad_norm": 0.0025614567566663027,
"learning_rate": 2.441860465116279e-05,
"loss": 0.0024,
"step": 3020
},
{
"epoch": 35.23255813953488,
"grad_norm": 0.002899142215028405,
"learning_rate": 2.3837209302325582e-05,
"loss": 0.0028,
"step": 3030
},
{
"epoch": 35.348837209302324,
"grad_norm": 0.003214885015040636,
"learning_rate": 2.3255813953488374e-05,
"loss": 0.0032,
"step": 3040
},
{
"epoch": 35.46511627906977,
"grad_norm": 0.003339228220283985,
"learning_rate": 2.2674418604651163e-05,
"loss": 0.0027,
"step": 3050
},
{
"epoch": 35.58139534883721,
"grad_norm": 0.0030185177456587553,
"learning_rate": 2.2093023255813955e-05,
"loss": 0.0028,
"step": 3060
},
{
"epoch": 35.69767441860465,
"grad_norm": 0.00310333538800478,
"learning_rate": 2.1511627906976744e-05,
"loss": 0.0026,
"step": 3070
},
{
"epoch": 35.81395348837209,
"grad_norm": 0.0028332043439149857,
"learning_rate": 2.0930232558139536e-05,
"loss": 0.0026,
"step": 3080
},
{
"epoch": 35.93023255813954,
"grad_norm": 0.0021240401547402143,
"learning_rate": 2.0348837209302328e-05,
"loss": 0.0028,
"step": 3090
},
{
"epoch": 36.04651162790697,
"grad_norm": 0.0028554806485772133,
"learning_rate": 1.9767441860465116e-05,
"loss": 0.0025,
"step": 3100
},
{
"epoch": 36.04651162790697,
"eval_f1": 0.13470319634703196,
"eval_loss": 0.008070297539234161,
"eval_runtime": 1.8154,
"eval_samples_per_second": 40.211,
"eval_steps_per_second": 5.508,
"step": 3100
},
{
"epoch": 36.16279069767442,
"grad_norm": 0.0031274755019694567,
"learning_rate": 1.918604651162791e-05,
"loss": 0.0029,
"step": 3110
},
{
"epoch": 36.27906976744186,
"grad_norm": 0.0016608175355941057,
"learning_rate": 1.8604651162790697e-05,
"loss": 0.0026,
"step": 3120
},
{
"epoch": 36.395348837209305,
"grad_norm": 0.0023403004743158817,
"learning_rate": 1.802325581395349e-05,
"loss": 0.0025,
"step": 3130
},
{
"epoch": 36.51162790697674,
"grad_norm": 0.0022207223810255527,
"learning_rate": 1.744186046511628e-05,
"loss": 0.0028,
"step": 3140
},
{
"epoch": 36.627906976744185,
"grad_norm": 0.0019355164840817451,
"learning_rate": 1.686046511627907e-05,
"loss": 0.0025,
"step": 3150
},
{
"epoch": 36.74418604651163,
"grad_norm": 0.00238374387845397,
"learning_rate": 1.6279069767441862e-05,
"loss": 0.0026,
"step": 3160
},
{
"epoch": 36.86046511627907,
"grad_norm": 0.002908985363319516,
"learning_rate": 1.569767441860465e-05,
"loss": 0.0028,
"step": 3170
},
{
"epoch": 36.97674418604651,
"grad_norm": 0.002507247030735016,
"learning_rate": 1.5116279069767441e-05,
"loss": 0.0025,
"step": 3180
},
{
"epoch": 37.093023255813954,
"grad_norm": 0.0024845688603818417,
"learning_rate": 1.4534883720930233e-05,
"loss": 0.0028,
"step": 3190
},
{
"epoch": 37.2093023255814,
"grad_norm": 0.0021229905541986227,
"learning_rate": 1.3953488372093024e-05,
"loss": 0.0027,
"step": 3200
},
{
"epoch": 37.2093023255814,
"eval_f1": 0.1324200913242009,
"eval_loss": 0.008123889565467834,
"eval_runtime": 1.6983,
"eval_samples_per_second": 42.985,
"eval_steps_per_second": 5.888,
"step": 3200
},
{
"epoch": 37.325581395348834,
"grad_norm": 0.0020165506284683943,
"learning_rate": 1.3372093023255814e-05,
"loss": 0.0024,
"step": 3210
},
{
"epoch": 37.44186046511628,
"grad_norm": 0.00225025019608438,
"learning_rate": 1.2790697674418606e-05,
"loss": 0.0024,
"step": 3220
},
{
"epoch": 37.55813953488372,
"grad_norm": 0.002410660730674863,
"learning_rate": 1.2209302325581395e-05,
"loss": 0.0025,
"step": 3230
},
{
"epoch": 37.674418604651166,
"grad_norm": 0.0022259822580963373,
"learning_rate": 1.1627906976744187e-05,
"loss": 0.0026,
"step": 3240
},
{
"epoch": 37.7906976744186,
"grad_norm": 0.002472145715728402,
"learning_rate": 1.1046511627906977e-05,
"loss": 0.0028,
"step": 3250
},
{
"epoch": 37.906976744186046,
"grad_norm": 0.002238726709038019,
"learning_rate": 1.0465116279069768e-05,
"loss": 0.0024,
"step": 3260
},
{
"epoch": 38.02325581395349,
"grad_norm": 0.0025766075123101473,
"learning_rate": 9.883720930232558e-06,
"loss": 0.0027,
"step": 3270
},
{
"epoch": 38.13953488372093,
"grad_norm": 0.002669785637408495,
"learning_rate": 9.302325581395349e-06,
"loss": 0.0027,
"step": 3280
},
{
"epoch": 38.25581395348837,
"grad_norm": 0.0023648098576813936,
"learning_rate": 8.72093023255814e-06,
"loss": 0.0028,
"step": 3290
},
{
"epoch": 38.372093023255815,
"grad_norm": 0.0019695968367159367,
"learning_rate": 8.139534883720931e-06,
"loss": 0.0028,
"step": 3300
},
{
"epoch": 38.372093023255815,
"eval_f1": 0.1324200913242009,
"eval_loss": 0.008158449083566666,
"eval_runtime": 1.8249,
"eval_samples_per_second": 40.002,
"eval_steps_per_second": 5.48,
"step": 3300
},
{
"epoch": 38.48837209302326,
"grad_norm": 0.0023280028253793716,
"learning_rate": 7.558139534883721e-06,
"loss": 0.0024,
"step": 3310
},
{
"epoch": 38.604651162790695,
"grad_norm": 0.0030140073504298925,
"learning_rate": 6.976744186046512e-06,
"loss": 0.0026,
"step": 3320
},
{
"epoch": 38.72093023255814,
"grad_norm": 0.0024626152589917183,
"learning_rate": 6.395348837209303e-06,
"loss": 0.0024,
"step": 3330
},
{
"epoch": 38.83720930232558,
"grad_norm": 0.0027184640057384968,
"learning_rate": 5.8139534883720935e-06,
"loss": 0.0022,
"step": 3340
},
{
"epoch": 38.95348837209303,
"grad_norm": 0.002258691005408764,
"learning_rate": 5.232558139534884e-06,
"loss": 0.0027,
"step": 3350
},
{
"epoch": 39.06976744186046,
"grad_norm": 0.0015635826857760549,
"learning_rate": 4.651162790697674e-06,
"loss": 0.0025,
"step": 3360
},
{
"epoch": 39.18604651162791,
"grad_norm": 0.0014281744370236993,
"learning_rate": 4.0697674418604655e-06,
"loss": 0.0025,
"step": 3370
},
{
"epoch": 39.30232558139535,
"grad_norm": 0.0019795901607722044,
"learning_rate": 3.488372093023256e-06,
"loss": 0.0024,
"step": 3380
},
{
"epoch": 39.41860465116279,
"grad_norm": 0.00274649984203279,
"learning_rate": 2.9069767441860468e-06,
"loss": 0.0028,
"step": 3390
},
{
"epoch": 39.53488372093023,
"grad_norm": 0.0021044183522462845,
"learning_rate": 2.325581395348837e-06,
"loss": 0.0023,
"step": 3400
},
{
"epoch": 39.53488372093023,
"eval_f1": 0.1324200913242009,
"eval_loss": 0.008160348981618881,
"eval_runtime": 1.7392,
"eval_samples_per_second": 41.972,
"eval_steps_per_second": 5.75,
"step": 3400
},
{
"epoch": 39.651162790697676,
"grad_norm": 0.0022693907376378775,
"learning_rate": 1.744186046511628e-06,
"loss": 0.0025,
"step": 3410
},
{
"epoch": 39.76744186046512,
"grad_norm": 0.0020165799651294947,
"learning_rate": 1.1627906976744186e-06,
"loss": 0.0027,
"step": 3420
},
{
"epoch": 39.883720930232556,
"grad_norm": 0.0023625025060027838,
"learning_rate": 5.813953488372093e-07,
"loss": 0.0022,
"step": 3430
},
{
"epoch": 40.0,
"grad_norm": 0.0031698495149612427,
"learning_rate": 0.0,
"loss": 0.0029,
"step": 3440
},
{
"epoch": 40.0,
"step": 3440,
"total_flos": 4.2863407148659507e+18,
"train_loss": 0.015139733321509908,
"train_runtime": 1940.273,
"train_samples_per_second": 28.285,
"train_steps_per_second": 1.773
}
],
"logging_steps": 10,
"max_steps": 3440,
"num_input_tokens_seen": 0,
"num_train_epochs": 40,
"save_steps": 100,
"total_flos": 4.2863407148659507e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}