top_14_ranking_stackexchange / trainer_state.json
sedrickkeh's picture
End of training
9b1ebdd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.997084548104956,
"eval_steps": 500,
"global_step": 1542,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019436345966958212,
"grad_norm": 3.5477320746078096,
"learning_rate": 5e-06,
"loss": 1.0532,
"step": 10
},
{
"epoch": 0.038872691933916424,
"grad_norm": 2.8209235996953512,
"learning_rate": 5e-06,
"loss": 0.9684,
"step": 20
},
{
"epoch": 0.05830903790087463,
"grad_norm": 1.1257852692164845,
"learning_rate": 5e-06,
"loss": 0.9294,
"step": 30
},
{
"epoch": 0.07774538386783285,
"grad_norm": 0.888481539231949,
"learning_rate": 5e-06,
"loss": 0.9201,
"step": 40
},
{
"epoch": 0.09718172983479106,
"grad_norm": 0.6469537614046023,
"learning_rate": 5e-06,
"loss": 0.8971,
"step": 50
},
{
"epoch": 0.11661807580174927,
"grad_norm": 0.592544408615531,
"learning_rate": 5e-06,
"loss": 0.8837,
"step": 60
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.6433313667438364,
"learning_rate": 5e-06,
"loss": 0.8749,
"step": 70
},
{
"epoch": 0.1554907677356657,
"grad_norm": 1.344429915972531,
"learning_rate": 5e-06,
"loss": 0.867,
"step": 80
},
{
"epoch": 0.1749271137026239,
"grad_norm": 0.8707935340603853,
"learning_rate": 5e-06,
"loss": 0.8623,
"step": 90
},
{
"epoch": 0.19436345966958213,
"grad_norm": 0.7828161719097383,
"learning_rate": 5e-06,
"loss": 0.8624,
"step": 100
},
{
"epoch": 0.21379980563654033,
"grad_norm": 0.682126021781355,
"learning_rate": 5e-06,
"loss": 0.8611,
"step": 110
},
{
"epoch": 0.23323615160349853,
"grad_norm": 0.8081328723422389,
"learning_rate": 5e-06,
"loss": 0.8551,
"step": 120
},
{
"epoch": 0.25267249757045673,
"grad_norm": 0.6691994077909735,
"learning_rate": 5e-06,
"loss": 0.8583,
"step": 130
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.586140300081327,
"learning_rate": 5e-06,
"loss": 0.8523,
"step": 140
},
{
"epoch": 0.2915451895043732,
"grad_norm": 0.8552001050583659,
"learning_rate": 5e-06,
"loss": 0.8497,
"step": 150
},
{
"epoch": 0.3109815354713314,
"grad_norm": 0.501523965918074,
"learning_rate": 5e-06,
"loss": 0.8464,
"step": 160
},
{
"epoch": 0.3304178814382896,
"grad_norm": 0.6785445153206255,
"learning_rate": 5e-06,
"loss": 0.848,
"step": 170
},
{
"epoch": 0.3498542274052478,
"grad_norm": 0.6494060293856031,
"learning_rate": 5e-06,
"loss": 0.8459,
"step": 180
},
{
"epoch": 0.369290573372206,
"grad_norm": 0.6020737926390343,
"learning_rate": 5e-06,
"loss": 0.8484,
"step": 190
},
{
"epoch": 0.38872691933916426,
"grad_norm": 0.48912875825316915,
"learning_rate": 5e-06,
"loss": 0.8449,
"step": 200
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.8781137047011839,
"learning_rate": 5e-06,
"loss": 0.8395,
"step": 210
},
{
"epoch": 0.42759961127308066,
"grad_norm": 0.5879468826056136,
"learning_rate": 5e-06,
"loss": 0.8387,
"step": 220
},
{
"epoch": 0.44703595724003886,
"grad_norm": 0.6017675792916065,
"learning_rate": 5e-06,
"loss": 0.8363,
"step": 230
},
{
"epoch": 0.46647230320699706,
"grad_norm": 0.616647981494789,
"learning_rate": 5e-06,
"loss": 0.8336,
"step": 240
},
{
"epoch": 0.4859086491739553,
"grad_norm": 0.855268617382177,
"learning_rate": 5e-06,
"loss": 0.8336,
"step": 250
},
{
"epoch": 0.5053449951409135,
"grad_norm": 0.780188445457583,
"learning_rate": 5e-06,
"loss": 0.831,
"step": 260
},
{
"epoch": 0.5247813411078717,
"grad_norm": 0.5840816004625115,
"learning_rate": 5e-06,
"loss": 0.8319,
"step": 270
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.6535257947752856,
"learning_rate": 5e-06,
"loss": 0.833,
"step": 280
},
{
"epoch": 0.5636540330417882,
"grad_norm": 0.6710102563759031,
"learning_rate": 5e-06,
"loss": 0.8354,
"step": 290
},
{
"epoch": 0.5830903790087464,
"grad_norm": 0.5947252085113186,
"learning_rate": 5e-06,
"loss": 0.8315,
"step": 300
},
{
"epoch": 0.6025267249757046,
"grad_norm": 0.5902161171422673,
"learning_rate": 5e-06,
"loss": 0.83,
"step": 310
},
{
"epoch": 0.6219630709426628,
"grad_norm": 0.5757165816873938,
"learning_rate": 5e-06,
"loss": 0.8299,
"step": 320
},
{
"epoch": 0.641399416909621,
"grad_norm": 0.6751753868245474,
"learning_rate": 5e-06,
"loss": 0.827,
"step": 330
},
{
"epoch": 0.6608357628765792,
"grad_norm": 0.535560817394151,
"learning_rate": 5e-06,
"loss": 0.8264,
"step": 340
},
{
"epoch": 0.6802721088435374,
"grad_norm": 0.6183377886108462,
"learning_rate": 5e-06,
"loss": 0.8243,
"step": 350
},
{
"epoch": 0.6997084548104956,
"grad_norm": 0.6200501825686097,
"learning_rate": 5e-06,
"loss": 0.828,
"step": 360
},
{
"epoch": 0.7191448007774538,
"grad_norm": 0.5622254912052161,
"learning_rate": 5e-06,
"loss": 0.8287,
"step": 370
},
{
"epoch": 0.738581146744412,
"grad_norm": 0.6720527159909909,
"learning_rate": 5e-06,
"loss": 0.8234,
"step": 380
},
{
"epoch": 0.7580174927113703,
"grad_norm": 0.5315560166276624,
"learning_rate": 5e-06,
"loss": 0.824,
"step": 390
},
{
"epoch": 0.7774538386783285,
"grad_norm": 0.6413527148042328,
"learning_rate": 5e-06,
"loss": 0.8194,
"step": 400
},
{
"epoch": 0.7968901846452867,
"grad_norm": 0.6402327795437167,
"learning_rate": 5e-06,
"loss": 0.8243,
"step": 410
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.6290935177044384,
"learning_rate": 5e-06,
"loss": 0.8201,
"step": 420
},
{
"epoch": 0.8357628765792031,
"grad_norm": 0.6137598310285064,
"learning_rate": 5e-06,
"loss": 0.8187,
"step": 430
},
{
"epoch": 0.8551992225461613,
"grad_norm": 0.6137894354862566,
"learning_rate": 5e-06,
"loss": 0.8217,
"step": 440
},
{
"epoch": 0.8746355685131195,
"grad_norm": 0.7376542092364302,
"learning_rate": 5e-06,
"loss": 0.8191,
"step": 450
},
{
"epoch": 0.8940719144800777,
"grad_norm": 0.5443940007812901,
"learning_rate": 5e-06,
"loss": 0.8179,
"step": 460
},
{
"epoch": 0.9135082604470359,
"grad_norm": 0.6778023088897194,
"learning_rate": 5e-06,
"loss": 0.8158,
"step": 470
},
{
"epoch": 0.9329446064139941,
"grad_norm": 0.6040677193471313,
"learning_rate": 5e-06,
"loss": 0.8176,
"step": 480
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.5741967517130403,
"learning_rate": 5e-06,
"loss": 0.8215,
"step": 490
},
{
"epoch": 0.9718172983479106,
"grad_norm": 0.5184977635424605,
"learning_rate": 5e-06,
"loss": 0.8218,
"step": 500
},
{
"epoch": 0.9912536443148688,
"grad_norm": 0.5484601778792662,
"learning_rate": 5e-06,
"loss": 0.8158,
"step": 510
},
{
"epoch": 0.9990281827016521,
"eval_loss": 0.8152287006378174,
"eval_runtime": 547.9504,
"eval_samples_per_second": 25.3,
"eval_steps_per_second": 0.396,
"step": 514
},
{
"epoch": 1.010689990281827,
"grad_norm": 0.6821565020818955,
"learning_rate": 5e-06,
"loss": 0.8355,
"step": 520
},
{
"epoch": 1.0301263362487851,
"grad_norm": 0.6277981964271326,
"learning_rate": 5e-06,
"loss": 0.7749,
"step": 530
},
{
"epoch": 1.0495626822157433,
"grad_norm": 0.6114756472079457,
"learning_rate": 5e-06,
"loss": 0.7751,
"step": 540
},
{
"epoch": 1.0689990281827018,
"grad_norm": 0.8259615966427586,
"learning_rate": 5e-06,
"loss": 0.7756,
"step": 550
},
{
"epoch": 1.08843537414966,
"grad_norm": 0.6113352198439804,
"learning_rate": 5e-06,
"loss": 0.7789,
"step": 560
},
{
"epoch": 1.1078717201166182,
"grad_norm": 0.5269512429419262,
"learning_rate": 5e-06,
"loss": 0.7784,
"step": 570
},
{
"epoch": 1.1273080660835764,
"grad_norm": 0.5792710933468033,
"learning_rate": 5e-06,
"loss": 0.7757,
"step": 580
},
{
"epoch": 1.1467444120505346,
"grad_norm": 0.5467198449481481,
"learning_rate": 5e-06,
"loss": 0.7757,
"step": 590
},
{
"epoch": 1.1661807580174928,
"grad_norm": 0.6190447420364188,
"learning_rate": 5e-06,
"loss": 0.7754,
"step": 600
},
{
"epoch": 1.185617103984451,
"grad_norm": 0.7074708178383962,
"learning_rate": 5e-06,
"loss": 0.7738,
"step": 610
},
{
"epoch": 1.2050534499514092,
"grad_norm": 0.5708793884696434,
"learning_rate": 5e-06,
"loss": 0.7675,
"step": 620
},
{
"epoch": 1.2244897959183674,
"grad_norm": 0.5278424041049065,
"learning_rate": 5e-06,
"loss": 0.7724,
"step": 630
},
{
"epoch": 1.2439261418853256,
"grad_norm": 0.48269223376284837,
"learning_rate": 5e-06,
"loss": 0.7678,
"step": 640
},
{
"epoch": 1.2633624878522838,
"grad_norm": 0.6628438951702088,
"learning_rate": 5e-06,
"loss": 0.772,
"step": 650
},
{
"epoch": 1.282798833819242,
"grad_norm": 0.49956285734450795,
"learning_rate": 5e-06,
"loss": 0.78,
"step": 660
},
{
"epoch": 1.3022351797862002,
"grad_norm": 0.4482989241465936,
"learning_rate": 5e-06,
"loss": 0.7712,
"step": 670
},
{
"epoch": 1.3216715257531584,
"grad_norm": 0.515252608881534,
"learning_rate": 5e-06,
"loss": 0.7712,
"step": 680
},
{
"epoch": 1.3411078717201166,
"grad_norm": 0.5392787594093453,
"learning_rate": 5e-06,
"loss": 0.7733,
"step": 690
},
{
"epoch": 1.3605442176870748,
"grad_norm": 0.5239288581769422,
"learning_rate": 5e-06,
"loss": 0.7744,
"step": 700
},
{
"epoch": 1.379980563654033,
"grad_norm": 0.5368087860350439,
"learning_rate": 5e-06,
"loss": 0.7721,
"step": 710
},
{
"epoch": 1.3994169096209912,
"grad_norm": 0.5331498843832938,
"learning_rate": 5e-06,
"loss": 0.7702,
"step": 720
},
{
"epoch": 1.4188532555879494,
"grad_norm": 0.5840718481917428,
"learning_rate": 5e-06,
"loss": 0.7755,
"step": 730
},
{
"epoch": 1.4382896015549078,
"grad_norm": 0.537679455083028,
"learning_rate": 5e-06,
"loss": 0.7719,
"step": 740
},
{
"epoch": 1.4577259475218658,
"grad_norm": 0.7948828701245976,
"learning_rate": 5e-06,
"loss": 0.7717,
"step": 750
},
{
"epoch": 1.4771622934888242,
"grad_norm": 0.5813227807421696,
"learning_rate": 5e-06,
"loss": 0.7763,
"step": 760
},
{
"epoch": 1.4965986394557822,
"grad_norm": 0.6049608273143411,
"learning_rate": 5e-06,
"loss": 0.7719,
"step": 770
},
{
"epoch": 1.5160349854227406,
"grad_norm": 0.6057676712274179,
"learning_rate": 5e-06,
"loss": 0.7742,
"step": 780
},
{
"epoch": 1.5354713313896986,
"grad_norm": 0.625042201984692,
"learning_rate": 5e-06,
"loss": 0.7767,
"step": 790
},
{
"epoch": 1.554907677356657,
"grad_norm": 0.5502470811006085,
"learning_rate": 5e-06,
"loss": 0.769,
"step": 800
},
{
"epoch": 1.574344023323615,
"grad_norm": 0.5857965121292225,
"learning_rate": 5e-06,
"loss": 0.7709,
"step": 810
},
{
"epoch": 1.5937803692905734,
"grad_norm": 0.613513782295781,
"learning_rate": 5e-06,
"loss": 0.7755,
"step": 820
},
{
"epoch": 1.6132167152575316,
"grad_norm": 0.583240417199926,
"learning_rate": 5e-06,
"loss": 0.7738,
"step": 830
},
{
"epoch": 1.6326530612244898,
"grad_norm": 0.5231701163074167,
"learning_rate": 5e-06,
"loss": 0.7731,
"step": 840
},
{
"epoch": 1.652089407191448,
"grad_norm": 0.4914609295920534,
"learning_rate": 5e-06,
"loss": 0.7714,
"step": 850
},
{
"epoch": 1.6715257531584062,
"grad_norm": 0.5250445862469549,
"learning_rate": 5e-06,
"loss": 0.7747,
"step": 860
},
{
"epoch": 1.6909620991253644,
"grad_norm": 0.5415316148110199,
"learning_rate": 5e-06,
"loss": 0.7685,
"step": 870
},
{
"epoch": 1.7103984450923226,
"grad_norm": 0.5426871063126633,
"learning_rate": 5e-06,
"loss": 0.7687,
"step": 880
},
{
"epoch": 1.7298347910592808,
"grad_norm": 0.5573890854917875,
"learning_rate": 5e-06,
"loss": 0.7732,
"step": 890
},
{
"epoch": 1.749271137026239,
"grad_norm": 0.5966240890058521,
"learning_rate": 5e-06,
"loss": 0.7695,
"step": 900
},
{
"epoch": 1.7687074829931972,
"grad_norm": 0.48852508189672406,
"learning_rate": 5e-06,
"loss": 0.7716,
"step": 910
},
{
"epoch": 1.7881438289601554,
"grad_norm": 0.510423049422432,
"learning_rate": 5e-06,
"loss": 0.7695,
"step": 920
},
{
"epoch": 1.8075801749271136,
"grad_norm": 0.5128362713697912,
"learning_rate": 5e-06,
"loss": 0.768,
"step": 930
},
{
"epoch": 1.8270165208940718,
"grad_norm": 0.5100892708497722,
"learning_rate": 5e-06,
"loss": 0.7721,
"step": 940
},
{
"epoch": 1.8464528668610303,
"grad_norm": 0.5050841552954286,
"learning_rate": 5e-06,
"loss": 0.7703,
"step": 950
},
{
"epoch": 1.8658892128279883,
"grad_norm": 0.5100122636839687,
"learning_rate": 5e-06,
"loss": 0.7675,
"step": 960
},
{
"epoch": 1.8853255587949467,
"grad_norm": 0.5858103448907649,
"learning_rate": 5e-06,
"loss": 0.7719,
"step": 970
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.7186164442084766,
"learning_rate": 5e-06,
"loss": 0.7704,
"step": 980
},
{
"epoch": 1.924198250728863,
"grad_norm": 0.49187325414902705,
"learning_rate": 5e-06,
"loss": 0.7744,
"step": 990
},
{
"epoch": 1.943634596695821,
"grad_norm": 0.48132179189009133,
"learning_rate": 5e-06,
"loss": 0.7666,
"step": 1000
},
{
"epoch": 1.9630709426627795,
"grad_norm": 0.6042950330363083,
"learning_rate": 5e-06,
"loss": 0.7682,
"step": 1010
},
{
"epoch": 1.9825072886297375,
"grad_norm": 0.5107563561120851,
"learning_rate": 5e-06,
"loss": 0.7688,
"step": 1020
},
{
"epoch": 2.0,
"eval_loss": 0.8017936944961548,
"eval_runtime": 549.3546,
"eval_samples_per_second": 25.235,
"eval_steps_per_second": 0.395,
"step": 1029
},
{
"epoch": 2.001943634596696,
"grad_norm": 0.8520236880105687,
"learning_rate": 5e-06,
"loss": 0.8065,
"step": 1030
},
{
"epoch": 2.021379980563654,
"grad_norm": 0.6759369335284802,
"learning_rate": 5e-06,
"loss": 0.7247,
"step": 1040
},
{
"epoch": 2.0408163265306123,
"grad_norm": 0.6810404313004184,
"learning_rate": 5e-06,
"loss": 0.7257,
"step": 1050
},
{
"epoch": 2.0602526724975703,
"grad_norm": 0.5976865962159595,
"learning_rate": 5e-06,
"loss": 0.7266,
"step": 1060
},
{
"epoch": 2.0796890184645287,
"grad_norm": 0.5170299538250477,
"learning_rate": 5e-06,
"loss": 0.7262,
"step": 1070
},
{
"epoch": 2.0991253644314867,
"grad_norm": 0.636746039057963,
"learning_rate": 5e-06,
"loss": 0.7271,
"step": 1080
},
{
"epoch": 2.118561710398445,
"grad_norm": 0.7347207445428233,
"learning_rate": 5e-06,
"loss": 0.7221,
"step": 1090
},
{
"epoch": 2.1379980563654035,
"grad_norm": 0.6669755018146,
"learning_rate": 5e-06,
"loss": 0.7277,
"step": 1100
},
{
"epoch": 2.1574344023323615,
"grad_norm": 0.5653238227925762,
"learning_rate": 5e-06,
"loss": 0.726,
"step": 1110
},
{
"epoch": 2.17687074829932,
"grad_norm": 0.5720425000083328,
"learning_rate": 5e-06,
"loss": 0.7271,
"step": 1120
},
{
"epoch": 2.196307094266278,
"grad_norm": 0.5494809428119856,
"learning_rate": 5e-06,
"loss": 0.727,
"step": 1130
},
{
"epoch": 2.2157434402332363,
"grad_norm": 0.6708852933316355,
"learning_rate": 5e-06,
"loss": 0.7286,
"step": 1140
},
{
"epoch": 2.2351797862001943,
"grad_norm": 0.5649904350477953,
"learning_rate": 5e-06,
"loss": 0.7253,
"step": 1150
},
{
"epoch": 2.2546161321671527,
"grad_norm": 0.6681152322447659,
"learning_rate": 5e-06,
"loss": 0.7227,
"step": 1160
},
{
"epoch": 2.2740524781341107,
"grad_norm": 0.7452957502891413,
"learning_rate": 5e-06,
"loss": 0.7271,
"step": 1170
},
{
"epoch": 2.293488824101069,
"grad_norm": 0.730891816162587,
"learning_rate": 5e-06,
"loss": 0.7297,
"step": 1180
},
{
"epoch": 2.312925170068027,
"grad_norm": 0.5422066002537126,
"learning_rate": 5e-06,
"loss": 0.7291,
"step": 1190
},
{
"epoch": 2.3323615160349855,
"grad_norm": 0.5368279161848004,
"learning_rate": 5e-06,
"loss": 0.7291,
"step": 1200
},
{
"epoch": 2.3517978620019435,
"grad_norm": 0.5831776325357405,
"learning_rate": 5e-06,
"loss": 0.7299,
"step": 1210
},
{
"epoch": 2.371234207968902,
"grad_norm": 0.4988880812457934,
"learning_rate": 5e-06,
"loss": 0.7286,
"step": 1220
},
{
"epoch": 2.39067055393586,
"grad_norm": 0.6745820300056689,
"learning_rate": 5e-06,
"loss": 0.7246,
"step": 1230
},
{
"epoch": 2.4101068999028183,
"grad_norm": 0.6502142374990822,
"learning_rate": 5e-06,
"loss": 0.7274,
"step": 1240
},
{
"epoch": 2.4295432458697763,
"grad_norm": 0.6686318012888572,
"learning_rate": 5e-06,
"loss": 0.7273,
"step": 1250
},
{
"epoch": 2.4489795918367347,
"grad_norm": 0.5959741098147713,
"learning_rate": 5e-06,
"loss": 0.7296,
"step": 1260
},
{
"epoch": 2.4684159378036927,
"grad_norm": 0.6933996442905096,
"learning_rate": 5e-06,
"loss": 0.7265,
"step": 1270
},
{
"epoch": 2.487852283770651,
"grad_norm": 0.5220203781381132,
"learning_rate": 5e-06,
"loss": 0.7279,
"step": 1280
},
{
"epoch": 2.5072886297376096,
"grad_norm": 0.5382425321528858,
"learning_rate": 5e-06,
"loss": 0.7305,
"step": 1290
},
{
"epoch": 2.5267249757045676,
"grad_norm": 0.5181218910911854,
"learning_rate": 5e-06,
"loss": 0.7249,
"step": 1300
},
{
"epoch": 2.5461613216715255,
"grad_norm": 0.6478067615615305,
"learning_rate": 5e-06,
"loss": 0.7319,
"step": 1310
},
{
"epoch": 2.565597667638484,
"grad_norm": 0.5078942293566884,
"learning_rate": 5e-06,
"loss": 0.7288,
"step": 1320
},
{
"epoch": 2.5850340136054424,
"grad_norm": 0.6268137880948265,
"learning_rate": 5e-06,
"loss": 0.7299,
"step": 1330
},
{
"epoch": 2.6044703595724004,
"grad_norm": 0.7996921164519973,
"learning_rate": 5e-06,
"loss": 0.7332,
"step": 1340
},
{
"epoch": 2.6239067055393583,
"grad_norm": 0.6835765316105765,
"learning_rate": 5e-06,
"loss": 0.7332,
"step": 1350
},
{
"epoch": 2.6433430515063168,
"grad_norm": 0.6208677881375628,
"learning_rate": 5e-06,
"loss": 0.7255,
"step": 1360
},
{
"epoch": 2.662779397473275,
"grad_norm": 0.5685215567462071,
"learning_rate": 5e-06,
"loss": 0.7311,
"step": 1370
},
{
"epoch": 2.682215743440233,
"grad_norm": 0.502556515949076,
"learning_rate": 5e-06,
"loss": 0.7288,
"step": 1380
},
{
"epoch": 2.7016520894071916,
"grad_norm": 0.5557319329244653,
"learning_rate": 5e-06,
"loss": 0.7252,
"step": 1390
},
{
"epoch": 2.7210884353741496,
"grad_norm": 0.5681474343567127,
"learning_rate": 5e-06,
"loss": 0.7324,
"step": 1400
},
{
"epoch": 2.740524781341108,
"grad_norm": 0.5718856742663174,
"learning_rate": 5e-06,
"loss": 0.7314,
"step": 1410
},
{
"epoch": 2.759961127308066,
"grad_norm": 0.5207890225289823,
"learning_rate": 5e-06,
"loss": 0.7327,
"step": 1420
},
{
"epoch": 2.7793974732750244,
"grad_norm": 0.5610775317663003,
"learning_rate": 5e-06,
"loss": 0.7325,
"step": 1430
},
{
"epoch": 2.7988338192419824,
"grad_norm": 0.49435740403271333,
"learning_rate": 5e-06,
"loss": 0.7306,
"step": 1440
},
{
"epoch": 2.818270165208941,
"grad_norm": 0.5548340643212172,
"learning_rate": 5e-06,
"loss": 0.7248,
"step": 1450
},
{
"epoch": 2.837706511175899,
"grad_norm": 0.8162943266425523,
"learning_rate": 5e-06,
"loss": 0.728,
"step": 1460
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.5605722072050826,
"learning_rate": 5e-06,
"loss": 0.7321,
"step": 1470
},
{
"epoch": 2.8765792031098156,
"grad_norm": 0.5811312094740239,
"learning_rate": 5e-06,
"loss": 0.7312,
"step": 1480
},
{
"epoch": 2.8960155490767736,
"grad_norm": 0.581575144969956,
"learning_rate": 5e-06,
"loss": 0.7321,
"step": 1490
},
{
"epoch": 2.9154518950437316,
"grad_norm": 0.5099095695816087,
"learning_rate": 5e-06,
"loss": 0.7334,
"step": 1500
},
{
"epoch": 2.93488824101069,
"grad_norm": 0.5572721217938963,
"learning_rate": 5e-06,
"loss": 0.7308,
"step": 1510
},
{
"epoch": 2.9543245869776484,
"grad_norm": 0.522930114991094,
"learning_rate": 5e-06,
"loss": 0.7295,
"step": 1520
},
{
"epoch": 2.9737609329446064,
"grad_norm": 0.6436319992243297,
"learning_rate": 5e-06,
"loss": 0.7331,
"step": 1530
},
{
"epoch": 2.9931972789115644,
"grad_norm": 0.6235339876579238,
"learning_rate": 5e-06,
"loss": 0.7263,
"step": 1540
},
{
"epoch": 2.997084548104956,
"eval_loss": 0.8005240559577942,
"eval_runtime": 550.1116,
"eval_samples_per_second": 25.2,
"eval_steps_per_second": 0.394,
"step": 1542
},
{
"epoch": 2.997084548104956,
"step": 1542,
"total_flos": 2582698052812800.0,
"train_loss": 0.7834810720498151,
"train_runtime": 91136.0835,
"train_samples_per_second": 8.67,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 1542,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2582698052812800.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}