|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.997084548104956, |
|
"eval_steps": 500, |
|
"global_step": 1542, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019436345966958212, |
|
"grad_norm": 3.5477320746078096, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0532, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.038872691933916424, |
|
"grad_norm": 2.8209235996953512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9684, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05830903790087463, |
|
"grad_norm": 1.1257852692164845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9294, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07774538386783285, |
|
"grad_norm": 0.888481539231949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9201, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09718172983479106, |
|
"grad_norm": 0.6469537614046023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8971, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11661807580174927, |
|
"grad_norm": 0.592544408615531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8837, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1360544217687075, |
|
"grad_norm": 0.6433313667438364, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8749, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1554907677356657, |
|
"grad_norm": 1.344429915972531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.867, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1749271137026239, |
|
"grad_norm": 0.8707935340603853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8623, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19436345966958213, |
|
"grad_norm": 0.7828161719097383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8624, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21379980563654033, |
|
"grad_norm": 0.682126021781355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8611, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23323615160349853, |
|
"grad_norm": 0.8081328723422389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8551, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25267249757045673, |
|
"grad_norm": 0.6691994077909735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8583, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.272108843537415, |
|
"grad_norm": 0.586140300081327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8523, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2915451895043732, |
|
"grad_norm": 0.8552001050583659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8497, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3109815354713314, |
|
"grad_norm": 0.501523965918074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8464, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3304178814382896, |
|
"grad_norm": 0.6785445153206255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.848, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3498542274052478, |
|
"grad_norm": 0.6494060293856031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8459, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.369290573372206, |
|
"grad_norm": 0.6020737926390343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8484, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.38872691933916426, |
|
"grad_norm": 0.48912875825316915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8449, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 0.8781137047011839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8395, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.42759961127308066, |
|
"grad_norm": 0.5879468826056136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8387, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.44703595724003886, |
|
"grad_norm": 0.6017675792916065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8363, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.46647230320699706, |
|
"grad_norm": 0.616647981494789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8336, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4859086491739553, |
|
"grad_norm": 0.855268617382177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8336, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5053449951409135, |
|
"grad_norm": 0.780188445457583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.831, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5247813411078717, |
|
"grad_norm": 0.5840816004625115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8319, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.54421768707483, |
|
"grad_norm": 0.6535257947752856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.833, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5636540330417882, |
|
"grad_norm": 0.6710102563759031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8354, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5830903790087464, |
|
"grad_norm": 0.5947252085113186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8315, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6025267249757046, |
|
"grad_norm": 0.5902161171422673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.83, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6219630709426628, |
|
"grad_norm": 0.5757165816873938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8299, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.641399416909621, |
|
"grad_norm": 0.6751753868245474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.827, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6608357628765792, |
|
"grad_norm": 0.535560817394151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8264, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6802721088435374, |
|
"grad_norm": 0.6183377886108462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8243, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6997084548104956, |
|
"grad_norm": 0.6200501825686097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.828, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7191448007774538, |
|
"grad_norm": 0.5622254912052161, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8287, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.738581146744412, |
|
"grad_norm": 0.6720527159909909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8234, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7580174927113703, |
|
"grad_norm": 0.5315560166276624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.824, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7774538386783285, |
|
"grad_norm": 0.6413527148042328, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8194, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7968901846452867, |
|
"grad_norm": 0.6402327795437167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8243, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 0.6290935177044384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8201, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8357628765792031, |
|
"grad_norm": 0.6137598310285064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8187, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8551992225461613, |
|
"grad_norm": 0.6137894354862566, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8217, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8746355685131195, |
|
"grad_norm": 0.7376542092364302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8191, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8940719144800777, |
|
"grad_norm": 0.5443940007812901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8179, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9135082604470359, |
|
"grad_norm": 0.6778023088897194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8158, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9329446064139941, |
|
"grad_norm": 0.6040677193471313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8176, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.5741967517130403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8215, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9718172983479106, |
|
"grad_norm": 0.5184977635424605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8218, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9912536443148688, |
|
"grad_norm": 0.5484601778792662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8158, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9990281827016521, |
|
"eval_loss": 0.8152287006378174, |
|
"eval_runtime": 547.9504, |
|
"eval_samples_per_second": 25.3, |
|
"eval_steps_per_second": 0.396, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.010689990281827, |
|
"grad_norm": 0.6821565020818955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8355, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0301263362487851, |
|
"grad_norm": 0.6277981964271326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7749, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0495626822157433, |
|
"grad_norm": 0.6114756472079457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7751, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0689990281827018, |
|
"grad_norm": 0.8259615966427586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7756, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.08843537414966, |
|
"grad_norm": 0.6113352198439804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7789, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1078717201166182, |
|
"grad_norm": 0.5269512429419262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7784, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1273080660835764, |
|
"grad_norm": 0.5792710933468033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7757, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1467444120505346, |
|
"grad_norm": 0.5467198449481481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7757, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1661807580174928, |
|
"grad_norm": 0.6190447420364188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7754, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.185617103984451, |
|
"grad_norm": 0.7074708178383962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7738, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2050534499514092, |
|
"grad_norm": 0.5708793884696434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7675, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2244897959183674, |
|
"grad_norm": 0.5278424041049065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7724, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2439261418853256, |
|
"grad_norm": 0.48269223376284837, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7678, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2633624878522838, |
|
"grad_norm": 0.6628438951702088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.772, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.282798833819242, |
|
"grad_norm": 0.49956285734450795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.78, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.3022351797862002, |
|
"grad_norm": 0.4482989241465936, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7712, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3216715257531584, |
|
"grad_norm": 0.515252608881534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7712, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3411078717201166, |
|
"grad_norm": 0.5392787594093453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7733, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3605442176870748, |
|
"grad_norm": 0.5239288581769422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7744, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.379980563654033, |
|
"grad_norm": 0.5368087860350439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7721, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3994169096209912, |
|
"grad_norm": 0.5331498843832938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7702, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4188532555879494, |
|
"grad_norm": 0.5840718481917428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7755, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4382896015549078, |
|
"grad_norm": 0.537679455083028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7719, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4577259475218658, |
|
"grad_norm": 0.7948828701245976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7717, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4771622934888242, |
|
"grad_norm": 0.5813227807421696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7763, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4965986394557822, |
|
"grad_norm": 0.6049608273143411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7719, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.5160349854227406, |
|
"grad_norm": 0.6057676712274179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7742, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.5354713313896986, |
|
"grad_norm": 0.625042201984692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7767, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.554907677356657, |
|
"grad_norm": 0.5502470811006085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.769, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.574344023323615, |
|
"grad_norm": 0.5857965121292225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7709, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5937803692905734, |
|
"grad_norm": 0.613513782295781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7755, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6132167152575316, |
|
"grad_norm": 0.583240417199926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7738, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6326530612244898, |
|
"grad_norm": 0.5231701163074167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7731, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.652089407191448, |
|
"grad_norm": 0.4914609295920534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7714, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6715257531584062, |
|
"grad_norm": 0.5250445862469549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7747, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6909620991253644, |
|
"grad_norm": 0.5415316148110199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7685, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.7103984450923226, |
|
"grad_norm": 0.5426871063126633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7687, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.7298347910592808, |
|
"grad_norm": 0.5573890854917875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7732, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.749271137026239, |
|
"grad_norm": 0.5966240890058521, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7695, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7687074829931972, |
|
"grad_norm": 0.48852508189672406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7716, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7881438289601554, |
|
"grad_norm": 0.510423049422432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7695, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.8075801749271136, |
|
"grad_norm": 0.5128362713697912, |
|
"learning_rate": 5e-06, |
|
"loss": 0.768, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.8270165208940718, |
|
"grad_norm": 0.5100892708497722, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7721, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8464528668610303, |
|
"grad_norm": 0.5050841552954286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7703, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8658892128279883, |
|
"grad_norm": 0.5100122636839687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7675, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.8853255587949467, |
|
"grad_norm": 0.5858103448907649, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7719, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.7186164442084766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7704, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.924198250728863, |
|
"grad_norm": 0.49187325414902705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7744, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.943634596695821, |
|
"grad_norm": 0.48132179189009133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7666, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9630709426627795, |
|
"grad_norm": 0.6042950330363083, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7682, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9825072886297375, |
|
"grad_norm": 0.5107563561120851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7688, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.8017936944961548, |
|
"eval_runtime": 549.3546, |
|
"eval_samples_per_second": 25.235, |
|
"eval_steps_per_second": 0.395, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 2.001943634596696, |
|
"grad_norm": 0.8520236880105687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8065, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.021379980563654, |
|
"grad_norm": 0.6759369335284802, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7247, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.0408163265306123, |
|
"grad_norm": 0.6810404313004184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7257, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.0602526724975703, |
|
"grad_norm": 0.5976865962159595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7266, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.0796890184645287, |
|
"grad_norm": 0.5170299538250477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7262, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.0991253644314867, |
|
"grad_norm": 0.636746039057963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7271, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.118561710398445, |
|
"grad_norm": 0.7347207445428233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7221, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.1379980563654035, |
|
"grad_norm": 0.6669755018146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7277, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.1574344023323615, |
|
"grad_norm": 0.5653238227925762, |
|
"learning_rate": 5e-06, |
|
"loss": 0.726, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.17687074829932, |
|
"grad_norm": 0.5720425000083328, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7271, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.196307094266278, |
|
"grad_norm": 0.5494809428119856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.727, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.2157434402332363, |
|
"grad_norm": 0.6708852933316355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7286, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.2351797862001943, |
|
"grad_norm": 0.5649904350477953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7253, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.2546161321671527, |
|
"grad_norm": 0.6681152322447659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7227, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.2740524781341107, |
|
"grad_norm": 0.7452957502891413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7271, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.293488824101069, |
|
"grad_norm": 0.730891816162587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7297, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.312925170068027, |
|
"grad_norm": 0.5422066002537126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7291, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.3323615160349855, |
|
"grad_norm": 0.5368279161848004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7291, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.3517978620019435, |
|
"grad_norm": 0.5831776325357405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7299, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.371234207968902, |
|
"grad_norm": 0.4988880812457934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7286, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.39067055393586, |
|
"grad_norm": 0.6745820300056689, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7246, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.4101068999028183, |
|
"grad_norm": 0.6502142374990822, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7274, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.4295432458697763, |
|
"grad_norm": 0.6686318012888572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7273, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.4489795918367347, |
|
"grad_norm": 0.5959741098147713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7296, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.4684159378036927, |
|
"grad_norm": 0.6933996442905096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7265, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.487852283770651, |
|
"grad_norm": 0.5220203781381132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7279, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.5072886297376096, |
|
"grad_norm": 0.5382425321528858, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7305, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.5267249757045676, |
|
"grad_norm": 0.5181218910911854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7249, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.5461613216715255, |
|
"grad_norm": 0.6478067615615305, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7319, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.565597667638484, |
|
"grad_norm": 0.5078942293566884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7288, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.5850340136054424, |
|
"grad_norm": 0.6268137880948265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7299, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.6044703595724004, |
|
"grad_norm": 0.7996921164519973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7332, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.6239067055393583, |
|
"grad_norm": 0.6835765316105765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7332, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.6433430515063168, |
|
"grad_norm": 0.6208677881375628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7255, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.662779397473275, |
|
"grad_norm": 0.5685215567462071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7311, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.682215743440233, |
|
"grad_norm": 0.502556515949076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7288, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.7016520894071916, |
|
"grad_norm": 0.5557319329244653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7252, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.7210884353741496, |
|
"grad_norm": 0.5681474343567127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7324, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.740524781341108, |
|
"grad_norm": 0.5718856742663174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7314, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.759961127308066, |
|
"grad_norm": 0.5207890225289823, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7327, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.7793974732750244, |
|
"grad_norm": 0.5610775317663003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7325, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.7988338192419824, |
|
"grad_norm": 0.49435740403271333, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7306, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.818270165208941, |
|
"grad_norm": 0.5548340643212172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7248, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.837706511175899, |
|
"grad_norm": 0.8162943266425523, |
|
"learning_rate": 5e-06, |
|
"loss": 0.728, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.5605722072050826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7321, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.8765792031098156, |
|
"grad_norm": 0.5811312094740239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7312, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.8960155490767736, |
|
"grad_norm": 0.581575144969956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7321, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.9154518950437316, |
|
"grad_norm": 0.5099095695816087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7334, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.93488824101069, |
|
"grad_norm": 0.5572721217938963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7308, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.9543245869776484, |
|
"grad_norm": 0.522930114991094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7295, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.9737609329446064, |
|
"grad_norm": 0.6436319992243297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7331, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.9931972789115644, |
|
"grad_norm": 0.6235339876579238, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7263, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.997084548104956, |
|
"eval_loss": 0.8005240559577942, |
|
"eval_runtime": 550.1116, |
|
"eval_samples_per_second": 25.2, |
|
"eval_steps_per_second": 0.394, |
|
"step": 1542 |
|
}, |
|
{ |
|
"epoch": 2.997084548104956, |
|
"step": 1542, |
|
"total_flos": 2582698052812800.0, |
|
"train_loss": 0.7834810720498151, |
|
"train_runtime": 91136.0835, |
|
"train_samples_per_second": 8.67, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1542, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2582698052812800.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|