|
{ |
|
"best_metric": 0.007947824895381927, |
|
"best_model_checkpoint": "./beans_outputs/checkpoint-3250", |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 3250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 2.198366403579712, |
|
"learning_rate": 1.9692307692307696e-05, |
|
"loss": 1.0245, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 1.8460220098495483, |
|
"learning_rate": 1.9384615384615386e-05, |
|
"loss": 0.9453, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 2.117300271987915, |
|
"learning_rate": 1.907692307692308e-05, |
|
"loss": 0.8406, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 2.645448684692383, |
|
"learning_rate": 1.876923076923077e-05, |
|
"loss": 0.6951, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 4.24210262298584, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.662, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 2.095567464828491, |
|
"learning_rate": 1.8153846153846155e-05, |
|
"loss": 0.5811, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 2.079636573791504, |
|
"learning_rate": 1.784615384615385e-05, |
|
"loss": 0.5193, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 1.218340277671814, |
|
"learning_rate": 1.753846153846154e-05, |
|
"loss": 0.3945, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 2.4416184425354004, |
|
"learning_rate": 1.7230769230769234e-05, |
|
"loss": 0.3528, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 2.8852720260620117, |
|
"learning_rate": 1.6923076923076924e-05, |
|
"loss": 0.3689, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 1.9503966569900513, |
|
"learning_rate": 1.6615384615384618e-05, |
|
"loss": 0.3368, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 2.335829496383667, |
|
"learning_rate": 1.630769230769231e-05, |
|
"loss": 0.2468, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 7.0948638916015625, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.2859, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9624060150375939, |
|
"eval_loss": 0.21892516314983368, |
|
"eval_runtime": 0.9855, |
|
"eval_samples_per_second": 134.957, |
|
"eval_steps_per_second": 17.25, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 1.765810251235962, |
|
"learning_rate": 1.5692307692307693e-05, |
|
"loss": 0.2006, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 3.1403791904449463, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.2243, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 1.4933677911758423, |
|
"learning_rate": 1.5076923076923078e-05, |
|
"loss": 0.2035, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 0.5171657800674438, |
|
"learning_rate": 1.4769230769230772e-05, |
|
"loss": 0.2208, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.600874662399292, |
|
"learning_rate": 1.4461538461538462e-05, |
|
"loss": 0.211, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 3.5654282569885254, |
|
"learning_rate": 1.4153846153846156e-05, |
|
"loss": 0.1607, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.47061973810195923, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.2526, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 0.6342479586601257, |
|
"learning_rate": 1.353846153846154e-05, |
|
"loss": 0.1804, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 2.391136884689331, |
|
"learning_rate": 1.3230769230769231e-05, |
|
"loss": 0.2121, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 1.8535972833633423, |
|
"learning_rate": 1.2923076923076925e-05, |
|
"loss": 0.1678, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 11.256393432617188, |
|
"learning_rate": 1.2615384615384616e-05, |
|
"loss": 0.2574, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 0.4076194167137146, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.1442, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3490784168243408, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.1316, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9699248120300752, |
|
"eval_loss": 0.13337133824825287, |
|
"eval_runtime": 0.9938, |
|
"eval_samples_per_second": 133.827, |
|
"eval_steps_per_second": 17.106, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.076923076923077, |
|
"grad_norm": 0.5328232645988464, |
|
"learning_rate": 1.1692307692307694e-05, |
|
"loss": 0.0897, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.3375113904476166, |
|
"learning_rate": 1.1384615384615385e-05, |
|
"loss": 0.1475, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.230769230769231, |
|
"grad_norm": 7.195676326751709, |
|
"learning_rate": 1.1076923076923079e-05, |
|
"loss": 0.1321, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 8.226975440979004, |
|
"learning_rate": 1.076923076923077e-05, |
|
"loss": 0.1375, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.3846153846153846, |
|
"grad_norm": 5.051274299621582, |
|
"learning_rate": 1.0461538461538463e-05, |
|
"loss": 0.1755, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.30559489130973816, |
|
"learning_rate": 1.0153846153846154e-05, |
|
"loss": 0.1542, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.5384615384615383, |
|
"grad_norm": 6.551879405975342, |
|
"learning_rate": 9.846153846153848e-06, |
|
"loss": 0.1872, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 1.2321629524230957, |
|
"learning_rate": 9.53846153846154e-06, |
|
"loss": 0.0714, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 0.3684569299221039, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.1325, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 9.62742805480957, |
|
"learning_rate": 8.923076923076925e-06, |
|
"loss": 0.1203, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.8461538461538463, |
|
"grad_norm": 0.3335096836090088, |
|
"learning_rate": 8.615384615384617e-06, |
|
"loss": 0.154, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 1.1392183303833008, |
|
"learning_rate": 8.307692307692309e-06, |
|
"loss": 0.1306, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.29587361216545105, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.1438, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9699248120300752, |
|
"eval_loss": 0.09808181971311569, |
|
"eval_runtime": 0.9838, |
|
"eval_samples_per_second": 135.192, |
|
"eval_steps_per_second": 17.28, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 2.2563395500183105, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.1325, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.1538461538461537, |
|
"grad_norm": 6.758065223693848, |
|
"learning_rate": 7.384615384615386e-06, |
|
"loss": 0.154, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"grad_norm": 8.712298393249512, |
|
"learning_rate": 7.076923076923078e-06, |
|
"loss": 0.1422, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.3076923076923075, |
|
"grad_norm": 0.23161759972572327, |
|
"learning_rate": 6.76923076923077e-06, |
|
"loss": 0.0768, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 5.164632797241211, |
|
"learning_rate": 6.461538461538463e-06, |
|
"loss": 0.1136, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"grad_norm": 12.590032577514648, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.0854, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 4.483827590942383, |
|
"learning_rate": 5.846153846153847e-06, |
|
"loss": 0.098, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.6153846153846154, |
|
"grad_norm": 7.863719940185547, |
|
"learning_rate": 5.538461538461539e-06, |
|
"loss": 0.0778, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 0.21892791986465454, |
|
"learning_rate": 5.230769230769232e-06, |
|
"loss": 0.0957, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.769230769230769, |
|
"grad_norm": 0.2107536494731903, |
|
"learning_rate": 4.923076923076924e-06, |
|
"loss": 0.0775, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 0.9603075981140137, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.1357, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.9230769230769234, |
|
"grad_norm": 0.21214883029460907, |
|
"learning_rate": 4.307692307692308e-06, |
|
"loss": 0.1225, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.2996317744255066, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0833, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.06558172404766083, |
|
"eval_runtime": 1.0412, |
|
"eval_samples_per_second": 127.735, |
|
"eval_steps_per_second": 16.327, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.076923076923077, |
|
"grad_norm": 9.374862670898438, |
|
"learning_rate": 3.692307692307693e-06, |
|
"loss": 0.1009, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 1.5623271465301514, |
|
"learning_rate": 3.384615384615385e-06, |
|
"loss": 0.1658, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.230769230769231, |
|
"grad_norm": 0.3211674094200134, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.1534, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 1.208674430847168, |
|
"learning_rate": 2.7692307692307697e-06, |
|
"loss": 0.0504, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.384615384615385, |
|
"grad_norm": 0.3158506751060486, |
|
"learning_rate": 2.461538461538462e-06, |
|
"loss": 0.1345, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"grad_norm": 0.2690170705318451, |
|
"learning_rate": 2.153846153846154e-06, |
|
"loss": 0.0816, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.538461538461538, |
|
"grad_norm": 9.395990371704102, |
|
"learning_rate": 1.8461538461538465e-06, |
|
"loss": 0.0714, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 3.067613124847412, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.0892, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.6923076923076925, |
|
"grad_norm": 5.167181491851807, |
|
"learning_rate": 1.230769230769231e-06, |
|
"loss": 0.087, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"grad_norm": 0.2610948383808136, |
|
"learning_rate": 9.230769230769232e-07, |
|
"loss": 0.0807, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.846153846153846, |
|
"grad_norm": 0.22187209129333496, |
|
"learning_rate": 6.153846153846155e-07, |
|
"loss": 0.0668, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 0.23086942732334137, |
|
"learning_rate": 3.0769230769230774e-07, |
|
"loss": 0.0784, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.6445406675338745, |
|
"learning_rate": 0.0, |
|
"loss": 0.1107, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9774436090225563, |
|
"eval_loss": 0.0816693976521492, |
|
"eval_runtime": 1.9096, |
|
"eval_samples_per_second": 69.648, |
|
"eval_steps_per_second": 8.902, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 10.153846153846153, |
|
"grad_norm": 2.079591751098633, |
|
"learning_rate": 1.593846153846154e-05, |
|
"loss": 0.0956, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 10.307692307692308, |
|
"grad_norm": 0.22365273535251617, |
|
"learning_rate": 1.587692307692308e-05, |
|
"loss": 0.0704, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 10.461538461538462, |
|
"grad_norm": 1.2620346546173096, |
|
"learning_rate": 1.5815384615384616e-05, |
|
"loss": 0.0927, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 10.615384615384615, |
|
"grad_norm": 0.17169518768787384, |
|
"learning_rate": 1.5753846153846154e-05, |
|
"loss": 0.1318, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"grad_norm": 1.01598060131073, |
|
"learning_rate": 1.5692307692307693e-05, |
|
"loss": 0.1102, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 10.923076923076923, |
|
"grad_norm": 1.3003897666931152, |
|
"learning_rate": 1.5630769230769232e-05, |
|
"loss": 0.098, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.056982800364494324, |
|
"eval_runtime": 0.6404, |
|
"eval_samples_per_second": 207.697, |
|
"eval_steps_per_second": 14.055, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 11.076923076923077, |
|
"grad_norm": 2.616173505783081, |
|
"learning_rate": 1.556923076923077e-05, |
|
"loss": 0.1061, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 11.23076923076923, |
|
"grad_norm": 0.20168966054916382, |
|
"learning_rate": 1.550769230769231e-05, |
|
"loss": 0.1275, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 11.384615384615385, |
|
"grad_norm": 2.059192419052124, |
|
"learning_rate": 1.544615384615385e-05, |
|
"loss": 0.0463, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 11.538461538461538, |
|
"grad_norm": 0.1697787493467331, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.0759, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 11.692307692307692, |
|
"grad_norm": 0.8658211827278137, |
|
"learning_rate": 1.5323076923076926e-05, |
|
"loss": 0.1983, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 11.846153846153847, |
|
"grad_norm": 0.4407813251018524, |
|
"learning_rate": 1.5261538461538465e-05, |
|
"loss": 0.1181, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 6.601933479309082, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.0935, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.04177865758538246, |
|
"eval_runtime": 0.6316, |
|
"eval_samples_per_second": 210.569, |
|
"eval_steps_per_second": 14.249, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 12.153846153846153, |
|
"grad_norm": 1.3233654499053955, |
|
"learning_rate": 1.5138461538461539e-05, |
|
"loss": 0.0949, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 10.496395111083984, |
|
"learning_rate": 1.5076923076923078e-05, |
|
"loss": 0.1141, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 12.461538461538462, |
|
"grad_norm": 0.40288811922073364, |
|
"learning_rate": 1.5015384615384617e-05, |
|
"loss": 0.0662, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 12.615384615384615, |
|
"grad_norm": 1.3594541549682617, |
|
"learning_rate": 1.4953846153846154e-05, |
|
"loss": 0.0672, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 12.76923076923077, |
|
"grad_norm": 0.15957416594028473, |
|
"learning_rate": 1.4892307692307692e-05, |
|
"loss": 0.1198, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 12.923076923076923, |
|
"grad_norm": 3.9073538780212402, |
|
"learning_rate": 1.4830769230769233e-05, |
|
"loss": 0.0907, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.9699248120300752, |
|
"eval_loss": 0.10930211842060089, |
|
"eval_runtime": 0.6314, |
|
"eval_samples_per_second": 210.658, |
|
"eval_steps_per_second": 14.255, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 13.076923076923077, |
|
"grad_norm": 4.819633483886719, |
|
"learning_rate": 1.4769230769230772e-05, |
|
"loss": 0.0678, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 13.23076923076923, |
|
"grad_norm": 0.1750306636095047, |
|
"learning_rate": 1.4707692307692309e-05, |
|
"loss": 0.0498, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 13.384615384615385, |
|
"grad_norm": 0.7398102879524231, |
|
"learning_rate": 1.4646153846153848e-05, |
|
"loss": 0.112, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 13.538461538461538, |
|
"grad_norm": 1.1426323652267456, |
|
"learning_rate": 1.4584615384615386e-05, |
|
"loss": 0.061, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 13.692307692307692, |
|
"grad_norm": 0.16307789087295532, |
|
"learning_rate": 1.4523076923076923e-05, |
|
"loss": 0.0405, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"grad_norm": 3.81508207321167, |
|
"learning_rate": 1.4461538461538462e-05, |
|
"loss": 0.0768, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.482637494802475, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.0947, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.03473825752735138, |
|
"eval_runtime": 0.6324, |
|
"eval_samples_per_second": 210.312, |
|
"eval_steps_per_second": 14.232, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 14.153846153846153, |
|
"grad_norm": 0.18653550744056702, |
|
"learning_rate": 1.4338461538461538e-05, |
|
"loss": 0.0853, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 14.307692307692308, |
|
"grad_norm": 1.822533130645752, |
|
"learning_rate": 1.4276923076923077e-05, |
|
"loss": 0.1008, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 14.461538461538462, |
|
"grad_norm": 0.32205572724342346, |
|
"learning_rate": 1.4215384615384617e-05, |
|
"loss": 0.0638, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 14.615384615384615, |
|
"grad_norm": 0.5714139342308044, |
|
"learning_rate": 1.4153846153846156e-05, |
|
"loss": 0.1159, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 14.76923076923077, |
|
"grad_norm": 5.901656627655029, |
|
"learning_rate": 1.4092307692307693e-05, |
|
"loss": 0.091, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 14.923076923076923, |
|
"grad_norm": 2.119544744491577, |
|
"learning_rate": 1.4030769230769232e-05, |
|
"loss": 0.1259, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.9849624060150376, |
|
"eval_loss": 0.07099475711584091, |
|
"eval_runtime": 0.6277, |
|
"eval_samples_per_second": 211.88, |
|
"eval_steps_per_second": 14.338, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 15.076923076923077, |
|
"grad_norm": 0.14739356935024261, |
|
"learning_rate": 1.3969230769230771e-05, |
|
"loss": 0.082, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 15.23076923076923, |
|
"grad_norm": 0.12192496657371521, |
|
"learning_rate": 1.3907692307692308e-05, |
|
"loss": 0.0558, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 0.11841657012701035, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.0852, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 15.538461538461538, |
|
"grad_norm": 0.8062542080879211, |
|
"learning_rate": 1.3784615384615386e-05, |
|
"loss": 0.0793, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 15.692307692307692, |
|
"grad_norm": 0.11781653016805649, |
|
"learning_rate": 1.3723076923076923e-05, |
|
"loss": 0.0436, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 15.846153846153847, |
|
"grad_norm": 0.4233933389186859, |
|
"learning_rate": 1.3661538461538461e-05, |
|
"loss": 0.0448, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.3999457061290741, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.0325, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.9774436090225563, |
|
"eval_loss": 0.05867745727300644, |
|
"eval_runtime": 0.6418, |
|
"eval_samples_per_second": 207.244, |
|
"eval_steps_per_second": 14.024, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 16.153846153846153, |
|
"grad_norm": 0.36126333475112915, |
|
"learning_rate": 1.353846153846154e-05, |
|
"loss": 0.0373, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 16.307692307692307, |
|
"grad_norm": 2.5870630741119385, |
|
"learning_rate": 1.3476923076923078e-05, |
|
"loss": 0.0746, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 16.46153846153846, |
|
"grad_norm": 10.347090721130371, |
|
"learning_rate": 1.3415384615384617e-05, |
|
"loss": 0.047, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 16.615384615384617, |
|
"grad_norm": 0.5827314257621765, |
|
"learning_rate": 1.3353846153846155e-05, |
|
"loss": 0.0832, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 16.76923076923077, |
|
"grad_norm": 0.1550850123167038, |
|
"learning_rate": 1.3292307692307692e-05, |
|
"loss": 0.1211, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"grad_norm": 0.8602961897850037, |
|
"learning_rate": 1.3230769230769231e-05, |
|
"loss": 0.1397, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.049453094601631165, |
|
"eval_runtime": 0.6291, |
|
"eval_samples_per_second": 211.407, |
|
"eval_steps_per_second": 14.306, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 17.076923076923077, |
|
"grad_norm": 1.7802873849868774, |
|
"learning_rate": 1.316923076923077e-05, |
|
"loss": 0.0908, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 17.23076923076923, |
|
"grad_norm": 0.10594117641448975, |
|
"learning_rate": 1.3107692307692307e-05, |
|
"loss": 0.0698, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 17.384615384615383, |
|
"grad_norm": 0.21712522208690643, |
|
"learning_rate": 1.3046153846153846e-05, |
|
"loss": 0.1116, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 17.53846153846154, |
|
"grad_norm": 6.615525245666504, |
|
"learning_rate": 1.2984615384615386e-05, |
|
"loss": 0.0767, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 17.692307692307693, |
|
"grad_norm": 0.10564947128295898, |
|
"learning_rate": 1.2923076923076925e-05, |
|
"loss": 0.0522, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 17.846153846153847, |
|
"grad_norm": 1.2303924560546875, |
|
"learning_rate": 1.2861538461538462e-05, |
|
"loss": 0.0558, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.10486020892858505, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.0456, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.9774436090225563, |
|
"eval_loss": 0.051864467561244965, |
|
"eval_runtime": 0.6367, |
|
"eval_samples_per_second": 208.899, |
|
"eval_steps_per_second": 14.136, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 18.153846153846153, |
|
"grad_norm": 0.09572620689868927, |
|
"learning_rate": 1.273846153846154e-05, |
|
"loss": 0.0446, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 18.307692307692307, |
|
"grad_norm": 0.12069143354892731, |
|
"learning_rate": 1.2676923076923077e-05, |
|
"loss": 0.0612, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"grad_norm": 3.15175724029541, |
|
"learning_rate": 1.2615384615384616e-05, |
|
"loss": 0.1213, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 18.615384615384617, |
|
"grad_norm": 0.6527738571166992, |
|
"learning_rate": 1.2553846153846155e-05, |
|
"loss": 0.1027, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 18.76923076923077, |
|
"grad_norm": 0.6189332604408264, |
|
"learning_rate": 1.2492307692307692e-05, |
|
"loss": 0.0262, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 18.923076923076923, |
|
"grad_norm": 0.1209394633769989, |
|
"learning_rate": 1.243076923076923e-05, |
|
"loss": 0.0439, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.021639494225382805, |
|
"eval_runtime": 0.6265, |
|
"eval_samples_per_second": 212.296, |
|
"eval_steps_per_second": 14.366, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 19.076923076923077, |
|
"grad_norm": 6.580589771270752, |
|
"learning_rate": 1.2369230769230771e-05, |
|
"loss": 0.0469, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 19.23076923076923, |
|
"grad_norm": 8.734590530395508, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.0297, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 19.384615384615383, |
|
"grad_norm": 1.7616609334945679, |
|
"learning_rate": 1.2246153846153847e-05, |
|
"loss": 0.1147, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 19.53846153846154, |
|
"grad_norm": 0.3767222464084625, |
|
"learning_rate": 1.2184615384615386e-05, |
|
"loss": 0.0499, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 19.692307692307693, |
|
"grad_norm": 0.1071651503443718, |
|
"learning_rate": 1.2123076923076924e-05, |
|
"loss": 0.0623, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 19.846153846153847, |
|
"grad_norm": 4.712902545928955, |
|
"learning_rate": 1.2061538461538462e-05, |
|
"loss": 0.0365, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.08926769345998764, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.0484, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.03160810098052025, |
|
"eval_runtime": 0.6412, |
|
"eval_samples_per_second": 207.432, |
|
"eval_steps_per_second": 14.037, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 20.153846153846153, |
|
"grad_norm": 1.210523009300232, |
|
"learning_rate": 1.1938461538461539e-05, |
|
"loss": 0.0789, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 20.307692307692307, |
|
"grad_norm": 0.21540193259716034, |
|
"learning_rate": 1.1876923076923076e-05, |
|
"loss": 0.0214, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 20.46153846153846, |
|
"grad_norm": 0.08566620200872421, |
|
"learning_rate": 1.1815384615384617e-05, |
|
"loss": 0.0308, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 20.615384615384617, |
|
"grad_norm": 1.3403387069702148, |
|
"learning_rate": 1.1753846153846155e-05, |
|
"loss": 0.0656, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 20.76923076923077, |
|
"grad_norm": 1.3895039558410645, |
|
"learning_rate": 1.1692307692307694e-05, |
|
"loss": 0.0651, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 20.923076923076923, |
|
"grad_norm": 2.5756139755249023, |
|
"learning_rate": 1.1630769230769231e-05, |
|
"loss": 0.0276, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.019228629767894745, |
|
"eval_runtime": 0.6415, |
|
"eval_samples_per_second": 207.316, |
|
"eval_steps_per_second": 14.029, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 21.076923076923077, |
|
"grad_norm": 0.11552825570106506, |
|
"learning_rate": 1.156923076923077e-05, |
|
"loss": 0.0494, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 21.23076923076923, |
|
"grad_norm": 0.08301220834255219, |
|
"learning_rate": 1.1507692307692309e-05, |
|
"loss": 0.1079, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 21.384615384615383, |
|
"grad_norm": 0.08622205257415771, |
|
"learning_rate": 1.1446153846153846e-05, |
|
"loss": 0.0397, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"grad_norm": 1.3071388006210327, |
|
"learning_rate": 1.1384615384615385e-05, |
|
"loss": 0.1094, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 21.692307692307693, |
|
"grad_norm": 17.1097469329834, |
|
"learning_rate": 1.1323076923076924e-05, |
|
"loss": 0.0298, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 21.846153846153847, |
|
"grad_norm": 0.08082272112369537, |
|
"learning_rate": 1.126153846153846e-05, |
|
"loss": 0.0196, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.08441055566072464, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.0348, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0177127867937088, |
|
"eval_runtime": 0.5805, |
|
"eval_samples_per_second": 229.097, |
|
"eval_steps_per_second": 15.503, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 22.153846153846153, |
|
"grad_norm": 0.6167936325073242, |
|
"learning_rate": 1.113846153846154e-05, |
|
"loss": 0.0675, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 22.307692307692307, |
|
"grad_norm": 6.9912638664245605, |
|
"learning_rate": 1.1076923076923079e-05, |
|
"loss": 0.0644, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 22.46153846153846, |
|
"grad_norm": 0.07652874290943146, |
|
"learning_rate": 1.1015384615384616e-05, |
|
"loss": 0.0572, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 22.615384615384617, |
|
"grad_norm": 0.08351747691631317, |
|
"learning_rate": 1.0953846153846155e-05, |
|
"loss": 0.051, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 22.76923076923077, |
|
"grad_norm": 0.11051066219806671, |
|
"learning_rate": 1.0892307692307693e-05, |
|
"loss": 0.032, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 22.923076923076923, |
|
"grad_norm": 10.532815933227539, |
|
"learning_rate": 1.083076923076923e-05, |
|
"loss": 0.0326, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.01754232682287693, |
|
"eval_runtime": 0.6431, |
|
"eval_samples_per_second": 206.82, |
|
"eval_steps_per_second": 13.995, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"grad_norm": 0.08222197741270065, |
|
"learning_rate": 1.076923076923077e-05, |
|
"loss": 0.0462, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 23.23076923076923, |
|
"grad_norm": 0.0753277987241745, |
|
"learning_rate": 1.0707692307692308e-05, |
|
"loss": 0.0516, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 23.384615384615383, |
|
"grad_norm": 0.9446002840995789, |
|
"learning_rate": 1.0646153846153845e-05, |
|
"loss": 0.0214, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 23.53846153846154, |
|
"grad_norm": 0.0864008441567421, |
|
"learning_rate": 1.0584615384615386e-05, |
|
"loss": 0.0185, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 23.692307692307693, |
|
"grad_norm": 7.382317543029785, |
|
"learning_rate": 1.0523076923076924e-05, |
|
"loss": 0.0674, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 23.846153846153847, |
|
"grad_norm": 0.18953648209571838, |
|
"learning_rate": 1.0461538461538463e-05, |
|
"loss": 0.02, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 2.4876315593719482, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.1014, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.02354033850133419, |
|
"eval_runtime": 0.6442, |
|
"eval_samples_per_second": 206.452, |
|
"eval_steps_per_second": 13.97, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 24.153846153846153, |
|
"grad_norm": 0.08115736395120621, |
|
"learning_rate": 1.033846153846154e-05, |
|
"loss": 0.014, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 24.307692307692307, |
|
"grad_norm": 0.06984913349151611, |
|
"learning_rate": 1.0276923076923078e-05, |
|
"loss": 0.0328, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 24.46153846153846, |
|
"grad_norm": 7.471628665924072, |
|
"learning_rate": 1.0215384615384615e-05, |
|
"loss": 0.0321, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"grad_norm": 0.0844321921467781, |
|
"learning_rate": 1.0153846153846154e-05, |
|
"loss": 0.051, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 24.76923076923077, |
|
"grad_norm": 0.0683414489030838, |
|
"learning_rate": 1.0092307692307693e-05, |
|
"loss": 0.0428, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 24.923076923076923, |
|
"grad_norm": 0.07370961457490921, |
|
"learning_rate": 1.0030769230769231e-05, |
|
"loss": 0.0395, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.9849624060150376, |
|
"eval_loss": 0.04511820524930954, |
|
"eval_runtime": 0.6443, |
|
"eval_samples_per_second": 206.419, |
|
"eval_steps_per_second": 13.968, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 25.076923076923077, |
|
"grad_norm": 0.7060135006904602, |
|
"learning_rate": 9.96923076923077e-06, |
|
"loss": 0.0197, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 25.23076923076923, |
|
"grad_norm": 0.5647442936897278, |
|
"learning_rate": 9.907692307692309e-06, |
|
"loss": 0.0636, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 25.384615384615383, |
|
"grad_norm": 0.06951487809419632, |
|
"learning_rate": 9.846153846153848e-06, |
|
"loss": 0.0338, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 25.53846153846154, |
|
"grad_norm": 0.9578651785850525, |
|
"learning_rate": 9.784615384615387e-06, |
|
"loss": 0.049, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 25.692307692307693, |
|
"grad_norm": 0.1774640828371048, |
|
"learning_rate": 9.723076923076924e-06, |
|
"loss": 0.0135, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 25.846153846153847, |
|
"grad_norm": 0.06652193516492844, |
|
"learning_rate": 9.661538461538462e-06, |
|
"loss": 0.046, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.06518968194723129, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.0265, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.0296646561473608, |
|
"eval_runtime": 0.5911, |
|
"eval_samples_per_second": 225.018, |
|
"eval_steps_per_second": 15.227, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"grad_norm": 0.07430905103683472, |
|
"learning_rate": 9.53846153846154e-06, |
|
"loss": 0.0326, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 26.307692307692307, |
|
"grad_norm": 0.07437779009342194, |
|
"learning_rate": 9.476923076923079e-06, |
|
"loss": 0.0205, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 26.46153846153846, |
|
"grad_norm": 5.995608329772949, |
|
"learning_rate": 9.415384615384616e-06, |
|
"loss": 0.0725, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 26.615384615384617, |
|
"grad_norm": 0.09473489224910736, |
|
"learning_rate": 9.353846153846155e-06, |
|
"loss": 0.0142, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 26.76923076923077, |
|
"grad_norm": 0.05937571823596954, |
|
"learning_rate": 9.292307692307694e-06, |
|
"loss": 0.0212, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 26.923076923076923, |
|
"grad_norm": 19.040122985839844, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.0569, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.026294343173503876, |
|
"eval_runtime": 0.642, |
|
"eval_samples_per_second": 207.155, |
|
"eval_steps_per_second": 14.018, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 27.076923076923077, |
|
"grad_norm": 0.5836367011070251, |
|
"learning_rate": 9.169230769230771e-06, |
|
"loss": 0.1035, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 27.23076923076923, |
|
"grad_norm": 0.06087055802345276, |
|
"learning_rate": 9.107692307692308e-06, |
|
"loss": 0.0518, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 27.384615384615383, |
|
"grad_norm": 0.06178651750087738, |
|
"learning_rate": 9.046153846153847e-06, |
|
"loss": 0.0477, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 27.53846153846154, |
|
"grad_norm": 0.0936984047293663, |
|
"learning_rate": 8.984615384615386e-06, |
|
"loss": 0.012, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"grad_norm": 0.056670334190130234, |
|
"learning_rate": 8.923076923076925e-06, |
|
"loss": 0.0363, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 27.846153846153847, |
|
"grad_norm": 0.13299456238746643, |
|
"learning_rate": 8.861538461538463e-06, |
|
"loss": 0.013, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.0713280737400055, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.0666, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.9849624060150376, |
|
"eval_loss": 0.02451479434967041, |
|
"eval_runtime": 0.6311, |
|
"eval_samples_per_second": 210.727, |
|
"eval_steps_per_second": 14.26, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 28.153846153846153, |
|
"grad_norm": 0.08556320518255234, |
|
"learning_rate": 8.73846153846154e-06, |
|
"loss": 0.0119, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 28.307692307692307, |
|
"grad_norm": 0.05891846865415573, |
|
"learning_rate": 8.676923076923078e-06, |
|
"loss": 0.0456, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 28.46153846153846, |
|
"grad_norm": 0.05940578132867813, |
|
"learning_rate": 8.615384615384617e-06, |
|
"loss": 0.0541, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 28.615384615384617, |
|
"grad_norm": 0.06775704026222229, |
|
"learning_rate": 8.553846153846156e-06, |
|
"loss": 0.0162, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 28.76923076923077, |
|
"grad_norm": 0.058256104588508606, |
|
"learning_rate": 8.492307692307693e-06, |
|
"loss": 0.0354, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 28.923076923076923, |
|
"grad_norm": 1.286210060119629, |
|
"learning_rate": 8.430769230769231e-06, |
|
"loss": 0.0285, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.9774436090225563, |
|
"eval_loss": 0.041793130338191986, |
|
"eval_runtime": 0.6391, |
|
"eval_samples_per_second": 208.111, |
|
"eval_steps_per_second": 14.083, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 29.076923076923077, |
|
"grad_norm": 2.135648727416992, |
|
"learning_rate": 8.36923076923077e-06, |
|
"loss": 0.0197, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"grad_norm": 1.5485310554504395, |
|
"learning_rate": 8.307692307692309e-06, |
|
"loss": 0.0129, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 29.384615384615383, |
|
"grad_norm": 1.2594960927963257, |
|
"learning_rate": 8.246153846153848e-06, |
|
"loss": 0.0964, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 29.53846153846154, |
|
"grad_norm": 0.13048137724399567, |
|
"learning_rate": 8.184615384615385e-06, |
|
"loss": 0.0111, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 29.692307692307693, |
|
"grad_norm": 0.38255247473716736, |
|
"learning_rate": 8.123076923076924e-06, |
|
"loss": 0.0292, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 29.846153846153847, |
|
"grad_norm": 2.1401822566986084, |
|
"learning_rate": 8.061538461538463e-06, |
|
"loss": 0.0417, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.2889564633369446, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.0892, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.020448315888643265, |
|
"eval_runtime": 0.5776, |
|
"eval_samples_per_second": 230.273, |
|
"eval_steps_per_second": 15.582, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 30.153846153846153, |
|
"grad_norm": 3.4330976009368896, |
|
"learning_rate": 7.93846153846154e-06, |
|
"loss": 0.0466, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 30.307692307692307, |
|
"grad_norm": 0.05678678676486015, |
|
"learning_rate": 7.876923076923077e-06, |
|
"loss": 0.0701, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 30.46153846153846, |
|
"grad_norm": 0.05310628563165665, |
|
"learning_rate": 7.815384615384616e-06, |
|
"loss": 0.0118, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 30.615384615384617, |
|
"grad_norm": 0.07134439796209335, |
|
"learning_rate": 7.753846153846155e-06, |
|
"loss": 0.0412, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"grad_norm": 0.06206020340323448, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.0254, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 30.923076923076923, |
|
"grad_norm": 0.0549427792429924, |
|
"learning_rate": 7.630769230769232e-06, |
|
"loss": 0.0371, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.9849624060150376, |
|
"eval_loss": 0.03390338271856308, |
|
"eval_runtime": 0.6391, |
|
"eval_samples_per_second": 208.12, |
|
"eval_steps_per_second": 14.083, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 31.076923076923077, |
|
"grad_norm": 0.05257405340671539, |
|
"learning_rate": 7.5692307692307695e-06, |
|
"loss": 0.0119, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 31.23076923076923, |
|
"grad_norm": 1.0072262287139893, |
|
"learning_rate": 7.507692307692308e-06, |
|
"loss": 0.0131, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 31.384615384615383, |
|
"grad_norm": 0.054051704704761505, |
|
"learning_rate": 7.446153846153846e-06, |
|
"loss": 0.0309, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 31.53846153846154, |
|
"grad_norm": 0.059842586517333984, |
|
"learning_rate": 7.384615384615386e-06, |
|
"loss": 0.0699, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 31.692307692307693, |
|
"grad_norm": 0.4310505986213684, |
|
"learning_rate": 7.323076923076924e-06, |
|
"loss": 0.0105, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 31.846153846153847, |
|
"grad_norm": 0.05525004491209984, |
|
"learning_rate": 7.261538461538462e-06, |
|
"loss": 0.0654, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.06051107123494148, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.0105, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.01434730738401413, |
|
"eval_runtime": 0.5855, |
|
"eval_samples_per_second": 227.144, |
|
"eval_steps_per_second": 15.371, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 32.15384615384615, |
|
"grad_norm": 0.04868883639574051, |
|
"learning_rate": 7.1384615384615385e-06, |
|
"loss": 0.032, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"grad_norm": 0.10859151929616928, |
|
"learning_rate": 7.076923076923078e-06, |
|
"loss": 0.084, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 32.46153846153846, |
|
"grad_norm": 0.05709298700094223, |
|
"learning_rate": 7.015384615384616e-06, |
|
"loss": 0.0189, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 32.61538461538461, |
|
"grad_norm": 0.08583523333072662, |
|
"learning_rate": 6.953846153846154e-06, |
|
"loss": 0.0124, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 32.76923076923077, |
|
"grad_norm": 0.7491576671600342, |
|
"learning_rate": 6.892307692307693e-06, |
|
"loss": 0.0213, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 32.92307692307692, |
|
"grad_norm": 0.3305934965610504, |
|
"learning_rate": 6.830769230769231e-06, |
|
"loss": 0.0563, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.014035705476999283, |
|
"eval_runtime": 0.6445, |
|
"eval_samples_per_second": 206.373, |
|
"eval_steps_per_second": 13.965, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 33.07692307692308, |
|
"grad_norm": 0.049748744815588, |
|
"learning_rate": 6.76923076923077e-06, |
|
"loss": 0.0104, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 33.23076923076923, |
|
"grad_norm": 0.05033630132675171, |
|
"learning_rate": 6.707692307692308e-06, |
|
"loss": 0.0548, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 33.38461538461539, |
|
"grad_norm": 0.054612692445516586, |
|
"learning_rate": 6.646153846153846e-06, |
|
"loss": 0.0356, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 33.53846153846154, |
|
"grad_norm": 0.05341866612434387, |
|
"learning_rate": 6.584615384615385e-06, |
|
"loss": 0.0133, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 33.69230769230769, |
|
"grad_norm": 0.04895515367388725, |
|
"learning_rate": 6.523076923076923e-06, |
|
"loss": 0.0213, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"grad_norm": 0.063107430934906, |
|
"learning_rate": 6.461538461538463e-06, |
|
"loss": 0.0112, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 7.929271221160889, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.0573, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.010156131349503994, |
|
"eval_runtime": 0.6308, |
|
"eval_samples_per_second": 210.855, |
|
"eval_steps_per_second": 14.268, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 34.15384615384615, |
|
"grad_norm": 0.11311448365449905, |
|
"learning_rate": 6.3384615384615385e-06, |
|
"loss": 0.0271, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 34.30769230769231, |
|
"grad_norm": 0.17935284972190857, |
|
"learning_rate": 6.276923076923077e-06, |
|
"loss": 0.0444, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 34.46153846153846, |
|
"grad_norm": 0.07903819531202316, |
|
"learning_rate": 6.215384615384615e-06, |
|
"loss": 0.039, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 34.61538461538461, |
|
"grad_norm": 0.07042822986841202, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.0617, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 34.76923076923077, |
|
"grad_norm": 0.05035420507192612, |
|
"learning_rate": 6.092307692307693e-06, |
|
"loss": 0.0505, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 34.92307692307692, |
|
"grad_norm": 0.04776820167899132, |
|
"learning_rate": 6.030769230769231e-06, |
|
"loss": 0.0409, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.009572061710059643, |
|
"eval_runtime": 0.6399, |
|
"eval_samples_per_second": 207.852, |
|
"eval_steps_per_second": 14.065, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 35.07692307692308, |
|
"grad_norm": 20.547210693359375, |
|
"learning_rate": 5.9692307692307695e-06, |
|
"loss": 0.0212, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 35.23076923076923, |
|
"grad_norm": 16.86725616455078, |
|
"learning_rate": 5.907692307692308e-06, |
|
"loss": 0.0421, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"grad_norm": 1.5179917812347412, |
|
"learning_rate": 5.846153846153847e-06, |
|
"loss": 0.1226, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 35.53846153846154, |
|
"grad_norm": 0.045909151434898376, |
|
"learning_rate": 5.784615384615385e-06, |
|
"loss": 0.0092, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 35.69230769230769, |
|
"grad_norm": 0.04946780204772949, |
|
"learning_rate": 5.723076923076923e-06, |
|
"loss": 0.0505, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 35.84615384615385, |
|
"grad_norm": 0.055896684527397156, |
|
"learning_rate": 5.661538461538462e-06, |
|
"loss": 0.0159, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.04570392891764641, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.0523, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.01487450860440731, |
|
"eval_runtime": 0.6368, |
|
"eval_samples_per_second": 208.852, |
|
"eval_steps_per_second": 14.133, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 36.15384615384615, |
|
"grad_norm": 0.06980939954519272, |
|
"learning_rate": 5.538461538461539e-06, |
|
"loss": 0.0219, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 36.30769230769231, |
|
"grad_norm": 0.05014393478631973, |
|
"learning_rate": 5.476923076923077e-06, |
|
"loss": 0.0352, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 36.46153846153846, |
|
"grad_norm": 0.04635272175073624, |
|
"learning_rate": 5.415384615384615e-06, |
|
"loss": 0.0328, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 36.61538461538461, |
|
"grad_norm": 0.04787183925509453, |
|
"learning_rate": 5.353846153846154e-06, |
|
"loss": 0.0106, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 36.76923076923077, |
|
"grad_norm": 0.05444607138633728, |
|
"learning_rate": 5.292307692307693e-06, |
|
"loss": 0.0443, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"grad_norm": 0.046256501227617264, |
|
"learning_rate": 5.230769230769232e-06, |
|
"loss": 0.0131, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.0196556244045496, |
|
"eval_runtime": 0.641, |
|
"eval_samples_per_second": 207.498, |
|
"eval_steps_per_second": 14.041, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 37.07692307692308, |
|
"grad_norm": 0.045623164623975754, |
|
"learning_rate": 5.16923076923077e-06, |
|
"loss": 0.0112, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 37.23076923076923, |
|
"grad_norm": 0.04630188271403313, |
|
"learning_rate": 5.1076923076923075e-06, |
|
"loss": 0.0129, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 37.38461538461539, |
|
"grad_norm": 1.0303974151611328, |
|
"learning_rate": 5.046153846153846e-06, |
|
"loss": 0.0523, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 37.53846153846154, |
|
"grad_norm": 0.8218058347702026, |
|
"learning_rate": 4.984615384615385e-06, |
|
"loss": 0.0532, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 37.69230769230769, |
|
"grad_norm": 1.5203208923339844, |
|
"learning_rate": 4.923076923076924e-06, |
|
"loss": 0.0345, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 37.84615384615385, |
|
"grad_norm": 0.05802327021956444, |
|
"learning_rate": 4.861538461538462e-06, |
|
"loss": 0.0251, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.06536999344825745, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.0329, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.010934116318821907, |
|
"eval_runtime": 0.6397, |
|
"eval_samples_per_second": 207.921, |
|
"eval_steps_per_second": 14.07, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 38.15384615384615, |
|
"grad_norm": 0.07708264887332916, |
|
"learning_rate": 4.738461538461539e-06, |
|
"loss": 0.0339, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 38.30769230769231, |
|
"grad_norm": 0.05018337070941925, |
|
"learning_rate": 4.676923076923077e-06, |
|
"loss": 0.0371, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 2.005122423171997, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.0493, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 38.61538461538461, |
|
"grad_norm": 0.04191539064049721, |
|
"learning_rate": 4.553846153846154e-06, |
|
"loss": 0.056, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 38.76923076923077, |
|
"grad_norm": 0.08912540227174759, |
|
"learning_rate": 4.492307692307693e-06, |
|
"loss": 0.0675, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 38.92307692307692, |
|
"grad_norm": 4.123304843902588, |
|
"learning_rate": 4.430769230769232e-06, |
|
"loss": 0.0577, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.00963473692536354, |
|
"eval_runtime": 0.6269, |
|
"eval_samples_per_second": 212.155, |
|
"eval_steps_per_second": 14.356, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 39.07692307692308, |
|
"grad_norm": 0.12956681847572327, |
|
"learning_rate": 4.36923076923077e-06, |
|
"loss": 0.0348, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 39.23076923076923, |
|
"grad_norm": 0.0469551756978035, |
|
"learning_rate": 4.307692307692308e-06, |
|
"loss": 0.047, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 39.38461538461539, |
|
"grad_norm": 0.0566897876560688, |
|
"learning_rate": 4.246153846153846e-06, |
|
"loss": 0.0305, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 39.53846153846154, |
|
"grad_norm": 0.04538924619555473, |
|
"learning_rate": 4.184615384615385e-06, |
|
"loss": 0.0083, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 39.69230769230769, |
|
"grad_norm": 0.1393657773733139, |
|
"learning_rate": 4.123076923076924e-06, |
|
"loss": 0.0087, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 39.84615384615385, |
|
"grad_norm": 0.04170211777091026, |
|
"learning_rate": 4.061538461538462e-06, |
|
"loss": 0.008, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.04205217584967613, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0085, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.014666187576949596, |
|
"eval_runtime": 0.5786, |
|
"eval_samples_per_second": 229.849, |
|
"eval_steps_per_second": 15.554, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 40.15384615384615, |
|
"grad_norm": 0.04466895014047623, |
|
"learning_rate": 3.938461538461539e-06, |
|
"loss": 0.0376, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 40.30769230769231, |
|
"grad_norm": 0.04949569329619408, |
|
"learning_rate": 3.876923076923077e-06, |
|
"loss": 0.0621, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 40.46153846153846, |
|
"grad_norm": 0.0461997464299202, |
|
"learning_rate": 3.815384615384616e-06, |
|
"loss": 0.0107, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 40.61538461538461, |
|
"grad_norm": 0.048004575073719025, |
|
"learning_rate": 3.753846153846154e-06, |
|
"loss": 0.0093, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 40.76923076923077, |
|
"grad_norm": 0.04209740087389946, |
|
"learning_rate": 3.692307692307693e-06, |
|
"loss": 0.0342, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 40.92307692307692, |
|
"grad_norm": 0.15323477983474731, |
|
"learning_rate": 3.630769230769231e-06, |
|
"loss": 0.0618, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.009433195926249027, |
|
"eval_runtime": 0.6376, |
|
"eval_samples_per_second": 208.608, |
|
"eval_steps_per_second": 14.116, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 41.07692307692308, |
|
"grad_norm": 0.040465518832206726, |
|
"learning_rate": 3.5692307692307692e-06, |
|
"loss": 0.0079, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 41.23076923076923, |
|
"grad_norm": 0.06956275552511215, |
|
"learning_rate": 3.507692307692308e-06, |
|
"loss": 0.0278, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 41.38461538461539, |
|
"grad_norm": 0.04409582540392876, |
|
"learning_rate": 3.4461538461538464e-06, |
|
"loss": 0.0079, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"grad_norm": 13.665828704833984, |
|
"learning_rate": 3.384615384615385e-06, |
|
"loss": 0.0187, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 41.69230769230769, |
|
"grad_norm": 1.3187448978424072, |
|
"learning_rate": 3.323076923076923e-06, |
|
"loss": 0.1204, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 41.84615384615385, |
|
"grad_norm": 0.04300126060843468, |
|
"learning_rate": 3.2615384615384615e-06, |
|
"loss": 0.0198, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.0438438281416893, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.0847, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.9924812030075187, |
|
"eval_loss": 0.019689705222845078, |
|
"eval_runtime": 0.634, |
|
"eval_samples_per_second": 209.767, |
|
"eval_steps_per_second": 14.195, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 42.15384615384615, |
|
"grad_norm": 0.055344920605421066, |
|
"learning_rate": 3.1384615384615386e-06, |
|
"loss": 0.0343, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 42.30769230769231, |
|
"grad_norm": 0.04840540513396263, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.0091, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 42.46153846153846, |
|
"grad_norm": 0.0416325181722641, |
|
"learning_rate": 3.0153846153846154e-06, |
|
"loss": 0.0379, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 42.61538461538461, |
|
"grad_norm": 0.04047630727291107, |
|
"learning_rate": 2.953846153846154e-06, |
|
"loss": 0.0344, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 42.76923076923077, |
|
"grad_norm": 0.039694271981716156, |
|
"learning_rate": 2.8923076923076925e-06, |
|
"loss": 0.0556, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 42.92307692307692, |
|
"grad_norm": 0.0425509512424469, |
|
"learning_rate": 2.830769230769231e-06, |
|
"loss": 0.0291, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.008893251419067383, |
|
"eval_runtime": 0.6271, |
|
"eval_samples_per_second": 212.079, |
|
"eval_steps_per_second": 14.351, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"grad_norm": 0.04988468438386917, |
|
"learning_rate": 2.7692307692307697e-06, |
|
"loss": 0.0369, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 43.23076923076923, |
|
"grad_norm": 0.07137361913919449, |
|
"learning_rate": 2.7076923076923076e-06, |
|
"loss": 0.0434, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 43.38461538461539, |
|
"grad_norm": 7.0051679611206055, |
|
"learning_rate": 2.6461538461538464e-06, |
|
"loss": 0.0291, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 43.53846153846154, |
|
"grad_norm": 0.045469850301742554, |
|
"learning_rate": 2.584615384615385e-06, |
|
"loss": 0.0338, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 43.69230769230769, |
|
"grad_norm": 0.08003593236207962, |
|
"learning_rate": 2.523076923076923e-06, |
|
"loss": 0.0099, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 43.84615384615385, |
|
"grad_norm": 0.04380409047007561, |
|
"learning_rate": 2.461538461538462e-06, |
|
"loss": 0.0111, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 13.31029224395752, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.0568, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.008692615665495396, |
|
"eval_runtime": 0.585, |
|
"eval_samples_per_second": 227.347, |
|
"eval_steps_per_second": 15.384, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 44.15384615384615, |
|
"grad_norm": 0.04654600843787193, |
|
"learning_rate": 2.3384615384615387e-06, |
|
"loss": 0.0087, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 44.30769230769231, |
|
"grad_norm": 7.452500343322754, |
|
"learning_rate": 2.276923076923077e-06, |
|
"loss": 0.0108, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 44.46153846153846, |
|
"grad_norm": 13.458589553833008, |
|
"learning_rate": 2.215384615384616e-06, |
|
"loss": 0.0274, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"grad_norm": 0.044014327228069305, |
|
"learning_rate": 2.153846153846154e-06, |
|
"loss": 0.0299, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 44.76923076923077, |
|
"grad_norm": 0.041860181838274, |
|
"learning_rate": 2.0923076923076926e-06, |
|
"loss": 0.0112, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 44.92307692307692, |
|
"grad_norm": 0.04078350216150284, |
|
"learning_rate": 2.030769230769231e-06, |
|
"loss": 0.0077, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.010402214713394642, |
|
"eval_runtime": 0.6383, |
|
"eval_samples_per_second": 208.376, |
|
"eval_steps_per_second": 14.101, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 45.07692307692308, |
|
"grad_norm": 0.7966273427009583, |
|
"learning_rate": 1.9692307692307693e-06, |
|
"loss": 0.0432, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 45.23076923076923, |
|
"grad_norm": 0.04070662334561348, |
|
"learning_rate": 1.907692307692308e-06, |
|
"loss": 0.0549, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 45.38461538461539, |
|
"grad_norm": 0.042289331555366516, |
|
"learning_rate": 1.8461538461538465e-06, |
|
"loss": 0.0364, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 45.53846153846154, |
|
"grad_norm": 0.04655339941382408, |
|
"learning_rate": 1.7846153846153846e-06, |
|
"loss": 0.0114, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 45.69230769230769, |
|
"grad_norm": 0.04026507958769798, |
|
"learning_rate": 1.7230769230769232e-06, |
|
"loss": 0.0078, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 45.84615384615385, |
|
"grad_norm": 0.048073675483465195, |
|
"learning_rate": 1.6615384615384616e-06, |
|
"loss": 0.0155, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.1250167340040207, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.008, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.013788605108857155, |
|
"eval_runtime": 0.5799, |
|
"eval_samples_per_second": 229.355, |
|
"eval_steps_per_second": 15.52, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"grad_norm": 0.3202461302280426, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.0352, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 46.30769230769231, |
|
"grad_norm": 2.8946588039398193, |
|
"learning_rate": 1.476923076923077e-06, |
|
"loss": 0.0196, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 46.46153846153846, |
|
"grad_norm": 0.0777769535779953, |
|
"learning_rate": 1.4153846153846155e-06, |
|
"loss": 0.0079, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 46.61538461538461, |
|
"grad_norm": 1.6607468128204346, |
|
"learning_rate": 1.3538461538461538e-06, |
|
"loss": 0.0102, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 46.76923076923077, |
|
"grad_norm": 0.04541005194187164, |
|
"learning_rate": 1.2923076923076924e-06, |
|
"loss": 0.0085, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 46.92307692307692, |
|
"grad_norm": 0.041475191712379456, |
|
"learning_rate": 1.230769230769231e-06, |
|
"loss": 0.0272, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.00810349639505148, |
|
"eval_runtime": 0.6353, |
|
"eval_samples_per_second": 209.365, |
|
"eval_steps_per_second": 14.168, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 47.07692307692308, |
|
"grad_norm": 0.048281800001859665, |
|
"learning_rate": 1.1692307692307693e-06, |
|
"loss": 0.008, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 47.23076923076923, |
|
"grad_norm": 0.03975387290120125, |
|
"learning_rate": 1.107692307692308e-06, |
|
"loss": 0.0431, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 47.38461538461539, |
|
"grad_norm": 0.040405042469501495, |
|
"learning_rate": 1.0461538461538463e-06, |
|
"loss": 0.0355, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 47.53846153846154, |
|
"grad_norm": 0.04081344977021217, |
|
"learning_rate": 9.846153846153847e-07, |
|
"loss": 0.0078, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"grad_norm": 0.045139458030462265, |
|
"learning_rate": 9.230769230769232e-07, |
|
"loss": 0.0096, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 47.84615384615385, |
|
"grad_norm": 0.06699339300394058, |
|
"learning_rate": 8.615384615384616e-07, |
|
"loss": 0.0402, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.04077847674489021, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.008, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.008442863821983337, |
|
"eval_runtime": 0.6316, |
|
"eval_samples_per_second": 210.576, |
|
"eval_steps_per_second": 14.25, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 48.15384615384615, |
|
"grad_norm": 0.14330124855041504, |
|
"learning_rate": 7.384615384615385e-07, |
|
"loss": 0.0396, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 48.30769230769231, |
|
"grad_norm": 0.04051917791366577, |
|
"learning_rate": 6.769230769230769e-07, |
|
"loss": 0.0613, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 48.46153846153846, |
|
"grad_norm": 0.03945121914148331, |
|
"learning_rate": 6.153846153846155e-07, |
|
"loss": 0.0092, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 48.61538461538461, |
|
"grad_norm": 0.04850227013230324, |
|
"learning_rate": 5.53846153846154e-07, |
|
"loss": 0.0395, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 48.76923076923077, |
|
"grad_norm": 0.611132800579071, |
|
"learning_rate": 4.923076923076923e-07, |
|
"loss": 0.015, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 48.92307692307692, |
|
"grad_norm": 0.039736129343509674, |
|
"learning_rate": 4.307692307692308e-07, |
|
"loss": 0.0112, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.008192874491214752, |
|
"eval_runtime": 0.594, |
|
"eval_samples_per_second": 223.907, |
|
"eval_steps_per_second": 15.152, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 49.07692307692308, |
|
"grad_norm": 0.0747324600815773, |
|
"learning_rate": 3.6923076923076927e-07, |
|
"loss": 0.018, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"grad_norm": 0.03965931013226509, |
|
"learning_rate": 3.0769230769230774e-07, |
|
"loss": 0.0948, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 49.38461538461539, |
|
"grad_norm": 3.6046483516693115, |
|
"learning_rate": 2.4615384615384616e-07, |
|
"loss": 0.0302, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 49.53846153846154, |
|
"grad_norm": 0.03949074074625969, |
|
"learning_rate": 1.8461538461538464e-07, |
|
"loss": 0.0094, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 49.69230769230769, |
|
"grad_norm": 0.03980456292629242, |
|
"learning_rate": 1.2307692307692308e-07, |
|
"loss": 0.0388, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 49.84615384615385, |
|
"grad_norm": 8.264440536499023, |
|
"learning_rate": 6.153846153846154e-08, |
|
"loss": 0.0143, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.043733663856983185, |
|
"learning_rate": 0.0, |
|
"loss": 0.013, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.007947824895381927, |
|
"eval_runtime": 0.6285, |
|
"eval_samples_per_second": 211.608, |
|
"eval_steps_per_second": 14.319, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"step": 3250, |
|
"total_flos": 3.6243328994998477e+18, |
|
"train_loss": 0.0, |
|
"train_runtime": 0.0791, |
|
"train_samples_per_second": 65335.775, |
|
"train_steps_per_second": 4107.181 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 325, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.6243328994998477e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|