|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.996777029006739, |
|
"eval_steps": 500, |
|
"global_step": 1278, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.023439789041898623, |
|
"grad_norm": 9.967006175549518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6132, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.046879578083797245, |
|
"grad_norm": 1.708901732081446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5268, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07031936712569586, |
|
"grad_norm": 1.6601249126276376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4939, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09375915616759449, |
|
"grad_norm": 0.9058674995172575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4729, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11719894520949312, |
|
"grad_norm": 1.1831753366659354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4606, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14063873425139173, |
|
"grad_norm": 0.6857017193079774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4448, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16407852329329037, |
|
"grad_norm": 0.665824933523209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4416, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18751831233518898, |
|
"grad_norm": 1.5098116393031806, |
|
"learning_rate": 5e-06, |
|
"loss": 0.436, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2109581013770876, |
|
"grad_norm": 0.8814351421187862, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4249, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23439789041898623, |
|
"grad_norm": 0.5818056460875761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.416, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25783767946088487, |
|
"grad_norm": 0.6332178617430071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4143, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.28127746850278346, |
|
"grad_norm": 0.6385133089285979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4142, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3047172575446821, |
|
"grad_norm": 0.5190122301041274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4054, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.32815704658658074, |
|
"grad_norm": 0.72677604074983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4061, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3515968356284793, |
|
"grad_norm": 0.5862867262610982, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4002, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.37503662467037796, |
|
"grad_norm": 0.4943669370299919, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4078, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3984764137122766, |
|
"grad_norm": 0.617733036177013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.396, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4219162027541752, |
|
"grad_norm": 0.501523217408356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.398, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4453559917960738, |
|
"grad_norm": 0.5513765393124738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3915, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.46879578083797246, |
|
"grad_norm": 0.48423414375531915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3928, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4922355698798711, |
|
"grad_norm": 0.6668034319234166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3897, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5156753589217697, |
|
"grad_norm": 0.6844513719684475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3943, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5391151479636683, |
|
"grad_norm": 0.5634978239261383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3867, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5625549370055669, |
|
"grad_norm": 0.48253201050555344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3951, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5859947260474656, |
|
"grad_norm": 0.5583859071799139, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3896, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6094345150893642, |
|
"grad_norm": 0.5725388703107269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3888, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6328743041312628, |
|
"grad_norm": 0.8156082811133835, |
|
"learning_rate": 5e-06, |
|
"loss": 0.379, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6563140931731615, |
|
"grad_norm": 0.5966575531920639, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3864, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6797538822150601, |
|
"grad_norm": 0.48850438172794114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3873, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7031936712569586, |
|
"grad_norm": 0.5962793861443083, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3869, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7266334602988573, |
|
"grad_norm": 0.5171258822649074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3818, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7500732493407559, |
|
"grad_norm": 0.5592073208479841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3755, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7735130383826545, |
|
"grad_norm": 0.533904690846359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3745, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7969528274245532, |
|
"grad_norm": 0.548421936851489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3794, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8203926164664518, |
|
"grad_norm": 0.5826483187799658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3729, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8438324055083504, |
|
"grad_norm": 0.65088471853365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3728, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8672721945502491, |
|
"grad_norm": 0.7534193430854838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3763, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8907119835921476, |
|
"grad_norm": 0.6782662978614515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3794, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9141517726340463, |
|
"grad_norm": 0.4771544648840299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3702, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9375915616759449, |
|
"grad_norm": 0.5535379054118112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3768, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9610313507178435, |
|
"grad_norm": 0.4608571118764034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3788, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9844711397597422, |
|
"grad_norm": 0.5668677920657946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3673, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9985350131848814, |
|
"eval_loss": 0.37145182490348816, |
|
"eval_runtime": 299.8322, |
|
"eval_samples_per_second": 38.341, |
|
"eval_steps_per_second": 0.6, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.0084969235276882, |
|
"grad_norm": 0.6984558481581081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3801, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0319367125695869, |
|
"grad_norm": 0.5239390968899379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3221, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0553765016114856, |
|
"grad_norm": 0.5695148407791144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3257, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.078816290653384, |
|
"grad_norm": 0.5709165769711674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3205, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1022560796952827, |
|
"grad_norm": 0.5285237377857552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3267, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1256958687371814, |
|
"grad_norm": 0.5164839735723259, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3269, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1491356577790799, |
|
"grad_norm": 0.4651823010378352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3299, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1725754468209786, |
|
"grad_norm": 0.4611493940935449, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3307, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1960152358628773, |
|
"grad_norm": 0.5290456767213961, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3338, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.219455024904776, |
|
"grad_norm": 0.49326918332510095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3253, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2428948139466744, |
|
"grad_norm": 0.551961171965543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3296, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2663346029885731, |
|
"grad_norm": 0.49768749823200925, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3256, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2897743920304716, |
|
"grad_norm": 0.5626038647037341, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3255, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3132141810723703, |
|
"grad_norm": 0.5103540222464168, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3225, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.336653970114269, |
|
"grad_norm": 0.540424196440726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3264, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3600937591561677, |
|
"grad_norm": 0.5745930638703757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3196, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3835335481980662, |
|
"grad_norm": 0.6445462088875488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3244, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4069733372399649, |
|
"grad_norm": 0.481096562270643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3224, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4304131262818633, |
|
"grad_norm": 0.5092132313164293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.453852915323762, |
|
"grad_norm": 0.5067387262720331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3265, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4772927043656607, |
|
"grad_norm": 0.48943368667359805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3279, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5007324934075594, |
|
"grad_norm": 0.5183176032348761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3227, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.524172282449458, |
|
"grad_norm": 0.5247150397228669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3181, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5476120714913566, |
|
"grad_norm": 0.5298290612904054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3275, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.571051860533255, |
|
"grad_norm": 0.5335846082176007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3228, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.5944916495751538, |
|
"grad_norm": 0.49588298624982247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3199, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6179314386170525, |
|
"grad_norm": 0.5142614839866478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3317, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.6413712276589512, |
|
"grad_norm": 0.4957666564649026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3239, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6648110167008499, |
|
"grad_norm": 0.5027765035894468, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3208, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.6882508057427483, |
|
"grad_norm": 0.6791514332148134, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3184, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7116905947846468, |
|
"grad_norm": 0.4925105147109606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3252, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7351303838265455, |
|
"grad_norm": 0.5049447941188098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3201, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7585701728684442, |
|
"grad_norm": 0.544693187707417, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3258, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.7820099619103429, |
|
"grad_norm": 0.4756047714296392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3231, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8054497509522416, |
|
"grad_norm": 0.5599548628855956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3246, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.82888953999414, |
|
"grad_norm": 0.4756090546668893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3219, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8523293290360385, |
|
"grad_norm": 0.6124115056884903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3217, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8757691180779372, |
|
"grad_norm": 0.46580289937046976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3259, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.899208907119836, |
|
"grad_norm": 0.5943245513187589, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3197, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9226486961617346, |
|
"grad_norm": 0.49042860120039294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3226, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.9460884852036333, |
|
"grad_norm": 0.5267257339110174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3221, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9695282742455318, |
|
"grad_norm": 0.717669514593216, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3282, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.9929680632874303, |
|
"grad_norm": 0.46905846352186026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.99765602109581, |
|
"eval_loss": 0.36132651567459106, |
|
"eval_runtime": 296.4899, |
|
"eval_samples_per_second": 38.774, |
|
"eval_steps_per_second": 0.607, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 2.0169938470553763, |
|
"grad_norm": 0.6240807456372731, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3065, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.040433636097275, |
|
"grad_norm": 0.6288943822511728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2738, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.0638734251391737, |
|
"grad_norm": 0.5311356208759552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2731, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.0873132141810724, |
|
"grad_norm": 0.56075413432555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2744, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.110753003222971, |
|
"grad_norm": 0.5639112603035239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2751, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.13419279226487, |
|
"grad_norm": 0.5591523409924748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2765, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.157632581306768, |
|
"grad_norm": 0.5663817052316904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2737, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.1810723703486667, |
|
"grad_norm": 0.623581090691007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2741, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2045121593905654, |
|
"grad_norm": 0.47856596919111055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2756, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.227951948432464, |
|
"grad_norm": 0.48512193272255605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2737, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.251391737474363, |
|
"grad_norm": 0.5099859245096886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2796, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.2748315265162615, |
|
"grad_norm": 0.5305125040779604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.273, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.2982713155581598, |
|
"grad_norm": 0.557582967315411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.273, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3217111046000585, |
|
"grad_norm": 0.5807484141453545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2755, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.345150893641957, |
|
"grad_norm": 0.49544753860231366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.281, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.368590682683856, |
|
"grad_norm": 0.5035116546897159, |
|
"learning_rate": 5e-06, |
|
"loss": 0.275, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.3920304717257546, |
|
"grad_norm": 0.5229341109864724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2778, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.4154702607676533, |
|
"grad_norm": 0.5788744389040322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2843, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.438910049809552, |
|
"grad_norm": 0.587705401996759, |
|
"learning_rate": 5e-06, |
|
"loss": 0.278, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.46234983885145, |
|
"grad_norm": 0.526667348496211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2758, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.485789627893349, |
|
"grad_norm": 0.5557320228470611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2749, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5092294169352476, |
|
"grad_norm": 0.5455334258984845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2829, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.5326692059771463, |
|
"grad_norm": 0.6049156598816975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2792, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.556108995019045, |
|
"grad_norm": 0.6063878997150756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2768, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.5795487840609432, |
|
"grad_norm": 0.5807051957808216, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2837, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.602988573102842, |
|
"grad_norm": 0.48690825278368993, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2732, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.6264283621447406, |
|
"grad_norm": 0.47298208429795785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2838, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.6498681511866393, |
|
"grad_norm": 0.5117446887050897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2799, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.673307940228538, |
|
"grad_norm": 0.5788651437910809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2831, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.6967477292704367, |
|
"grad_norm": 0.48115847246690635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2795, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.7201875183123354, |
|
"grad_norm": 0.4752829940802366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.282, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.7436273073542337, |
|
"grad_norm": 0.4817048174863979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2798, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.7670670963961324, |
|
"grad_norm": 0.5163232884368958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2785, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.790506885438031, |
|
"grad_norm": 0.49724121376113645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2871, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8139466744799297, |
|
"grad_norm": 0.575214274563183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.277, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.8373864635218284, |
|
"grad_norm": 0.49524252636931043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.284, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.8608262525637267, |
|
"grad_norm": 0.5731018313488945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.279, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.8842660416056254, |
|
"grad_norm": 0.5359950242882849, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2805, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.907705830647524, |
|
"grad_norm": 0.5534733032853985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2819, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.9311456196894228, |
|
"grad_norm": 0.5653588586425916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.276, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.9545854087313215, |
|
"grad_norm": 0.5400026090208886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2795, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.97802519777322, |
|
"grad_norm": 0.5145229483499503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2817, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.996777029006739, |
|
"eval_loss": 0.3670854866504669, |
|
"eval_runtime": 285.3181, |
|
"eval_samples_per_second": 40.292, |
|
"eval_steps_per_second": 0.631, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 2.996777029006739, |
|
"step": 1278, |
|
"total_flos": 2140488220016640.0, |
|
"train_loss": 0.33715206393986613, |
|
"train_runtime": 42576.9653, |
|
"train_samples_per_second": 15.39, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1278, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2140488220016640.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|