|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9871244635193133, |
|
"eval_steps": 500, |
|
"global_step": 522, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02861230329041488, |
|
"grad_norm": 2827.3015022702225, |
|
"learning_rate": 9.766490138119515e-06, |
|
"loss": 6.1313, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05722460658082976, |
|
"grad_norm": 516.297994115725, |
|
"learning_rate": 1.3972688495262568e-05, |
|
"loss": 3.18, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08583690987124463, |
|
"grad_norm": 10.64334537255521, |
|
"learning_rate": 1.6433156804786183e-05, |
|
"loss": 0.6962, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11444921316165951, |
|
"grad_norm": 21.79989086330797, |
|
"learning_rate": 1.8178886852405614e-05, |
|
"loss": 0.4631, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1430615164520744, |
|
"grad_norm": 8.407923664935195, |
|
"learning_rate": 1.953298027623903e-05, |
|
"loss": 0.3384, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17167381974248927, |
|
"grad_norm": 4.980763069359495, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2112, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20028612303290416, |
|
"grad_norm": 3.3050323983887084, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1508, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22889842632331903, |
|
"grad_norm": 2.747487969978562, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1331, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2575107296137339, |
|
"grad_norm": 3.8930367082100776, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1279, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2861230329041488, |
|
"grad_norm": 2.262054738771732, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1245, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3147353361945637, |
|
"grad_norm": 2.1250650940368487, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1133, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 1.770036809289755, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0945, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3719599427753934, |
|
"grad_norm": 1.7938876086769961, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0866, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4005722460658083, |
|
"grad_norm": 1.4764321868179242, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0827, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4291845493562232, |
|
"grad_norm": 1.9128984486999376, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0798, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.45779685264663805, |
|
"grad_norm": 1.46483598523261, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0804, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4864091559370529, |
|
"grad_norm": 1.6523142945768603, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0758, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5150214592274678, |
|
"grad_norm": 1.5333568395438437, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0709, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5436337625178826, |
|
"grad_norm": 1.5031573664657636, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0687, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5722460658082976, |
|
"grad_norm": 1.4132321325139292, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0574, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6008583690987125, |
|
"grad_norm": 1.3745064149711035, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0572, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6294706723891274, |
|
"grad_norm": 1.3470725333478024, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0611, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6580829756795422, |
|
"grad_norm": 1.1315562008981583, |
|
"learning_rate": 2e-05, |
|
"loss": 0.05, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 1.2282177741629088, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0525, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7153075822603719, |
|
"grad_norm": 1.3933198446492454, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0519, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7439198855507868, |
|
"grad_norm": 1.7829406158054193, |
|
"learning_rate": 2e-05, |
|
"loss": 0.051, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7725321888412017, |
|
"grad_norm": 1.1948798936363785, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0514, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8011444921316166, |
|
"grad_norm": 1.0816375150345683, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0512, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8297567954220315, |
|
"grad_norm": 1.469354846951377, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0465, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8583690987124464, |
|
"grad_norm": 1.2522970466753844, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0493, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8869814020028612, |
|
"grad_norm": 1.162286189716735, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0474, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9155937052932761, |
|
"grad_norm": 1.0718851830181713, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0415, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.944206008583691, |
|
"grad_norm": 1.0733174430217316, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0528, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9728183118741058, |
|
"grad_norm": 1.0644789712973826, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0456, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0014306151645207, |
|
"grad_norm": 1.1496891631410193, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0432, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.0300429184549356, |
|
"grad_norm": 1.0200369998966563, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0241, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0586552217453504, |
|
"grad_norm": 1.3931472581826994, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0225, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.0872675250357653, |
|
"grad_norm": 1.0114502707229882, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0239, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1158798283261802, |
|
"grad_norm": 0.7510204564859241, |
|
"learning_rate": 2e-05, |
|
"loss": 0.027, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.144492131616595, |
|
"grad_norm": 0.7295063245482636, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0235, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.17310443490701, |
|
"grad_norm": 0.856671906186716, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0247, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.201716738197425, |
|
"grad_norm": 0.7255694844652782, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0258, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2303290414878398, |
|
"grad_norm": 0.9184490713297652, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0276, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.2589413447782547, |
|
"grad_norm": 0.818205379161712, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0219, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2875536480686696, |
|
"grad_norm": 0.6654369429209799, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0261, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.3161659513590844, |
|
"grad_norm": 0.5829250860946364, |
|
"learning_rate": 2e-05, |
|
"loss": 0.017, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3447782546494993, |
|
"grad_norm": 0.6661015288674467, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0228, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.3733905579399142, |
|
"grad_norm": 0.7177595155125043, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0178, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.402002861230329, |
|
"grad_norm": 0.8848450057764279, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0147, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.4306151645207439, |
|
"grad_norm": 0.6571566830429797, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0216, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4592274678111588, |
|
"grad_norm": 0.812436234834659, |
|
"learning_rate": 2e-05, |
|
"loss": 0.023, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.4878397711015736, |
|
"grad_norm": 0.7840598860015469, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0209, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5164520743919887, |
|
"grad_norm": 0.7844249253805873, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0205, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.5450643776824036, |
|
"grad_norm": 0.7640044613122257, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0238, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5736766809728184, |
|
"grad_norm": 1.0261564863265702, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0271, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6022889842632333, |
|
"grad_norm": 0.6603554019675723, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0224, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.6309012875536482, |
|
"grad_norm": 0.6112434008888443, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0201, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.659513590844063, |
|
"grad_norm": 0.6941562759172227, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0209, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.688125894134478, |
|
"grad_norm": 0.920122539331784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0224, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.7167381974248928, |
|
"grad_norm": 0.784663743006703, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0204, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7453505007153076, |
|
"grad_norm": 0.5371735092585386, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0175, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.7739628040057225, |
|
"grad_norm": 0.4569754495971157, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0191, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.8025751072961373, |
|
"grad_norm": 0.5809346149070659, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0163, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.8311874105865522, |
|
"grad_norm": 0.9416876917379606, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0186, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.859799713876967, |
|
"grad_norm": 0.9128407546360238, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0177, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.888412017167382, |
|
"grad_norm": 0.6404265787090032, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0142, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.9170243204577968, |
|
"grad_norm": 0.8659777729110113, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0188, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.9456366237482117, |
|
"grad_norm": 0.7389926990941214, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0205, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.9742489270386265, |
|
"grad_norm": 0.5010753341589564, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0173, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.0028612303290414, |
|
"grad_norm": 0.44585995494505515, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0176, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.0314735336194563, |
|
"grad_norm": 0.3372647662374521, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0114, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.060085836909871, |
|
"grad_norm": 0.5724492338755197, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0144, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.088698140200286, |
|
"grad_norm": 0.5187760378661088, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0137, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.117310443490701, |
|
"grad_norm": 0.5106154674345083, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0116, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.1459227467811157, |
|
"grad_norm": 0.5354245824814097, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0122, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.1745350500715306, |
|
"grad_norm": 0.5465913593959528, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0135, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.2031473533619454, |
|
"grad_norm": 0.7553346807762494, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0139, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.2317596566523603, |
|
"grad_norm": 0.47986647806349203, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0125, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.260371959942775, |
|
"grad_norm": 0.6926460223156353, |
|
"learning_rate": 2e-05, |
|
"loss": 0.012, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.28898426323319, |
|
"grad_norm": 0.7721328467064986, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0104, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.317596566523605, |
|
"grad_norm": 0.3938351782885143, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0094, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.34620886981402, |
|
"grad_norm": 0.7956446454256582, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0127, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.374821173104435, |
|
"grad_norm": 0.7163390976077646, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0117, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.40343347639485, |
|
"grad_norm": 0.40417151839328475, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0116, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.432045779685265, |
|
"grad_norm": 0.7406033829214401, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0115, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.4606580829756797, |
|
"grad_norm": 1.0372948520488305, |
|
"learning_rate": 2e-05, |
|
"loss": 0.013, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.4892703862660945, |
|
"grad_norm": 0.48303247551117084, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0121, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.5178826895565094, |
|
"grad_norm": 0.822531770752665, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0091, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.5464949928469243, |
|
"grad_norm": 0.5751055353850153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0117, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.575107296137339, |
|
"grad_norm": 0.8111046011909318, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0138, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.603719599427754, |
|
"grad_norm": 0.5529988693729204, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0137, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.632331902718169, |
|
"grad_norm": 0.6426046706622803, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0128, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.6609442060085837, |
|
"grad_norm": 0.5842059243112792, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0097, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.6895565092989986, |
|
"grad_norm": 0.9462035289351468, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0111, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.7181688125894135, |
|
"grad_norm": 0.47730280851213186, |
|
"learning_rate": 2e-05, |
|
"loss": 0.01, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.7467811158798283, |
|
"grad_norm": 0.7829145546901836, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0147, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.775393419170243, |
|
"grad_norm": 0.41532702346006606, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0106, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.804005722460658, |
|
"grad_norm": 0.42916154288878555, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0135, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.832618025751073, |
|
"grad_norm": 0.5117471137135019, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0095, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.8612303290414878, |
|
"grad_norm": 0.42607884594383777, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0105, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.8898426323319026, |
|
"grad_norm": 0.5078330866142711, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0111, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.9184549356223175, |
|
"grad_norm": 0.872560256021644, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0096, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.9470672389127324, |
|
"grad_norm": 0.5473131169980906, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0119, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.9756795422031472, |
|
"grad_norm": 0.33829761962434335, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0094, |
|
"step": 520 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 522, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 10457554665472.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|