|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9955423476968797, |
|
"eval_steps": 500, |
|
"global_step": 756, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0396235760277365, |
|
"grad_norm": 27.41085149828123, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0725, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.079247152055473, |
|
"grad_norm": 21.426346851591315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9867, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1188707280832095, |
|
"grad_norm": 0.8214529838984242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9489, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.158494304110946, |
|
"grad_norm": 0.8934541351618693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9168, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1981178801386825, |
|
"grad_norm": 0.6396922311734878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.903, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.237741456166419, |
|
"grad_norm": 0.8318834132473311, |
|
"learning_rate": 5e-06, |
|
"loss": 0.884, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.27736503219415554, |
|
"grad_norm": 0.6283992782157446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8732, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.316988608221892, |
|
"grad_norm": 0.6596849388425453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8701, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.35661218424962854, |
|
"grad_norm": 0.6758187675410737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8641, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.396235760277365, |
|
"grad_norm": 1.0513265622155463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8597, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.43585933630510154, |
|
"grad_norm": 0.5790780273693242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8538, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.475482912332838, |
|
"grad_norm": 0.5821582037832116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8493, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5151064883605745, |
|
"grad_norm": 0.6354797019513801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8491, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5547300643883111, |
|
"grad_norm": 0.74047379799972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8519, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5943536404160475, |
|
"grad_norm": 0.6926741472374109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8388, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.633977216443784, |
|
"grad_norm": 0.5805929955993835, |
|
"learning_rate": 5e-06, |
|
"loss": 0.842, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6736007924715206, |
|
"grad_norm": 0.6028836834161021, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8381, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7132243684992571, |
|
"grad_norm": 0.5654340386019846, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8341, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7528479445269936, |
|
"grad_norm": 0.5723158352099399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8375, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.79247152055473, |
|
"grad_norm": 0.5198688688930553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8381, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8320950965824666, |
|
"grad_norm": 0.6050739761187648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8342, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8717186726102031, |
|
"grad_norm": 0.5397219218505803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8308, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9113422486379396, |
|
"grad_norm": 0.5613324111744644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8302, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.950965824665676, |
|
"grad_norm": 0.5372566937252206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8272, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9905894006934126, |
|
"grad_norm": 0.4784741594645362, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8269, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9985141158989599, |
|
"eval_loss": 0.8291837573051453, |
|
"eval_runtime": 271.0608, |
|
"eval_samples_per_second": 25.083, |
|
"eval_steps_per_second": 0.395, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.030212976721149, |
|
"grad_norm": 0.87740197973562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8491, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0698365527488856, |
|
"grad_norm": 0.5678119260366427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.785, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1094601287766221, |
|
"grad_norm": 0.6337004334606812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7887, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1490837048043585, |
|
"grad_norm": 0.6986139436792874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7862, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.188707280832095, |
|
"grad_norm": 0.5874593683473379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7793, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2283308568598317, |
|
"grad_norm": 0.7975693973340302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7795, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.267954432887568, |
|
"grad_norm": 0.5846939921280547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7798, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.3075780089153046, |
|
"grad_norm": 0.7654979422005525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7832, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3472015849430412, |
|
"grad_norm": 0.6496190339072363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7835, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3868251609707776, |
|
"grad_norm": 0.7019021685342436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7841, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.4264487369985142, |
|
"grad_norm": 0.697904784471384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7809, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4660723130262507, |
|
"grad_norm": 0.6140987998837893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7839, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.505695889053987, |
|
"grad_norm": 0.6176502513567036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.78, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5453194650817235, |
|
"grad_norm": 0.6195463728584585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7812, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.5849430411094603, |
|
"grad_norm": 0.5860977422231922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7855, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.6245666171371966, |
|
"grad_norm": 0.6058978073304615, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7826, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.664190193164933, |
|
"grad_norm": 0.5726428056331896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7767, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.7038137691926698, |
|
"grad_norm": 0.6255990916494661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7776, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7434373452204062, |
|
"grad_norm": 0.621833549178633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7786, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.7830609212481425, |
|
"grad_norm": 0.5435896897432534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7775, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.8226844972758791, |
|
"grad_norm": 0.7495004498554898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7759, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.8623080733036157, |
|
"grad_norm": 0.5987267949701414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.778, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.901931649331352, |
|
"grad_norm": 0.6388662201965061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7728, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.9415552253590886, |
|
"grad_norm": 0.5711439995135582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7817, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.9811788013868252, |
|
"grad_norm": 0.617975937550618, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7778, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.9970282317979198, |
|
"eval_loss": 0.8146935701370239, |
|
"eval_runtime": 271.9619, |
|
"eval_samples_per_second": 25.0, |
|
"eval_steps_per_second": 0.393, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.0208023774145616, |
|
"grad_norm": 0.6930015338867856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8016, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.060425953442298, |
|
"grad_norm": 0.6358377946223995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7316, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.1000495294700348, |
|
"grad_norm": 0.8278609254484581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7265, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.139673105497771, |
|
"grad_norm": 0.6962674479421791, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7311, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.1792966815255075, |
|
"grad_norm": 0.6727207101976576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7294, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.2189202575532443, |
|
"grad_norm": 0.5817825946787325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7307, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.2585438335809807, |
|
"grad_norm": 0.6436375631599169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7325, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.298167409608717, |
|
"grad_norm": 0.9292364267574976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7352, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.337790985636454, |
|
"grad_norm": 0.9489749212265489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7317, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.37741456166419, |
|
"grad_norm": 0.7240317017944226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7274, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.4170381376919265, |
|
"grad_norm": 0.6501433881507601, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7301, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.4566617137196634, |
|
"grad_norm": 0.8099751855974757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.734, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.4962852897473997, |
|
"grad_norm": 0.6496005537182832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7361, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.535908865775136, |
|
"grad_norm": 0.5493051299286916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7327, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.5755324418028724, |
|
"grad_norm": 0.5890719222816667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7346, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.6151560178306092, |
|
"grad_norm": 0.5341897077649945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7314, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.6547795938583456, |
|
"grad_norm": 0.5921523513579053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7352, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.6944031698860824, |
|
"grad_norm": 0.5890344658716773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7328, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.734026745913819, |
|
"grad_norm": 0.5886927133745706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7353, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.773650321941555, |
|
"grad_norm": 0.7084337230865917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7359, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.8132738979692915, |
|
"grad_norm": 0.6320863625704357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7289, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.8528974739970283, |
|
"grad_norm": 0.6406803421798857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7321, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.8925210500247647, |
|
"grad_norm": 0.5985842168545841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7363, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.9321446260525015, |
|
"grad_norm": 0.5878885552435451, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7317, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.971768202080238, |
|
"grad_norm": 0.7738327584141713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7327, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.9955423476968797, |
|
"eval_loss": 0.8128942847251892, |
|
"eval_runtime": 272.1686, |
|
"eval_samples_per_second": 24.981, |
|
"eval_steps_per_second": 0.393, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 2.9955423476968797, |
|
"step": 756, |
|
"total_flos": 1266118777896960.0, |
|
"train_loss": 0.7958845669630343, |
|
"train_runtime": 44860.7812, |
|
"train_samples_per_second": 8.638, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 756, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1266118777896960.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|