|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.6416295353278166, |
|
"eval_steps": 32, |
|
"global_step": 378, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001697432633142372, |
|
"eval_loss": 2.4177019596099854, |
|
"eval_runtime": 13.7303, |
|
"eval_samples_per_second": 18.135, |
|
"eval_steps_per_second": 18.135, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016974326331423723, |
|
"grad_norm": 0.5538016557693481, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2236, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.033948652662847446, |
|
"grad_norm": 0.7852675318717957, |
|
"learning_rate": 9.989726963751683e-06, |
|
"loss": 2.4165, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.050922978994271166, |
|
"grad_norm": 0.4371705651283264, |
|
"learning_rate": 9.95895006911623e-06, |
|
"loss": 2.5564, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05431784426055591, |
|
"eval_loss": 2.3971197605133057, |
|
"eval_runtime": 14.2456, |
|
"eval_samples_per_second": 17.479, |
|
"eval_steps_per_second": 17.479, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06789730532569489, |
|
"grad_norm": 0.8462035655975342, |
|
"learning_rate": 9.907795784955327e-06, |
|
"loss": 2.3432, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08487163165711861, |
|
"grad_norm": 0.9292763471603394, |
|
"learning_rate": 9.836474315195148e-06, |
|
"loss": 2.276, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10184595798854233, |
|
"grad_norm": 0.9029847979545593, |
|
"learning_rate": 9.745278735053345e-06, |
|
"loss": 2.3619, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10863568852111181, |
|
"eval_loss": 2.334070920944214, |
|
"eval_runtime": 14.5899, |
|
"eval_samples_per_second": 17.067, |
|
"eval_steps_per_second": 17.067, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11882028431996605, |
|
"grad_norm": 0.857122004032135, |
|
"learning_rate": 9.63458378673011e-06, |
|
"loss": 2.2851, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13579461065138979, |
|
"grad_norm": 0.931951642036438, |
|
"learning_rate": 9.504844339512096e-06, |
|
"loss": 2.3657, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1527689369828135, |
|
"grad_norm": 1.322995662689209, |
|
"learning_rate": 9.356593520616948e-06, |
|
"loss": 2.3112, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16295353278166771, |
|
"eval_loss": 2.282419443130493, |
|
"eval_runtime": 14.7626, |
|
"eval_samples_per_second": 16.867, |
|
"eval_steps_per_second": 16.867, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.16974326331423722, |
|
"grad_norm": 0.8094339966773987, |
|
"learning_rate": 9.190440524459203e-06, |
|
"loss": 2.3695, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18671758964566093, |
|
"grad_norm": 0.723338782787323, |
|
"learning_rate": 9.007068109339783e-06, |
|
"loss": 2.2478, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20369191597708466, |
|
"grad_norm": 0.7135640978813171, |
|
"learning_rate": 8.807229791845673e-06, |
|
"loss": 2.1921, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21727137704222363, |
|
"eval_loss": 2.2477967739105225, |
|
"eval_runtime": 14.6761, |
|
"eval_samples_per_second": 16.966, |
|
"eval_steps_per_second": 16.966, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.22066624230850837, |
|
"grad_norm": 0.772994339466095, |
|
"learning_rate": 8.591746750488639e-06, |
|
"loss": 2.1553, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2376405686399321, |
|
"grad_norm": 1.070949673652649, |
|
"learning_rate": 8.361504451306585e-06, |
|
"loss": 2.1614, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2546148949713558, |
|
"grad_norm": 0.636782705783844, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 2.0752, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.27158922130277957, |
|
"grad_norm": 0.9331526756286621, |
|
"learning_rate": 7.860583300610849e-06, |
|
"loss": 2.2286, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.27158922130277957, |
|
"eval_loss": 2.2229602336883545, |
|
"eval_runtime": 14.5584, |
|
"eval_samples_per_second": 17.104, |
|
"eval_steps_per_second": 17.104, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2885635476342033, |
|
"grad_norm": 0.8503024578094482, |
|
"learning_rate": 7.591962841552627e-06, |
|
"loss": 1.9789, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.305537873965627, |
|
"grad_norm": 0.9481040239334106, |
|
"learning_rate": 7.312691451204178e-06, |
|
"loss": 2.1179, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3225122002970507, |
|
"grad_norm": 0.854963481426239, |
|
"learning_rate": 7.023916715611969e-06, |
|
"loss": 2.297, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32590706556333543, |
|
"eval_loss": 2.204796552658081, |
|
"eval_runtime": 14.6745, |
|
"eval_samples_per_second": 16.968, |
|
"eval_steps_per_second": 16.968, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.33948652662847445, |
|
"grad_norm": 0.70894455909729, |
|
"learning_rate": 6.726825272106539e-06, |
|
"loss": 2.1591, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.35646085295989816, |
|
"grad_norm": 0.7739590406417847, |
|
"learning_rate": 6.4226379331551625e-06, |
|
"loss": 2.1628, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.37343517929132186, |
|
"grad_norm": 1.1449049711227417, |
|
"learning_rate": 6.112604669781572e-06, |
|
"loss": 2.196, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.38022490982389134, |
|
"eval_loss": 2.1902246475219727, |
|
"eval_runtime": 14.9629, |
|
"eval_samples_per_second": 16.641, |
|
"eval_steps_per_second": 16.641, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.3904095056227456, |
|
"grad_norm": 0.9429897665977478, |
|
"learning_rate": 5.797999475166897e-06, |
|
"loss": 2.1209, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.40738383195416933, |
|
"grad_norm": 0.6817293167114258, |
|
"learning_rate": 5.480115129538409e-06, |
|
"loss": 2.078, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.42435815828559303, |
|
"grad_norm": 1.0392802953720093, |
|
"learning_rate": 5.160257887858278e-06, |
|
"loss": 2.019, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.43454275408444726, |
|
"eval_loss": 2.18066143989563, |
|
"eval_runtime": 14.9609, |
|
"eval_samples_per_second": 16.643, |
|
"eval_steps_per_second": 16.643, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.44133248461701674, |
|
"grad_norm": 1.4284169673919678, |
|
"learning_rate": 4.839742112141725e-06, |
|
"loss": 2.1248, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4583068109484405, |
|
"grad_norm": 1.103446364402771, |
|
"learning_rate": 4.5198848704615915e-06, |
|
"loss": 2.2042, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4752811372798642, |
|
"grad_norm": 0.9654069542884827, |
|
"learning_rate": 4.2020005248331056e-06, |
|
"loss": 2.0661, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.48886059834500317, |
|
"eval_loss": 2.1737821102142334, |
|
"eval_runtime": 14.6586, |
|
"eval_samples_per_second": 16.987, |
|
"eval_steps_per_second": 16.987, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.4922554636112879, |
|
"grad_norm": 1.2110482454299927, |
|
"learning_rate": 3.887395330218429e-06, |
|
"loss": 2.1245, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5092297899427116, |
|
"grad_norm": 0.9131014943122864, |
|
"learning_rate": 3.5773620668448384e-06, |
|
"loss": 1.9854, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5262041162741353, |
|
"grad_norm": 0.8336049914360046, |
|
"learning_rate": 3.273174727893463e-06, |
|
"loss": 2.0522, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5431784426055591, |
|
"grad_norm": 0.8388031721115112, |
|
"learning_rate": 2.976083284388031e-06, |
|
"loss": 2.2818, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5431784426055591, |
|
"eval_loss": 2.169279098510742, |
|
"eval_runtime": 14.6962, |
|
"eval_samples_per_second": 16.943, |
|
"eval_steps_per_second": 16.943, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5601527689369828, |
|
"grad_norm": 1.1344929933547974, |
|
"learning_rate": 2.687308548795825e-06, |
|
"loss": 2.153, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5771270952684066, |
|
"grad_norm": 0.5925095081329346, |
|
"learning_rate": 2.408037158447375e-06, |
|
"loss": 2.1837, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5941014215998303, |
|
"grad_norm": 0.7718944549560547, |
|
"learning_rate": 2.139416699389153e-06, |
|
"loss": 2.1624, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.597496286866115, |
|
"eval_loss": 2.1667089462280273, |
|
"eval_runtime": 14.721, |
|
"eval_samples_per_second": 16.915, |
|
"eval_steps_per_second": 16.915, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.611075747931254, |
|
"grad_norm": 0.9826621413230896, |
|
"learning_rate": 1.8825509907063328e-06, |
|
"loss": 2.1559, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6280500742626777, |
|
"grad_norm": 1.0323383808135986, |
|
"learning_rate": 1.6384955486934157e-06, |
|
"loss": 2.0834, |
|
"step": 370 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 63, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.20029415211008e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|