{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6416295353278166, "eval_steps": 32, "global_step": 378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001697432633142372, "eval_loss": 2.4177019596099854, "eval_runtime": 13.7303, "eval_samples_per_second": 18.135, "eval_steps_per_second": 18.135, "step": 1 }, { "epoch": 0.016974326331423723, "grad_norm": 0.5538016557693481, "learning_rate": 1e-05, "loss": 2.2236, "step": 10 }, { "epoch": 0.033948652662847446, "grad_norm": 0.7852675318717957, "learning_rate": 9.989726963751683e-06, "loss": 2.4165, "step": 20 }, { "epoch": 0.050922978994271166, "grad_norm": 0.4371705651283264, "learning_rate": 9.95895006911623e-06, "loss": 2.5564, "step": 30 }, { "epoch": 0.05431784426055591, "eval_loss": 2.3971197605133057, "eval_runtime": 14.2456, "eval_samples_per_second": 17.479, "eval_steps_per_second": 17.479, "step": 32 }, { "epoch": 0.06789730532569489, "grad_norm": 0.8462035655975342, "learning_rate": 9.907795784955327e-06, "loss": 2.3432, "step": 40 }, { "epoch": 0.08487163165711861, "grad_norm": 0.9292763471603394, "learning_rate": 9.836474315195148e-06, "loss": 2.276, "step": 50 }, { "epoch": 0.10184595798854233, "grad_norm": 0.9029847979545593, "learning_rate": 9.745278735053345e-06, "loss": 2.3619, "step": 60 }, { "epoch": 0.10863568852111181, "eval_loss": 2.334070920944214, "eval_runtime": 14.5899, "eval_samples_per_second": 17.067, "eval_steps_per_second": 17.067, "step": 64 }, { "epoch": 0.11882028431996605, "grad_norm": 0.857122004032135, "learning_rate": 9.63458378673011e-06, "loss": 2.2851, "step": 70 }, { "epoch": 0.13579461065138979, "grad_norm": 0.931951642036438, "learning_rate": 9.504844339512096e-06, "loss": 2.3657, "step": 80 }, { "epoch": 0.1527689369828135, "grad_norm": 1.322995662689209, "learning_rate": 9.356593520616948e-06, "loss": 2.3112, "step": 90 }, { "epoch": 0.16295353278166771, "eval_loss": 2.282419443130493, "eval_runtime": 14.7626, "eval_samples_per_second": 16.867, "eval_steps_per_second": 16.867, "step": 96 }, { "epoch": 0.16974326331423722, "grad_norm": 0.8094339966773987, "learning_rate": 9.190440524459203e-06, "loss": 2.3695, "step": 100 }, { "epoch": 0.18671758964566093, "grad_norm": 0.723338782787323, "learning_rate": 9.007068109339783e-06, "loss": 2.2478, "step": 110 }, { "epoch": 0.20369191597708466, "grad_norm": 0.7135640978813171, "learning_rate": 8.807229791845673e-06, "loss": 2.1921, "step": 120 }, { "epoch": 0.21727137704222363, "eval_loss": 2.2477967739105225, "eval_runtime": 14.6761, "eval_samples_per_second": 16.966, "eval_steps_per_second": 16.966, "step": 128 }, { "epoch": 0.22066624230850837, "grad_norm": 0.772994339466095, "learning_rate": 8.591746750488639e-06, "loss": 2.1553, "step": 130 }, { "epoch": 0.2376405686399321, "grad_norm": 1.070949673652649, "learning_rate": 8.361504451306585e-06, "loss": 2.1614, "step": 140 }, { "epoch": 0.2546148949713558, "grad_norm": 0.636782705783844, "learning_rate": 8.117449009293668e-06, "loss": 2.0752, "step": 150 }, { "epoch": 0.27158922130277957, "grad_norm": 0.9331526756286621, "learning_rate": 7.860583300610849e-06, "loss": 2.2286, "step": 160 }, { "epoch": 0.27158922130277957, "eval_loss": 2.2229602336883545, "eval_runtime": 14.5584, "eval_samples_per_second": 17.104, "eval_steps_per_second": 17.104, "step": 160 }, { "epoch": 0.2885635476342033, "grad_norm": 0.8503024578094482, "learning_rate": 7.591962841552627e-06, "loss": 1.9789, "step": 170 }, { "epoch": 0.305537873965627, "grad_norm": 0.9481040239334106, "learning_rate": 7.312691451204178e-06, "loss": 2.1179, "step": 180 }, { "epoch": 0.3225122002970507, "grad_norm": 0.854963481426239, "learning_rate": 7.023916715611969e-06, "loss": 2.297, "step": 190 }, { "epoch": 0.32590706556333543, "eval_loss": 2.204796552658081, "eval_runtime": 14.6745, "eval_samples_per_second": 16.968, "eval_steps_per_second": 16.968, "step": 192 }, { "epoch": 0.33948652662847445, "grad_norm": 0.70894455909729, "learning_rate": 6.726825272106539e-06, "loss": 2.1591, "step": 200 }, { "epoch": 0.35646085295989816, "grad_norm": 0.7739590406417847, "learning_rate": 6.4226379331551625e-06, "loss": 2.1628, "step": 210 }, { "epoch": 0.37343517929132186, "grad_norm": 1.1449049711227417, "learning_rate": 6.112604669781572e-06, "loss": 2.196, "step": 220 }, { "epoch": 0.38022490982389134, "eval_loss": 2.1902246475219727, "eval_runtime": 14.9629, "eval_samples_per_second": 16.641, "eval_steps_per_second": 16.641, "step": 224 }, { "epoch": 0.3904095056227456, "grad_norm": 0.9429897665977478, "learning_rate": 5.797999475166897e-06, "loss": 2.1209, "step": 230 }, { "epoch": 0.40738383195416933, "grad_norm": 0.6817293167114258, "learning_rate": 5.480115129538409e-06, "loss": 2.078, "step": 240 }, { "epoch": 0.42435815828559303, "grad_norm": 1.0392802953720093, "learning_rate": 5.160257887858278e-06, "loss": 2.019, "step": 250 }, { "epoch": 0.43454275408444726, "eval_loss": 2.18066143989563, "eval_runtime": 14.9609, "eval_samples_per_second": 16.643, "eval_steps_per_second": 16.643, "step": 256 }, { "epoch": 0.44133248461701674, "grad_norm": 1.4284169673919678, "learning_rate": 4.839742112141725e-06, "loss": 2.1248, "step": 260 }, { "epoch": 0.4583068109484405, "grad_norm": 1.103446364402771, "learning_rate": 4.5198848704615915e-06, "loss": 2.2042, "step": 270 }, { "epoch": 0.4752811372798642, "grad_norm": 0.9654069542884827, "learning_rate": 4.2020005248331056e-06, "loss": 2.0661, "step": 280 }, { "epoch": 0.48886059834500317, "eval_loss": 2.1737821102142334, "eval_runtime": 14.6586, "eval_samples_per_second": 16.987, "eval_steps_per_second": 16.987, "step": 288 }, { "epoch": 0.4922554636112879, "grad_norm": 1.2110482454299927, "learning_rate": 3.887395330218429e-06, "loss": 2.1245, "step": 290 }, { "epoch": 0.5092297899427116, "grad_norm": 0.9131014943122864, "learning_rate": 3.5773620668448384e-06, "loss": 1.9854, "step": 300 }, { "epoch": 0.5262041162741353, "grad_norm": 0.8336049914360046, "learning_rate": 3.273174727893463e-06, "loss": 2.0522, "step": 310 }, { "epoch": 0.5431784426055591, "grad_norm": 0.8388031721115112, "learning_rate": 2.976083284388031e-06, "loss": 2.2818, "step": 320 }, { "epoch": 0.5431784426055591, "eval_loss": 2.169279098510742, "eval_runtime": 14.6962, "eval_samples_per_second": 16.943, "eval_steps_per_second": 16.943, "step": 320 }, { "epoch": 0.5601527689369828, "grad_norm": 1.1344929933547974, "learning_rate": 2.687308548795825e-06, "loss": 2.153, "step": 330 }, { "epoch": 0.5771270952684066, "grad_norm": 0.5925095081329346, "learning_rate": 2.408037158447375e-06, "loss": 2.1837, "step": 340 }, { "epoch": 0.5941014215998303, "grad_norm": 0.7718944549560547, "learning_rate": 2.139416699389153e-06, "loss": 2.1624, "step": 350 }, { "epoch": 0.597496286866115, "eval_loss": 2.1667089462280273, "eval_runtime": 14.721, "eval_samples_per_second": 16.915, "eval_steps_per_second": 16.915, "step": 352 }, { "epoch": 0.611075747931254, "grad_norm": 0.9826621413230896, "learning_rate": 1.8825509907063328e-06, "loss": 2.1559, "step": 360 }, { "epoch": 0.6280500742626777, "grad_norm": 1.0323383808135986, "learning_rate": 1.6384955486934157e-06, "loss": 2.0834, "step": 370 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 63, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.20029415211008e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }