|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2987481434330575, |
|
"eval_steps": 32, |
|
"global_step": 176, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001697432633142372, |
|
"eval_loss": 2.4177019596099854, |
|
"eval_runtime": 13.7299, |
|
"eval_samples_per_second": 18.136, |
|
"eval_steps_per_second": 18.136, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008487163165711862, |
|
"grad_norm": 0.5580189228057861, |
|
"learning_rate": 5e-06, |
|
"loss": 2.0835, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016974326331423723, |
|
"grad_norm": 0.5617932081222534, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3638, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.025461489497135583, |
|
"grad_norm": 0.637174129486084, |
|
"learning_rate": 9.99743108100344e-06, |
|
"loss": 2.3443, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.033948652662847446, |
|
"grad_norm": 0.7906777858734131, |
|
"learning_rate": 9.989726963751683e-06, |
|
"loss": 2.4875, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.042435815828559306, |
|
"grad_norm": 0.7220119833946228, |
|
"learning_rate": 9.976895564745993e-06, |
|
"loss": 2.2905, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.050922978994271166, |
|
"grad_norm": 0.4569860100746155, |
|
"learning_rate": 9.95895006911623e-06, |
|
"loss": 2.8207, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05431784426055591, |
|
"eval_loss": 2.396263599395752, |
|
"eval_runtime": 13.7534, |
|
"eval_samples_per_second": 18.105, |
|
"eval_steps_per_second": 18.105, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.059410142159983026, |
|
"grad_norm": 0.5562223196029663, |
|
"learning_rate": 9.935908917072253e-06, |
|
"loss": 2.3774, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06789730532569489, |
|
"grad_norm": 0.8851813077926636, |
|
"learning_rate": 9.907795784955327e-06, |
|
"loss": 2.3059, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07638446849140675, |
|
"grad_norm": 0.8263425827026367, |
|
"learning_rate": 9.874639560909118e-06, |
|
"loss": 2.2858, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08487163165711861, |
|
"grad_norm": 0.9496198296546936, |
|
"learning_rate": 9.836474315195148e-06, |
|
"loss": 2.2606, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09335879482283047, |
|
"grad_norm": 0.8389888405799866, |
|
"learning_rate": 9.793339265183303e-06, |
|
"loss": 2.4757, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10184595798854233, |
|
"grad_norm": 0.9090803861618042, |
|
"learning_rate": 9.745278735053345e-06, |
|
"loss": 2.2428, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10863568852111181, |
|
"eval_loss": 2.3315982818603516, |
|
"eval_runtime": 13.995, |
|
"eval_samples_per_second": 17.792, |
|
"eval_steps_per_second": 17.792, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11033312115425419, |
|
"grad_norm": 0.8944710493087769, |
|
"learning_rate": 9.692342110248802e-06, |
|
"loss": 2.361, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11882028431996605, |
|
"grad_norm": 0.8705277442932129, |
|
"learning_rate": 9.63458378673011e-06, |
|
"loss": 2.2061, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1273074474856779, |
|
"grad_norm": 1.0183981657028198, |
|
"learning_rate": 9.572063115079063e-06, |
|
"loss": 2.3014, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13579461065138979, |
|
"grad_norm": 0.9694010615348816, |
|
"learning_rate": 9.504844339512096e-06, |
|
"loss": 2.4273, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14428177381710164, |
|
"grad_norm": 0.6600094437599182, |
|
"learning_rate": 9.432996531865001e-06, |
|
"loss": 2.2039, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1527689369828135, |
|
"grad_norm": 1.437016487121582, |
|
"learning_rate": 9.356593520616948e-06, |
|
"loss": 2.4129, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16125610014852534, |
|
"grad_norm": 1.1358604431152344, |
|
"learning_rate": 9.275713815026732e-06, |
|
"loss": 2.2346, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16295353278166771, |
|
"eval_loss": 2.2801592350006104, |
|
"eval_runtime": 14.3531, |
|
"eval_samples_per_second": 17.348, |
|
"eval_steps_per_second": 17.348, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.16974326331423722, |
|
"grad_norm": 0.8347494006156921, |
|
"learning_rate": 9.190440524459203e-06, |
|
"loss": 2.5003, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17823042647994908, |
|
"grad_norm": 0.9528422355651855, |
|
"learning_rate": 9.10086127298478e-06, |
|
"loss": 2.2398, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.18671758964566093, |
|
"grad_norm": 0.7451781630516052, |
|
"learning_rate": 9.007068109339783e-06, |
|
"loss": 2.253, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1952047528113728, |
|
"grad_norm": 0.6891763210296631, |
|
"learning_rate": 8.90915741234015e-06, |
|
"loss": 2.0703, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20369191597708466, |
|
"grad_norm": 0.7363041639328003, |
|
"learning_rate": 8.807229791845673e-06, |
|
"loss": 2.3083, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21217907914279652, |
|
"grad_norm": 0.7747501730918884, |
|
"learning_rate": 8.701389985376578e-06, |
|
"loss": 2.2058, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.21727137704222363, |
|
"eval_loss": 2.245945692062378, |
|
"eval_runtime": 14.3131, |
|
"eval_samples_per_second": 17.397, |
|
"eval_steps_per_second": 17.397, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.22066624230850837, |
|
"grad_norm": 0.8016952276229858, |
|
"learning_rate": 8.591746750488639e-06, |
|
"loss": 2.1023, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.22915340547422025, |
|
"grad_norm": 1.4869773387908936, |
|
"learning_rate": 8.478412753017433e-06, |
|
"loss": 2.2433, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2376405686399321, |
|
"grad_norm": 1.111100673675537, |
|
"learning_rate": 8.361504451306585e-06, |
|
"loss": 2.0758, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24612773180564396, |
|
"grad_norm": 0.7958012819290161, |
|
"learning_rate": 8.241141976538944e-06, |
|
"loss": 2.1178, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2546148949713558, |
|
"grad_norm": 0.6610277891159058, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 2.0289, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26310205813706766, |
|
"grad_norm": 0.9238812923431396, |
|
"learning_rate": 7.99055265245608e-06, |
|
"loss": 2.3871, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.27158922130277957, |
|
"grad_norm": 0.9540156126022339, |
|
"learning_rate": 7.860583300610849e-06, |
|
"loss": 2.069, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.27158922130277957, |
|
"eval_loss": 2.2213292121887207, |
|
"eval_runtime": 14.5462, |
|
"eval_samples_per_second": 17.118, |
|
"eval_steps_per_second": 17.118, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2800763844684914, |
|
"grad_norm": 1.1984089612960815, |
|
"learning_rate": 7.727674506052744e-06, |
|
"loss": 1.9745, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2885635476342033, |
|
"grad_norm": 0.8632123470306396, |
|
"learning_rate": 7.591962841552627e-06, |
|
"loss": 1.9814, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.29705071079991513, |
|
"grad_norm": 0.6884729266166687, |
|
"learning_rate": 7.453587760019691e-06, |
|
"loss": 2.1216, |
|
"step": 175 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 16, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5596099588915200.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|