|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9990610328638497, |
|
"eval_steps": 50, |
|
"global_step": 266, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03755868544600939, |
|
"grad_norm": 238.26105857074086, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 2.184, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07511737089201878, |
|
"grad_norm": 11.851124100947331, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.9583, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11267605633802817, |
|
"grad_norm": 13.23709111000931, |
|
"learning_rate": 9.996112860009689e-06, |
|
"loss": 0.6666, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15023474178403756, |
|
"grad_norm": 10.595263707766286, |
|
"learning_rate": 9.9271761563539e-06, |
|
"loss": 0.6073, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18779342723004694, |
|
"grad_norm": 10.974358931394494, |
|
"learning_rate": 9.773228160797187e-06, |
|
"loss": 0.4888, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18779342723004694, |
|
"eval_loss": 0.40262532234191895, |
|
"eval_runtime": 84.8046, |
|
"eval_samples_per_second": 11.155, |
|
"eval_steps_per_second": 1.403, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22535211267605634, |
|
"grad_norm": 6.999547671281271, |
|
"learning_rate": 9.536925023144742e-06, |
|
"loss": 0.332, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.26291079812206575, |
|
"grad_norm": 7.934709674256903, |
|
"learning_rate": 9.222343811959694e-06, |
|
"loss": 0.3059, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3004694835680751, |
|
"grad_norm": 4.216247268345636, |
|
"learning_rate": 8.834912170647102e-06, |
|
"loss": 0.2967, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3380281690140845, |
|
"grad_norm": 3.8491049580742125, |
|
"learning_rate": 8.38131467132416e-06, |
|
"loss": 0.3126, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3755868544600939, |
|
"grad_norm": 4.86769045072678, |
|
"learning_rate": 7.869377482205042e-06, |
|
"loss": 0.2158, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3755868544600939, |
|
"eval_loss": 0.2777794301509857, |
|
"eval_runtime": 84.5196, |
|
"eval_samples_per_second": 11.193, |
|
"eval_steps_per_second": 1.408, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4131455399061033, |
|
"grad_norm": 4.811792495045984, |
|
"learning_rate": 7.307933338397667e-06, |
|
"loss": 0.2894, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4507042253521127, |
|
"grad_norm": 4.159739012234102, |
|
"learning_rate": 6.706669145845863e-06, |
|
"loss": 0.2692, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.48826291079812206, |
|
"grad_norm": 4.52034631499569, |
|
"learning_rate": 6.075958847790262e-06, |
|
"loss": 0.2533, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5258215962441315, |
|
"grad_norm": 3.976938645835112, |
|
"learning_rate": 5.426684437395196e-06, |
|
"loss": 0.2145, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5633802816901409, |
|
"grad_norm": 5.272788226812684, |
|
"learning_rate": 4.770048204709648e-06, |
|
"loss": 0.2311, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5633802816901409, |
|
"eval_loss": 0.21654193103313446, |
|
"eval_runtime": 84.4941, |
|
"eval_samples_per_second": 11.196, |
|
"eval_steps_per_second": 1.408, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6009389671361502, |
|
"grad_norm": 4.509328497992535, |
|
"learning_rate": 4.1173794573691e-06, |
|
"loss": 0.2118, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6384976525821596, |
|
"grad_norm": 3.367119100278668, |
|
"learning_rate": 3.479939049792817e-06, |
|
"loss": 0.2152, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.676056338028169, |
|
"grad_norm": 2.4731294441327822, |
|
"learning_rate": 2.8687250934422774e-06, |
|
"loss": 0.2025, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7136150234741784, |
|
"grad_norm": 3.5116971601898213, |
|
"learning_rate": 2.2942832003289823e-06, |
|
"loss": 0.1917, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7511737089201878, |
|
"grad_norm": 2.8679194958187146, |
|
"learning_rate": 1.7665245337452368e-06, |
|
"loss": 0.1806, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7511737089201878, |
|
"eval_loss": 0.18289822340011597, |
|
"eval_runtime": 84.4823, |
|
"eval_samples_per_second": 11.198, |
|
"eval_steps_per_second": 1.409, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7887323943661971, |
|
"grad_norm": 8.060687194002183, |
|
"learning_rate": 1.2945548054891322e-06, |
|
"loss": 0.1808, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8262910798122066, |
|
"grad_norm": 2.8917862492734168, |
|
"learning_rate": 8.865171699890835e-07, |
|
"loss": 0.1439, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.863849765258216, |
|
"grad_norm": 2.7979916522519823, |
|
"learning_rate": 5.494517259623478e-07, |
|
"loss": 0.1977, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9014084507042254, |
|
"grad_norm": 3.1531141963354, |
|
"learning_rate": 2.8917404970305096e-07, |
|
"loss": 0.1996, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9389671361502347, |
|
"grad_norm": 4.235066375032124, |
|
"learning_rate": 1.1017485573197151e-07, |
|
"loss": 0.183, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9389671361502347, |
|
"eval_loss": 0.1692400723695755, |
|
"eval_runtime": 84.3942, |
|
"eval_samples_per_second": 11.209, |
|
"eval_steps_per_second": 1.41, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9765258215962441, |
|
"grad_norm": 3.024511496452823, |
|
"learning_rate": 1.554251601833201e-08, |
|
"loss": 0.154, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9990610328638497, |
|
"step": 266, |
|
"total_flos": 47252837498880.0, |
|
"train_loss": 0.36927685298417745, |
|
"train_runtime": 7961.1782, |
|
"train_samples_per_second": 1.069, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 266, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 47252837498880.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|