|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.992412746585736, |
|
"eval_steps": 500, |
|
"global_step": 246, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12139605462822459, |
|
"grad_norm": 1.9068916730195116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7721, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24279210925644917, |
|
"grad_norm": 5.44810384092203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.36418816388467373, |
|
"grad_norm": 0.8162292861303865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6289, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.48558421851289835, |
|
"grad_norm": 0.724722400146013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6062, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6069802731411229, |
|
"grad_norm": 0.6325755456980601, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5886, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7283763277693475, |
|
"grad_norm": 0.5102096530669636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5763, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.849772382397572, |
|
"grad_norm": 0.6134528530146113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5635, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9711684370257967, |
|
"grad_norm": 0.6520975040339092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5578, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9954476479514416, |
|
"eval_loss": 0.5475569367408752, |
|
"eval_runtime": 31.1849, |
|
"eval_samples_per_second": 71.156, |
|
"eval_steps_per_second": 1.122, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.095599393019727, |
|
"grad_norm": 0.9030012716394636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5739, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2169954476479514, |
|
"grad_norm": 0.5546978323548724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5098, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.338391502276176, |
|
"grad_norm": 0.7373972665017838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5099, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4597875569044005, |
|
"grad_norm": 0.8335652060900699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5086, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5811836115326252, |
|
"grad_norm": 0.9482742766383457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4988, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.7025796661608497, |
|
"grad_norm": 0.7107692585969188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4973, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8239757207890743, |
|
"grad_norm": 0.6956293579831972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5006, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.945371775417299, |
|
"grad_norm": 0.5537353905530825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4962, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.9939301972685888, |
|
"eval_loss": 0.5237926244735718, |
|
"eval_runtime": 31.2236, |
|
"eval_samples_per_second": 71.068, |
|
"eval_steps_per_second": 1.121, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.069802731411229, |
|
"grad_norm": 0.9676846382246841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5181, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.191198786039454, |
|
"grad_norm": 1.0604432739536909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4483, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.3125948406676784, |
|
"grad_norm": 0.9072874578025836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4481, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.433990895295903, |
|
"grad_norm": 0.93397296136386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4503, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5553869499241273, |
|
"grad_norm": 0.5608137627707893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4475, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.676783004552352, |
|
"grad_norm": 0.7216376866533744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4491, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.7981790591805766, |
|
"grad_norm": 0.7502162152741092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4475, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.919575113808801, |
|
"grad_norm": 0.6902724753233441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4505, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.992412746585736, |
|
"eval_loss": 0.5186718702316284, |
|
"eval_runtime": 31.6067, |
|
"eval_samples_per_second": 70.207, |
|
"eval_steps_per_second": 1.107, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.992412746585736, |
|
"step": 246, |
|
"total_flos": 411849782722560.0, |
|
"train_loss": 0.5282489497487138, |
|
"train_runtime": 6430.2484, |
|
"train_samples_per_second": 19.666, |
|
"train_steps_per_second": 0.038 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 246, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 411849782722560.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|