|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9982668977469671, |
|
"eval_steps": 18, |
|
"global_step": 360, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04991334488734835, |
|
"grad_norm": 0.4083130955696106, |
|
"learning_rate": 0.002, |
|
"loss": 1.2589, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0998266897746967, |
|
"grad_norm": 0.3094785809516907, |
|
"learning_rate": 0.0008944271909999159, |
|
"loss": 1.0636, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.14974003466204505, |
|
"grad_norm": 0.37615087628364563, |
|
"learning_rate": 0.0006488856845230502, |
|
"loss": 0.8574, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1996533795493934, |
|
"grad_norm": 0.2839493453502655, |
|
"learning_rate": 0.0005345224838248488, |
|
"loss": 0.8292, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.24956672443674177, |
|
"grad_norm": 0.2941131889820099, |
|
"learning_rate": 0.00046499055497527714, |
|
"loss": 0.7934, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2994800693240901, |
|
"grad_norm": 0.3659161627292633, |
|
"learning_rate": 0.0004170288281141495, |
|
"loss": 0.7727, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3493934142114385, |
|
"grad_norm": 0.3437303304672241, |
|
"learning_rate": 0.00038138503569823694, |
|
"loss": 0.7557, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3993067590987868, |
|
"grad_norm": 0.2811639904975891, |
|
"learning_rate": 0.00035355339059327376, |
|
"loss": 0.7285, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.44922010398613516, |
|
"grad_norm": 0.35479724407196045, |
|
"learning_rate": 0.00033104235544094716, |
|
"loss": 0.7107, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.49913344887348354, |
|
"grad_norm": 0.3011772036552429, |
|
"learning_rate": 0.0003123475237772121, |
|
"loss": 0.7186, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5490467937608319, |
|
"grad_norm": 0.29623347520828247, |
|
"learning_rate": 0.00029649972666444046, |
|
"loss": 0.6818, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5989601386481802, |
|
"grad_norm": 0.3092997074127197, |
|
"learning_rate": 0.000282842712474619, |
|
"loss": 0.6701, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6488734835355287, |
|
"grad_norm": 0.32858991622924805, |
|
"learning_rate": 0.00027091418459143857, |
|
"loss": 0.6733, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.698786828422877, |
|
"grad_norm": 0.3046702742576599, |
|
"learning_rate": 0.0002603778219616478, |
|
"loss": 0.6523, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7487001733102253, |
|
"grad_norm": 0.41049444675445557, |
|
"learning_rate": 0.00025098232205526344, |
|
"loss": 0.6473, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7986135181975736, |
|
"grad_norm": 0.35009312629699707, |
|
"learning_rate": 0.00024253562503633296, |
|
"loss": 0.6309, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.848526863084922, |
|
"grad_norm": 0.3388204276561737, |
|
"learning_rate": 0.0002348880878058814, |
|
"loss": 0.6264, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8984402079722703, |
|
"grad_norm": 0.28809213638305664, |
|
"learning_rate": 0.0002279211529192759, |
|
"loss": 0.6046, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.9483535528596188, |
|
"grad_norm": 0.3333686292171478, |
|
"learning_rate": 0.0002215395102486845, |
|
"loss": 0.5891, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9982668977469671, |
|
"grad_norm": 0.3479894697666168, |
|
"learning_rate": 0.00021566554640687683, |
|
"loss": 0.5952, |
|
"step": 360 |
|
} |
|
], |
|
"logging_steps": 18, |
|
"max_steps": 360, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 18, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.656326962122588e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|