|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 32, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.03922202742278759, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.1908, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.03600727500414045, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.1872, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.041458677460302705, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.1789, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.042477239921338006, |
|
"learning_rate": 5e-05, |
|
"loss": 0.191, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.04699963432811002, |
|
"learning_rate": 4.984280524733107e-05, |
|
"loss": 0.1832, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.04547329936830524, |
|
"learning_rate": 4.937319780454559e-05, |
|
"loss": 0.1806, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.04367593014445298, |
|
"learning_rate": 4.8597083257709194e-05, |
|
"loss": 0.169, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.03429260122917999, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 0.1523, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.025555649134774323, |
|
"learning_rate": 4.6168104980707107e-05, |
|
"loss": 0.1608, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.035622860476729544, |
|
"learning_rate": 4.454578706170075e-05, |
|
"loss": 0.1505, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.0459250933312251, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.1613, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.044900129344464294, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 0.1597, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.03285936274496835, |
|
"learning_rate": 3.830080191288342e-05, |
|
"loss": 0.1295, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.03104349497374127, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 0.1582, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.024404726332163458, |
|
"learning_rate": 3.3256976548879184e-05, |
|
"loss": 0.137, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.022789243213683145, |
|
"learning_rate": 3.056302334890786e-05, |
|
"loss": 0.1367, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 0.017586552406833603, |
|
"learning_rate": 2.7799111902582696e-05, |
|
"loss": 0.1292, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.016792838194229127, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.1372, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.019142947911569803, |
|
"learning_rate": 2.2200888097417307e-05, |
|
"loss": 0.1274, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.019220489564682752, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 0.1303, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.02264252406960103, |
|
"learning_rate": 1.6743023451120832e-05, |
|
"loss": 0.1362, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.020883245611579733, |
|
"learning_rate": 1.4152906522061048e-05, |
|
"loss": 0.1352, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.021254829840046948, |
|
"learning_rate": 1.1699198087116589e-05, |
|
"loss": 0.1193, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.022396775591741604, |
|
"learning_rate": 9.412754953531663e-06, |
|
"loss": 0.1193, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.01930808848159821, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 0.1366, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.018929254567235987, |
|
"learning_rate": 5.454212938299255e-06, |
|
"loss": 0.1257, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 0.02122756340453363, |
|
"learning_rate": 3.831895019292897e-06, |
|
"loss": 0.1256, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.017855075491123307, |
|
"learning_rate": 2.475778302439524e-06, |
|
"loss": 0.1273, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 0.0181714053917743, |
|
"learning_rate": 1.4029167422908107e-06, |
|
"loss": 0.1231, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.01977447479487821, |
|
"learning_rate": 6.268021954544096e-07, |
|
"loss": 0.1352, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.022144566953258688, |
|
"learning_rate": 1.571947526689349e-07, |
|
"loss": 0.1342, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.020972476400823264, |
|
"learning_rate": 0.0, |
|
"loss": 0.117, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 32, |
|
"total_flos": 600196704632832.0, |
|
"train_loss": 0.14642456639558077, |
|
"train_runtime": 1046.7446, |
|
"train_samples_per_second": 0.478, |
|
"train_steps_per_second": 0.031 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 32, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 600196704632832.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|