|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 252, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 114440.2421875, |
|
"learning_rate": 0.0002988358809900258, |
|
"loss": 1.5517, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 86295.7265625, |
|
"learning_rate": 0.00029536159293436166, |
|
"loss": 1.0216, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 93327.2890625, |
|
"learning_rate": 0.00028963106229663063, |
|
"loss": 0.938, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 90729.6796875, |
|
"learning_rate": 0.0002817332360055343, |
|
"loss": 1.0426, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 77492.8203125, |
|
"learning_rate": 0.0002717907008573785, |
|
"loss": 0.9682, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 71191.8359375, |
|
"learning_rate": 0.0002599577807744739, |
|
"loss": 0.9422, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 81620.265625, |
|
"learning_rate": 0.0002464181414529809, |
|
"loss": 1.0024, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 68561.46875, |
|
"learning_rate": 0.0002313819395798639, |
|
"loss": 0.9483, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.9816237688064575, |
|
"eval_runtime": 80.5903, |
|
"eval_samples_per_second": 1.377, |
|
"eval_steps_per_second": 0.347, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 79334.9375, |
|
"learning_rate": 0.00021508256086763368, |
|
"loss": 0.8421, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 100762.734375, |
|
"learning_rate": 0.00019777299753775265, |
|
"loss": 0.6904, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 88126.5390625, |
|
"learning_rate": 0.0001797219214799096, |
|
"loss": 0.7075, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 98623.3359375, |
|
"learning_rate": 0.00016120951403796364, |
|
"loss": 0.765, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 93532.2890625, |
|
"learning_rate": 0.0001425231171508954, |
|
"loss": 0.7921, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 92075.6484375, |
|
"learning_rate": 0.00012395277334996044, |
|
"loss": 0.6924, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 93960.328125, |
|
"learning_rate": 0.00010578672383836435, |
|
"loss": 0.677, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 71849.8125, |
|
"learning_rate": 8.830693453040829e-05, |
|
"loss": 0.7375, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.9629082679748535, |
|
"eval_runtime": 80.2418, |
|
"eval_samples_per_second": 1.383, |
|
"eval_steps_per_second": 0.349, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 68938.421875, |
|
"learning_rate": 7.17847194930753e-05, |
|
"loss": 0.659, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 141907.765625, |
|
"learning_rate": 5.6476529721189974e-05, |
|
"loss": 0.425, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 96978.578125, |
|
"learning_rate": 4.261997261104223e-05, |
|
"loss": 0.4418, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 142711.125, |
|
"learning_rate": 3.0430123916561672e-05, |
|
"loss": 0.415, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 90807.7421875, |
|
"learning_rate": 2.009618943233419e-05, |
|
"loss": 0.4457, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 105124.25, |
|
"learning_rate": 1.1778568219438839e-05, |
|
"loss": 0.4444, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 92137.609375, |
|
"learning_rate": 5.606362957498195e-06, |
|
"loss": 0.4239, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 96611.28125, |
|
"learning_rate": 1.6753760662307215e-06, |
|
"loss": 0.4227, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 100055.1953125, |
|
"learning_rate": 4.662269987756317e-08, |
|
"loss": 0.4589, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.0281459093093872, |
|
"eval_runtime": 80.2237, |
|
"eval_samples_per_second": 1.384, |
|
"eval_steps_per_second": 0.349, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 252, |
|
"total_flos": 2.458929889338163e+16, |
|
"train_loss": 0.735804313705081, |
|
"train_runtime": 2220.224, |
|
"train_samples_per_second": 0.45, |
|
"train_steps_per_second": 0.114 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 252, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10, |
|
"total_flos": 2.458929889338163e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|