|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.473684210526315, |
|
"eval_steps": 500, |
|
"global_step": 90, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 3.0858, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 3.0546, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"eval_loss": 2.956000804901123, |
|
"eval_runtime": 0.5468, |
|
"eval_samples_per_second": 18.288, |
|
"eval_steps_per_second": 1.829, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.00019992479525042303, |
|
"loss": 2.6964, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 2.2821, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.696643829345703, |
|
"eval_runtime": 0.5537, |
|
"eval_samples_per_second": 18.061, |
|
"eval_steps_per_second": 1.806, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0001910362940966147, |
|
"loss": 2.0545, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00018135520702629675, |
|
"loss": 1.8527, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"eval_loss": 2.6030924320220947, |
|
"eval_runtime": 0.5536, |
|
"eval_samples_per_second": 18.063, |
|
"eval_steps_per_second": 1.806, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 1.6976, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00015332044328016914, |
|
"loss": 1.5789, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.58021879196167, |
|
"eval_runtime": 0.5505, |
|
"eval_samples_per_second": 18.165, |
|
"eval_steps_per_second": 1.817, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00013601777248047105, |
|
"loss": 1.5127, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.7368421052631575, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.4487, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 4.947368421052632, |
|
"eval_loss": 2.5688371658325195, |
|
"eval_runtime": 0.555, |
|
"eval_samples_per_second": 18.017, |
|
"eval_steps_per_second": 1.802, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 5.2631578947368425, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 9.806086682281758e-05, |
|
"loss": 1.4087, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 5.7894736842105265, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 7.882961277705895e-05, |
|
"loss": 1.3708, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.5638375282287598, |
|
"eval_runtime": 0.5463, |
|
"eval_samples_per_second": 18.305, |
|
"eval_steps_per_second": 1.831, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 6.315789473684211, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 1.3437, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 6.842105263157895, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 4.343931245134616e-05, |
|
"loss": 1.3286, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 6.947368421052632, |
|
"eval_loss": 2.5667476654052734, |
|
"eval_runtime": 0.5585, |
|
"eval_samples_per_second": 17.905, |
|
"eval_steps_per_second": 1.79, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 7.368421052631579, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 2.8607026544210114e-05, |
|
"loss": 1.3139, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 7.894736842105263, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 1.3104, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.5672240257263184, |
|
"eval_runtime": 0.5487, |
|
"eval_samples_per_second": 18.224, |
|
"eval_steps_per_second": 1.822, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 8.421052631578947, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 7.427603073110967e-06, |
|
"loss": 1.3054, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 8.947368421052632, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 1.874468937261531e-06, |
|
"loss": 1.3046, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 8.947368421052632, |
|
"eval_loss": 2.567439556121826, |
|
"eval_runtime": 0.561, |
|
"eval_samples_per_second": 17.827, |
|
"eval_steps_per_second": 1.783, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 9.473684210526315, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0, |
|
"loss": 1.3032, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 9.473684210526315, |
|
"eval_loss": 2.5678367614746094, |
|
"eval_runtime": 0.5511, |
|
"eval_samples_per_second": 18.145, |
|
"eval_steps_per_second": 1.814, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 9.473684210526315, |
|
"step": 90, |
|
"total_flos": 5.286360054444851e+16, |
|
"train_loss": 1.6763220760557387, |
|
"train_runtime": 490.2927, |
|
"train_samples_per_second": 8.933, |
|
"train_steps_per_second": 0.184 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 90, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.286360054444851e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|