|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 13336, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.067293643951416, |
|
"learning_rate": 9.625074985003e-06, |
|
"loss": 1.3508, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.026179313659668, |
|
"learning_rate": 9.250149970005999e-06, |
|
"loss": 1.3055, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 6.276480674743652, |
|
"learning_rate": 8.875224955008999e-06, |
|
"loss": 1.2235, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.172286510467529, |
|
"learning_rate": 8.500299940011997e-06, |
|
"loss": 1.2637, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 5.915102958679199, |
|
"learning_rate": 8.125374925014997e-06, |
|
"loss": 1.2232, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 5.094091415405273, |
|
"learning_rate": 7.750449910017997e-06, |
|
"loss": 1.1814, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 4.9657793045043945, |
|
"learning_rate": 7.375524895020996e-06, |
|
"loss": 1.1449, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 5.007996082305908, |
|
"learning_rate": 7.000599880023996e-06, |
|
"loss": 1.0923, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 5.494340896606445, |
|
"learning_rate": 6.6256748650269955e-06, |
|
"loss": 1.0908, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.9410271644592285, |
|
"learning_rate": 6.250749850029995e-06, |
|
"loss": 1.0804, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 5.13407564163208, |
|
"learning_rate": 5.875824835032994e-06, |
|
"loss": 1.0556, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 4.397137641906738, |
|
"learning_rate": 5.500899820035993e-06, |
|
"loss": 1.0886, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 5.924018383026123, |
|
"learning_rate": 5.125974805038992e-06, |
|
"loss": 1.0694, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 3.952533006668091, |
|
"learning_rate": 4.751049790041992e-06, |
|
"loss": 1.0158, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 6.275745868682861, |
|
"learning_rate": 4.376124775044991e-06, |
|
"loss": 1.0082, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 7.6413116455078125, |
|
"learning_rate": 4.001199760047991e-06, |
|
"loss": 0.9991, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 5.2266387939453125, |
|
"learning_rate": 3.6262747450509898e-06, |
|
"loss": 0.976, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 4.824028968811035, |
|
"learning_rate": 3.2513497300539893e-06, |
|
"loss": 0.9826, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 7.193837642669678, |
|
"learning_rate": 2.876424715056989e-06, |
|
"loss": 0.9731, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 6.571595191955566, |
|
"learning_rate": 2.5014997000599884e-06, |
|
"loss": 1.0054, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 4.61974573135376, |
|
"learning_rate": 2.1265746850629876e-06, |
|
"loss": 0.9583, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 5.337657451629639, |
|
"learning_rate": 1.751649670065987e-06, |
|
"loss": 0.9296, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 5.575818061828613, |
|
"learning_rate": 1.3767246550689864e-06, |
|
"loss": 0.9576, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 6.3436431884765625, |
|
"learning_rate": 1.0017996400719856e-06, |
|
"loss": 0.9241, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 3.276711940765381, |
|
"learning_rate": 6.26874625074985e-07, |
|
"loss": 0.9307, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 5.67604923248291, |
|
"learning_rate": 2.519496100779844e-07, |
|
"loss": 0.9255, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 13336, |
|
"total_flos": 1.66894187000832e+16, |
|
"train_loss": 1.0640255470939504, |
|
"train_runtime": 2426.9638, |
|
"train_samples_per_second": 16.481, |
|
"train_steps_per_second": 5.495 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 13336, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 4000, |
|
"total_flos": 1.66894187000832e+16, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|