|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.993680884676145, |
|
"eval_steps": 100, |
|
"global_step": 632, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5669772624969482, |
|
"learning_rate": 4.9888580822471086e-05, |
|
"loss": 1.4429, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5426832437515259, |
|
"learning_rate": 4.953167838259285e-05, |
|
"loss": 1.142, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5566983819007874, |
|
"learning_rate": 4.893250847214369e-05, |
|
"loss": 1.0441, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5931661128997803, |
|
"learning_rate": 4.8096988312782174e-05, |
|
"loss": 1.0139, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7013124227523804, |
|
"learning_rate": 4.703336925007311e-05, |
|
"loss": 0.9813, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 0.9671486616134644, |
|
"eval_runtime": 276.2617, |
|
"eval_samples_per_second": 10.03, |
|
"eval_steps_per_second": 1.256, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6544580459594727, |
|
"learning_rate": 4.575215526568278e-05, |
|
"loss": 0.9546, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6307847499847412, |
|
"learning_rate": 4.426599924321815e-05, |
|
"loss": 0.9498, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.6634028553962708, |
|
"learning_rate": 4.2589578012157426e-05, |
|
"loss": 0.9497, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.6686026453971863, |
|
"learning_rate": 4.073944740390061e-05, |
|
"loss": 0.9203, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.7216710448265076, |
|
"learning_rate": 3.873387875136252e-05, |
|
"loss": 0.9108, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_loss": 0.9249987602233887, |
|
"eval_runtime": 276.209, |
|
"eval_samples_per_second": 10.032, |
|
"eval_steps_per_second": 1.256, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.6702993512153625, |
|
"learning_rate": 3.6592678446789516e-05, |
|
"loss": 0.908, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.7004181146621704, |
|
"learning_rate": 3.433699233979222e-05, |
|
"loss": 0.9077, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.6908763647079468, |
|
"learning_rate": 3.198909690730063e-05, |
|
"loss": 0.903, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.6947716474533081, |
|
"learning_rate": 2.9572179257784215e-05, |
|
"loss": 0.8984, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.6954981088638306, |
|
"learning_rate": 2.711010814234896e-05, |
|
"loss": 0.8976, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 0.9090741276741028, |
|
"eval_runtime": 276.2477, |
|
"eval_samples_per_second": 10.031, |
|
"eval_steps_per_second": 1.256, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.6500376462936401, |
|
"learning_rate": 2.462719823413707e-05, |
|
"loss": 0.8959, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.7603716850280762, |
|
"learning_rate": 2.214797000393479e-05, |
|
"loss": 0.8837, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.820634126663208, |
|
"learning_rate": 1.9696907563384687e-05, |
|
"loss": 0.8655, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.7582141757011414, |
|
"learning_rate": 1.7298216867269906e-05, |
|
"loss": 0.8695, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.8109495043754578, |
|
"learning_rate": 1.4975586662791783e-05, |
|
"loss": 0.8687, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"eval_loss": 0.9004740118980408, |
|
"eval_runtime": 276.3126, |
|
"eval_samples_per_second": 10.028, |
|
"eval_steps_per_second": 1.256, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.7401316165924072, |
|
"learning_rate": 1.2751954546633871e-05, |
|
"loss": 0.8688, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.7914406657218933, |
|
"learning_rate": 1.0649280440162326e-05, |
|
"loss": 0.8565, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.7221879363059998, |
|
"learning_rate": 8.68832971985347e-06, |
|
"loss": 0.8672, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.8058634400367737, |
|
"learning_rate": 6.8884681446869105e-06, |
|
"loss": 0.8644, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.8008745908737183, |
|
"learning_rate": 5.267470605739952e-06, |
|
"loss": 0.8548, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"eval_loss": 0.8958437442779541, |
|
"eval_runtime": 276.5086, |
|
"eval_samples_per_second": 10.021, |
|
"eval_steps_per_second": 1.255, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.8349476456642151, |
|
"learning_rate": 3.841345586714251e-06, |
|
"loss": 0.8522, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.8091734051704407, |
|
"learning_rate": 2.624177068970124e-06, |
|
"loss": 0.8547, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.7861022353172302, |
|
"learning_rate": 1.6279854423664697e-06, |
|
"loss": 0.8387, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.7953125834465027, |
|
"learning_rate": 8.62608795509276e-07, |
|
"loss": 0.8471, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.8621749877929688, |
|
"learning_rate": 3.3560575775019864e-07, |
|
"loss": 0.8468, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"eval_loss": 0.895173192024231, |
|
"eval_runtime": 276.743, |
|
"eval_samples_per_second": 10.013, |
|
"eval_steps_per_second": 1.254, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.7905401587486267, |
|
"learning_rate": 5.218085243859638e-08, |
|
"loss": 0.8597, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"step": 632, |
|
"total_flos": 1.7866046476474122e+18, |
|
"train_loss": 0.9217187543458576, |
|
"train_runtime": 18676.6384, |
|
"train_samples_per_second": 2.169, |
|
"train_steps_per_second": 0.034 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 632, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 1.7866046476474122e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|