|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1580, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.15822784810126583, |
|
"grad_norm": 2.233290195465088, |
|
"learning_rate": 0.0002, |
|
"loss": 3.0567, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.31645569620253167, |
|
"grad_norm": 1.9256885051727295, |
|
"learning_rate": 0.0002, |
|
"loss": 2.6434, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.47468354430379744, |
|
"grad_norm": 4.249744415283203, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0778, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6329113924050633, |
|
"grad_norm": 1.954801082611084, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1352, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7911392405063291, |
|
"grad_norm": 1.9269670248031616, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8299, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9493670886075949, |
|
"grad_norm": 4.059688091278076, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6206, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1075949367088607, |
|
"grad_norm": 2.5162670612335205, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7301, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2658227848101267, |
|
"grad_norm": 2.2635657787323, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3103, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4240506329113924, |
|
"grad_norm": 2.5782394409179688, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2166, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.5822784810126582, |
|
"grad_norm": 2.443361282348633, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4792, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.740506329113924, |
|
"grad_norm": 4.522688388824463, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2199, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.8987341772151898, |
|
"grad_norm": 3.9393839836120605, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3172, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.0569620253164556, |
|
"grad_norm": 1.763312816619873, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1909, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.2151898734177213, |
|
"grad_norm": 2.383930206298828, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9682, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3734177215189876, |
|
"grad_norm": 3.6665306091308594, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1693, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.5316455696202533, |
|
"grad_norm": 1.7745016813278198, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0193, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.689873417721519, |
|
"grad_norm": 1.569421410560608, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9753, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.848101265822785, |
|
"grad_norm": 2.2681877613067627, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0567, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.0063291139240507, |
|
"grad_norm": 1.752241849899292, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0343, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.1645569620253164, |
|
"grad_norm": 0.7529569268226624, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8624, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.3227848101265822, |
|
"grad_norm": 2.013693332672119, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9729, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.481012658227848, |
|
"grad_norm": 2.212862730026245, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8433, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.6392405063291138, |
|
"grad_norm": 2.6525330543518066, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9046, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.7974683544303796, |
|
"grad_norm": 1.9108997583389282, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9368, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.9556962025316453, |
|
"grad_norm": 1.4593428373336792, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8079, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.113924050632911, |
|
"grad_norm": 1.0320943593978882, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8961, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.272151898734177, |
|
"grad_norm": 2.041616439819336, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7348, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.430379746835443, |
|
"grad_norm": 2.494473457336426, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7822, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.588607594936709, |
|
"grad_norm": 1.134831428527832, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8666, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.746835443037975, |
|
"grad_norm": 1.860443353652954, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7721, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.905063291139241, |
|
"grad_norm": 3.339151620864868, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8407, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 5.063291139240507, |
|
"grad_norm": 1.3228943347930908, |
|
"learning_rate": 0.0002, |
|
"loss": 0.833, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.2215189873417724, |
|
"grad_norm": 2.0199851989746094, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6558, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 5.379746835443038, |
|
"grad_norm": 1.0233032703399658, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7571, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.537974683544304, |
|
"grad_norm": 1.8455493450164795, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7673, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 5.69620253164557, |
|
"grad_norm": 1.3019192218780518, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6765, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.8544303797468356, |
|
"grad_norm": 1.6968228816986084, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8249, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 6.012658227848101, |
|
"grad_norm": 1.5166069269180298, |
|
"learning_rate": 0.0002, |
|
"loss": 0.765, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.170886075949367, |
|
"grad_norm": 1.438341498374939, |
|
"learning_rate": 0.0002, |
|
"loss": 0.628, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 6.329113924050633, |
|
"grad_norm": 1.4135054349899292, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7128, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.487341772151899, |
|
"grad_norm": 1.8510311841964722, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6726, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 6.6455696202531644, |
|
"grad_norm": 0.8984973430633545, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7169, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.80379746835443, |
|
"grad_norm": 1.762295126914978, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7315, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 6.962025316455696, |
|
"grad_norm": 1.3354698419570923, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6275, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.120253164556962, |
|
"grad_norm": 1.680066466331482, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6706, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 7.2784810126582276, |
|
"grad_norm": 1.5245403051376343, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6232, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 7.436708860759493, |
|
"grad_norm": 1.4877965450286865, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5902, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 7.594936708860759, |
|
"grad_norm": 0.7956791520118713, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6998, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 7.753164556962025, |
|
"grad_norm": 2.1762688159942627, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7275, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 7.911392405063291, |
|
"grad_norm": 1.2218317985534668, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6267, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 8.069620253164556, |
|
"grad_norm": 1.339480996131897, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6799, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 8.227848101265822, |
|
"grad_norm": 1.3387433290481567, |
|
"learning_rate": 0.0002, |
|
"loss": 0.577, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 8.386075949367088, |
|
"grad_norm": 1.0354127883911133, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6526, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 8.544303797468354, |
|
"grad_norm": 1.4868078231811523, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6638, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 8.70253164556962, |
|
"grad_norm": 0.7492271065711975, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5833, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 8.860759493670885, |
|
"grad_norm": 1.3193756341934204, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6851, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 9.018987341772151, |
|
"grad_norm": 1.924387812614441, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6335, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 9.177215189873417, |
|
"grad_norm": 1.1999796628952026, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4827, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 9.335443037974684, |
|
"grad_norm": 1.647176742553711, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6423, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 9.49367088607595, |
|
"grad_norm": 1.3660459518432617, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6176, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 9.651898734177216, |
|
"grad_norm": 0.9778301119804382, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5802, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 9.810126582278482, |
|
"grad_norm": 1.5528557300567627, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6645, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 9.968354430379748, |
|
"grad_norm": 1.8788762092590332, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5932, |
|
"step": 1575 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1580, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5358729709043712.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|