{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15822784810126583, "grad_norm": 2.233290195465088, "learning_rate": 0.0002, "loss": 3.0567, "step": 25 }, { "epoch": 0.31645569620253167, "grad_norm": 1.9256885051727295, "learning_rate": 0.0002, "loss": 2.6434, "step": 50 }, { "epoch": 0.47468354430379744, "grad_norm": 4.249744415283203, "learning_rate": 0.0002, "loss": 2.0778, "step": 75 }, { "epoch": 0.6329113924050633, "grad_norm": 1.954801082611084, "learning_rate": 0.0002, "loss": 2.1352, "step": 100 }, { "epoch": 0.7911392405063291, "grad_norm": 1.9269670248031616, "learning_rate": 0.0002, "loss": 1.8299, "step": 125 }, { "epoch": 0.9493670886075949, "grad_norm": 4.059688091278076, "learning_rate": 0.0002, "loss": 1.6206, "step": 150 }, { "epoch": 1.1075949367088607, "grad_norm": 2.5162670612335205, "learning_rate": 0.0002, "loss": 1.7301, "step": 175 }, { "epoch": 1.2658227848101267, "grad_norm": 2.2635657787323, "learning_rate": 0.0002, "loss": 1.3103, "step": 200 }, { "epoch": 1.4240506329113924, "grad_norm": 2.5782394409179688, "learning_rate": 0.0002, "loss": 1.2166, "step": 225 }, { "epoch": 1.5822784810126582, "grad_norm": 2.443361282348633, "learning_rate": 0.0002, "loss": 1.4792, "step": 250 }, { "epoch": 1.740506329113924, "grad_norm": 4.522688388824463, "learning_rate": 0.0002, "loss": 1.2199, "step": 275 }, { "epoch": 1.8987341772151898, "grad_norm": 3.9393839836120605, "learning_rate": 0.0002, "loss": 1.3172, "step": 300 }, { "epoch": 2.0569620253164556, "grad_norm": 1.763312816619873, "learning_rate": 0.0002, "loss": 1.1909, "step": 325 }, { "epoch": 2.2151898734177213, "grad_norm": 2.383930206298828, "learning_rate": 0.0002, "loss": 0.9682, "step": 350 }, { "epoch": 2.3734177215189876, "grad_norm": 3.6665306091308594, "learning_rate": 0.0002, "loss": 1.1693, "step": 375 }, { "epoch": 2.5316455696202533, "grad_norm": 1.7745016813278198, "learning_rate": 0.0002, "loss": 1.0193, "step": 400 }, { "epoch": 2.689873417721519, "grad_norm": 1.569421410560608, "learning_rate": 0.0002, "loss": 0.9753, "step": 425 }, { "epoch": 2.848101265822785, "grad_norm": 2.2681877613067627, "learning_rate": 0.0002, "loss": 1.0567, "step": 450 }, { "epoch": 3.0063291139240507, "grad_norm": 1.752241849899292, "learning_rate": 0.0002, "loss": 1.0343, "step": 475 }, { "epoch": 3.1645569620253164, "grad_norm": 0.7529569268226624, "learning_rate": 0.0002, "loss": 0.8624, "step": 500 }, { "epoch": 3.3227848101265822, "grad_norm": 2.013693332672119, "learning_rate": 0.0002, "loss": 0.9729, "step": 525 }, { "epoch": 3.481012658227848, "grad_norm": 2.212862730026245, "learning_rate": 0.0002, "loss": 0.8433, "step": 550 }, { "epoch": 3.6392405063291138, "grad_norm": 2.6525330543518066, "learning_rate": 0.0002, "loss": 0.9046, "step": 575 }, { "epoch": 3.7974683544303796, "grad_norm": 1.9108997583389282, "learning_rate": 0.0002, "loss": 0.9368, "step": 600 }, { "epoch": 3.9556962025316453, "grad_norm": 1.4593428373336792, "learning_rate": 0.0002, "loss": 0.8079, "step": 625 }, { "epoch": 4.113924050632911, "grad_norm": 1.0320943593978882, "learning_rate": 0.0002, "loss": 0.8961, "step": 650 }, { "epoch": 4.272151898734177, "grad_norm": 2.041616439819336, "learning_rate": 0.0002, "loss": 0.7348, "step": 675 }, { "epoch": 4.430379746835443, "grad_norm": 2.494473457336426, "learning_rate": 0.0002, "loss": 0.7822, "step": 700 }, { "epoch": 4.588607594936709, "grad_norm": 1.134831428527832, "learning_rate": 0.0002, "loss": 0.8666, "step": 725 }, { "epoch": 4.746835443037975, "grad_norm": 1.860443353652954, "learning_rate": 0.0002, "loss": 0.7721, "step": 750 }, { "epoch": 4.905063291139241, "grad_norm": 3.339151620864868, "learning_rate": 0.0002, "loss": 0.8407, "step": 775 }, { "epoch": 5.063291139240507, "grad_norm": 1.3228943347930908, "learning_rate": 0.0002, "loss": 0.833, "step": 800 }, { "epoch": 5.2215189873417724, "grad_norm": 2.0199851989746094, "learning_rate": 0.0002, "loss": 0.6558, "step": 825 }, { "epoch": 5.379746835443038, "grad_norm": 1.0233032703399658, "learning_rate": 0.0002, "loss": 0.7571, "step": 850 }, { "epoch": 5.537974683544304, "grad_norm": 1.8455493450164795, "learning_rate": 0.0002, "loss": 0.7673, "step": 875 }, { "epoch": 5.69620253164557, "grad_norm": 1.3019192218780518, "learning_rate": 0.0002, "loss": 0.6765, "step": 900 }, { "epoch": 5.8544303797468356, "grad_norm": 1.6968228816986084, "learning_rate": 0.0002, "loss": 0.8249, "step": 925 }, { "epoch": 6.012658227848101, "grad_norm": 1.5166069269180298, "learning_rate": 0.0002, "loss": 0.765, "step": 950 }, { "epoch": 6.170886075949367, "grad_norm": 1.438341498374939, "learning_rate": 0.0002, "loss": 0.628, "step": 975 }, { "epoch": 6.329113924050633, "grad_norm": 1.4135054349899292, "learning_rate": 0.0002, "loss": 0.7128, "step": 1000 }, { "epoch": 6.487341772151899, "grad_norm": 1.8510311841964722, "learning_rate": 0.0002, "loss": 0.6726, "step": 1025 }, { "epoch": 6.6455696202531644, "grad_norm": 0.8984973430633545, "learning_rate": 0.0002, "loss": 0.7169, "step": 1050 }, { "epoch": 6.80379746835443, "grad_norm": 1.762295126914978, "learning_rate": 0.0002, "loss": 0.7315, "step": 1075 }, { "epoch": 6.962025316455696, "grad_norm": 1.3354698419570923, "learning_rate": 0.0002, "loss": 0.6275, "step": 1100 }, { "epoch": 7.120253164556962, "grad_norm": 1.680066466331482, "learning_rate": 0.0002, "loss": 0.6706, "step": 1125 }, { "epoch": 7.2784810126582276, "grad_norm": 1.5245403051376343, "learning_rate": 0.0002, "loss": 0.6232, "step": 1150 }, { "epoch": 7.436708860759493, "grad_norm": 1.4877965450286865, "learning_rate": 0.0002, "loss": 0.5902, "step": 1175 }, { "epoch": 7.594936708860759, "grad_norm": 0.7956791520118713, "learning_rate": 0.0002, "loss": 0.6998, "step": 1200 }, { "epoch": 7.753164556962025, "grad_norm": 2.1762688159942627, "learning_rate": 0.0002, "loss": 0.7275, "step": 1225 }, { "epoch": 7.911392405063291, "grad_norm": 1.2218317985534668, "learning_rate": 0.0002, "loss": 0.6267, "step": 1250 }, { "epoch": 8.069620253164556, "grad_norm": 1.339480996131897, "learning_rate": 0.0002, "loss": 0.6799, "step": 1275 }, { "epoch": 8.227848101265822, "grad_norm": 1.3387433290481567, "learning_rate": 0.0002, "loss": 0.577, "step": 1300 }, { "epoch": 8.386075949367088, "grad_norm": 1.0354127883911133, "learning_rate": 0.0002, "loss": 0.6526, "step": 1325 }, { "epoch": 8.544303797468354, "grad_norm": 1.4868078231811523, "learning_rate": 0.0002, "loss": 0.6638, "step": 1350 }, { "epoch": 8.70253164556962, "grad_norm": 0.7492271065711975, "learning_rate": 0.0002, "loss": 0.5833, "step": 1375 }, { "epoch": 8.860759493670885, "grad_norm": 1.3193756341934204, "learning_rate": 0.0002, "loss": 0.6851, "step": 1400 }, { "epoch": 9.018987341772151, "grad_norm": 1.924387812614441, "learning_rate": 0.0002, "loss": 0.6335, "step": 1425 }, { "epoch": 9.177215189873417, "grad_norm": 1.1999796628952026, "learning_rate": 0.0002, "loss": 0.4827, "step": 1450 }, { "epoch": 9.335443037974684, "grad_norm": 1.647176742553711, "learning_rate": 0.0002, "loss": 0.6423, "step": 1475 }, { "epoch": 9.49367088607595, "grad_norm": 1.3660459518432617, "learning_rate": 0.0002, "loss": 0.6176, "step": 1500 }, { "epoch": 9.651898734177216, "grad_norm": 0.9778301119804382, "learning_rate": 0.0002, "loss": 0.5802, "step": 1525 }, { "epoch": 9.810126582278482, "grad_norm": 1.5528557300567627, "learning_rate": 0.0002, "loss": 0.6645, "step": 1550 }, { "epoch": 9.968354430379748, "grad_norm": 1.8788762092590332, "learning_rate": 0.0002, "loss": 0.5932, "step": 1575 } ], "logging_steps": 25, "max_steps": 1580, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5358729709043712.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }