{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15929908403026682, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 10.092560768127441, "learning_rate": 1.0000000000000002e-06, "loss": 1.9497, "step": 1 }, { "epoch": 0.01, "grad_norm": 8.963287353515625, "learning_rate": 2.0000000000000003e-06, "loss": 1.951, "step": 2 }, { "epoch": 0.01, "grad_norm": 9.27600383758545, "learning_rate": 3e-06, "loss": 1.9345, "step": 3 }, { "epoch": 0.01, "grad_norm": 5.632405757904053, "learning_rate": 4.000000000000001e-06, "loss": 1.9157, "step": 4 }, { "epoch": 0.02, "grad_norm": 5.668580055236816, "learning_rate": 5e-06, "loss": 1.9026, "step": 5 }, { "epoch": 0.02, "grad_norm": 3.8046553134918213, "learning_rate": 6e-06, "loss": 1.8923, "step": 6 }, { "epoch": 0.02, "grad_norm": 4.357985973358154, "learning_rate": 7e-06, "loss": 1.8241, "step": 7 }, { "epoch": 0.03, "grad_norm": 4.685062885284424, "learning_rate": 8.000000000000001e-06, "loss": 1.8467, "step": 8 }, { "epoch": 0.03, "grad_norm": 4.768229961395264, "learning_rate": 9e-06, "loss": 1.8199, "step": 9 }, { "epoch": 0.03, "grad_norm": 4.796407699584961, "learning_rate": 1e-05, "loss": 1.8374, "step": 10 }, { "epoch": 0.04, "grad_norm": 5.7536139488220215, "learning_rate": 9.999731248679734e-06, "loss": 1.779, "step": 11 }, { "epoch": 0.04, "grad_norm": 4.202663898468018, "learning_rate": 9.99892502360984e-06, "loss": 1.7579, "step": 12 }, { "epoch": 0.04, "grad_norm": 2.9114131927490234, "learning_rate": 9.99758141145994e-06, "loss": 1.7433, "step": 13 }, { "epoch": 0.04, "grad_norm": 2.3823723793029785, "learning_rate": 9.995700556669052e-06, "loss": 1.7212, "step": 14 }, { "epoch": 0.05, "grad_norm": 2.3254876136779785, "learning_rate": 9.993282661430058e-06, "loss": 1.7218, "step": 15 }, { "epoch": 0.05, "grad_norm": 2.053166151046753, "learning_rate": 9.990327985667972e-06, "loss": 1.7256, "step": 16 }, { "epoch": 0.05, "grad_norm": 2.3782012462615967, "learning_rate": 9.986836847012001e-06, "loss": 1.713, "step": 17 }, { "epoch": 0.06, "grad_norm": 2.1311683654785156, "learning_rate": 9.98280962076139e-06, "loss": 1.6785, "step": 18 }, { "epoch": 0.06, "grad_norm": 2.325747489929199, "learning_rate": 9.978246739845095e-06, "loss": 1.7167, "step": 19 }, { "epoch": 0.06, "grad_norm": 2.1330366134643555, "learning_rate": 9.973148694775217e-06, "loss": 1.676, "step": 20 }, { "epoch": 0.07, "grad_norm": 2.2632806301116943, "learning_rate": 9.967516033594295e-06, "loss": 1.7033, "step": 21 }, { "epoch": 0.07, "grad_norm": 2.6582744121551514, "learning_rate": 9.961349361816384e-06, "loss": 1.6957, "step": 22 }, { "epoch": 0.07, "grad_norm": 1.9663337469100952, "learning_rate": 9.954649342361952e-06, "loss": 1.6729, "step": 23 }, { "epoch": 0.08, "grad_norm": 2.2557435035705566, "learning_rate": 9.947416695486633e-06, "loss": 1.6399, "step": 24 }, { "epoch": 0.08, "grad_norm": 2.090054512023926, "learning_rate": 9.939652198703785e-06, "loss": 1.6792, "step": 25 }, { "epoch": 0.08, "grad_norm": 2.1648776531219482, "learning_rate": 9.93135668670091e-06, "loss": 1.6625, "step": 26 }, { "epoch": 0.09, "grad_norm": 2.1209850311279297, "learning_rate": 9.92253105124993e-06, "loss": 1.7057, "step": 27 }, { "epoch": 0.09, "grad_norm": 2.2777392864227295, "learning_rate": 9.91317624111132e-06, "loss": 1.6052, "step": 28 }, { "epoch": 0.09, "grad_norm": 2.1739561557769775, "learning_rate": 9.903293261932106e-06, "loss": 1.6139, "step": 29 }, { "epoch": 0.1, "grad_norm": 2.4518871307373047, "learning_rate": 9.89288317613777e-06, "loss": 1.6759, "step": 30 }, { "epoch": 0.1, "grad_norm": 1.795519471168518, "learning_rate": 9.881947102818036e-06, "loss": 1.7036, "step": 31 }, { "epoch": 0.1, "grad_norm": 2.150223731994629, "learning_rate": 9.870486217606557e-06, "loss": 1.6469, "step": 32 }, { "epoch": 0.11, "grad_norm": 1.91805899143219, "learning_rate": 9.858501752554548e-06, "loss": 1.6353, "step": 33 }, { "epoch": 0.11, "grad_norm": 1.8129810094833374, "learning_rate": 9.845994995998332e-06, "loss": 1.6551, "step": 34 }, { "epoch": 0.11, "grad_norm": 2.1308679580688477, "learning_rate": 9.83296729242084e-06, "loss": 1.617, "step": 35 }, { "epoch": 0.11, "grad_norm": 1.8321012258529663, "learning_rate": 9.819420042307091e-06, "loss": 1.6213, "step": 36 }, { "epoch": 0.12, "grad_norm": 1.9112164974212646, "learning_rate": 9.805354701993624e-06, "loss": 1.6245, "step": 37 }, { "epoch": 0.12, "grad_norm": 2.1160471439361572, "learning_rate": 9.79077278351195e-06, "loss": 1.6405, "step": 38 }, { "epoch": 0.12, "grad_norm": 2.6318371295928955, "learning_rate": 9.77567585442601e-06, "loss": 1.6234, "step": 39 }, { "epoch": 0.13, "grad_norm": 3.238373279571533, "learning_rate": 9.76006553766365e-06, "loss": 1.6452, "step": 40 }, { "epoch": 0.13, "grad_norm": 2.056736469268799, "learning_rate": 9.743943511342168e-06, "loss": 1.679, "step": 41 }, { "epoch": 0.13, "grad_norm": 2.000826358795166, "learning_rate": 9.727311508587907e-06, "loss": 1.6904, "step": 42 }, { "epoch": 0.14, "grad_norm": 2.089482069015503, "learning_rate": 9.710171317349946e-06, "loss": 1.62, "step": 43 }, { "epoch": 0.14, "grad_norm": 2.5748705863952637, "learning_rate": 9.692524780207897e-06, "loss": 1.6669, "step": 44 }, { "epoch": 0.14, "grad_norm": 1.9039987325668335, "learning_rate": 9.674373794173818e-06, "loss": 1.6489, "step": 45 }, { "epoch": 0.15, "grad_norm": 1.8047181367874146, "learning_rate": 9.655720310488298e-06, "loss": 1.6227, "step": 46 }, { "epoch": 0.15, "grad_norm": 2.0552868843078613, "learning_rate": 9.636566334410682e-06, "loss": 1.5898, "step": 47 }, { "epoch": 0.15, "grad_norm": 2.252218008041382, "learning_rate": 9.616913925003514e-06, "loss": 1.6667, "step": 48 }, { "epoch": 0.16, "grad_norm": 2.015887498855591, "learning_rate": 9.596765194911182e-06, "loss": 1.6668, "step": 49 }, { "epoch": 0.16, "grad_norm": 2.582007884979248, "learning_rate": 9.576122310132814e-06, "loss": 1.6542, "step": 50 } ], "logging_steps": 1.0, "max_steps": 313, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 49711364571136.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }