{ "best_metric": null, "best_model_checkpoint": null, "epoch": 14.0, "eval_steps": 500, "global_step": 153300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.46, "grad_norm": 1.4694527387619019, "learning_rate": 0.0002, "loss": 3.6175, "step": 5000 }, { "epoch": 0.91, "grad_norm": 1.7512956857681274, "learning_rate": 0.0002, "loss": 3.409, "step": 10000 }, { "epoch": 1.37, "grad_norm": 1.6375586986541748, "learning_rate": 0.0002, "loss": 3.3163, "step": 15000 }, { "epoch": 1.83, "grad_norm": 1.5933928489685059, "learning_rate": 0.0002, "loss": 3.2832, "step": 20000 }, { "epoch": 2.28, "grad_norm": 1.7562757730484009, "learning_rate": 0.0002, "loss": 3.2418, "step": 25000 }, { "epoch": 2.74, "grad_norm": 1.7487740516662598, "learning_rate": 0.0002, "loss": 3.2303, "step": 30000 }, { "epoch": 3.2, "grad_norm": 2.1667191982269287, "learning_rate": 0.0002, "loss": 3.2035, "step": 35000 }, { "epoch": 3.65, "grad_norm": 1.7021046876907349, "learning_rate": 0.0002, "loss": 3.1944, "step": 40000 }, { "epoch": 4.11, "grad_norm": 1.7821632623672485, "learning_rate": 0.0002, "loss": 3.1868, "step": 45000 }, { "epoch": 4.57, "grad_norm": 1.7171430587768555, "learning_rate": 0.0002, "loss": 3.1735, "step": 50000 }, { "epoch": 5.02, "grad_norm": 1.8709882497787476, "learning_rate": 0.0002, "loss": 3.1753, "step": 55000 }, { "epoch": 5.48, "grad_norm": 1.7582167387008667, "learning_rate": 0.0002, "loss": 3.1544, "step": 60000 }, { "epoch": 5.94, "grad_norm": 1.7653158903121948, "learning_rate": 0.0002, "loss": 3.1575, "step": 65000 }, { "epoch": 6.39, "grad_norm": 1.7473134994506836, "learning_rate": 0.0002, "loss": 3.1399, "step": 70000 }, { "epoch": 6.85, "grad_norm": 1.813683271408081, "learning_rate": 0.0002, "loss": 3.1516, "step": 75000 }, { "epoch": 7.31, "grad_norm": 1.8709701299667358, "learning_rate": 0.0002, "loss": 3.1346, "step": 80000 }, { "epoch": 7.76, "grad_norm": 1.7610187530517578, "learning_rate": 0.0002, "loss": 3.1373, "step": 85000 }, { "epoch": 8.22, "grad_norm": 1.851116418838501, "learning_rate": 0.0002, "loss": 3.129, "step": 90000 }, { "epoch": 8.68, "grad_norm": 1.9145632982254028, "learning_rate": 0.0002, "loss": 3.1299, "step": 95000 }, { "epoch": 9.13, "grad_norm": 1.8567790985107422, "learning_rate": 0.0002, "loss": 3.1257, "step": 100000 }, { "epoch": 9.59, "grad_norm": 1.803861379623413, "learning_rate": 0.0002, "loss": 3.1208, "step": 105000 }, { "epoch": 10.05, "grad_norm": 1.8530014753341675, "learning_rate": 0.0002, "loss": 3.1219, "step": 110000 }, { "epoch": 10.5, "grad_norm": 1.8452036380767822, "learning_rate": 0.0002, "loss": 3.1081, "step": 115000 }, { "epoch": 10.96, "grad_norm": 1.8687958717346191, "learning_rate": 0.0002, "loss": 3.123, "step": 120000 }, { "epoch": 11.42, "grad_norm": 2.0348446369171143, "learning_rate": 0.0002, "loss": 3.1025, "step": 125000 }, { "epoch": 11.87, "grad_norm": 1.7902649641036987, "learning_rate": 0.0002, "loss": 3.1172, "step": 130000 }, { "epoch": 12.33, "grad_norm": 1.935986876487732, "learning_rate": 0.0002, "loss": 3.1013, "step": 135000 }, { "epoch": 12.79, "grad_norm": 1.9082064628601074, "learning_rate": 0.0002, "loss": 3.1085, "step": 140000 }, { "epoch": 13.24, "grad_norm": 1.9145407676696777, "learning_rate": 0.0002, "loss": 3.0992, "step": 145000 }, { "epoch": 13.7, "grad_norm": 2.0970780849456787, "learning_rate": 0.0002, "loss": 3.1061, "step": 150000 } ], "logging_steps": 5000, "max_steps": 164250, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "total_flos": 8.154160201367232e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }