{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.12, "eval_steps": 4, "global_step": 39, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 17.890169143676758, "learning_rate": 2e-05, "loss": 6.6299, "step": 1 }, { "epoch": 0.08, "eval_loss": 6.932005405426025, "eval_runtime": 5.2841, "eval_samples_per_second": 2.271, "eval_steps_per_second": 1.135, "step": 1 }, { "epoch": 0.16, "grad_norm": 17.0613956451416, "learning_rate": 4e-05, "loss": 6.8062, "step": 2 }, { "epoch": 0.24, "grad_norm": 19.356693267822266, "learning_rate": 6e-05, "loss": 7.0697, "step": 3 }, { "epoch": 0.32, "grad_norm": 12.688406944274902, "learning_rate": 8e-05, "loss": 5.9686, "step": 4 }, { "epoch": 0.32, "eval_loss": 4.446321964263916, "eval_runtime": 5.3077, "eval_samples_per_second": 2.261, "eval_steps_per_second": 1.13, "step": 4 }, { "epoch": 0.4, "grad_norm": 11.240882873535156, "learning_rate": 0.0001, "loss": 4.5161, "step": 5 }, { "epoch": 0.48, "grad_norm": 14.287125587463379, "learning_rate": 0.00012, "loss": 2.5736, "step": 6 }, { "epoch": 0.56, "grad_norm": 4.680168628692627, "learning_rate": 0.00014, "loss": 0.9119, "step": 7 }, { "epoch": 0.64, "grad_norm": 3.75272536277771, "learning_rate": 0.00016, "loss": 0.5956, "step": 8 }, { "epoch": 0.64, "eval_loss": 0.5577284693717957, "eval_runtime": 5.323, "eval_samples_per_second": 2.254, "eval_steps_per_second": 1.127, "step": 8 }, { "epoch": 0.72, "grad_norm": 7.569386959075928, "learning_rate": 0.00018, "loss": 0.6499, "step": 9 }, { "epoch": 0.8, "grad_norm": 14.337709426879883, "learning_rate": 0.0002, "loss": 1.6675, "step": 10 }, { "epoch": 0.88, "grad_norm": 3.6579396724700928, "learning_rate": 0.00019981755542233177, "loss": 0.5612, "step": 11 }, { "epoch": 0.96, "grad_norm": 2.5605533123016357, "learning_rate": 0.0001992708874098054, "loss": 0.4848, "step": 12 }, { "epoch": 0.96, "eval_loss": 0.8369883894920349, "eval_runtime": 5.327, "eval_samples_per_second": 2.253, "eval_steps_per_second": 1.126, "step": 12 }, { "epoch": 1.04, "grad_norm": 8.139214515686035, "learning_rate": 0.00019836199069471437, "loss": 0.9289, "step": 13 }, { "epoch": 1.12, "grad_norm": 2.3758115768432617, "learning_rate": 0.0001970941817426052, "loss": 0.5154, "step": 14 }, { "epoch": 1.2, "grad_norm": 2.3552050590515137, "learning_rate": 0.00019547208665085457, "loss": 0.5028, "step": 15 }, { "epoch": 1.28, "grad_norm": 2.0085458755493164, "learning_rate": 0.0001935016242685415, "loss": 0.4913, "step": 16 }, { "epoch": 1.28, "eval_loss": 0.4895554482936859, "eval_runtime": 5.325, "eval_samples_per_second": 2.254, "eval_steps_per_second": 1.127, "step": 16 }, { "epoch": 1.36, "grad_norm": 3.799877643585205, "learning_rate": 0.00019118998459920902, "loss": 0.3925, "step": 17 }, { "epoch": 1.44, "grad_norm": 6.162234783172607, "learning_rate": 0.000188545602565321, "loss": 1.0242, "step": 18 }, { "epoch": 1.52, "grad_norm": 4.3233642578125, "learning_rate": 0.00018557812723014476, "loss": 0.7239, "step": 19 }, { "epoch": 1.6, "grad_norm": 4.661636829376221, "learning_rate": 0.00018229838658936564, "loss": 0.671, "step": 20 }, { "epoch": 1.6, "eval_loss": 0.5175419449806213, "eval_runtime": 5.3241, "eval_samples_per_second": 2.254, "eval_steps_per_second": 1.127, "step": 20 }, { "epoch": 1.68, "grad_norm": 1.8649834394454956, "learning_rate": 0.00017871834806090501, "loss": 0.3227, "step": 21 }, { "epoch": 1.76, "grad_norm": 18.722929000854492, "learning_rate": 0.00017485107481711012, "loss": 1.5111, "step": 22 }, { "epoch": 1.84, "grad_norm": 8.523069381713867, "learning_rate": 0.00017071067811865476, "loss": 0.5847, "step": 23 }, { "epoch": 1.92, "grad_norm": 125.49153900146484, "learning_rate": 0.00016631226582407952, "loss": 2.6136, "step": 24 }, { "epoch": 1.92, "eval_loss": 2.3445661067962646, "eval_runtime": 5.3223, "eval_samples_per_second": 2.255, "eval_steps_per_second": 1.127, "step": 24 }, { "epoch": 2.0, "grad_norm": 92.8938980102539, "learning_rate": 0.00016167188726285434, "loss": 2.5399, "step": 25 }, { "epoch": 2.08, "grad_norm": 18.453845977783203, "learning_rate": 0.00015680647467311557, "loss": 0.6644, "step": 26 }, { "epoch": 2.16, "grad_norm": 7.2362141609191895, "learning_rate": 0.00015173378141776568, "loss": 0.3526, "step": 27 }, { "epoch": 2.24, "grad_norm": 8.623927116394043, "learning_rate": 0.00014647231720437686, "loss": 0.6383, "step": 28 }, { "epoch": 2.24, "eval_loss": 0.5194393992424011, "eval_runtime": 5.3225, "eval_samples_per_second": 2.255, "eval_steps_per_second": 1.127, "step": 28 }, { "epoch": 2.32, "grad_norm": 4.0871782302856445, "learning_rate": 0.0001410412805452757, "loss": 0.5128, "step": 29 }, { "epoch": 2.4, "grad_norm": 4.378565311431885, "learning_rate": 0.00013546048870425356, "loss": 0.4853, "step": 30 }, { "epoch": 2.48, "grad_norm": 2.0163638591766357, "learning_rate": 0.00012975030538552032, "loss": 0.4083, "step": 31 }, { "epoch": 2.56, "grad_norm": 12.063375473022461, "learning_rate": 0.0001239315664287558, "loss": 0.5776, "step": 32 }, { "epoch": 2.56, "eval_loss": 0.5652761459350586, "eval_runtime": 5.3235, "eval_samples_per_second": 2.254, "eval_steps_per_second": 1.127, "step": 32 }, { "epoch": 2.64, "grad_norm": 3.3182127475738525, "learning_rate": 0.0001180255037813906, "loss": 0.5234, "step": 33 }, { "epoch": 2.72, "grad_norm": 2.9432356357574463, "learning_rate": 0.0001120536680255323, "loss": 0.4264, "step": 34 }, { "epoch": 2.8, "grad_norm": 1.9505524635314941, "learning_rate": 0.00010603784974222861, "loss": 0.4241, "step": 35 }, { "epoch": 2.88, "grad_norm": 3.3271913528442383, "learning_rate": 0.0001, "loss": 0.4913, "step": 36 }, { "epoch": 2.88, "eval_loss": 0.47906360030174255, "eval_runtime": 5.323, "eval_samples_per_second": 2.254, "eval_steps_per_second": 1.127, "step": 36 }, { "epoch": 2.96, "grad_norm": 12.475479125976562, "learning_rate": 9.396215025777139e-05, "loss": 0.5668, "step": 37 }, { "epoch": 3.04, "grad_norm": 3.1950840950012207, "learning_rate": 8.79463319744677e-05, "loss": 0.3439, "step": 38 }, { "epoch": 3.12, "grad_norm": 12.156915664672852, "learning_rate": 8.197449621860943e-05, "loss": 0.4084, "step": 39 } ], "logging_steps": 1, "max_steps": 62, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 13, "total_flos": 2.539161524581171e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }