{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9945, "eval_steps": 500, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 17.363519218688417, "learning_rate": 3.125e-08, "logits/chosen": -2.205641031265259, "logits/rejected": -2.2929024696350098, "logps/chosen": -215.50050354003906, "logps/rejected": -237.99966430664062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "grad_norm": 18.34878510832685, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.382091522216797, "logits/rejected": -2.295259952545166, "logps/chosen": -222.111328125, "logps/rejected": -210.6314697265625, "loss": 0.693, "rewards/accuracies": 0.4615384638309479, "rewards/chosen": 0.0012403662549331784, "rewards/margins": 0.0005746442475356162, "rewards/rejected": 0.0006657222402282059, "step": 10 }, { "epoch": 0.13, "grad_norm": 18.337158741008302, "learning_rate": 4.989490450759331e-07, "logits/chosen": -2.383176803588867, "logits/rejected": -2.422689199447632, "logps/chosen": -212.94821166992188, "logps/rejected": -232.4334259033203, "loss": 0.6895, "rewards/accuracies": 0.5384615659713745, "rewards/chosen": -0.08582816272974014, "rewards/margins": 0.0013778842985630035, "rewards/rejected": -0.08720605075359344, "step": 20 }, { "epoch": 0.2, "grad_norm": 22.542054212888594, "learning_rate": 4.872270441827174e-07, "logits/chosen": -2.3914377689361572, "logits/rejected": -2.3520281314849854, "logps/chosen": -215.9875030517578, "logps/rejected": -227.70399475097656, "loss": 0.6861, "rewards/accuracies": 0.5384615659713745, "rewards/chosen": -0.25440388917922974, "rewards/margins": 0.03694874048233032, "rewards/rejected": -0.29135259985923767, "step": 30 }, { "epoch": 0.26, "grad_norm": 18.6076942310058, "learning_rate": 4.6308512113530063e-07, "logits/chosen": -2.4768216609954834, "logits/rejected": -2.4233009815216064, "logps/chosen": -229.41555786132812, "logps/rejected": -242.26214599609375, "loss": 0.6925, "rewards/accuracies": 0.5384615659713745, "rewards/chosen": -0.19061818718910217, "rewards/margins": 0.01797662116587162, "rewards/rejected": -0.20859479904174805, "step": 40 }, { "epoch": 0.33, "grad_norm": 15.540618809394221, "learning_rate": 4.277872161641681e-07, "logits/chosen": -2.5903377532958984, "logits/rejected": -2.5592682361602783, "logps/chosen": -225.72836303710938, "logps/rejected": -240.36595153808594, "loss": 0.6897, "rewards/accuracies": 0.5307692289352417, "rewards/chosen": -0.033847782760858536, "rewards/margins": 0.001607205718755722, "rewards/rejected": -0.03545498102903366, "step": 50 }, { "epoch": 0.39, "grad_norm": 24.02478451571149, "learning_rate": 3.8318133624280046e-07, "logits/chosen": -2.6142632961273193, "logits/rejected": -2.6310534477233887, "logps/chosen": -223.6808624267578, "logps/rejected": -255.2255401611328, "loss": 0.6903, "rewards/accuracies": 0.5423076748847961, "rewards/chosen": -0.10960451513528824, "rewards/margins": 0.02808019518852234, "rewards/rejected": -0.13768470287322998, "step": 60 }, { "epoch": 0.46, "grad_norm": 20.288409772533704, "learning_rate": 3.316028034595861e-07, "logits/chosen": -2.5127460956573486, "logits/rejected": -2.5037307739257812, "logps/chosen": -230.75833129882812, "logps/rejected": -256.0094909667969, "loss": 0.6832, "rewards/accuracies": 0.4961538314819336, "rewards/chosen": -0.2965443730354309, "rewards/margins": 0.03154058754444122, "rewards/rejected": -0.3280849754810333, "step": 70 }, { "epoch": 0.52, "grad_norm": 22.680428500041128, "learning_rate": 2.7575199021178855e-07, "logits/chosen": -2.5435211658477783, "logits/rejected": -2.5064070224761963, "logps/chosen": -251.8400421142578, "logps/rejected": -273.2138671875, "loss": 0.6778, "rewards/accuracies": 0.5538461804389954, "rewards/chosen": -0.28813430666923523, "rewards/margins": 0.06685086339712143, "rewards/rejected": -0.3549851179122925, "step": 80 }, { "epoch": 0.58, "grad_norm": 18.86897478499211, "learning_rate": 2.1855294234408068e-07, "logits/chosen": -2.5077600479125977, "logits/rejected": -2.503957986831665, "logps/chosen": -251.26556396484375, "logps/rejected": -242.37310791015625, "loss": 0.6842, "rewards/accuracies": 0.5307692289352417, "rewards/chosen": -0.24721869826316833, "rewards/margins": 0.009251880459487438, "rewards/rejected": -0.25647059082984924, "step": 90 }, { "epoch": 0.65, "grad_norm": 20.56846597020704, "learning_rate": 1.6300029195778453e-07, "logits/chosen": -2.492692708969116, "logits/rejected": -2.3162038326263428, "logps/chosen": -253.9385223388672, "logps/rejected": -268.18414306640625, "loss": 0.6821, "rewards/accuracies": 0.5538461804389954, "rewards/chosen": -0.4339679181575775, "rewards/margins": 0.06273461133241653, "rewards/rejected": -0.49670252203941345, "step": 100 }, { "epoch": 0.71, "grad_norm": 29.852940409061166, "learning_rate": 1.1200247470632392e-07, "logits/chosen": -2.409569501876831, "logits/rejected": -2.3867456912994385, "logps/chosen": -297.3172607421875, "logps/rejected": -285.6565246582031, "loss": 0.6823, "rewards/accuracies": 0.4923076927661896, "rewards/chosen": -0.73952317237854, "rewards/margins": 0.020068956539034843, "rewards/rejected": -0.7595921754837036, "step": 110 }, { "epoch": 0.78, "grad_norm": 20.239291104683563, "learning_rate": 6.822945986946385e-08, "logits/chosen": -2.3316874504089355, "logits/rejected": -2.2675819396972656, "logps/chosen": -275.67767333984375, "logps/rejected": -291.9703674316406, "loss": 0.6858, "rewards/accuracies": 0.5384615659713745, "rewards/chosen": -0.5500468611717224, "rewards/margins": 0.06261468678712845, "rewards/rejected": -0.6126615405082703, "step": 120 }, { "epoch": 0.84, "grad_norm": 24.27835409910822, "learning_rate": 3.397296523427806e-08, "logits/chosen": -2.4436075687408447, "logits/rejected": -2.309699296951294, "logps/chosen": -233.41680908203125, "logps/rejected": -262.0289611816406, "loss": 0.6782, "rewards/accuracies": 0.5807692408561707, "rewards/chosen": -0.36947229504585266, "rewards/margins": 0.0935312956571579, "rewards/rejected": -0.46300360560417175, "step": 130 }, { "epoch": 0.91, "grad_norm": 24.790097756991855, "learning_rate": 1.1026475173977978e-08, "logits/chosen": -2.3928282260894775, "logits/rejected": -2.382559299468994, "logps/chosen": -244.34410095214844, "logps/rejected": -248.87876892089844, "loss": 0.6769, "rewards/accuracies": 0.5615384578704834, "rewards/chosen": -0.27959996461868286, "rewards/margins": 0.04778864234685898, "rewards/rejected": -0.32738858461380005, "step": 140 }, { "epoch": 0.97, "grad_norm": 24.029709323211232, "learning_rate": 5.913435276374834e-10, "logits/chosen": -2.4940154552459717, "logits/rejected": -2.3921005725860596, "logps/chosen": -241.1715545654297, "logps/rejected": -262.944580078125, "loss": 0.6737, "rewards/accuracies": 0.6192307472229004, "rewards/chosen": -0.2436859905719757, "rewards/margins": 0.09733694791793823, "rewards/rejected": -0.34102290868759155, "step": 150 }, { "epoch": 0.99, "step": 153, "total_flos": 0.0, "train_loss": 0.6839175197034101, "train_runtime": 39806.5992, "train_samples_per_second": 0.502, "train_steps_per_second": 0.004 } ], "logging_steps": 10, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }