{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003194888178913738, "grad_norm": 106.83801953543872, "learning_rate": 5.3191489361702125e-09, "logits/chosen": -3.578125, "logits/rejected": -3.421875, "logps/chosen": -282.0, "logps/rejected": -290.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03194888178913738, "grad_norm": 98.60082387787162, "learning_rate": 5.3191489361702123e-08, "logits/chosen": -3.484375, "logits/rejected": -3.390625, "logps/chosen": -274.0, "logps/rejected": -278.0, "loss": 0.6868, "rewards/accuracies": 0.1388888955116272, "rewards/chosen": -0.00970458984375, "rewards/margins": 6.794929504394531e-06, "rewards/rejected": -0.009765625, "step": 10 }, { "epoch": 0.06389776357827476, "grad_norm": 116.57506460735225, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -3.421875, "logits/rejected": -3.421875, "logps/chosen": -280.0, "logps/rejected": -266.0, "loss": 0.6537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03125, "rewards/margins": 0.12158203125, "rewards/rejected": -0.15234375, "step": 20 }, { "epoch": 0.09584664536741214, "grad_norm": 78.04564900745318, "learning_rate": 1.5957446808510638e-07, "logits/chosen": -3.34375, "logits/rejected": -3.484375, "logps/chosen": -276.0, "logps/rejected": -268.0, "loss": 0.6477, "rewards/accuracies": 0.625, "rewards/chosen": -0.1552734375, "rewards/margins": 0.1826171875, "rewards/rejected": -0.337890625, "step": 30 }, { "epoch": 0.12779552715654952, "grad_norm": 104.09831779598169, "learning_rate": 2.127659574468085e-07, "logits/chosen": -3.46875, "logits/rejected": -3.421875, "logps/chosen": -272.0, "logps/rejected": -276.0, "loss": 0.5849, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1923828125, "rewards/margins": 0.341796875, "rewards/rejected": -0.53515625, "step": 40 }, { "epoch": 0.1597444089456869, "grad_norm": 83.06087118508516, "learning_rate": 2.659574468085106e-07, "logits/chosen": -3.375, "logits/rejected": -3.40625, "logps/chosen": -278.0, "logps/rejected": -274.0, "loss": 0.6062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.34375, "rewards/margins": 0.244140625, "rewards/rejected": -0.5859375, "step": 50 }, { "epoch": 0.19169329073482427, "grad_norm": 85.2326419243345, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -3.28125, "logits/rejected": -3.3125, "logps/chosen": -280.0, "logps/rejected": -276.0, "loss": 0.5718, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.314453125, "rewards/margins": 0.5546875, "rewards/rejected": -0.87109375, "step": 60 }, { "epoch": 0.22364217252396165, "grad_norm": 85.35363076925235, "learning_rate": 3.7234042553191484e-07, "logits/chosen": -3.28125, "logits/rejected": -3.296875, "logps/chosen": -278.0, "logps/rejected": -278.0, "loss": 0.5627, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.244140625, "rewards/margins": 0.451171875, "rewards/rejected": -0.6953125, "step": 70 }, { "epoch": 0.25559105431309903, "grad_norm": 80.13630527816748, "learning_rate": 4.25531914893617e-07, "logits/chosen": -3.328125, "logits/rejected": -3.296875, "logps/chosen": -280.0, "logps/rejected": -274.0, "loss": 0.5984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3203125, "rewards/margins": 0.546875, "rewards/rejected": -0.8671875, "step": 80 }, { "epoch": 0.28753993610223644, "grad_norm": 64.74916812090929, "learning_rate": 4.787234042553192e-07, "logits/chosen": -3.375, "logits/rejected": -3.296875, "logps/chosen": -274.0, "logps/rejected": -278.0, "loss": 0.5594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.224609375, "rewards/margins": 0.69921875, "rewards/rejected": -0.921875, "step": 90 }, { "epoch": 0.3194888178913738, "grad_norm": 62.874343395319244, "learning_rate": 4.964497041420119e-07, "logits/chosen": -3.34375, "logits/rejected": -3.34375, "logps/chosen": -280.0, "logps/rejected": -268.0, "loss": 0.5807, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3203125, "rewards/margins": 0.55078125, "rewards/rejected": -0.87109375, "step": 100 }, { "epoch": 0.3514376996805112, "grad_norm": 65.70150692167816, "learning_rate": 4.905325443786982e-07, "logits/chosen": -3.28125, "logits/rejected": -3.328125, "logps/chosen": -276.0, "logps/rejected": -286.0, "loss": 0.5878, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1162109375, "rewards/margins": 0.75, "rewards/rejected": -0.8671875, "step": 110 }, { "epoch": 0.38338658146964855, "grad_norm": 66.45326771352913, "learning_rate": 4.846153846153846e-07, "logits/chosen": -3.296875, "logits/rejected": -3.375, "logps/chosen": -280.0, "logps/rejected": -276.0, "loss": 0.5633, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1328125, "rewards/margins": 0.67578125, "rewards/rejected": -0.80859375, "step": 120 }, { "epoch": 0.41533546325878595, "grad_norm": 83.00130651053203, "learning_rate": 4.78698224852071e-07, "logits/chosen": -3.3125, "logits/rejected": -3.296875, "logps/chosen": -282.0, "logps/rejected": -276.0, "loss": 0.5831, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.05224609375, "rewards/margins": 1.0625, "rewards/rejected": -1.0078125, "step": 130 }, { "epoch": 0.4472843450479233, "grad_norm": 65.14922163628061, "learning_rate": 4.727810650887574e-07, "logits/chosen": -3.328125, "logits/rejected": -3.34375, "logps/chosen": -276.0, "logps/rejected": -284.0, "loss": 0.5408, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.02001953125, "rewards/margins": 0.97265625, "rewards/rejected": -0.953125, "step": 140 }, { "epoch": 0.4792332268370607, "grad_norm": 69.5805163425697, "learning_rate": 4.668639053254438e-07, "logits/chosen": -3.3125, "logits/rejected": -3.328125, "logps/chosen": -266.0, "logps/rejected": -278.0, "loss": 0.5394, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.047607421875, "rewards/margins": 0.890625, "rewards/rejected": -0.84375, "step": 150 }, { "epoch": 0.5111821086261981, "grad_norm": 67.64968360037635, "learning_rate": 4.6094674556213014e-07, "logits/chosen": -3.328125, "logits/rejected": -3.171875, "logps/chosen": -270.0, "logps/rejected": -288.0, "loss": 0.6143, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.26171875, "rewards/margins": 0.83984375, "rewards/rejected": -1.1015625, "step": 160 }, { "epoch": 0.5431309904153354, "grad_norm": 75.9892459922825, "learning_rate": 4.5502958579881655e-07, "logits/chosen": -3.265625, "logits/rejected": -3.171875, "logps/chosen": -278.0, "logps/rejected": -290.0, "loss": 0.5603, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.44140625, "rewards/margins": 0.796875, "rewards/rejected": -1.234375, "step": 170 }, { "epoch": 0.5750798722044729, "grad_norm": 90.65062968036268, "learning_rate": 4.491124260355029e-07, "logits/chosen": -3.203125, "logits/rejected": -3.1875, "logps/chosen": -278.0, "logps/rejected": -282.0, "loss": 0.6629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.306640625, "rewards/margins": 0.75, "rewards/rejected": -1.0546875, "step": 180 }, { "epoch": 0.6070287539936102, "grad_norm": 108.28762791772537, "learning_rate": 4.4319526627218936e-07, "logits/chosen": -3.234375, "logits/rejected": -3.28125, "logps/chosen": -280.0, "logps/rejected": -280.0, "loss": 0.5251, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.208984375, "rewards/margins": 1.0390625, "rewards/rejected": -1.25, "step": 190 }, { "epoch": 0.6389776357827476, "grad_norm": 78.98605565885474, "learning_rate": 4.372781065088757e-07, "logits/chosen": -3.3125, "logits/rejected": -3.171875, "logps/chosen": -278.0, "logps/rejected": -288.0, "loss": 0.5118, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.296875, "rewards/margins": 1.015625, "rewards/rejected": -1.3125, "step": 200 }, { "epoch": 0.670926517571885, "grad_norm": 77.91455442318774, "learning_rate": 4.313609467455621e-07, "logits/chosen": -3.234375, "logits/rejected": -3.1875, "logps/chosen": -272.0, "logps/rejected": -288.0, "loss": 0.5157, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.14453125, "rewards/margins": 1.015625, "rewards/rejected": -1.15625, "step": 210 }, { "epoch": 0.7028753993610224, "grad_norm": 63.1172852867484, "learning_rate": 4.2544378698224847e-07, "logits/chosen": -3.1875, "logits/rejected": -3.28125, "logps/chosen": -268.0, "logps/rejected": -280.0, "loss": 0.5155, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0301513671875, "rewards/margins": 0.9375, "rewards/rejected": -0.96875, "step": 220 }, { "epoch": 0.7348242811501597, "grad_norm": 75.1179190067346, "learning_rate": 4.195266272189349e-07, "logits/chosen": -3.21875, "logits/rejected": -3.3125, "logps/chosen": -280.0, "logps/rejected": -280.0, "loss": 0.5115, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.072265625, "rewards/margins": 1.1953125, "rewards/rejected": -1.265625, "step": 230 }, { "epoch": 0.7667731629392971, "grad_norm": 68.71250790991336, "learning_rate": 4.1360946745562133e-07, "logits/chosen": -3.265625, "logits/rejected": -3.203125, "logps/chosen": -272.0, "logps/rejected": -290.0, "loss": 0.4746, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.029052734375, "rewards/margins": 1.6796875, "rewards/rejected": -1.6484375, "step": 240 }, { "epoch": 0.7987220447284346, "grad_norm": 55.46878046949881, "learning_rate": 4.076923076923077e-07, "logits/chosen": -3.265625, "logits/rejected": -3.328125, "logps/chosen": -280.0, "logps/rejected": -278.0, "loss": 0.4711, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.150390625, "rewards/margins": 1.140625, "rewards/rejected": -1.2890625, "step": 250 }, { "epoch": 0.8306709265175719, "grad_norm": 60.69436763385765, "learning_rate": 4.017751479289941e-07, "logits/chosen": -3.25, "logits/rejected": -3.28125, "logps/chosen": -274.0, "logps/rejected": -286.0, "loss": 0.5292, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1650390625, "rewards/margins": 1.25, "rewards/rejected": -1.4140625, "step": 260 }, { "epoch": 0.8626198083067093, "grad_norm": 61.755235473426524, "learning_rate": 3.9585798816568044e-07, "logits/chosen": -3.28125, "logits/rejected": -3.3125, "logps/chosen": -274.0, "logps/rejected": -282.0, "loss": 0.4873, "rewards/accuracies": 0.8125, "rewards/chosen": -0.044921875, "rewards/margins": 1.4375, "rewards/rejected": -1.484375, "step": 270 }, { "epoch": 0.8945686900958466, "grad_norm": 64.8584152127513, "learning_rate": 3.8994082840236685e-07, "logits/chosen": -3.328125, "logits/rejected": -3.265625, "logps/chosen": -284.0, "logps/rejected": -280.0, "loss": 0.5342, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1640625, "rewards/margins": 0.87109375, "rewards/rejected": -1.03125, "step": 280 }, { "epoch": 0.9265175718849841, "grad_norm": 71.13313796916569, "learning_rate": 3.840236686390532e-07, "logits/chosen": -3.296875, "logits/rejected": -3.28125, "logps/chosen": -268.0, "logps/rejected": -284.0, "loss": 0.455, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0546875, "rewards/margins": 1.65625, "rewards/rejected": -1.7109375, "step": 290 }, { "epoch": 0.9584664536741214, "grad_norm": 147.24265223128356, "learning_rate": 3.7810650887573966e-07, "logits/chosen": -3.296875, "logits/rejected": -3.234375, "logps/chosen": -284.0, "logps/rejected": -284.0, "loss": 0.5612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.27734375, "rewards/margins": 1.140625, "rewards/rejected": -1.421875, "step": 300 }, { "epoch": 0.9904153354632588, "grad_norm": 57.83608472539185, "learning_rate": 3.72189349112426e-07, "logits/chosen": -3.25, "logits/rejected": -3.25, "logps/chosen": -274.0, "logps/rejected": -282.0, "loss": 0.4624, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.05712890625, "rewards/margins": 1.40625, "rewards/rejected": -1.3515625, "step": 310 }, { "epoch": 1.0, "eval_logits/chosen": -3.3125, "eval_logits/rejected": -3.34375, "eval_logps/chosen": -272.0, "eval_logps/rejected": -276.0, "eval_loss": 0.5753124952316284, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.039306640625, "eval_rewards/margins": 0.703125, "eval_rewards/rejected": -0.7421875, "eval_runtime": 12.4952, "eval_samples_per_second": 16.006, "eval_steps_per_second": 0.56, "step": 313 } ], "logging_steps": 10, "max_steps": 939, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }