{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006279434850863423, "grad_norm": 16.698454749053152, "learning_rate": 1.875e-08, "logits/chosen": 0.13163629174232483, "logits/rejected": 0.7037353515625, "logps/chosen": -296.6709289550781, "logps/pi_response": -123.40753173828125, "logps/ref_response": -123.40753173828125, "logps/rejected": -325.5771484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06279434850863422, "grad_norm": 24.513917430946538, "learning_rate": 1.875e-07, "logits/chosen": 0.6406950354576111, "logits/rejected": 0.8759365081787109, "logps/chosen": -260.0070495605469, "logps/pi_response": -114.28534698486328, "logps/ref_response": -114.47286224365234, "logps/rejected": -385.5276184082031, "loss": 0.6924, "rewards/accuracies": 0.4097222089767456, "rewards/chosen": 0.00021976388234179467, "rewards/margins": -2.7502783268573694e-05, "rewards/rejected": 0.000247266492806375, "step": 10 }, { "epoch": 0.12558869701726844, "grad_norm": 20.98441253721937, "learning_rate": 2.9942119880575817e-07, "logits/chosen": 0.5097376704216003, "logits/rejected": 0.8540347814559937, "logps/chosen": -268.75433349609375, "logps/pi_response": -118.2417221069336, "logps/ref_response": -118.39286041259766, "logps/rejected": -400.633544921875, "loss": 0.6749, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03052676096558571, "rewards/margins": 0.040752165019512177, "rewards/rejected": -0.07127892971038818, "step": 20 }, { "epoch": 0.18838304552590268, "grad_norm": 13.999591521451507, "learning_rate": 2.929608750821129e-07, "logits/chosen": 0.5199416875839233, "logits/rejected": 0.992133617401123, "logps/chosen": -295.43292236328125, "logps/pi_response": -119.1610336303711, "logps/ref_response": -118.39522552490234, "logps/rejected": -421.61041259765625, "loss": 0.6174, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.17318478226661682, "rewards/margins": 0.18207214772701263, "rewards/rejected": -0.35525694489479065, "step": 30 }, { "epoch": 0.25117739403453687, "grad_norm": 14.705995187750815, "learning_rate": 2.7962832564252725e-07, "logits/chosen": 0.5350409746170044, "logits/rejected": 0.9762212634086609, "logps/chosen": -292.2400207519531, "logps/pi_response": -122.87149810791016, "logps/ref_response": -120.0985336303711, "logps/rejected": -484.0975646972656, "loss": 0.5739, "rewards/accuracies": 0.75, "rewards/chosen": -0.30772843956947327, "rewards/margins": 0.45096302032470703, "rewards/rejected": -0.7586914300918579, "step": 40 }, { "epoch": 0.3139717425431711, "grad_norm": 20.925472606748368, "learning_rate": 2.6006445513357056e-07, "logits/chosen": 0.6897233724594116, "logits/rejected": 1.0123343467712402, "logps/chosen": -320.56976318359375, "logps/pi_response": -124.61143493652344, "logps/ref_response": -115.71650695800781, "logps/rejected": -523.7175903320312, "loss": 0.5629, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6995627284049988, "rewards/margins": 0.6693333387374878, "rewards/rejected": -1.3688960075378418, "step": 50 }, { "epoch": 0.37676609105180536, "grad_norm": 13.008934683020064, "learning_rate": 2.3520971200967334e-07, "logits/chosen": 0.6137208938598633, "logits/rejected": 1.0412781238555908, "logps/chosen": -332.25738525390625, "logps/pi_response": -126.09577941894531, "logps/ref_response": -118.1528549194336, "logps/rejected": -495.4088439941406, "loss": 0.555, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5905637145042419, "rewards/margins": 0.6077089309692383, "rewards/rejected": -1.198272705078125, "step": 60 }, { "epoch": 0.43956043956043955, "grad_norm": 12.777487677582881, "learning_rate": 2.0625888054143427e-07, "logits/chosen": 0.593045175075531, "logits/rejected": 0.9839151501655579, "logps/chosen": -273.0267333984375, "logps/pi_response": -126.1861801147461, "logps/ref_response": -120.62638854980469, "logps/rejected": -531.7461547851562, "loss": 0.5431, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3775605261325836, "rewards/margins": 0.8122557401657104, "rewards/rejected": -1.1898162364959717, "step": 70 }, { "epoch": 0.5023547880690737, "grad_norm": 10.709251992827037, "learning_rate": 1.7460364672965327e-07, "logits/chosen": 0.6686810255050659, "logits/rejected": 1.0736128091812134, "logps/chosen": -280.6498107910156, "logps/pi_response": -112.1661376953125, "logps/ref_response": -106.67897033691406, "logps/rejected": -489.26556396484375, "loss": 0.5321, "rewards/accuracies": 0.75, "rewards/chosen": -0.4469337463378906, "rewards/margins": 0.6373990774154663, "rewards/rejected": -1.084332823753357, "step": 80 }, { "epoch": 0.565149136577708, "grad_norm": 11.68215452300686, "learning_rate": 1.4176569902035086e-07, "logits/chosen": 0.6378764510154724, "logits/rejected": 1.0353127717971802, "logps/chosen": -339.03973388671875, "logps/pi_response": -119.71498107910156, "logps/ref_response": -111.9307861328125, "logps/rejected": -513.7333984375, "loss": 0.5119, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5252664685249329, "rewards/margins": 0.7039340734481812, "rewards/rejected": -1.2292006015777588, "step": 90 }, { "epoch": 0.6279434850863422, "grad_norm": 11.96502204484806, "learning_rate": 1.0932357971453743e-07, "logits/chosen": 0.7395003437995911, "logits/rejected": 1.0328724384307861, "logps/chosen": -301.8004455566406, "logps/pi_response": -116.29144287109375, "logps/ref_response": -108.0909423828125, "logps/rejected": -524.360107421875, "loss": 0.547, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5083015561103821, "rewards/margins": 0.7502217888832092, "rewards/rejected": -1.2585232257843018, "step": 100 }, { "epoch": 0.6907378335949764, "grad_norm": 11.40631298798362, "learning_rate": 7.883680337481599e-08, "logits/chosen": 0.7460795640945435, "logits/rejected": 0.9838323593139648, "logps/chosen": -305.3519592285156, "logps/pi_response": -125.8452377319336, "logps/ref_response": -117.07008361816406, "logps/rejected": -504.4964294433594, "loss": 0.5349, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4922094941139221, "rewards/margins": 0.6827653646469116, "rewards/rejected": -1.174974799156189, "step": 110 }, { "epoch": 0.7535321821036107, "grad_norm": 10.671563097729658, "learning_rate": 5.177088990820725e-08, "logits/chosen": 0.5097354650497437, "logits/rejected": 0.8302543759346008, "logps/chosen": -327.8287353515625, "logps/pi_response": -134.05953979492188, "logps/ref_response": -125.61170959472656, "logps/rejected": -554.1288452148438, "loss": 0.524, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5032998323440552, "rewards/margins": 0.7790510058403015, "rewards/rejected": -1.282350778579712, "step": 120 }, { "epoch": 0.8163265306122449, "grad_norm": 13.393410138993277, "learning_rate": 2.942691603548416e-08, "logits/chosen": 0.5484687089920044, "logits/rejected": 1.0104806423187256, "logps/chosen": -329.3583984375, "logps/pi_response": -133.8960723876953, "logps/ref_response": -126.83935546875, "logps/rejected": -530.1004028320312, "loss": 0.5048, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4904448986053467, "rewards/margins": 0.7761750221252441, "rewards/rejected": -1.2666199207305908, "step": 130 }, { "epoch": 0.8791208791208791, "grad_norm": 12.267577779535525, "learning_rate": 1.2878971655412513e-08, "logits/chosen": 0.5720739364624023, "logits/rejected": 0.9325042963027954, "logps/chosen": -310.2190856933594, "logps/pi_response": -136.25198364257812, "logps/ref_response": -126.86582946777344, "logps/rejected": -564.576171875, "loss": 0.5164, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.481649786233902, "rewards/margins": 0.8537376523017883, "rewards/rejected": -1.3353874683380127, "step": 140 }, { "epoch": 0.9419152276295133, "grad_norm": 11.331362077198552, "learning_rate": 2.922527618666465e-09, "logits/chosen": 0.5811373591423035, "logits/rejected": 0.9567831158638, "logps/chosen": -304.1393737792969, "logps/pi_response": -123.76485443115234, "logps/ref_response": -114.90129089355469, "logps/rejected": -522.5949096679688, "loss": 0.5227, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5134440064430237, "rewards/margins": 0.7166833281517029, "rewards/rejected": -1.2301273345947266, "step": 150 }, { "epoch": 0.9984301412872841, "step": 159, "total_flos": 0.0, "train_loss": 0.5592624436384477, "train_runtime": 4324.4895, "train_samples_per_second": 4.712, "train_steps_per_second": 0.037 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }