{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9905956112852664, "eval_steps": 500, "global_step": 79, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012539184952978056, "grad_norm": 9.394233371670921, "learning_rate": 6.25e-08, "logits/chosen": -2.9077322483062744, "logits/rejected": -2.8318910598754883, "logps/chosen": -351.8885498046875, "logps/pi_response": -76.32845306396484, "logps/ref_response": -76.32845306396484, "logps/rejected": -169.29762268066406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.12539184952978055, "grad_norm": 8.360621703019095, "learning_rate": 4.990217055187362e-07, "logits/chosen": -2.7861883640289307, "logits/rejected": -2.7629709243774414, "logps/chosen": -234.14410400390625, "logps/pi_response": -70.11377716064453, "logps/ref_response": -70.02328491210938, "logps/rejected": -167.95562744140625, "loss": 0.6904, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.006910877302289009, "rewards/margins": 0.0037490064278244972, "rewards/rejected": 0.0031618711072951555, "step": 10 }, { "epoch": 0.2507836990595611, "grad_norm": 6.69893642100669, "learning_rate": 4.655786431300069e-07, "logits/chosen": -2.741486072540283, "logits/rejected": -2.690483331680298, "logps/chosen": -245.74948120117188, "logps/pi_response": -75.77564239501953, "logps/ref_response": -67.40553283691406, "logps/rejected": -170.17709350585938, "loss": 0.6608, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.04696048051118851, "rewards/margins": 0.0809074118733406, "rewards/rejected": -0.0339469388127327, "step": 20 }, { "epoch": 0.3761755485893417, "grad_norm": 6.686646881580658, "learning_rate": 3.9061232191019517e-07, "logits/chosen": -2.6538798809051514, "logits/rejected": -2.620079517364502, "logps/chosen": -233.30227661132812, "logps/pi_response": -102.32685852050781, "logps/ref_response": -65.888427734375, "logps/rejected": -186.7948455810547, "loss": 0.6253, "rewards/accuracies": 0.734375, "rewards/chosen": -0.09834689646959305, "rewards/margins": 0.18449249863624573, "rewards/rejected": -0.2828393876552582, "step": 30 }, { "epoch": 0.5015673981191222, "grad_norm": 7.858132087597293, "learning_rate": 2.8856223324132555e-07, "logits/chosen": -2.6627352237701416, "logits/rejected": -2.6404871940612793, "logps/chosen": -254.8513641357422, "logps/pi_response": -137.86434936523438, "logps/ref_response": -70.97199249267578, "logps/rejected": -225.2699737548828, "loss": 0.5782, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2209537923336029, "rewards/margins": 0.34001588821411133, "rewards/rejected": -0.5609697103500366, "step": 40 }, { "epoch": 0.6269592476489029, "grad_norm": 10.578475411451166, "learning_rate": 1.7908455541642582e-07, "logits/chosen": -2.65732479095459, "logits/rejected": -2.624019145965576, "logps/chosen": -284.77923583984375, "logps/pi_response": -154.8039093017578, "logps/ref_response": -69.12784576416016, "logps/rejected": -249.51736450195312, "loss": 0.5471, "rewards/accuracies": 0.796875, "rewards/chosen": -0.34653979539871216, "rewards/margins": 0.46229037642478943, "rewards/rejected": -0.8088302612304688, "step": 50 }, { "epoch": 0.7523510971786834, "grad_norm": 10.61093460315722, "learning_rate": 8.32661172908373e-08, "logits/chosen": -2.6534972190856934, "logits/rejected": -2.6186232566833496, "logps/chosen": -255.72732543945312, "logps/pi_response": -160.81385803222656, "logps/ref_response": -62.94016647338867, "logps/rejected": -265.20989990234375, "loss": 0.5272, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4366573691368103, "rewards/margins": 0.5785155892372131, "rewards/rejected": -1.0151729583740234, "step": 60 }, { "epoch": 0.877742946708464, "grad_norm": 10.972518174316015, "learning_rate": 1.956279997278043e-08, "logits/chosen": -2.6517553329467773, "logits/rejected": -2.6147875785827637, "logps/chosen": -301.74700927734375, "logps/pi_response": -181.74948120117188, "logps/ref_response": -70.71024322509766, "logps/rejected": -276.01605224609375, "loss": 0.5138, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.4930971562862396, "rewards/margins": 0.6322487592697144, "rewards/rejected": -1.1253459453582764, "step": 70 }, { "epoch": 0.9905956112852664, "step": 79, "total_flos": 0.0, "train_loss": 0.5845865599716766, "train_runtime": 3518.9349, "train_samples_per_second": 5.791, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 79, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }