{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992429977289932, "eval_steps": 500, "global_step": 165, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eta": 0.0010000000474974513, "grad_norm": 19.93140330088016, "learning_rate": 2.941176470588235e-08, "logits/chosen": -2.397038221359253, "logits/rejected": -2.213353395462036, "logps/chosen": -180.87660217285156, "logps/pi_response": -160.3468780517578, "logps/ref_response": -160.3468780517578, "logps/rejected": -188.15975952148438, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "eta": 0.0010000000474974513, "grad_norm": 19.32986244730549, "learning_rate": 2.941176470588235e-07, "logits/chosen": -2.334710121154785, "logits/rejected": -2.268401861190796, "logps/chosen": -204.74749755859375, "logps/pi_response": -172.67669677734375, "logps/ref_response": -171.70980834960938, "logps/rejected": -210.5995330810547, "loss": 0.6931, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": -0.009038920514285564, "rewards/margins": 0.0021236613392829895, "rewards/rejected": -0.011162581853568554, "step": 10 }, { "epoch": 0.12, "eta": 0.0010000000474974513, "grad_norm": 38.14550506704795, "learning_rate": 4.994932636402031e-07, "logits/chosen": -2.1406025886535645, "logits/rejected": -2.2008185386657715, "logps/chosen": -219.163330078125, "logps/pi_response": -189.20216369628906, "logps/ref_response": -168.5287628173828, "logps/rejected": -222.94271850585938, "loss": 0.6888, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.19716469943523407, "rewards/margins": 0.0258896853774786, "rewards/rejected": -0.22305437922477722, "step": 20 }, { "epoch": 0.18, "eta": 0.0010000000474974513, "grad_norm": 17.675340945525093, "learning_rate": 4.905416503522123e-07, "logits/chosen": -2.074061870574951, "logits/rejected": -2.080552101135254, "logps/chosen": -198.05816650390625, "logps/pi_response": -164.6136932373047, "logps/ref_response": -160.591552734375, "logps/rejected": -201.87464904785156, "loss": 0.6934, "rewards/accuracies": 0.515625, "rewards/chosen": -0.06703463941812515, "rewards/margins": 0.023267237469553947, "rewards/rejected": -0.09030187875032425, "step": 30 }, { "epoch": 0.24, "eta": 0.0010000000474974513, "grad_norm": 20.77937618350303, "learning_rate": 4.707922373336523e-07, "logits/chosen": -2.1749043464660645, "logits/rejected": -2.1247994899749756, "logps/chosen": -209.81790161132812, "logps/pi_response": -183.0597381591797, "logps/ref_response": -177.06118774414062, "logps/rejected": -218.8212890625, "loss": 0.6946, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03284044936299324, "rewards/margins": 0.03580809757113457, "rewards/rejected": -0.06864854693412781, "step": 40 }, { "epoch": 0.3, "eta": 0.0010000000474974513, "grad_norm": 21.485794770779655, "learning_rate": 4.4113156629677313e-07, "logits/chosen": -2.161895990371704, "logits/rejected": -2.047234058380127, "logps/chosen": -253.85107421875, "logps/pi_response": -220.5885467529297, "logps/ref_response": -183.68719482421875, "logps/rejected": -255.03494262695312, "loss": 0.6961, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.3808112144470215, "rewards/margins": 0.016062479466199875, "rewards/rejected": -0.3968736529350281, "step": 50 }, { "epoch": 0.36, "eta": 0.0010000000474974513, "grad_norm": 20.861550744842482, "learning_rate": 4.0289109058972283e-07, "logits/chosen": -2.0308499336242676, "logits/rejected": -2.0642848014831543, "logps/chosen": -249.21365356445312, "logps/pi_response": -218.06405639648438, "logps/ref_response": -171.6521453857422, "logps/rejected": -256.102294921875, "loss": 0.6918, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.45084279775619507, "rewards/margins": 0.007402978837490082, "rewards/rejected": -0.45824581384658813, "step": 60 }, { "epoch": 0.42, "eta": 0.0010000000474974513, "grad_norm": 28.00866705635057, "learning_rate": 3.577874068920446e-07, "logits/chosen": -1.9535369873046875, "logits/rejected": -1.9607006311416626, "logps/chosen": -277.5329284667969, "logps/pi_response": -250.90200805664062, "logps/ref_response": -169.15725708007812, "logps/rejected": -291.10015869140625, "loss": 0.6856, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7873853445053101, "rewards/margins": 0.044238921254873276, "rewards/rejected": -0.8316243290901184, "step": 70 }, { "epoch": 0.48, "eta": 0.0010000000474974513, "grad_norm": 30.93214776345345, "learning_rate": 3.078451980100854e-07, "logits/chosen": -1.9619076251983643, "logits/rejected": -1.8502511978149414, "logps/chosen": -281.6351318359375, "logps/pi_response": -248.10684204101562, "logps/ref_response": -168.71682739257812, "logps/rejected": -294.75604248046875, "loss": 0.6782, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.7786952257156372, "rewards/margins": 0.08077356964349747, "rewards/rejected": -0.8594688177108765, "step": 80 }, { "epoch": 0.55, "eta": 0.0010000000474974513, "grad_norm": 25.352348701139025, "learning_rate": 2.553063458334059e-07, "logits/chosen": -1.952636957168579, "logits/rejected": -2.0428998470306396, "logps/chosen": -253.1057586669922, "logps/pi_response": -229.44277954101562, "logps/ref_response": -163.27528381347656, "logps/rejected": -273.68780517578125, "loss": 0.6755, "rewards/accuracies": 0.546875, "rewards/chosen": -0.6530182361602783, "rewards/margins": 0.06835106015205383, "rewards/rejected": -0.7213693261146545, "step": 90 }, { "epoch": 0.61, "eta": 0.0010000000474974513, "grad_norm": 30.010292143430778, "learning_rate": 2.0252929432814287e-07, "logits/chosen": -2.0059685707092285, "logits/rejected": -1.9757779836654663, "logps/chosen": -277.2453308105469, "logps/pi_response": -250.5970458984375, "logps/ref_response": -177.19424438476562, "logps/rejected": -285.8126525878906, "loss": 0.6861, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.7217534780502319, "rewards/margins": 0.028179144486784935, "rewards/rejected": -0.7499326467514038, "step": 100 }, { "epoch": 0.67, "eta": 0.0010000000474974513, "grad_norm": 24.160814982479724, "learning_rate": 1.5188318011445906e-07, "logits/chosen": -1.880448579788208, "logits/rejected": -1.9354912042617798, "logps/chosen": -272.9020080566406, "logps/pi_response": -249.7644805908203, "logps/ref_response": -170.73162841796875, "logps/rejected": -296.87628173828125, "loss": 0.6738, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7538261413574219, "rewards/margins": 0.10054464638233185, "rewards/rejected": -0.8543707132339478, "step": 110 }, { "epoch": 0.73, "eta": 0.0010000000474974513, "grad_norm": 21.79427862950612, "learning_rate": 1.0564148305586295e-07, "logits/chosen": -1.8592274188995361, "logits/rejected": -1.8245065212249756, "logps/chosen": -286.49853515625, "logps/pi_response": -259.8140869140625, "logps/ref_response": -175.83364868164062, "logps/rejected": -298.8387756347656, "loss": 0.6773, "rewards/accuracies": 0.546875, "rewards/chosen": -0.8258917927742004, "rewards/margins": 0.05054790526628494, "rewards/rejected": -0.8764396905899048, "step": 120 }, { "epoch": 0.79, "eta": 0.0010000000474974513, "grad_norm": 19.644675719528685, "learning_rate": 6.587997083462196e-08, "logits/chosen": -2.0082201957702637, "logits/rejected": -1.852614164352417, "logps/chosen": -262.72113037109375, "logps/pi_response": -234.92135620117188, "logps/ref_response": -170.3947296142578, "logps/rejected": -271.8548278808594, "loss": 0.6733, "rewards/accuracies": 0.546875, "rewards/chosen": -0.6122658848762512, "rewards/margins": 0.0654686689376831, "rewards/rejected": -0.6777344942092896, "step": 130 }, { "epoch": 0.85, "eta": 0.0010000000474974513, "grad_norm": 23.231586788018557, "learning_rate": 3.438351873250492e-08, "logits/chosen": -2.007563829421997, "logits/rejected": -1.9652036428451538, "logps/chosen": -268.14642333984375, "logps/pi_response": -241.33349609375, "logps/ref_response": -177.7176971435547, "logps/rejected": -281.2812805175781, "loss": 0.6679, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.6138588190078735, "rewards/margins": 0.07413015514612198, "rewards/rejected": -0.6879889369010925, "step": 140 }, { "epoch": 0.91, "eta": 0.0010000000474974513, "grad_norm": 22.182591656592486, "learning_rate": 1.256598743236703e-08, "logits/chosen": -1.9013135433197021, "logits/rejected": -1.9633811712265015, "logps/chosen": -258.21563720703125, "logps/pi_response": -229.17837524414062, "logps/ref_response": -161.02003479003906, "logps/rejected": -270.0240478515625, "loss": 0.6754, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.6737670302391052, "rewards/margins": 0.05831047147512436, "rewards/rejected": -0.7320775985717773, "step": 150 }, { "epoch": 0.97, "eta": 0.0010000000474974513, "grad_norm": 28.927536436184628, "learning_rate": 1.406755487774386e-09, "logits/chosen": -1.9876502752304077, "logits/rejected": -1.8038742542266846, "logps/chosen": -269.11968994140625, "logps/pi_response": -239.73095703125, "logps/ref_response": -171.18202209472656, "logps/rejected": -278.70245361328125, "loss": 0.6711, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6629125475883484, "rewards/margins": 0.0806988924741745, "rewards/rejected": -0.7436113953590393, "step": 160 }, { "epoch": 1.0, "step": 165, "total_flos": 0.0, "train_loss": 0.6822745196747058, "train_runtime": 33626.4283, "train_samples_per_second": 0.629, "train_steps_per_second": 0.005 } ], "logging_steps": 10, "max_steps": 165, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }