{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eta": 0.0010000000474974513, "grad_norm": 21.712186498887842, "learning_rate": 3.125e-08, "logits/chosen": -1.4551665782928467, "logits/rejected": -1.606083869934082, "logps/chosen": -144.822265625, "logps/pi_response": -243.71868896484375, "logps/ref_response": -243.71868896484375, "logps/rejected": -162.54443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "eta": 0.0010000000474974513, "grad_norm": 15.364437403667353, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.8610674142837524, "logits/rejected": -1.8439589738845825, "logps/chosen": -181.88343811035156, "logps/pi_response": -265.68951416015625, "logps/ref_response": -263.9686584472656, "logps/rejected": -183.3703155517578, "loss": 0.693, "rewards/accuracies": 0.4652777910232544, "rewards/chosen": -0.005265166517347097, "rewards/margins": -0.0010041914647445083, "rewards/rejected": -0.004260974936187267, "step": 10 }, { "epoch": 0.13, "eta": 0.0010000000474974513, "grad_norm": 18.345421633966485, "learning_rate": 4.989935734988097e-07, "logits/chosen": -1.6329383850097656, "logits/rejected": -1.6391578912734985, "logps/chosen": -178.48463439941406, "logps/pi_response": -288.0892639160156, "logps/ref_response": -259.4861755371094, "logps/rejected": -181.0362091064453, "loss": 0.6907, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13044723868370056, "rewards/margins": 0.01964881829917431, "rewards/rejected": -0.1500960886478424, "step": 20 }, { "epoch": 0.19, "eta": 0.0010000000474974513, "grad_norm": 17.13627597968646, "learning_rate": 4.877641290737883e-07, "logits/chosen": -1.431341290473938, "logits/rejected": -1.421644687652588, "logps/chosen": -222.0276641845703, "logps/pi_response": -336.2035827636719, "logps/ref_response": -273.8411865234375, "logps/rejected": -218.7247314453125, "loss": 0.6957, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -0.4222361147403717, "rewards/margins": 0.015909332782030106, "rewards/rejected": -0.43814539909362793, "step": 30 }, { "epoch": 0.26, "eta": 0.0010000000474974513, "grad_norm": 14.253311206515136, "learning_rate": 4.646121984004665e-07, "logits/chosen": -1.4934971332550049, "logits/rejected": -1.447249412536621, "logps/chosen": -192.37478637695312, "logps/pi_response": -272.644775390625, "logps/ref_response": -250.92996215820312, "logps/rejected": -193.36312866210938, "loss": 0.6915, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.19741061329841614, "rewards/margins": 0.010645559057593346, "rewards/rejected": -0.20805618166923523, "step": 40 }, { "epoch": 0.32, "eta": 0.0010000000474974513, "grad_norm": 16.796841718291038, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -1.7099899053573608, "logits/rejected": -1.7535244226455688, "logps/chosen": -194.2742462158203, "logps/pi_response": -277.76318359375, "logps/ref_response": -266.2118835449219, "logps/rejected": -201.6314239501953, "loss": 0.6895, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14295700192451477, "rewards/margins": 0.017321351915597916, "rewards/rejected": -0.1602783501148224, "step": 50 }, { "epoch": 0.38, "eta": 0.0010000000474974513, "grad_norm": 14.319074470267418, "learning_rate": 3.877242453630256e-07, "logits/chosen": -1.2345731258392334, "logits/rejected": -1.2892169952392578, "logps/chosen": -206.3093719482422, "logps/pi_response": -289.80902099609375, "logps/ref_response": -256.5821533203125, "logps/rejected": -212.19924926757812, "loss": 0.6896, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.30671426653862, "rewards/margins": 0.010099029168486595, "rewards/rejected": -0.31681329011917114, "step": 60 }, { "epoch": 0.45, "eta": 0.0010000000474974513, "grad_norm": 24.50253677275901, "learning_rate": 3.378437060203357e-07, "logits/chosen": -1.03169584274292, "logits/rejected": -1.021615743637085, "logps/chosen": -221.443359375, "logps/pi_response": -319.42242431640625, "logps/ref_response": -262.21759033203125, "logps/rejected": -225.0926971435547, "loss": 0.6847, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.44103240966796875, "rewards/margins": 0.028266970068216324, "rewards/rejected": -0.4692993760108948, "step": 70 }, { "epoch": 0.51, "eta": 0.0010000000474974513, "grad_norm": 19.526479759830394, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -0.8754630088806152, "logits/rejected": -0.9458340406417847, "logps/chosen": -225.5301971435547, "logps/pi_response": -346.0572204589844, "logps/ref_response": -269.8856201171875, "logps/rejected": -233.25045776367188, "loss": 0.689, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.513852596282959, "rewards/margins": 0.01622053235769272, "rewards/rejected": -0.5300731658935547, "step": 80 }, { "epoch": 0.58, "eta": 0.0010000000474974513, "grad_norm": 21.868311725000435, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -0.814942479133606, "logits/rejected": -0.8241022825241089, "logps/chosen": -234.712646484375, "logps/pi_response": -357.76568603515625, "logps/ref_response": -260.8216552734375, "logps/rejected": -250.9972686767578, "loss": 0.6791, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.6007817983627319, "rewards/margins": 0.07638835906982422, "rewards/rejected": -0.6771702170372009, "step": 90 }, { "epoch": 0.64, "eta": 0.0010000000474974513, "grad_norm": 20.43299106630643, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -0.897548496723175, "logits/rejected": -0.932096004486084, "logps/chosen": -236.4069061279297, "logps/pi_response": -367.8551940917969, "logps/ref_response": -274.1803283691406, "logps/rejected": -245.63119506835938, "loss": 0.6895, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.5783185958862305, "rewards/margins": 0.0737391859292984, "rewards/rejected": -0.6520577669143677, "step": 100 }, { "epoch": 0.7, "eta": 0.0010000000474974513, "grad_norm": 20.07983919936344, "learning_rate": 1.2177518064852348e-07, "logits/chosen": -0.7010077834129333, "logits/rejected": -0.7780792713165283, "logps/chosen": -239.2875518798828, "logps/pi_response": -375.39691162109375, "logps/ref_response": -266.8659362792969, "logps/rejected": -249.2735595703125, "loss": 0.6817, "rewards/accuracies": 0.578125, "rewards/chosen": -0.6955354809761047, "rewards/margins": 0.06171605736017227, "rewards/rejected": -0.7572515606880188, "step": 110 }, { "epoch": 0.77, "eta": 0.0010000000474974513, "grad_norm": 18.440065783036147, "learning_rate": 7.723433775328384e-08, "logits/chosen": -0.7951020002365112, "logits/rejected": -0.8204092979431152, "logps/chosen": -261.5052795410156, "logps/pi_response": -384.1082458496094, "logps/ref_response": -273.2558288574219, "logps/rejected": -263.2559509277344, "loss": 0.678, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7765410542488098, "rewards/margins": 0.03736639395356178, "rewards/rejected": -0.8139075040817261, "step": 120 }, { "epoch": 0.83, "eta": 0.0010000000474974513, "grad_norm": 20.63429522696927, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -0.6491920351982117, "logits/rejected": -0.6609460711479187, "logps/chosen": -245.2913055419922, "logps/pi_response": -372.6864929199219, "logps/ref_response": -255.30258178710938, "logps/rejected": -250.04989624023438, "loss": 0.6865, "rewards/accuracies": 0.515625, "rewards/chosen": -0.7680090665817261, "rewards/margins": 0.01968952640891075, "rewards/rejected": -0.78769850730896, "step": 130 }, { "epoch": 0.9, "eta": 0.0010000000474974513, "grad_norm": 18.194304704086697, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -0.8394004702568054, "logits/rejected": -0.8077179789543152, "logps/chosen": -243.73538208007812, "logps/pi_response": -391.2145690917969, "logps/ref_response": -279.54559326171875, "logps/rejected": -255.0926513671875, "loss": 0.6744, "rewards/accuracies": 0.578125, "rewards/chosen": -0.700167179107666, "rewards/margins": 0.07673807442188263, "rewards/rejected": -0.7769052386283875, "step": 140 }, { "epoch": 0.96, "eta": 0.0010000000474974513, "grad_norm": 17.724132159919495, "learning_rate": 2.2625595580163247e-09, "logits/chosen": -0.7538624405860901, "logits/rejected": -0.7211672067642212, "logps/chosen": -247.0625457763672, "logps/pi_response": -371.68939208984375, "logps/ref_response": -256.93328857421875, "logps/rejected": -252.12765502929688, "loss": 0.6781, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.7719146609306335, "rewards/margins": 0.03903389722108841, "rewards/rejected": -0.8109486699104309, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.6867941472774897, "train_runtime": 22649.1506, "train_samples_per_second": 0.883, "train_steps_per_second": 0.007 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }