{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 60, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011111111111111112, "grad_norm": 2.362602949142456, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -1.400684118270874, "logits/rejected": -1.4005341529846191, "logps/chosen": -174.8197021484375, "logps/rejected": -174.18280029296875, "loss": 0.6981, "rewards/accuracies": 0.3700000047683716, "rewards/chosen": -0.017464280128479004, "rewards/margins": -0.00935516320168972, "rewards/rejected": -0.00810911599546671, "step": 10 }, { "epoch": 0.022222222222222223, "grad_norm": 2.706902027130127, "learning_rate": 5.555555555555555e-07, "logits/chosen": -1.401512622833252, "logits/rejected": -1.4014896154403687, "logps/chosen": -172.8441162109375, "logps/rejected": -176.39537048339844, "loss": 0.6945, "rewards/accuracies": 0.4599999785423279, "rewards/chosen": -0.015734069049358368, "rewards/margins": -0.0022257084492594004, "rewards/rejected": -0.01350836269557476, "step": 20 }, { "epoch": 0.03333333333333333, "grad_norm": 2.120821714401245, "learning_rate": 8.333333333333333e-07, "logits/chosen": -1.3998275995254517, "logits/rejected": -1.3999087810516357, "logps/chosen": -173.80712890625, "logps/rejected": -175.36126708984375, "loss": 0.6927, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": -0.00933685339987278, "rewards/margins": 0.0013576654018834233, "rewards/rejected": -0.010694518685340881, "step": 30 }, { "epoch": 0.044444444444444446, "grad_norm": 0.6226487159729004, "learning_rate": 1.111111111111111e-06, "logits/chosen": -1.4011458158493042, "logits/rejected": -1.4012081623077393, "logps/chosen": -173.29324340820312, "logps/rejected": -175.90345764160156, "loss": 0.6926, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": -0.02281300537288189, "rewards/margins": 0.0015505983028560877, "rewards/rejected": -0.024363603442907333, "step": 40 }, { "epoch": 0.05555555555555555, "grad_norm": 2.68591046333313, "learning_rate": 1.3888888888888892e-06, "logits/chosen": -1.4008080959320068, "logits/rejected": -1.4006825685501099, "logps/chosen": -175.80612182617188, "logps/rejected": -173.04119873046875, "loss": 0.6942, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -0.014659256674349308, "rewards/margins": -0.0015078135766088963, "rewards/rejected": -0.013151444494724274, "step": 50 }, { "epoch": 0.06666666666666667, "grad_norm": 0.6941749453544617, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -1.4003050327301025, "logits/rejected": -1.4006407260894775, "logps/chosen": -174.0802001953125, "logps/rejected": -175.01547241210938, "loss": 0.6939, "rewards/accuracies": 0.5100000500679016, "rewards/chosen": -0.026361756026744843, "rewards/margins": -0.0008379966020584106, "rewards/rejected": -0.025523759424686432, "step": 60 }, { "epoch": 0.06666666666666667, "eval_logits/chosen": -1.4009861946105957, "eval_logits/rejected": -1.4008183479309082, "eval_logps/chosen": -175.24819946289062, "eval_logps/rejected": -173.85289001464844, "eval_loss": 0.6920965313911438, "eval_rewards/accuracies": 0.5189999938011169, "eval_rewards/chosen": -0.021925970911979675, "eval_rewards/margins": 0.0026434571482241154, "eval_rewards/rejected": -0.024569429457187653, "eval_runtime": 318.9511, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.314, "step": 60 }, { "epoch": 0.07777777777777778, "grad_norm": 1.3399503231048584, "learning_rate": 1.944444444444445e-06, "logits/chosen": -1.4007337093353271, "logits/rejected": -1.4006619453430176, "logps/chosen": -173.1317138671875, "logps/rejected": -175.83157348632812, "loss": 0.6926, "rewards/accuracies": 0.5200000405311584, "rewards/chosen": -0.02405247837305069, "rewards/margins": 0.001808380475267768, "rewards/rejected": -0.025860857218503952, "step": 70 }, { "epoch": 0.08888888888888889, "grad_norm": 4.030770778656006, "learning_rate": 2.222222222222222e-06, "logits/chosen": -1.400660753250122, "logits/rejected": -1.4007993936538696, "logps/chosen": -172.63229370117188, "logps/rejected": -176.5906524658203, "loss": 0.6849, "rewards/accuracies": 0.5900000333786011, "rewards/chosen": -0.013674546033143997, "rewards/margins": 0.017781419679522514, "rewards/rejected": -0.03145596385002136, "step": 80 }, { "epoch": 0.1, "grad_norm": 7.06594181060791, "learning_rate": 2.5e-06, "logits/chosen": -1.4002556800842285, "logits/rejected": -1.400156021118164, "logps/chosen": -176.54403686523438, "logps/rejected": -172.20162963867188, "loss": 0.6946, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01780758798122406, "rewards/margins": -0.0011917415540665388, "rewards/rejected": -0.016615845263004303, "step": 90 }, { "epoch": 0.1111111111111111, "grad_norm": 4.663311004638672, "learning_rate": 2.7777777777777783e-06, "logits/chosen": -1.40169358253479, "logits/rejected": -1.4018887281417847, "logps/chosen": -174.86729431152344, "logps/rejected": -174.28994750976562, "loss": 0.6925, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": -0.020927399396896362, "rewards/margins": 0.002672073431313038, "rewards/rejected": -0.023599475622177124, "step": 100 }, { "epoch": 0.12222222222222222, "grad_norm": 2.7771716117858887, "learning_rate": 3.055555555555556e-06, "logits/chosen": -1.4018511772155762, "logits/rejected": -1.401686668395996, "logps/chosen": -175.4040069580078, "logps/rejected": -173.77352905273438, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": -0.02241549640893936, "rewards/margins": 0.004419571254402399, "rewards/rejected": -0.026835069060325623, "step": 110 }, { "epoch": 0.13333333333333333, "grad_norm": 2.225400686264038, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -1.4029500484466553, "logits/rejected": -1.4027996063232422, "logps/chosen": -175.29742431640625, "logps/rejected": -174.22561645507812, "loss": 0.6871, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": -0.03488890081644058, "rewards/margins": 0.013112092390656471, "rewards/rejected": -0.0480009950697422, "step": 120 }, { "epoch": 0.13333333333333333, "eval_logits/chosen": -1.403046727180481, "eval_logits/rejected": -1.4029061794281006, "eval_logps/chosen": -175.306884765625, "eval_logps/rejected": -174.10104370117188, "eval_loss": 0.6829859018325806, "eval_rewards/accuracies": 0.6079999804496765, "eval_rewards/chosen": -0.027797138318419456, "eval_rewards/margins": 0.021586475893855095, "eval_rewards/rejected": -0.04938361421227455, "eval_runtime": 319.5591, "eval_samples_per_second": 3.129, "eval_steps_per_second": 0.313, "step": 120 }, { "epoch": 0.14444444444444443, "grad_norm": 4.428592205047607, "learning_rate": 3.6111111111111115e-06, "logits/chosen": -1.4035028219223022, "logits/rejected": -1.403373122215271, "logps/chosen": -175.11550903320312, "logps/rejected": -174.84075927734375, "loss": 0.6805, "rewards/accuracies": 0.6200000643730164, "rewards/chosen": -0.05135633796453476, "rewards/margins": 0.027625277638435364, "rewards/rejected": -0.07898162305355072, "step": 130 }, { "epoch": 0.15555555555555556, "grad_norm": 1.5452574491500854, "learning_rate": 3.88888888888889e-06, "logits/chosen": -1.4023932218551636, "logits/rejected": -1.402073621749878, "logps/chosen": -174.4642791748047, "logps/rejected": -176.83168029785156, "loss": 0.6804, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": -0.11876146495342255, "rewards/margins": 0.02933622896671295, "rewards/rejected": -0.1480976939201355, "step": 140 }, { "epoch": 0.16666666666666666, "grad_norm": 0.9253703951835632, "learning_rate": 4.166666666666667e-06, "logits/chosen": -1.4006946086883545, "logits/rejected": -1.400911808013916, "logps/chosen": -176.50845336914062, "logps/rejected": -175.89736938476562, "loss": 0.6765, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": -0.17112146317958832, "rewards/margins": 0.03798893839120865, "rewards/rejected": -0.20911039412021637, "step": 150 }, { "epoch": 0.17777777777777778, "grad_norm": 4.935380935668945, "learning_rate": 4.444444444444444e-06, "logits/chosen": -1.399414300918579, "logits/rejected": -1.399838924407959, "logps/chosen": -176.39724731445312, "logps/rejected": -178.42300415039062, "loss": 0.6537, "rewards/accuracies": 0.7100000381469727, "rewards/chosen": -0.25529032945632935, "rewards/margins": 0.08817656338214874, "rewards/rejected": -0.3434668779373169, "step": 160 }, { "epoch": 0.18888888888888888, "grad_norm": 1.3383221626281738, "learning_rate": 4.722222222222222e-06, "logits/chosen": -1.3981242179870605, "logits/rejected": -1.398409128189087, "logps/chosen": -179.46847534179688, "logps/rejected": -177.86688232421875, "loss": 0.684, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.40547820925712585, "rewards/margins": 0.04951518028974533, "rewards/rejected": -0.4549933969974518, "step": 170 }, { "epoch": 0.2, "grad_norm": 6.545588493347168, "learning_rate": 5e-06, "logits/chosen": -1.3984978199005127, "logits/rejected": -1.3985638618469238, "logps/chosen": -180.9668426513672, "logps/rejected": -178.43746948242188, "loss": 0.6159, "rewards/accuracies": 0.7599999904632568, "rewards/chosen": -0.4513840079307556, "rewards/margins": 0.18196940422058105, "rewards/rejected": -0.6333533525466919, "step": 180 }, { "epoch": 0.2, "eval_logits/chosen": -1.4020743370056152, "eval_logits/rejected": -1.402461051940918, "eval_logps/chosen": -180.4278564453125, "eval_logps/rejected": -180.83172607421875, "eval_loss": 0.6382298469543457, "eval_rewards/accuracies": 0.5610000491142273, "eval_rewards/chosen": -0.5398944616317749, "eval_rewards/margins": 0.18255746364593506, "eval_rewards/rejected": -0.72245192527771, "eval_runtime": 319.2836, "eval_samples_per_second": 3.132, "eval_steps_per_second": 0.313, "step": 180 }, { "epoch": 0.2111111111111111, "grad_norm": 2.0203661918640137, "learning_rate": 4.999529926121254e-06, "logits/chosen": -1.396078109741211, "logits/rejected": -1.3954544067382812, "logps/chosen": -180.74969482421875, "logps/rejected": -182.64613342285156, "loss": 0.6337, "rewards/accuracies": 0.5700000524520874, "rewards/chosen": -0.6385375261306763, "rewards/margins": 0.19739526510238647, "rewards/rejected": -0.8359327912330627, "step": 190 }, { "epoch": 0.2222222222222222, "grad_norm": 5.894029140472412, "learning_rate": 4.998119881260576e-06, "logits/chosen": -1.390157699584961, "logits/rejected": -1.3912606239318848, "logps/chosen": -181.57754516601562, "logps/rejected": -183.00576782226562, "loss": 0.5749, "rewards/accuracies": 0.8199999928474426, "rewards/chosen": -0.6484101414680481, "rewards/margins": 0.3046451807022095, "rewards/rejected": -0.9530552625656128, "step": 200 }, { "epoch": 0.23333333333333334, "grad_norm": 4.795431613922119, "learning_rate": 4.995770395678171e-06, "logits/chosen": -1.390209436416626, "logits/rejected": -1.3919038772583008, "logps/chosen": -181.8658447265625, "logps/rejected": -183.79417419433594, "loss": 0.5556, "rewards/accuracies": 0.75, "rewards/chosen": -0.6416223049163818, "rewards/margins": 0.4046136736869812, "rewards/rejected": -1.0462360382080078, "step": 210 }, { "epoch": 0.24444444444444444, "grad_norm": 8.91357421875, "learning_rate": 4.99248235291948e-06, "logits/chosen": -1.3888887166976929, "logits/rejected": -1.3894532918930054, "logps/chosen": -179.56829833984375, "logps/rejected": -189.20083618164062, "loss": 0.4952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6512977480888367, "rewards/margins": 0.7006000876426697, "rewards/rejected": -1.3518978357315063, "step": 220 }, { "epoch": 0.25555555555555554, "grad_norm": 14.271614074707031, "learning_rate": 4.9882569894829146e-06, "logits/chosen": -1.3921380043029785, "logits/rejected": -1.393751859664917, "logps/chosen": -185.2764892578125, "logps/rejected": -192.3001708984375, "loss": 0.5098, "rewards/accuracies": 0.7599999904632568, "rewards/chosen": -1.0151185989379883, "rewards/margins": 0.8646041750907898, "rewards/rejected": -1.8797227144241333, "step": 230 }, { "epoch": 0.26666666666666666, "grad_norm": 2.420156240463257, "learning_rate": 4.983095894354858e-06, "logits/chosen": -1.39105224609375, "logits/rejected": -1.392564296722412, "logps/chosen": -186.03451538085938, "logps/rejected": -201.23435974121094, "loss": 0.368, "rewards/accuracies": 0.8300000429153442, "rewards/chosen": -1.2737812995910645, "rewards/margins": 1.303347110748291, "rewards/rejected": -2.5771284103393555, "step": 240 }, { "epoch": 0.26666666666666666, "eval_logits/chosen": -1.3971052169799805, "eval_logits/rejected": -1.3996238708496094, "eval_logps/chosen": -188.56735229492188, "eval_logps/rejected": -201.0563201904297, "eval_loss": 0.3848608434200287, "eval_rewards/accuracies": 0.8309999704360962, "eval_rewards/chosen": -1.3538421392440796, "eval_rewards/margins": 1.3910682201385498, "eval_rewards/rejected": -2.74491024017334, "eval_runtime": 319.0097, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.313, "step": 240 }, { "epoch": 0.2777777777777778, "grad_norm": 14.02056884765625, "learning_rate": 4.977001008412113e-06, "logits/chosen": -1.3970434665679932, "logits/rejected": -1.400298833847046, "logps/chosen": -185.9792022705078, "logps/rejected": -203.23114013671875, "loss": 0.324, "rewards/accuracies": 0.8600000143051147, "rewards/chosen": -1.1137562990188599, "rewards/margins": 1.8328487873077393, "rewards/rejected": -2.9466049671173096, "step": 250 }, { "epoch": 0.28888888888888886, "grad_norm": 3.589820146560669, "learning_rate": 4.969974623692023e-06, "logits/chosen": -1.4056309461593628, "logits/rejected": -1.4085218906402588, "logps/chosen": -185.17918395996094, "logps/rejected": -209.30335998535156, "loss": 0.2772, "rewards/accuracies": 0.8800000548362732, "rewards/chosen": -1.051544189453125, "rewards/margins": 2.4677376747131348, "rewards/rejected": -3.5192818641662598, "step": 260 }, { "epoch": 0.3, "grad_norm": 4.202933311462402, "learning_rate": 4.962019382530521e-06, "logits/chosen": -1.4178866147994995, "logits/rejected": -1.4198402166366577, "logps/chosen": -191.2581329345703, "logps/rejected": -217.56085205078125, "loss": 0.2959, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": -1.7226934432983398, "rewards/margins": 2.5767905712127686, "rewards/rejected": -4.2994842529296875, "step": 270 }, { "epoch": 0.3111111111111111, "grad_norm": 4.351930141448975, "learning_rate": 4.953138276568462e-06, "logits/chosen": -1.4250727891921997, "logits/rejected": -1.427567720413208, "logps/chosen": -200.7665557861328, "logps/rejected": -221.02357482910156, "loss": 0.4344, "rewards/accuracies": 0.7900000214576721, "rewards/chosen": -2.566577434539795, "rewards/margins": 2.1860404014587402, "rewards/rejected": -4.752617835998535, "step": 280 }, { "epoch": 0.32222222222222224, "grad_norm": 9.703364372253418, "learning_rate": 4.943334645626589e-06, "logits/chosen": -1.4243228435516357, "logits/rejected": -1.4278262853622437, "logps/chosen": -197.0714111328125, "logps/rejected": -221.6966552734375, "loss": 0.3466, "rewards/accuracies": 0.8199999928474426, "rewards/chosen": -2.2351460456848145, "rewards/margins": 2.5265071392059326, "rewards/rejected": -4.761653900146484, "step": 290 }, { "epoch": 0.3333333333333333, "grad_norm": 14.332489967346191, "learning_rate": 4.93261217644956e-06, "logits/chosen": -1.4260220527648926, "logits/rejected": -1.4290738105773926, "logps/chosen": -194.31724548339844, "logps/rejected": -221.0859832763672, "loss": 0.3234, "rewards/accuracies": 0.8800000548362732, "rewards/chosen": -2.019387722015381, "rewards/margins": 2.6289873123168945, "rewards/rejected": -4.648375034332275, "step": 300 }, { "epoch": 0.3333333333333333, "eval_logits/chosen": -1.4247881174087524, "eval_logits/rejected": -1.4282124042510986, "eval_logps/chosen": -196.38650512695312, "eval_logps/rejected": -219.71144104003906, "eval_loss": 0.3633359372615814, "eval_rewards/accuracies": 0.8229999542236328, "eval_rewards/chosen": -2.135758876800537, "eval_rewards/margins": 2.4746649265289307, "eval_rewards/rejected": -4.610424041748047, "eval_runtime": 319.0479, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 300 }, { "epoch": 0.34444444444444444, "grad_norm": 26.149131774902344, "learning_rate": 4.9209749013195155e-06, "logits/chosen": -1.4286975860595703, "logits/rejected": -1.43110990524292, "logps/chosen": -191.86825561523438, "logps/rejected": -218.36767578125, "loss": 0.3799, "rewards/accuracies": 0.8300000429153442, "rewards/chosen": -1.8485496044158936, "rewards/margins": 2.443417549133301, "rewards/rejected": -4.291967391967773, "step": 310 }, { "epoch": 0.35555555555555557, "grad_norm": 18.254680633544922, "learning_rate": 4.908427196539701e-06, "logits/chosen": -1.4264110326766968, "logits/rejected": -1.4311984777450562, "logps/chosen": -196.54238891601562, "logps/rejected": -215.0438232421875, "loss": 0.3149, "rewards/accuracies": 0.8399999737739563, "rewards/chosen": -2.009295701980591, "rewards/margins": 2.2745771408081055, "rewards/rejected": -4.283872604370117, "step": 320 }, { "epoch": 0.36666666666666664, "grad_norm": 20.668800354003906, "learning_rate": 4.894973780788722e-06, "logits/chosen": -1.4264931678771973, "logits/rejected": -1.4278137683868408, "logps/chosen": -198.57382202148438, "logps/rejected": -217.05438232421875, "loss": 0.4159, "rewards/accuracies": 0.8100000023841858, "rewards/chosen": -2.5315957069396973, "rewards/margins": 1.5949325561523438, "rewards/rejected": -4.126528739929199, "step": 330 }, { "epoch": 0.37777777777777777, "grad_norm": 4.467871189117432, "learning_rate": 4.8806197133460385e-06, "logits/chosen": -1.4277429580688477, "logits/rejected": -1.4302550554275513, "logps/chosen": -204.53775024414062, "logps/rejected": -220.16055297851562, "loss": 0.3476, "rewards/accuracies": 0.8499999642372131, "rewards/chosen": -2.966139316558838, "rewards/margins": 1.6775035858154297, "rewards/rejected": -4.643642425537109, "step": 340 }, { "epoch": 0.3888888888888889, "grad_norm": 7.6644816398620605, "learning_rate": 4.865370392189377e-06, "logits/chosen": -1.43019437789917, "logits/rejected": -1.4324309825897217, "logps/chosen": -203.60850524902344, "logps/rejected": -224.7152862548828, "loss": 0.2798, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": -2.942948818206787, "rewards/margins": 2.063199996948242, "rewards/rejected": -5.006148338317871, "step": 350 }, { "epoch": 0.4, "grad_norm": 13.925436019897461, "learning_rate": 4.849231551964771e-06, "logits/chosen": -1.4413893222808838, "logits/rejected": -1.4455211162567139, "logps/chosen": -205.908447265625, "logps/rejected": -226.8455810546875, "loss": 0.2649, "rewards/accuracies": 0.8700000643730164, "rewards/chosen": -3.027750253677368, "rewards/margins": 2.361143112182617, "rewards/rejected": -5.388893127441406, "step": 360 }, { "epoch": 0.4, "eval_logits/chosen": -1.4411193132400513, "eval_logits/rejected": -1.4450273513793945, "eval_logps/chosen": -208.10166931152344, "eval_logps/rejected": -233.96986389160156, "eval_loss": 0.3037000298500061, "eval_rewards/accuracies": 0.8799999952316284, "eval_rewards/chosen": -3.3072755336761475, "eval_rewards/margins": 2.7289905548095703, "eval_rewards/rejected": -6.036265850067139, "eval_runtime": 318.9985, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.313, "step": 360 }, { "epoch": 0.4111111111111111, "grad_norm": 4.158270835876465, "learning_rate": 4.832209261830002e-06, "logits/chosen": -1.4424656629562378, "logits/rejected": -1.4435977935791016, "logps/chosen": -208.0481414794922, "logps/rejected": -236.48324584960938, "loss": 0.353, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": -3.5577611923217773, "rewards/margins": 2.4773597717285156, "rewards/rejected": -6.035120964050293, "step": 370 }, { "epoch": 0.4222222222222222, "grad_norm": 3.5422561168670654, "learning_rate": 4.814309923172227e-06, "logits/chosen": -1.4452104568481445, "logits/rejected": -1.4488377571105957, "logps/chosen": -204.97947692871094, "logps/rejected": -231.20712280273438, "loss": 0.3429, "rewards/accuracies": 0.8300000429153442, "rewards/chosen": -3.058133125305176, "rewards/margins": 2.625974178314209, "rewards/rejected": -5.684107780456543, "step": 380 }, { "epoch": 0.43333333333333335, "grad_norm": 16.114534378051758, "learning_rate": 4.7955402672006855e-06, "logits/chosen": -1.440530776977539, "logits/rejected": -1.4443151950836182, "logps/chosen": -205.27835083007812, "logps/rejected": -236.82347106933594, "loss": 0.2045, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -3.113431215286255, "rewards/margins": 3.1173110008239746, "rewards/rejected": -6.23074197769165, "step": 390 }, { "epoch": 0.4444444444444444, "grad_norm": 22.146488189697266, "learning_rate": 4.775907352415367e-06, "logits/chosen": -1.4472781419754028, "logits/rejected": -1.4499727487564087, "logps/chosen": -199.02243041992188, "logps/rejected": -242.6939697265625, "loss": 0.2361, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -2.694483757019043, "rewards/margins": 3.9368107318878174, "rewards/rejected": -6.631294250488281, "step": 400 }, { "epoch": 0.45555555555555555, "grad_norm": 16.819496154785156, "learning_rate": 4.755418561952595e-06, "logits/chosen": -1.4456830024719238, "logits/rejected": -1.4492114782333374, "logps/chosen": -207.8698272705078, "logps/rejected": -238.99583435058594, "loss": 0.2863, "rewards/accuracies": 0.9100000858306885, "rewards/chosen": -3.3276515007019043, "rewards/margins": 3.1782994270324707, "rewards/rejected": -6.505950927734375, "step": 410 }, { "epoch": 0.4666666666666667, "grad_norm": 15.385212898254395, "learning_rate": 4.734081600808531e-06, "logits/chosen": -1.448960542678833, "logits/rejected": -1.4532960653305054, "logps/chosen": -210.46075439453125, "logps/rejected": -245.5928955078125, "loss": 0.1784, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -3.5726406574249268, "rewards/margins": 3.5739850997924805, "rewards/rejected": -7.146625995635986, "step": 420 }, { "epoch": 0.4666666666666667, "eval_logits/chosen": -1.4469826221466064, "eval_logits/rejected": -1.4523011445999146, "eval_logps/chosen": -213.9627685546875, "eval_logps/rejected": -244.39593505859375, "eval_loss": 0.2159292995929718, "eval_rewards/accuracies": 0.9099999666213989, "eval_rewards/chosen": -3.8933866024017334, "eval_rewards/margins": 3.185485601425171, "eval_rewards/rejected": -7.078872203826904, "eval_runtime": 319.0594, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 420 }, { "epoch": 0.4777777777777778, "grad_norm": 35.55814743041992, "learning_rate": 4.711904492941644e-06, "logits/chosen": -1.4515868425369263, "logits/rejected": -1.4541680812835693, "logps/chosen": -207.43453979492188, "logps/rejected": -246.10247802734375, "loss": 0.2279, "rewards/accuracies": 0.9099999666213989, "rewards/chosen": -3.5352389812469482, "rewards/margins": 3.39831805229187, "rewards/rejected": -6.933557033538818, "step": 430 }, { "epoch": 0.4888888888888889, "grad_norm": 18.41891098022461, "learning_rate": 4.688895578255228e-06, "logits/chosen": -1.4477709531784058, "logits/rejected": -1.4553776979446411, "logps/chosen": -215.75033569335938, "logps/rejected": -245.9658203125, "loss": 0.2779, "rewards/accuracies": 0.8600000143051147, "rewards/chosen": -3.823634147644043, "rewards/margins": 3.665213108062744, "rewards/rejected": -7.488846778869629, "step": 440 }, { "epoch": 0.5, "grad_norm": 15.392614364624023, "learning_rate": 4.665063509461098e-06, "logits/chosen": -1.4473040103912354, "logits/rejected": -1.4520621299743652, "logps/chosen": -212.28256225585938, "logps/rejected": -245.33755493164062, "loss": 0.2924, "rewards/accuracies": 0.89000004529953, "rewards/chosen": -3.778430461883545, "rewards/margins": 3.308140277862549, "rewards/rejected": -7.086570739746094, "step": 450 }, { "epoch": 0.5111111111111111, "grad_norm": 19.698705673217773, "learning_rate": 4.640417248825667e-06, "logits/chosen": -1.4431393146514893, "logits/rejected": -1.4465763568878174, "logps/chosen": -209.155517578125, "logps/rejected": -247.68649291992188, "loss": 0.1966, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -3.5632858276367188, "rewards/margins": 3.6926655769348145, "rewards/rejected": -7.255951404571533, "step": 460 }, { "epoch": 0.5222222222222223, "grad_norm": 3.17411208152771, "learning_rate": 4.614966064799603e-06, "logits/chosen": -1.4454569816589355, "logits/rejected": -1.4508020877838135, "logps/chosen": -214.06642150878906, "logps/rejected": -249.29022216796875, "loss": 0.1576, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -3.891676902770996, "rewards/margins": 3.6903645992279053, "rewards/rejected": -7.5820417404174805, "step": 470 }, { "epoch": 0.5333333333333333, "grad_norm": 3.511045455932617, "learning_rate": 4.588719528532342e-06, "logits/chosen": -1.4526777267456055, "logits/rejected": -1.4565974473953247, "logps/chosen": -209.6256103515625, "logps/rejected": -252.88116455078125, "loss": 0.2608, "rewards/accuracies": 0.8700000643730164, "rewards/chosen": -3.6902856826782227, "rewards/margins": 4.007488250732422, "rewards/rejected": -7.6977739334106445, "step": 480 }, { "epoch": 0.5333333333333333, "eval_logits/chosen": -1.4509010314941406, "eval_logits/rejected": -1.4571257829666138, "eval_logps/chosen": -213.10494995117188, "eval_logps/rejected": -252.49603271484375, "eval_loss": 0.20726382732391357, "eval_rewards/accuracies": 0.9099999666213989, "eval_rewards/chosen": -3.8076045513153076, "eval_rewards/margins": 4.0812788009643555, "eval_rewards/rejected": -7.888883590698242, "eval_runtime": 319.0436, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 480 }, { "epoch": 0.5444444444444444, "grad_norm": 35.65738296508789, "learning_rate": 4.561687510272767e-06, "logits/chosen": -1.4541469812393188, "logits/rejected": -1.4597184658050537, "logps/chosen": -213.66517639160156, "logps/rejected": -254.37350463867188, "loss": 0.2904, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": -4.016324043273926, "rewards/margins": 3.9200973510742188, "rewards/rejected": -7.9364213943481445, "step": 490 }, { "epoch": 0.5555555555555556, "grad_norm": 15.376676559448242, "learning_rate": 4.533880175657419e-06, "logits/chosen": -1.4524576663970947, "logits/rejected": -1.4585695266723633, "logps/chosen": -218.01429748535156, "logps/rejected": -257.30328369140625, "loss": 0.2261, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -4.428624153137207, "rewards/margins": 3.822225332260132, "rewards/rejected": -8.250848770141602, "step": 500 }, { "epoch": 0.5666666666666667, "grad_norm": 25.499267578125, "learning_rate": 4.50530798188761e-06, "logits/chosen": -1.451499342918396, "logits/rejected": -1.4615750312805176, "logps/chosen": -223.37664794921875, "logps/rejected": -253.57177734375, "loss": 0.2516, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -4.594554424285889, "rewards/margins": 3.6208624839782715, "rewards/rejected": -8.215417861938477, "step": 510 }, { "epoch": 0.5777777777777777, "grad_norm": 42.641754150390625, "learning_rate": 4.475981673796899e-06, "logits/chosen": -1.4456167221069336, "logits/rejected": -1.4504668712615967, "logps/chosen": -213.45851135253906, "logps/rejected": -259.6695251464844, "loss": 0.2521, "rewards/accuracies": 0.9200000762939453, "rewards/chosen": -4.051717281341553, "rewards/margins": 4.357028484344482, "rewards/rejected": -8.408745765686035, "step": 520 }, { "epoch": 0.5888888888888889, "grad_norm": 26.318056106567383, "learning_rate": 4.445912279810401e-06, "logits/chosen": -1.4452048540115356, "logits/rejected": -1.4490594863891602, "logps/chosen": -211.29248046875, "logps/rejected": -264.21600341796875, "loss": 0.2038, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -3.8537445068359375, "rewards/margins": 4.980400085449219, "rewards/rejected": -8.834144592285156, "step": 530 }, { "epoch": 0.6, "grad_norm": 46.37030792236328, "learning_rate": 4.415111107797445e-06, "logits/chosen": -1.4452967643737793, "logits/rejected": -1.448035478591919, "logps/chosen": -221.65042114257812, "logps/rejected": -268.7168273925781, "loss": 0.2459, "rewards/accuracies": 0.8399999737739563, "rewards/chosen": -4.855015754699707, "rewards/margins": 4.4556379318237305, "rewards/rejected": -9.310652732849121, "step": 540 }, { "epoch": 0.6, "eval_logits/chosen": -1.4478332996368408, "eval_logits/rejected": -1.4528884887695312, "eval_logps/chosen": -222.76666259765625, "eval_logps/rejected": -269.6318664550781, "eval_loss": 0.21725089848041534, "eval_rewards/accuracies": 0.8889999389648438, "eval_rewards/chosen": -4.773774147033691, "eval_rewards/margins": 4.828692436218262, "eval_rewards/rejected": -9.602466583251953, "eval_runtime": 319.0307, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 540 }, { "epoch": 0.6111111111111112, "grad_norm": 37.16395568847656, "learning_rate": 4.3835897408191515e-06, "logits/chosen": -1.450826644897461, "logits/rejected": -1.4534823894500732, "logps/chosen": -222.22439575195312, "logps/rejected": -270.947998046875, "loss": 0.1905, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -4.919099807739258, "rewards/margins": 4.604528427124023, "rewards/rejected": -9.523628234863281, "step": 550 }, { "epoch": 0.6222222222222222, "grad_norm": 26.3408260345459, "learning_rate": 4.351360032772512e-06, "logits/chosen": -1.4518877267837524, "logits/rejected": -1.4572858810424805, "logps/chosen": -215.63409423828125, "logps/rejected": -271.2196044921875, "loss": 0.1935, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -4.156116962432861, "rewards/margins": 5.512393951416016, "rewards/rejected": -9.668511390686035, "step": 560 }, { "epoch": 0.6333333333333333, "grad_norm": 30.472354888916016, "learning_rate": 4.318434103932622e-06, "logits/chosen": -1.4471065998077393, "logits/rejected": -1.45332932472229, "logps/chosen": -217.19085693359375, "logps/rejected": -264.91046142578125, "loss": 0.3623, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": -4.126136779785156, "rewards/margins": 5.096201419830322, "rewards/rejected": -9.22233772277832, "step": 570 }, { "epoch": 0.6444444444444445, "grad_norm": 17.42032814025879, "learning_rate": 4.284824336394748e-06, "logits/chosen": -1.4501465559005737, "logits/rejected": -1.4535834789276123, "logps/chosen": -216.29188537597656, "logps/rejected": -262.982421875, "loss": 0.2146, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -4.250003814697266, "rewards/margins": 4.562039852142334, "rewards/rejected": -8.812044143676758, "step": 580 }, { "epoch": 0.6555555555555556, "grad_norm": 8.025737762451172, "learning_rate": 4.250543369417921e-06, "logits/chosen": -1.4417762756347656, "logits/rejected": -1.445784568786621, "logps/chosen": -210.0897216796875, "logps/rejected": -259.0534973144531, "loss": 0.2008, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -3.6182351112365723, "rewards/margins": 4.80393123626709, "rewards/rejected": -8.42216682434082, "step": 590 }, { "epoch": 0.6666666666666666, "grad_norm": 47.6915397644043, "learning_rate": 4.215604094671835e-06, "logits/chosen": -1.4405059814453125, "logits/rejected": -1.4476011991500854, "logps/chosen": -208.40203857421875, "logps/rejected": -262.4669189453125, "loss": 0.1729, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -3.2039127349853516, "rewards/margins": 5.8355712890625, "rewards/rejected": -9.039484024047852, "step": 600 }, { "epoch": 0.6666666666666666, "eval_logits/chosen": -1.4379254579544067, "eval_logits/rejected": -1.4430339336395264, "eval_logps/chosen": -211.66957092285156, "eval_logps/rejected": -264.79345703125, "eval_loss": 0.22635750472545624, "eval_rewards/accuracies": 0.9199999570846558, "eval_rewards/chosen": -3.664064407348633, "eval_rewards/margins": 5.454564094543457, "eval_rewards/rejected": -9.118627548217773, "eval_runtime": 319.005, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.313, "step": 600 }, { "epoch": 0.6777777777777778, "grad_norm": 9.863251686096191, "learning_rate": 4.180019651388807e-06, "logits/chosen": -1.4420831203460693, "logits/rejected": -1.4478440284729004, "logps/chosen": -215.6461181640625, "logps/rejected": -264.3682861328125, "loss": 0.1723, "rewards/accuracies": 0.9100000858306885, "rewards/chosen": -3.9336395263671875, "rewards/margins": 5.252224922180176, "rewards/rejected": -9.185864448547363, "step": 610 }, { "epoch": 0.6888888888888889, "grad_norm": 26.010082244873047, "learning_rate": 4.14380342142266e-06, "logits/chosen": -1.4423331022262573, "logits/rejected": -1.4474163055419922, "logps/chosen": -207.67831420898438, "logps/rejected": -265.69677734375, "loss": 0.214, "rewards/accuracies": 0.9099999666213989, "rewards/chosen": -3.3267159461975098, "rewards/margins": 5.816192626953125, "rewards/rejected": -9.142909049987793, "step": 620 }, { "epoch": 0.7, "grad_norm": 23.913930892944336, "learning_rate": 4.106969024216348e-06, "logits/chosen": -1.43362557888031, "logits/rejected": -1.4401135444641113, "logps/chosen": -211.0988311767578, "logps/rejected": -265.10693359375, "loss": 0.4388, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": -3.5427446365356445, "rewards/margins": 5.675654411315918, "rewards/rejected": -9.218399047851562, "step": 630 }, { "epoch": 0.7111111111111111, "grad_norm": 26.446819305419922, "learning_rate": 4.069530311680247e-06, "logits/chosen": -1.4354360103607178, "logits/rejected": -1.442990779876709, "logps/chosen": -204.5161590576172, "logps/rejected": -251.73101806640625, "loss": 0.2555, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -2.7997024059295654, "rewards/margins": 5.167999267578125, "rewards/rejected": -7.967701435089111, "step": 640 }, { "epoch": 0.7222222222222222, "grad_norm": 2.0295379161834717, "learning_rate": 4.031501362983007e-06, "logits/chosen": -1.4334403276443481, "logits/rejected": -1.4392154216766357, "logps/chosen": -205.815673828125, "logps/rejected": -249.6090087890625, "loss": 0.3747, "rewards/accuracies": 0.8800000548362732, "rewards/chosen": -3.0156917572021484, "rewards/margins": 4.648188591003418, "rewards/rejected": -7.663880348205566, "step": 650 }, { "epoch": 0.7333333333333333, "grad_norm": 22.82501792907715, "learning_rate": 3.992896479256966e-06, "logits/chosen": -1.4355220794677734, "logits/rejected": -1.4445066452026367, "logps/chosen": -205.87745666503906, "logps/rejected": -252.21890258789062, "loss": 0.2136, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -2.8590097427368164, "rewards/margins": 5.230529308319092, "rewards/rejected": -8.08953857421875, "step": 660 }, { "epoch": 0.7333333333333333, "eval_logits/chosen": -1.4456157684326172, "eval_logits/rejected": -1.451847791671753, "eval_logps/chosen": -206.54913330078125, "eval_logps/rejected": -253.787353515625, "eval_loss": 0.19935038685798645, "eval_rewards/accuracies": 0.918999969959259, "eval_rewards/chosen": -3.1520204544067383, "eval_rewards/margins": 4.865995407104492, "eval_rewards/rejected": -8.01801586151123, "eval_runtime": 319.1328, "eval_samples_per_second": 3.133, "eval_steps_per_second": 0.313, "step": 660 }, { "epoch": 0.7444444444444445, "grad_norm": 37.078155517578125, "learning_rate": 3.953730178220067e-06, "logits/chosen": -1.4451912641525269, "logits/rejected": -1.4504950046539307, "logps/chosen": -208.33489990234375, "logps/rejected": -255.33157348632812, "loss": 0.2289, "rewards/accuracies": 0.9199999570846558, "rewards/chosen": -3.3780035972595215, "rewards/margins": 4.752861976623535, "rewards/rejected": -8.130865097045898, "step": 670 }, { "epoch": 0.7555555555555555, "grad_norm": 14.792739868164062, "learning_rate": 3.914017188716347e-06, "logits/chosen": -1.446117877960205, "logits/rejected": -1.4537690877914429, "logps/chosen": -207.12896728515625, "logps/rejected": -261.03814697265625, "loss": 0.1755, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -3.137814998626709, "rewards/margins": 5.663388252258301, "rewards/rejected": -8.801202774047852, "step": 680 }, { "epoch": 0.7666666666666667, "grad_norm": 9.229610443115234, "learning_rate": 3.8737724451770155e-06, "logits/chosen": -1.4443621635437012, "logits/rejected": -1.4512722492218018, "logps/chosen": -215.41629028320312, "logps/rejected": -255.59149169921875, "loss": 0.2433, "rewards/accuracies": 0.8800000548362732, "rewards/chosen": -3.9103140830993652, "rewards/margins": 4.392501354217529, "rewards/rejected": -8.302814483642578, "step": 690 }, { "epoch": 0.7777777777777778, "grad_norm": 4.114097595214844, "learning_rate": 3.833011082004229e-06, "logits/chosen": -1.4504740238189697, "logits/rejected": -1.4539170265197754, "logps/chosen": -208.27923583984375, "logps/rejected": -259.309326171875, "loss": 0.1322, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -3.5451531410217285, "rewards/margins": 4.7936835289001465, "rewards/rejected": -8.338837623596191, "step": 700 }, { "epoch": 0.7888888888888889, "grad_norm": 14.269043922424316, "learning_rate": 3.7917484278796578e-06, "logits/chosen": -1.4536712169647217, "logits/rejected": -1.4596309661865234, "logps/chosen": -212.81170654296875, "logps/rejected": -259.4583435058594, "loss": 0.2778, "rewards/accuracies": 0.9100000858306885, "rewards/chosen": -3.7558376789093018, "rewards/margins": 4.881363868713379, "rewards/rejected": -8.637201309204102, "step": 710 }, { "epoch": 0.8, "grad_norm": 2.647397756576538, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.4511842727661133, "logits/rejected": -1.456930160522461, "logps/chosen": -208.67654418945312, "logps/rejected": -263.60205078125, "loss": 0.2148, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -3.4297666549682617, "rewards/margins": 5.496917724609375, "rewards/rejected": -8.926685333251953, "step": 720 }, { "epoch": 0.8, "eval_logits/chosen": -1.4526758193969727, "eval_logits/rejected": -1.4588308334350586, "eval_logps/chosen": -208.24917602539062, "eval_logps/rejected": -259.9820251464844, "eval_loss": 0.26233014464378357, "eval_rewards/accuracies": 0.9039999842643738, "eval_rewards/chosen": -3.3220245838165283, "eval_rewards/margins": 5.315458297729492, "eval_rewards/rejected": -8.637483596801758, "eval_runtime": 319.0745, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 720 }, { "epoch": 0.8111111111111111, "grad_norm": 27.4842472076416, "learning_rate": 3.7077814982415966e-06, "logits/chosen": -1.4542248249053955, "logits/rejected": -1.4581375122070312, "logps/chosen": -201.25257873535156, "logps/rejected": -267.01409912109375, "loss": 0.1524, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -2.901744842529297, "rewards/margins": 6.153472900390625, "rewards/rejected": -9.055217742919922, "step": 730 }, { "epoch": 0.8222222222222222, "grad_norm": 17.44131851196289, "learning_rate": 3.665108799256348e-06, "logits/chosen": -1.4501639604568481, "logits/rejected": -1.4550120830535889, "logps/chosen": -215.76513671875, "logps/rejected": -265.45428466796875, "loss": 0.1982, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.081113815307617, "rewards/margins": 5.071871757507324, "rewards/rejected": -9.152984619140625, "step": 740 }, { "epoch": 0.8333333333333334, "grad_norm": 58.25971221923828, "learning_rate": 3.621997950501156e-06, "logits/chosen": -1.4513449668884277, "logits/rejected": -1.4563398361206055, "logps/chosen": -208.85487365722656, "logps/rejected": -267.5930480957031, "loss": 0.2564, "rewards/accuracies": 0.89000004529953, "rewards/chosen": -3.607893466949463, "rewards/margins": 5.560456275939941, "rewards/rejected": -9.168350219726562, "step": 750 }, { "epoch": 0.8444444444444444, "grad_norm": 30.51304054260254, "learning_rate": 3.578465164203134e-06, "logits/chosen": -1.454546332359314, "logits/rejected": -1.457871913909912, "logps/chosen": -204.0816650390625, "logps/rejected": -271.85711669921875, "loss": 0.169, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.2631070613861084, "rewards/margins": 6.1964874267578125, "rewards/rejected": -9.4595947265625, "step": 760 }, { "epoch": 0.8555555555555555, "grad_norm": 28.097698211669922, "learning_rate": 3.5345268112628485e-06, "logits/chosen": -1.4505870342254639, "logits/rejected": -1.457573652267456, "logps/chosen": -215.683349609375, "logps/rejected": -270.27252197265625, "loss": 0.2219, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -4.015974998474121, "rewards/margins": 5.678750038146973, "rewards/rejected": -9.694725036621094, "step": 770 }, { "epoch": 0.8666666666666667, "grad_norm": 36.97835159301758, "learning_rate": 3.4901994150978926e-06, "logits/chosen": -1.4549884796142578, "logits/rejected": -1.4569082260131836, "logps/chosen": -204.8563995361328, "logps/rejected": -270.4274597167969, "loss": 0.151, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -3.443523406982422, "rewards/margins": 5.77408504486084, "rewards/rejected": -9.217609405517578, "step": 780 }, { "epoch": 0.8666666666666667, "eval_logits/chosen": -1.455579400062561, "eval_logits/rejected": -1.462104320526123, "eval_logps/chosen": -212.8717041015625, "eval_logps/rejected": -266.91241455078125, "eval_loss": 0.26282998919487, "eval_rewards/accuracies": 0.8830000162124634, "eval_rewards/chosen": -3.78427791595459, "eval_rewards/margins": 5.546243190765381, "eval_rewards/rejected": -9.330520629882812, "eval_runtime": 319.1792, "eval_samples_per_second": 3.133, "eval_steps_per_second": 0.313, "step": 780 }, { "epoch": 0.8777777777777778, "grad_norm": 4.409013748168945, "learning_rate": 3.4454996454291066e-06, "logits/chosen": -1.454880952835083, "logits/rejected": -1.4608569145202637, "logps/chosen": -213.51556396484375, "logps/rejected": -270.1238708496094, "loss": 0.2572, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.862175464630127, "rewards/margins": 5.772583961486816, "rewards/rejected": -9.634759902954102, "step": 790 }, { "epoch": 0.8888888888888888, "grad_norm": 29.155506134033203, "learning_rate": 3.400444312011776e-06, "logits/chosen": -1.4549602270126343, "logits/rejected": -1.4602875709533691, "logps/chosen": -212.6188201904297, "logps/rejected": -274.49560546875, "loss": 0.1285, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -3.8824949264526367, "rewards/margins": 6.083772659301758, "rewards/rejected": -9.966266632080078, "step": 800 }, { "epoch": 0.9, "grad_norm": 28.179977416992188, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -1.4578851461410522, "logits/rejected": -1.4644014835357666, "logps/chosen": -214.60816955566406, "logps/rejected": -270.767822265625, "loss": 0.3057, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": -3.949023723602295, "rewards/margins": 5.775270462036133, "rewards/rejected": -9.724294662475586, "step": 810 }, { "epoch": 0.9111111111111111, "grad_norm": 22.016096115112305, "learning_rate": 3.3093348551458033e-06, "logits/chosen": -1.4591329097747803, "logits/rejected": -1.464478850364685, "logps/chosen": -206.40281677246094, "logps/rejected": -272.22930908203125, "loss": 0.1286, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -3.3459863662719727, "rewards/margins": 6.317253112792969, "rewards/rejected": -9.663239479064941, "step": 820 }, { "epoch": 0.9222222222222223, "grad_norm": 24.308671951293945, "learning_rate": 3.2633149942377835e-06, "logits/chosen": -1.4574294090270996, "logits/rejected": -1.4642754793167114, "logps/chosen": -213.82862854003906, "logps/rejected": -266.60675048828125, "loss": 0.2728, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -3.8955249786376953, "rewards/margins": 5.386727809906006, "rewards/rejected": -9.282252311706543, "step": 830 }, { "epoch": 0.9333333333333333, "grad_norm": 18.76812171936035, "learning_rate": 3.217008081777726e-06, "logits/chosen": -1.4542195796966553, "logits/rejected": -1.461412787437439, "logps/chosen": -212.99435424804688, "logps/rejected": -267.50958251953125, "loss": 0.1759, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -3.8036112785339355, "rewards/margins": 5.557330131530762, "rewards/rejected": -9.360941886901855, "step": 840 }, { "epoch": 0.9333333333333333, "eval_logits/chosen": -1.4564862251281738, "eval_logits/rejected": -1.463136911392212, "eval_logps/chosen": -212.54718017578125, "eval_logps/rejected": -267.1683349609375, "eval_loss": 0.17360562086105347, "eval_rewards/accuracies": 0.9269999861717224, "eval_rewards/chosen": -3.751824378967285, "eval_rewards/margins": 5.604288101196289, "eval_rewards/rejected": -9.35611343383789, "eval_runtime": 319.0169, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.313, "step": 840 }, { "epoch": 0.9444444444444444, "grad_norm": 7.19240665435791, "learning_rate": 3.1704315319015936e-06, "logits/chosen": -1.4580819606781006, "logits/rejected": -1.46415114402771, "logps/chosen": -211.7685546875, "logps/rejected": -267.0213623046875, "loss": 0.2128, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -3.7857413291931152, "rewards/margins": 5.433224678039551, "rewards/rejected": -9.218965530395508, "step": 850 }, { "epoch": 0.9555555555555556, "grad_norm": 36.987693786621094, "learning_rate": 3.1236028601449534e-06, "logits/chosen": -1.457148551940918, "logits/rejected": -1.4629095792770386, "logps/chosen": -213.85028076171875, "logps/rejected": -263.3716735839844, "loss": 0.2345, "rewards/accuracies": 0.8800000548362732, "rewards/chosen": -3.9159281253814697, "rewards/margins": 5.010843276977539, "rewards/rejected": -8.92677116394043, "step": 860 }, { "epoch": 0.9666666666666667, "grad_norm": 3.213857889175415, "learning_rate": 3.0765396768561005e-06, "logits/chosen": -1.4600489139556885, "logits/rejected": -1.4643452167510986, "logps/chosen": -207.65179443359375, "logps/rejected": -265.60382080078125, "loss": 0.1257, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -3.5244479179382324, "rewards/margins": 5.42505407333374, "rewards/rejected": -8.949502944946289, "step": 870 }, { "epoch": 0.9777777777777777, "grad_norm": 2.7685673236846924, "learning_rate": 3.0292596805735275e-06, "logits/chosen": -1.4531805515289307, "logits/rejected": -1.4613621234893799, "logps/chosen": -207.08041381835938, "logps/rejected": -272.2119140625, "loss": 0.0729, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -3.164515495300293, "rewards/margins": 6.724908351898193, "rewards/rejected": -9.889423370361328, "step": 880 }, { "epoch": 0.9888888888888889, "grad_norm": 32.784828186035156, "learning_rate": 2.9817806513702247e-06, "logits/chosen": -1.4549615383148193, "logits/rejected": -1.4622005224227905, "logps/chosen": -208.28564453125, "logps/rejected": -271.87994384765625, "loss": 0.261, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -3.400259494781494, "rewards/margins": 6.355001449584961, "rewards/rejected": -9.755260467529297, "step": 890 }, { "epoch": 1.0, "grad_norm": 19.346893310546875, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -1.4544117450714111, "logits/rejected": -1.4625937938690186, "logps/chosen": -213.2257537841797, "logps/rejected": -273.80535888671875, "loss": 0.1455, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.6717934608459473, "rewards/margins": 6.4866108894348145, "rewards/rejected": -10.158405303955078, "step": 900 }, { "epoch": 1.0, "eval_logits/chosen": -1.4550888538360596, "eval_logits/rejected": -1.4625444412231445, "eval_logps/chosen": -209.57638549804688, "eval_logps/rejected": -274.5335388183594, "eval_loss": 0.19673706591129303, "eval_rewards/accuracies": 0.9290000200271606, "eval_rewards/chosen": -3.454745292663574, "eval_rewards/margins": 6.637889862060547, "eval_rewards/rejected": -10.092636108398438, "eval_runtime": 319.1955, "eval_samples_per_second": 3.133, "eval_steps_per_second": 0.313, "step": 900 }, { "epoch": 1.011111111111111, "grad_norm": 5.159682273864746, "learning_rate": 2.8862969820196017e-06, "logits/chosen": -1.453148603439331, "logits/rejected": -1.460700511932373, "logps/chosen": -207.94732666015625, "logps/rejected": -275.78265380859375, "loss": 0.1197, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.277247428894043, "rewards/margins": 6.945833683013916, "rewards/rejected": -10.223081588745117, "step": 910 }, { "epoch": 1.0222222222222221, "grad_norm": 39.852725982666016, "learning_rate": 2.8383282493753282e-06, "logits/chosen": -1.4552119970321655, "logits/rejected": -1.4620335102081299, "logps/chosen": -205.69607543945312, "logps/rejected": -279.0772705078125, "loss": 0.1646, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.194272041320801, "rewards/margins": 7.236158847808838, "rewards/rejected": -10.430431365966797, "step": 920 }, { "epoch": 1.0333333333333334, "grad_norm": 0.4127664268016815, "learning_rate": 2.7902322853130758e-06, "logits/chosen": -1.4518330097198486, "logits/rejected": -1.4583864212036133, "logps/chosen": -208.1166229248047, "logps/rejected": -273.89801025390625, "loss": 0.1935, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -3.375034809112549, "rewards/margins": 6.5843119621276855, "rewards/rejected": -9.959346771240234, "step": 930 }, { "epoch": 1.0444444444444445, "grad_norm": 55.90793991088867, "learning_rate": 2.742027176757948e-06, "logits/chosen": -1.4538707733154297, "logits/rejected": -1.4589080810546875, "logps/chosen": -207.4318389892578, "logps/rejected": -275.7708740234375, "loss": 0.2136, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -3.4339537620544434, "rewards/margins": 6.580141544342041, "rewards/rejected": -10.014095306396484, "step": 940 }, { "epoch": 1.0555555555555556, "grad_norm": 27.653209686279297, "learning_rate": 2.6937310516798276e-06, "logits/chosen": -1.4511687755584717, "logits/rejected": -1.4569811820983887, "logps/chosen": -213.1746368408203, "logps/rejected": -274.05364990234375, "loss": 0.3442, "rewards/accuracies": 0.8800000548362732, "rewards/chosen": -3.9909844398498535, "rewards/margins": 5.839582443237305, "rewards/rejected": -9.83056640625, "step": 950 }, { "epoch": 1.0666666666666667, "grad_norm": 17.936847686767578, "learning_rate": 2.6453620722761897e-06, "logits/chosen": -1.4525644779205322, "logits/rejected": -1.4593393802642822, "logps/chosen": -210.91744995117188, "logps/rejected": -276.6822814941406, "loss": 0.1456, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.682964324951172, "rewards/margins": 6.5234174728393555, "rewards/rejected": -10.206380844116211, "step": 960 }, { "epoch": 1.0666666666666667, "eval_logits/chosen": -1.4538413286209106, "eval_logits/rejected": -1.461044430732727, "eval_logps/chosen": -214.53591918945312, "eval_logps/rejected": -277.791259765625, "eval_loss": 0.2036525309085846, "eval_rewards/accuracies": 0.9289999604225159, "eval_rewards/chosen": -3.950699806213379, "eval_rewards/margins": 6.467706680297852, "eval_rewards/rejected": -10.418405532836914, "eval_runtime": 319.0271, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.313, "step": 960 }, { "epoch": 1.0777777777777777, "grad_norm": 53.1196403503418, "learning_rate": 2.5969384281420425e-06, "logits/chosen": -1.452633023262024, "logits/rejected": -1.4589219093322754, "logps/chosen": -213.8082275390625, "logps/rejected": -272.00054931640625, "loss": 0.2095, "rewards/accuracies": 0.9199999570846558, "rewards/chosen": -3.940258264541626, "rewards/margins": 5.825028896331787, "rewards/rejected": -9.765287399291992, "step": 970 }, { "epoch": 1.0888888888888888, "grad_norm": 9.145478248596191, "learning_rate": 2.548478329429561e-06, "logits/chosen": -1.4536033868789673, "logits/rejected": -1.4611570835113525, "logps/chosen": -206.41012573242188, "logps/rejected": -274.23272705078125, "loss": 0.2283, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -3.157097578048706, "rewards/margins": 6.88623046875, "rewards/rejected": -10.043328285217285, "step": 980 }, { "epoch": 1.1, "grad_norm": 26.69437026977539, "learning_rate": 2.5e-06, "logits/chosen": -1.452072024345398, "logits/rejected": -1.460184097290039, "logps/chosen": -215.56329345703125, "logps/rejected": -278.34051513671875, "loss": 0.2056, "rewards/accuracies": 0.9099999666213989, "rewards/chosen": -4.010292053222656, "rewards/margins": 6.502901077270508, "rewards/rejected": -10.513193130493164, "step": 990 }, { "epoch": 1.1111111111111112, "grad_norm": 26.09144401550293, "learning_rate": 2.4515216705704396e-06, "logits/chosen": -1.4517230987548828, "logits/rejected": -1.4599707126617432, "logps/chosen": -214.8649444580078, "logps/rejected": -274.9700927734375, "loss": 0.2523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.9132699966430664, "rewards/margins": 6.279613494873047, "rewards/rejected": -10.192882537841797, "step": 1000 }, { "epoch": 1.1222222222222222, "grad_norm": 25.6414852142334, "learning_rate": 2.403061571857958e-06, "logits/chosen": -1.4531188011169434, "logits/rejected": -1.460578441619873, "logps/chosen": -206.95849609375, "logps/rejected": -270.173583984375, "loss": 0.1519, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.1788997650146484, "rewards/margins": 6.496912002563477, "rewards/rejected": -9.675811767578125, "step": 1010 }, { "epoch": 1.1333333333333333, "grad_norm": 18.907466888427734, "learning_rate": 2.3546379277238107e-06, "logits/chosen": -1.4472074508666992, "logits/rejected": -1.4559324979782104, "logps/chosen": -208.98306274414062, "logps/rejected": -275.3448486328125, "loss": 0.1276, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -3.3369410037994385, "rewards/margins": 6.897830486297607, "rewards/rejected": -10.234771728515625, "step": 1020 }, { "epoch": 1.1333333333333333, "eval_logits/chosen": -1.4493515491485596, "eval_logits/rejected": -1.4567832946777344, "eval_logps/chosen": -212.98690795898438, "eval_logps/rejected": -277.5372619628906, "eval_loss": 0.20899365842342377, "eval_rewards/accuracies": 0.9240000247955322, "eval_rewards/chosen": -3.7957983016967773, "eval_rewards/margins": 6.597206115722656, "eval_rewards/rejected": -10.39300537109375, "eval_runtime": 319.0554, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1020 }, { "epoch": 1.1444444444444444, "grad_norm": 6.1386847496032715, "learning_rate": 2.3062689483201732e-06, "logits/chosen": -1.449528694152832, "logits/rejected": -1.4584475755691528, "logps/chosen": -210.4915771484375, "logps/rejected": -280.85870361328125, "loss": 0.0659, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -3.4778530597686768, "rewards/margins": 7.316442966461182, "rewards/rejected": -10.794296264648438, "step": 1030 }, { "epoch": 1.1555555555555554, "grad_norm": 0.5934897065162659, "learning_rate": 2.2579728232420524e-06, "logits/chosen": -1.450500726699829, "logits/rejected": -1.4581060409545898, "logps/chosen": -204.58624267578125, "logps/rejected": -282.1773681640625, "loss": 0.0935, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.075925350189209, "rewards/margins": 7.654919624328613, "rewards/rejected": -10.73084545135498, "step": 1040 }, { "epoch": 1.1666666666666667, "grad_norm": 36.4771614074707, "learning_rate": 2.2097677146869242e-06, "logits/chosen": -1.452951431274414, "logits/rejected": -1.4590579271316528, "logps/chosen": -204.48294067382812, "logps/rejected": -279.20623779296875, "loss": 0.3168, "rewards/accuracies": 0.9100000858306885, "rewards/chosen": -3.147733449935913, "rewards/margins": 7.201406002044678, "rewards/rejected": -10.349140167236328, "step": 1050 }, { "epoch": 1.1777777777777778, "grad_norm": 3.764472007751465, "learning_rate": 2.161671750624673e-06, "logits/chosen": -1.453149437904358, "logits/rejected": -1.4607138633728027, "logps/chosen": -213.9553985595703, "logps/rejected": -279.18658447265625, "loss": 0.0952, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.8566091060638428, "rewards/margins": 6.743518829345703, "rewards/rejected": -10.600127220153809, "step": 1060 }, { "epoch": 1.1888888888888889, "grad_norm": 42.325042724609375, "learning_rate": 2.113703017980399e-06, "logits/chosen": -1.4516856670379639, "logits/rejected": -1.4586374759674072, "logps/chosen": -212.69400024414062, "logps/rejected": -279.00396728515625, "loss": 0.1345, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -3.775832176208496, "rewards/margins": 6.73020076751709, "rewards/rejected": -10.506032943725586, "step": 1070 }, { "epoch": 1.2, "grad_norm": 11.120525360107422, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -1.4495866298675537, "logits/rejected": -1.4552946090698242, "logps/chosen": -208.491943359375, "logps/rejected": -277.58441162109375, "loss": 0.1768, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -3.599666118621826, "rewards/margins": 6.546569347381592, "rewards/rejected": -10.146235466003418, "step": 1080 }, { "epoch": 1.2, "eval_logits/chosen": -1.4486984014511108, "eval_logits/rejected": -1.456477403640747, "eval_logps/chosen": -212.42550659179688, "eval_logps/rejected": -281.87176513671875, "eval_loss": 0.17441098392009735, "eval_rewards/accuracies": 0.934999942779541, "eval_rewards/chosen": -3.739656925201416, "eval_rewards/margins": 7.086799621582031, "eval_rewards/rejected": -10.826456069946289, "eval_runtime": 319.0538, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1080 }, { "epoch": 1.211111111111111, "grad_norm": 0.504275918006897, "learning_rate": 2.0182193486297757e-06, "logits/chosen": -1.4507848024368286, "logits/rejected": -1.457758903503418, "logps/chosen": -210.86541748046875, "logps/rejected": -281.8276062011719, "loss": 0.2981, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": -3.7087669372558594, "rewards/margins": 6.992186546325684, "rewards/rejected": -10.700953483581543, "step": 1090 }, { "epoch": 1.2222222222222223, "grad_norm": 38.076751708984375, "learning_rate": 1.970740319426474e-06, "logits/chosen": -1.4482132196426392, "logits/rejected": -1.4552950859069824, "logps/chosen": -210.63734436035156, "logps/rejected": -281.1439514160156, "loss": 0.1044, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.595562219619751, "rewards/margins": 7.117644786834717, "rewards/rejected": -10.713207244873047, "step": 1100 }, { "epoch": 1.2333333333333334, "grad_norm": 33.685394287109375, "learning_rate": 1.9234603231439e-06, "logits/chosen": -1.4512310028076172, "logits/rejected": -1.4559197425842285, "logps/chosen": -212.38308715820312, "logps/rejected": -278.6757507324219, "loss": 0.1273, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -3.874723434448242, "rewards/margins": 6.4788641929626465, "rewards/rejected": -10.353588104248047, "step": 1110 }, { "epoch": 1.2444444444444445, "grad_norm": 3.557368755340576, "learning_rate": 1.876397139855047e-06, "logits/chosen": -1.4465047121047974, "logits/rejected": -1.4536259174346924, "logps/chosen": -216.92788696289062, "logps/rejected": -283.7412109375, "loss": 0.1901, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -4.245588302612305, "rewards/margins": 6.73915958404541, "rewards/rejected": -10.984746932983398, "step": 1120 }, { "epoch": 1.2555555555555555, "grad_norm": 7.008020401000977, "learning_rate": 1.8295684680984064e-06, "logits/chosen": -1.4479541778564453, "logits/rejected": -1.4568493366241455, "logps/chosen": -216.7061309814453, "logps/rejected": -287.44635009765625, "loss": 0.1335, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -4.068508148193359, "rewards/margins": 7.39614725112915, "rewards/rejected": -11.464654922485352, "step": 1130 }, { "epoch": 1.2666666666666666, "grad_norm": 0.6432875394821167, "learning_rate": 1.7829919182222752e-06, "logits/chosen": -1.4469690322875977, "logits/rejected": -1.4537835121154785, "logps/chosen": -218.46372985839844, "logps/rejected": -278.9936828613281, "loss": 0.2379, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.34934139251709, "rewards/margins": 6.168898105621338, "rewards/rejected": -10.518239974975586, "step": 1140 }, { "epoch": 1.2666666666666666, "eval_logits/chosen": -1.4457852840423584, "eval_logits/rejected": -1.4531958103179932, "eval_logps/chosen": -218.02694702148438, "eval_logps/rejected": -284.6993103027344, "eval_loss": 0.1678517907857895, "eval_rewards/accuracies": 0.9259999990463257, "eval_rewards/chosen": -4.299802780151367, "eval_rewards/margins": 6.809408664703369, "eval_rewards/rejected": -11.109211921691895, "eval_runtime": 319.0708, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1140 }, { "epoch": 1.2777777777777777, "grad_norm": 2.2014999389648438, "learning_rate": 1.7366850057622176e-06, "logits/chosen": -1.4449026584625244, "logits/rejected": -1.4521173238754272, "logps/chosen": -222.32373046875, "logps/rejected": -287.82470703125, "loss": 0.124, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.7135515213012695, "rewards/margins": 6.7242326736450195, "rewards/rejected": -11.437784194946289, "step": 1150 }, { "epoch": 1.2888888888888888, "grad_norm": 7.591695308685303, "learning_rate": 1.6906651448541977e-06, "logits/chosen": -1.44550621509552, "logits/rejected": -1.452260971069336, "logps/chosen": -216.9026641845703, "logps/rejected": -290.4498291015625, "loss": 0.1115, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.313070774078369, "rewards/margins": 7.246844291687012, "rewards/rejected": -11.559915542602539, "step": 1160 }, { "epoch": 1.3, "grad_norm": 0.5335530042648315, "learning_rate": 1.6449496416858285e-06, "logits/chosen": -1.4446995258331299, "logits/rejected": -1.4526121616363525, "logps/chosen": -217.58978271484375, "logps/rejected": -297.33953857421875, "loss": 0.1353, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -4.29036283493042, "rewards/margins": 8.026262283325195, "rewards/rejected": -12.316625595092773, "step": 1170 }, { "epoch": 1.3111111111111111, "grad_norm": 11.03783130645752, "learning_rate": 1.5995556879882246e-06, "logits/chosen": -1.44581937789917, "logits/rejected": -1.45389986038208, "logps/chosen": -220.682373046875, "logps/rejected": -293.34466552734375, "loss": 0.1417, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.592069625854492, "rewards/margins": 7.323507308959961, "rewards/rejected": -11.915576934814453, "step": 1180 }, { "epoch": 1.3222222222222222, "grad_norm": 47.34039306640625, "learning_rate": 1.5545003545708942e-06, "logits/chosen": -1.443078875541687, "logits/rejected": -1.4502229690551758, "logps/chosen": -221.423583984375, "logps/rejected": -293.61309814453125, "loss": 0.1844, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.658720016479492, "rewards/margins": 7.329789161682129, "rewards/rejected": -11.988508224487305, "step": 1190 }, { "epoch": 1.3333333333333333, "grad_norm": 7.4648566246032715, "learning_rate": 1.509800584902108e-06, "logits/chosen": -1.445673942565918, "logits/rejected": -1.4527684450149536, "logps/chosen": -214.28665161132812, "logps/rejected": -296.835205078125, "loss": 0.0571, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.044223308563232, "rewards/margins": 8.145161628723145, "rewards/rejected": -12.189384460449219, "step": 1200 }, { "epoch": 1.3333333333333333, "eval_logits/chosen": -1.4334654808044434, "eval_logits/rejected": -1.4415010213851929, "eval_logps/chosen": -220.21426391601562, "eval_logps/rejected": -297.70947265625, "eval_loss": 0.16259507834911346, "eval_rewards/accuracies": 0.9420000314712524, "eval_rewards/chosen": -4.518533706665039, "eval_rewards/margins": 7.891695022583008, "eval_rewards/rejected": -12.410228729248047, "eval_runtime": 319.1244, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1200 }, { "epoch": 1.3444444444444446, "grad_norm": 55.29027557373047, "learning_rate": 1.4654731887371524e-06, "logits/chosen": -1.4422087669372559, "logits/rejected": -1.447311520576477, "logps/chosen": -219.22817993164062, "logps/rejected": -296.27703857421875, "loss": 0.2174, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -4.672391414642334, "rewards/margins": 7.32711124420166, "rewards/rejected": -11.999502182006836, "step": 1210 }, { "epoch": 1.3555555555555556, "grad_norm": 4.867663860321045, "learning_rate": 1.421534835796867e-06, "logits/chosen": -1.441540241241455, "logits/rejected": -1.4484856128692627, "logps/chosen": -218.04440307617188, "logps/rejected": -293.49969482421875, "loss": 0.1568, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.3516693115234375, "rewards/margins": 7.608930587768555, "rewards/rejected": -11.960600852966309, "step": 1220 }, { "epoch": 1.3666666666666667, "grad_norm": 20.99898910522461, "learning_rate": 1.3780020494988447e-06, "logits/chosen": -1.4409953355789185, "logits/rejected": -1.448107361793518, "logps/chosen": -211.9335174560547, "logps/rejected": -297.6470947265625, "loss": 0.0491, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -3.8174076080322266, "rewards/margins": 8.474346160888672, "rewards/rejected": -12.291754722595215, "step": 1230 }, { "epoch": 1.3777777777777778, "grad_norm": 110.80073547363281, "learning_rate": 1.3348912007436538e-06, "logits/chosen": -1.4384217262268066, "logits/rejected": -1.4469711780548096, "logps/chosen": -221.45703125, "logps/rejected": -297.884521484375, "loss": 0.2962, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -4.6395368576049805, "rewards/margins": 7.799114227294922, "rewards/rejected": -12.438650131225586, "step": 1240 }, { "epoch": 1.3888888888888888, "grad_norm": 11.451473236083984, "learning_rate": 1.2922185017584038e-06, "logits/chosen": -1.4418102502822876, "logits/rejected": -1.450040340423584, "logps/chosen": -224.67489624023438, "logps/rejected": -294.7509460449219, "loss": 0.2274, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -4.944138526916504, "rewards/margins": 7.171684265136719, "rewards/rejected": -12.115822792053223, "step": 1250 }, { "epoch": 1.4, "grad_norm": 49.401031494140625, "learning_rate": 1.2500000000000007e-06, "logits/chosen": -1.4403979778289795, "logits/rejected": -1.4471863508224487, "logps/chosen": -212.93165588378906, "logps/rejected": -292.7206726074219, "loss": 0.1644, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -3.9882044792175293, "rewards/margins": 7.7179412841796875, "rewards/rejected": -11.706144332885742, "step": 1260 }, { "epoch": 1.4, "eval_logits/chosen": -1.4410432577133179, "eval_logits/rejected": -1.4497298002243042, "eval_logps/chosen": -218.07644653320312, "eval_logps/rejected": -295.89495849609375, "eval_loss": 0.1613789200782776, "eval_rewards/accuracies": 0.9399999976158142, "eval_rewards/chosen": -4.304754734039307, "eval_rewards/margins": 7.924018383026123, "eval_rewards/rejected": -12.228774070739746, "eval_runtime": 319.1171, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1260 }, { "epoch": 1.411111111111111, "grad_norm": 3.6920242309570312, "learning_rate": 1.2082515721203429e-06, "logits/chosen": -1.439995527267456, "logits/rejected": -1.446855068206787, "logps/chosen": -213.84771728515625, "logps/rejected": -295.51678466796875, "loss": 0.1399, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -4.055336952209473, "rewards/margins": 7.958105087280273, "rewards/rejected": -12.01344108581543, "step": 1270 }, { "epoch": 1.4222222222222223, "grad_norm": 52.6472282409668, "learning_rate": 1.1669889179957725e-06, "logits/chosen": -1.4401957988739014, "logits/rejected": -1.4491486549377441, "logps/chosen": -213.06893920898438, "logps/rejected": -299.53369140625, "loss": 0.1568, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -3.898712396621704, "rewards/margins": 8.592363357543945, "rewards/rejected": -12.49107551574707, "step": 1280 }, { "epoch": 1.4333333333333333, "grad_norm": 29.101463317871094, "learning_rate": 1.1262275548229852e-06, "logits/chosen": -1.441546082496643, "logits/rejected": -1.4492754936218262, "logps/chosen": -215.4649200439453, "logps/rejected": -297.18133544921875, "loss": 0.1692, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.13901424407959, "rewards/margins": 8.12405776977539, "rewards/rejected": -12.263072967529297, "step": 1290 }, { "epoch": 1.4444444444444444, "grad_norm": 34.07436752319336, "learning_rate": 1.085982811283654e-06, "logits/chosen": -1.4384413957595825, "logits/rejected": -1.448547601699829, "logps/chosen": -223.9049530029297, "logps/rejected": -299.6859436035156, "loss": 0.2549, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -4.689306259155273, "rewards/margins": 8.127752304077148, "rewards/rejected": -12.817058563232422, "step": 1300 }, { "epoch": 1.4555555555555555, "grad_norm": 1.7382193803787231, "learning_rate": 1.0462698217799333e-06, "logits/chosen": -1.4337615966796875, "logits/rejected": -1.4433856010437012, "logps/chosen": -220.635009765625, "logps/rejected": -301.463623046875, "loss": 0.0488, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -4.439496994018555, "rewards/margins": 8.465967178344727, "rewards/rejected": -12.905464172363281, "step": 1310 }, { "epoch": 1.4666666666666668, "grad_norm": 24.08198356628418, "learning_rate": 1.0071035207430352e-06, "logits/chosen": -1.4395148754119873, "logits/rejected": -1.4469318389892578, "logps/chosen": -217.70321655273438, "logps/rejected": -301.6554260253906, "loss": 0.3264, "rewards/accuracies": 0.9199999570846558, "rewards/chosen": -4.422707557678223, "rewards/margins": 8.232616424560547, "rewards/rejected": -12.655323028564453, "step": 1320 }, { "epoch": 1.4666666666666668, "eval_logits/chosen": -1.4390203952789307, "eval_logits/rejected": -1.447505235671997, "eval_logps/chosen": -220.72488403320312, "eval_logps/rejected": -299.20281982421875, "eval_loss": 0.14269497990608215, "eval_rewards/accuracies": 0.9470000267028809, "eval_rewards/chosen": -4.569596290588379, "eval_rewards/margins": 7.98996639251709, "eval_rewards/rejected": -12.559562683105469, "eval_runtime": 319.0786, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1320 }, { "epoch": 1.4777777777777779, "grad_norm": 33.78030014038086, "learning_rate": 9.68498637016993e-07, "logits/chosen": -1.4397677183151245, "logits/rejected": -1.447797417640686, "logps/chosen": -215.69036865234375, "logps/rejected": -304.0943603515625, "loss": 0.0708, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.21912145614624, "rewards/margins": 8.63876724243164, "rewards/rejected": -12.857889175415039, "step": 1330 }, { "epoch": 1.488888888888889, "grad_norm": 12.473185539245605, "learning_rate": 9.304696883197542e-07, "logits/chosen": -1.4403018951416016, "logits/rejected": -1.4468640089035034, "logps/chosen": -221.52301025390625, "logps/rejected": -301.3069763183594, "loss": 0.1152, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.801990509033203, "rewards/margins": 7.815496921539307, "rewards/rejected": -12.617486953735352, "step": 1340 }, { "epoch": 1.5, "grad_norm": 8.766846656799316, "learning_rate": 8.930309757836517e-07, "logits/chosen": -1.4371258020401, "logits/rejected": -1.4433132410049438, "logps/chosen": -213.47586059570312, "logps/rejected": -302.5564270019531, "loss": 0.1309, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -4.185782432556152, "rewards/margins": 8.38406753540039, "rewards/rejected": -12.569849014282227, "step": 1350 }, { "epoch": 1.511111111111111, "grad_norm": 2.927105665206909, "learning_rate": 8.561965785773413e-07, "logits/chosen": -1.4389441013336182, "logits/rejected": -1.4470137357711792, "logps/chosen": -220.51834106445312, "logps/rejected": -297.39044189453125, "loss": 0.1829, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -4.525027751922607, "rewards/margins": 7.858166694641113, "rewards/rejected": -12.383193969726562, "step": 1360 }, { "epoch": 1.5222222222222221, "grad_norm": 18.771081924438477, "learning_rate": 8.19980348611194e-07, "logits/chosen": -1.437861680984497, "logits/rejected": -1.4455113410949707, "logps/chosen": -220.85772705078125, "logps/rejected": -301.1795654296875, "loss": 0.2494, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.576181411743164, "rewards/margins": 8.187376022338867, "rewards/rejected": -12.763558387756348, "step": 1370 }, { "epoch": 1.5333333333333332, "grad_norm": 1.1617432832717896, "learning_rate": 7.843959053281663e-07, "logits/chosen": -1.434956669807434, "logits/rejected": -1.4432401657104492, "logps/chosen": -211.97430419921875, "logps/rejected": -305.6387023925781, "loss": 0.1088, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -3.911351203918457, "rewards/margins": 9.050506591796875, "rewards/rejected": -12.961858749389648, "step": 1380 }, { "epoch": 1.5333333333333332, "eval_logits/chosen": -1.4380238056182861, "eval_logits/rejected": -1.4465129375457764, "eval_logps/chosen": -221.4553680419922, "eval_logps/rejected": -301.4556884765625, "eval_loss": 0.1381780505180359, "eval_rewards/accuracies": 0.9509999752044678, "eval_rewards/chosen": -4.642644882202148, "eval_rewards/margins": 8.142204284667969, "eval_rewards/rejected": -12.784847259521484, "eval_runtime": 319.0431, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1380 }, { "epoch": 1.5444444444444443, "grad_norm": 0.9792585968971252, "learning_rate": 7.494566305820788e-07, "logits/chosen": -1.4381271600723267, "logits/rejected": -1.447120189666748, "logps/chosen": -219.04547119140625, "logps/rejected": -302.42169189453125, "loss": 0.0939, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.438849925994873, "rewards/margins": 8.397099494934082, "rewards/rejected": -12.835948944091797, "step": 1390 }, { "epoch": 1.5555555555555556, "grad_norm": 75.47477722167969, "learning_rate": 7.151756636052529e-07, "logits/chosen": -1.4314817190170288, "logits/rejected": -1.441815972328186, "logps/chosen": -225.7080078125, "logps/rejected": -303.7359313964844, "loss": 0.2658, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -4.849039554595947, "rewards/margins": 8.38930892944336, "rewards/rejected": -13.238348007202148, "step": 1400 }, { "epoch": 1.5666666666666667, "grad_norm": 14.793700218200684, "learning_rate": 6.815658960673782e-07, "logits/chosen": -1.4330942630767822, "logits/rejected": -1.4421098232269287, "logps/chosen": -215.4171142578125, "logps/rejected": -309.46636962890625, "loss": 0.1275, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.201850891113281, "rewards/margins": 9.197854995727539, "rewards/rejected": -13.39970588684082, "step": 1410 }, { "epoch": 1.5777777777777777, "grad_norm": 44.33953857421875, "learning_rate": 6.48639967227489e-07, "logits/chosen": -1.43377685546875, "logits/rejected": -1.4425432682037354, "logps/chosen": -223.57232666015625, "logps/rejected": -300.73687744140625, "loss": 0.1086, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.74373722076416, "rewards/margins": 8.091646194458008, "rewards/rejected": -12.835383415222168, "step": 1420 }, { "epoch": 1.588888888888889, "grad_norm": 57.546730041503906, "learning_rate": 6.164102591808482e-07, "logits/chosen": -1.436528205871582, "logits/rejected": -1.4442325830459595, "logps/chosen": -223.30726623535156, "logps/rejected": -298.614501953125, "loss": 0.1289, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.813453197479248, "rewards/margins": 7.699684143066406, "rewards/rejected": -12.513137817382812, "step": 1430 }, { "epoch": 1.6, "grad_norm": 24.603193283081055, "learning_rate": 5.848888922025553e-07, "logits/chosen": -1.435572624206543, "logits/rejected": -1.4410022497177124, "logps/chosen": -222.442626953125, "logps/rejected": -297.5535888671875, "loss": 0.1853, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -5.006618022918701, "rewards/margins": 7.103509902954102, "rewards/rejected": -12.110126495361328, "step": 1440 }, { "epoch": 1.6, "eval_logits/chosen": -1.434856653213501, "eval_logits/rejected": -1.4433155059814453, "eval_logps/chosen": -225.01356506347656, "eval_logps/rejected": -305.67608642578125, "eval_loss": 0.1416788399219513, "eval_rewards/accuracies": 0.9490000009536743, "eval_rewards/chosen": -4.998464584350586, "eval_rewards/margins": 8.208425521850586, "eval_rewards/rejected": -13.206890106201172, "eval_runtime": 319.0443, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1440 }, { "epoch": 1.6111111111111112, "grad_norm": 11.052775382995605, "learning_rate": 5.540877201896e-07, "logits/chosen": -1.4346046447753906, "logits/rejected": -1.441970705986023, "logps/chosen": -220.8565216064453, "logps/rejected": -309.455078125, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -4.751629829406738, "rewards/margins": 8.649368286132812, "rewards/rejected": -13.40099811553955, "step": 1450 }, { "epoch": 1.6222222222222222, "grad_norm": 155.28163146972656, "learning_rate": 5.240183262031021e-07, "logits/chosen": -1.4323256015777588, "logits/rejected": -1.4386875629425049, "logps/chosen": -223.56954956054688, "logps/rejected": -303.56964111328125, "loss": 0.1732, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -5.084565162658691, "rewards/margins": 7.685075759887695, "rewards/rejected": -12.76963996887207, "step": 1460 }, { "epoch": 1.6333333333333333, "grad_norm": 0.09031402319669724, "learning_rate": 4.946920181123904e-07, "logits/chosen": -1.4354331493377686, "logits/rejected": -1.4433681964874268, "logps/chosen": -218.1959228515625, "logps/rejected": -311.41815185546875, "loss": 0.0411, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -4.469109535217285, "rewards/margins": 9.170130729675293, "rewards/rejected": -13.639240264892578, "step": 1470 }, { "epoch": 1.6444444444444444, "grad_norm": 4.468040943145752, "learning_rate": 4.661198243425813e-07, "logits/chosen": -1.4358713626861572, "logits/rejected": -1.4435877799987793, "logps/chosen": -220.62884521484375, "logps/rejected": -308.4977722167969, "loss": 0.1746, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -4.726615905761719, "rewards/margins": 8.57591724395752, "rewards/rejected": -13.302533149719238, "step": 1480 }, { "epoch": 1.6555555555555554, "grad_norm": 24.597213745117188, "learning_rate": 4.383124897272331e-07, "logits/chosen": -1.4311268329620361, "logits/rejected": -1.4438539743423462, "logps/chosen": -223.80532836914062, "logps/rejected": -316.3698425292969, "loss": 0.1137, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.6827216148376465, "rewards/margins": 9.79144287109375, "rewards/rejected": -14.474164009094238, "step": 1490 }, { "epoch": 1.6666666666666665, "grad_norm": 11.660173416137695, "learning_rate": 4.1128047146765936e-07, "logits/chosen": -1.4334840774536133, "logits/rejected": -1.4423227310180664, "logps/chosen": -222.2215118408203, "logps/rejected": -310.5578308105469, "loss": 0.1406, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.7878241539001465, "rewards/margins": 8.839967727661133, "rewards/rejected": -13.627790451049805, "step": 1500 }, { "epoch": 1.6666666666666665, "eval_logits/chosen": -1.4283111095428467, "eval_logits/rejected": -1.4372782707214355, "eval_logps/chosen": -226.19561767578125, "eval_logps/rejected": -312.00286865234375, "eval_loss": 0.17411097884178162, "eval_rewards/accuracies": 0.9409999847412109, "eval_rewards/chosen": -5.116670608520508, "eval_rewards/margins": 8.722896575927734, "eval_rewards/rejected": -13.839567184448242, "eval_runtime": 319.1464, "eval_samples_per_second": 3.133, "eval_steps_per_second": 0.313, "step": 1500 }, { "epoch": 1.6777777777777778, "grad_norm": 34.12993240356445, "learning_rate": 3.8503393520039734e-07, "logits/chosen": -1.4328773021697998, "logits/rejected": -1.441019058227539, "logps/chosen": -226.4792022705078, "logps/rejected": -311.11376953125, "loss": 0.1039, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -5.133286476135254, "rewards/margins": 8.623987197875977, "rewards/rejected": -13.757274627685547, "step": 1510 }, { "epoch": 1.6888888888888889, "grad_norm": 3.2780518531799316, "learning_rate": 3.595827511743341e-07, "logits/chosen": -1.4375367164611816, "logits/rejected": -1.4466478824615479, "logps/chosen": -228.24720764160156, "logps/rejected": -313.73443603515625, "loss": 0.1373, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -5.353361129760742, "rewards/margins": 8.607213973999023, "rewards/rejected": -13.960576057434082, "step": 1520 }, { "epoch": 1.7, "grad_norm": 0.9572333097457886, "learning_rate": 3.3493649053890325e-07, "logits/chosen": -1.4340400695800781, "logits/rejected": -1.4402492046356201, "logps/chosen": -219.64596557617188, "logps/rejected": -312.9942626953125, "loss": 0.0528, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.749961853027344, "rewards/margins": 8.902398109436035, "rewards/rejected": -13.652359962463379, "step": 1530 }, { "epoch": 1.7111111111111112, "grad_norm": 50.73735046386719, "learning_rate": 3.111044217447731e-07, "logits/chosen": -1.4347453117370605, "logits/rejected": -1.4433064460754395, "logps/chosen": -226.47991943359375, "logps/rejected": -311.9306945800781, "loss": 0.145, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -5.080777168273926, "rewards/margins": 8.817042350769043, "rewards/rejected": -13.897819519042969, "step": 1540 }, { "epoch": 1.7222222222222223, "grad_norm": 59.58679962158203, "learning_rate": 2.880955070583555e-07, "logits/chosen": -1.4339215755462646, "logits/rejected": -1.441694736480713, "logps/chosen": -221.3302001953125, "logps/rejected": -311.9345703125, "loss": 0.068, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.724796772003174, "rewards/margins": 9.014541625976562, "rewards/rejected": -13.739337921142578, "step": 1550 }, { "epoch": 1.7333333333333334, "grad_norm": 0.05962331220507622, "learning_rate": 2.6591839919146963e-07, "logits/chosen": -1.4334750175476074, "logits/rejected": -1.4436006546020508, "logps/chosen": -222.41143798828125, "logps/rejected": -309.032470703125, "loss": 0.1751, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -4.5645952224731445, "rewards/margins": 9.154894828796387, "rewards/rejected": -13.719490051269531, "step": 1560 }, { "epoch": 1.7333333333333334, "eval_logits/chosen": -1.4309327602386475, "eval_logits/rejected": -1.4397170543670654, "eval_logps/chosen": -224.7161102294922, "eval_logps/rejected": -310.6195373535156, "eval_loss": 0.14331386983394623, "eval_rewards/accuracies": 0.9479999542236328, "eval_rewards/chosen": -4.968718528747559, "eval_rewards/margins": 8.732516288757324, "eval_rewards/rejected": -13.701233863830566, "eval_runtime": 319.0216, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.313, "step": 1560 }, { "epoch": 1.7444444444444445, "grad_norm": 13.6820707321167, "learning_rate": 2.445814380474057e-07, "logits/chosen": -1.431359052658081, "logits/rejected": -1.4409172534942627, "logps/chosen": -223.4319610595703, "logps/rejected": -311.96630859375, "loss": 0.1306, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -4.801632881164551, "rewards/margins": 9.068136215209961, "rewards/rejected": -13.869770050048828, "step": 1570 }, { "epoch": 1.7555555555555555, "grad_norm": 34.89216613769531, "learning_rate": 2.240926475846336e-07, "logits/chosen": -1.4330511093139648, "logits/rejected": -1.440633773803711, "logps/chosen": -220.532470703125, "logps/rejected": -311.3044738769531, "loss": 0.1008, "rewards/accuracies": 0.940000057220459, "rewards/chosen": -4.673737049102783, "rewards/margins": 8.975007057189941, "rewards/rejected": -13.648744583129883, "step": 1580 }, { "epoch": 1.7666666666666666, "grad_norm": 8.211335182189941, "learning_rate": 2.044597327993153e-07, "logits/chosen": -1.4353344440460205, "logits/rejected": -1.443753719329834, "logps/chosen": -228.2718505859375, "logps/rejected": -309.8863830566406, "loss": 0.0899, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.20648193359375, "rewards/margins": 8.540419578552246, "rewards/rejected": -13.74690055847168, "step": 1590 }, { "epoch": 1.7777777777777777, "grad_norm": 28.57567024230957, "learning_rate": 1.8569007682777417e-07, "logits/chosen": -1.4354259967803955, "logits/rejected": -1.4432419538497925, "logps/chosen": -234.33609008789062, "logps/rejected": -307.1394348144531, "loss": 0.2102, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -5.7801055908203125, "rewards/margins": 7.755853176116943, "rewards/rejected": -13.535958290100098, "step": 1600 }, { "epoch": 1.7888888888888888, "grad_norm": 0.20096662640571594, "learning_rate": 1.6779073816999864e-07, "logits/chosen": -1.4322645664215088, "logits/rejected": -1.440659523010254, "logps/chosen": -229.00491333007812, "logps/rejected": -311.02471923828125, "loss": 0.2204, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -5.322908401489258, "rewards/margins": 8.485345840454102, "rewards/rejected": -13.808255195617676, "step": 1610 }, { "epoch": 1.8, "grad_norm": 1.0877524614334106, "learning_rate": 1.507684480352292e-07, "logits/chosen": -1.4340641498565674, "logits/rejected": -1.4434683322906494, "logps/chosen": -224.53636169433594, "logps/rejected": -313.3722839355469, "loss": 0.1648, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.935386657714844, "rewards/margins": 9.045793533325195, "rewards/rejected": -13.981179237365723, "step": 1620 }, { "epoch": 1.8, "eval_logits/chosen": -1.43349289894104, "eval_logits/rejected": -1.4424097537994385, "eval_logps/chosen": -224.81407165527344, "eval_logps/rejected": -310.5035400390625, "eval_loss": 0.13682803511619568, "eval_rewards/accuracies": 0.9499999284744263, "eval_rewards/chosen": -4.978516101837158, "eval_rewards/margins": 8.7111177444458, "eval_rewards/rejected": -13.689634323120117, "eval_runtime": 319.0475, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1620 }, { "epoch": 1.8111111111111111, "grad_norm": 72.70378112792969, "learning_rate": 1.3462960781062433e-07, "logits/chosen": -1.435239553451538, "logits/rejected": -1.443226933479309, "logps/chosen": -224.82131958007812, "logps/rejected": -309.9866027832031, "loss": 0.2082, "rewards/accuracies": 0.9300000667572021, "rewards/chosen": -5.088614463806152, "rewards/margins": 8.422065734863281, "rewards/rejected": -13.5106782913208, "step": 1630 }, { "epoch": 1.8222222222222222, "grad_norm": 2.4910500049591064, "learning_rate": 1.1938028665396172e-07, "logits/chosen": -1.433123230934143, "logits/rejected": -1.4404329061508179, "logps/chosen": -219.64083862304688, "logps/rejected": -309.46173095703125, "loss": 0.1179, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -4.623630523681641, "rewards/margins": 8.780423164367676, "rewards/rejected": -13.404053688049316, "step": 1640 }, { "epoch": 1.8333333333333335, "grad_norm": 69.12033081054688, "learning_rate": 1.0502621921127776e-07, "logits/chosen": -1.4359033107757568, "logits/rejected": -1.4434657096862793, "logps/chosen": -218.10604858398438, "logps/rejected": -313.9632568359375, "loss": 0.0828, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.456075668334961, "rewards/margins": 9.42856216430664, "rewards/rejected": -13.884637832641602, "step": 1650 }, { "epoch": 1.8444444444444446, "grad_norm": 13.434600830078125, "learning_rate": 9.157280346029918e-08, "logits/chosen": -1.4327411651611328, "logits/rejected": -1.4432756900787354, "logps/chosen": -235.7450714111328, "logps/rejected": -309.2312927246094, "loss": 0.1124, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.710514068603516, "rewards/margins": 8.197881698608398, "rewards/rejected": -13.90839672088623, "step": 1660 }, { "epoch": 1.8555555555555556, "grad_norm": 12.185674667358398, "learning_rate": 7.902509868048552e-08, "logits/chosen": -1.4359219074249268, "logits/rejected": -1.4436171054840088, "logps/chosen": -221.72747802734375, "logps/rejected": -315.0428466796875, "loss": 0.0918, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.850347995758057, "rewards/margins": 9.118718147277832, "rewards/rejected": -13.969064712524414, "step": 1670 }, { "epoch": 1.8666666666666667, "grad_norm": 69.89892578125, "learning_rate": 6.738782355044048e-08, "logits/chosen": -1.4331374168395996, "logits/rejected": -1.4413154125213623, "logps/chosen": -223.36399841308594, "logps/rejected": -310.390625, "loss": 0.1109, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.924819469451904, "rewards/margins": 8.662806510925293, "rewards/rejected": -13.587625503540039, "step": 1680 }, { "epoch": 1.8666666666666667, "eval_logits/chosen": -1.4341464042663574, "eval_logits/rejected": -1.4429900646209717, "eval_logps/chosen": -225.63760375976562, "eval_logps/rejected": -311.9776611328125, "eval_loss": 0.136677548289299, "eval_rewards/accuracies": 0.9479999542236328, "eval_rewards/chosen": -5.060867786407471, "eval_rewards/margins": 8.776179313659668, "eval_rewards/rejected": -13.83704662322998, "eval_runtime": 319.0963, "eval_samples_per_second": 3.134, "eval_steps_per_second": 0.313, "step": 1680 }, { "epoch": 1.8777777777777778, "grad_norm": 8.003738403320312, "learning_rate": 5.6665354373411085e-08, "logits/chosen": -1.4322279691696167, "logits/rejected": -1.4386062622070312, "logps/chosen": -219.65518188476562, "logps/rejected": -312.4093017578125, "loss": 0.0945, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.767041206359863, "rewards/margins": 8.800390243530273, "rewards/rejected": -13.567432403564453, "step": 1690 }, { "epoch": 1.8888888888888888, "grad_norm": 3.392204761505127, "learning_rate": 4.6861723431538273e-08, "logits/chosen": -1.4348361492156982, "logits/rejected": -1.4400594234466553, "logps/chosen": -214.8970947265625, "logps/rejected": -313.658447265625, "loss": 0.1059, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.507518768310547, "rewards/margins": 8.9714937210083, "rewards/rejected": -13.479012489318848, "step": 1700 }, { "epoch": 1.9, "grad_norm": 62.22904586791992, "learning_rate": 3.798061746947995e-08, "logits/chosen": -1.4324069023132324, "logits/rejected": -1.441604733467102, "logps/chosen": -221.77166748046875, "logps/rejected": -318.039794921875, "loss": 0.0776, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.684864044189453, "rewards/margins": 9.731212615966797, "rewards/rejected": -14.41607666015625, "step": 1710 }, { "epoch": 1.911111111111111, "grad_norm": 30.089298248291016, "learning_rate": 3.0025376307977474e-08, "logits/chosen": -1.4305298328399658, "logits/rejected": -1.4401195049285889, "logps/chosen": -223.77716064453125, "logps/rejected": -311.95184326171875, "loss": 0.0941, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.831788063049316, "rewards/margins": 9.036359786987305, "rewards/rejected": -13.868147850036621, "step": 1720 }, { "epoch": 1.9222222222222223, "grad_norm": 25.641223907470703, "learning_rate": 2.299899158788671e-08, "logits/chosen": -1.4364161491394043, "logits/rejected": -1.444923996925354, "logps/chosen": -225.68487548828125, "logps/rejected": -313.0198974609375, "loss": 0.0718, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.079021453857422, "rewards/margins": 8.841741561889648, "rewards/rejected": -13.92076301574707, "step": 1730 }, { "epoch": 1.9333333333333333, "grad_norm": 26.901592254638672, "learning_rate": 1.6904105645142443e-08, "logits/chosen": -1.4377158880233765, "logits/rejected": -1.4454920291900635, "logps/chosen": -224.06100463867188, "logps/rejected": -312.62640380859375, "loss": 0.1875, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.969162464141846, "rewards/margins": 8.860335350036621, "rewards/rejected": -13.829497337341309, "step": 1740 }, { "epoch": 1.9333333333333333, "eval_logits/chosen": -1.4356236457824707, "eval_logits/rejected": -1.444490671157837, "eval_logps/chosen": -225.332763671875, "eval_logps/rejected": -311.51763916015625, "eval_loss": 0.13875921070575714, "eval_rewards/accuracies": 0.949999988079071, "eval_rewards/chosen": -5.030385971069336, "eval_rewards/margins": 8.760656356811523, "eval_rewards/rejected": -13.79104232788086, "eval_runtime": 319.0041, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.313, "step": 1740 }, { "epoch": 1.9444444444444444, "grad_norm": 14.615069389343262, "learning_rate": 1.1743010517085428e-08, "logits/chosen": -1.4370002746582031, "logits/rejected": -1.4407825469970703, "logps/chosen": -220.3291015625, "logps/rejected": -311.1973876953125, "loss": 0.2078, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -4.969962120056152, "rewards/margins": 8.346500396728516, "rewards/rejected": -13.316461563110352, "step": 1750 }, { "epoch": 1.9555555555555557, "grad_norm": 4.708809852600098, "learning_rate": 7.517647080519941e-09, "logits/chosen": -1.4301917552947998, "logits/rejected": -1.4405816793441772, "logps/chosen": -222.86749267578125, "logps/rejected": -314.6806945800781, "loss": 0.0517, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.636650085449219, "rewards/margins": 9.621713638305664, "rewards/rejected": -14.258363723754883, "step": 1760 }, { "epoch": 1.9666666666666668, "grad_norm": 81.20974731445312, "learning_rate": 4.229604321829561e-09, "logits/chosen": -1.4317975044250488, "logits/rejected": -1.440091609954834, "logps/chosen": -226.40724182128906, "logps/rejected": -307.99505615234375, "loss": 0.1122, "rewards/accuracies": 0.9500000476837158, "rewards/chosen": -5.121713638305664, "rewards/margins": 8.330584526062012, "rewards/rejected": -13.452299118041992, "step": 1770 }, { "epoch": 1.9777777777777779, "grad_norm": 1.9576209783554077, "learning_rate": 1.8801187394248966e-09, "logits/chosen": -1.4311833381652832, "logits/rejected": -1.4387831687927246, "logps/chosen": -217.8499755859375, "logps/rejected": -313.7250061035156, "loss": 0.2065, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -4.587855815887451, "rewards/margins": 9.110170364379883, "rewards/rejected": -13.698026657104492, "step": 1780 }, { "epoch": 1.988888888888889, "grad_norm": 15.842097282409668, "learning_rate": 4.700738787466463e-10, "logits/chosen": -1.434556245803833, "logits/rejected": -1.4411146640777588, "logps/chosen": -226.14297485351562, "logps/rejected": -307.7271423339844, "loss": 0.2763, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -5.291313648223877, "rewards/margins": 7.9369354248046875, "rewards/rejected": -13.228248596191406, "step": 1790 }, { "epoch": 2.0, "grad_norm": 42.78350830078125, "learning_rate": 0.0, "logits/chosen": -1.434855580329895, "logits/rejected": -1.4434187412261963, "logps/chosen": -225.7928009033203, "logps/rejected": -312.3804016113281, "loss": 0.0947, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.068363666534424, "rewards/margins": 8.823846817016602, "rewards/rejected": -13.892210006713867, "step": 1800 }, { "epoch": 2.0, "eval_logits/chosen": -1.4384021759033203, "eval_logits/rejected": -1.4473795890808105, "eval_logps/chosen": -224.70401000976562, "eval_logps/rejected": -310.91900634765625, "eval_loss": 0.1330825537443161, "eval_rewards/accuracies": 0.9479999542236328, "eval_rewards/chosen": -4.9675092697143555, "eval_rewards/margins": 8.763671875, "eval_rewards/rejected": -13.731181144714355, "eval_runtime": 318.9855, "eval_samples_per_second": 3.135, "eval_steps_per_second": 0.313, "step": 1800 }, { "epoch": 2.0, "step": 1800, "total_flos": 2.867691724430377e+18, "train_loss": 0.25175013176269, "train_runtime": 22554.8302, "train_samples_per_second": 0.798, "train_steps_per_second": 0.08 } ], "logging_steps": 10, "max_steps": 1800, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.867691724430377e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }