{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1911, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.6041666666666664e-09, "logits/chosen": -2.463043451309204, "logits/rejected": -2.288743019104004, "logps/chosen": -301.1433410644531, "logps/rejected": -128.25608825683594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.6041666666666667e-08, "logits/chosen": -2.5560247898101807, "logits/rejected": -2.555253028869629, "logps/chosen": -286.1558837890625, "logps/rejected": -256.28131103515625, "loss": 0.697, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.0025797931011766195, "rewards/margins": -0.008782301098108292, "rewards/rejected": 0.006202507298439741, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.648404598236084, "logits/rejected": -2.6275510787963867, "logps/chosen": -316.64373779296875, "logps/rejected": -314.99212646484375, "loss": 0.6895, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0072450353763997555, "rewards/margins": 0.0013799279695376754, "rewards/rejected": -0.008624963462352753, "step": 20 }, { "epoch": 0.02, "learning_rate": 7.812499999999999e-08, "logits/chosen": -2.683773994445801, "logits/rejected": -2.550048828125, "logps/chosen": -306.83282470703125, "logps/rejected": -266.6906433105469, "loss": 0.6823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.011949767358601093, "rewards/margins": 0.020141970366239548, "rewards/rejected": -0.008192205801606178, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.634948968887329, "logits/rejected": -2.5988709926605225, "logps/chosen": -265.98968505859375, "logps/rejected": -252.77835083007812, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": 0.08117702603340149, "rewards/margins": 0.05083204060792923, "rewards/rejected": 0.03034498728811741, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3020833333333334e-07, "logits/chosen": -2.644986152648926, "logits/rejected": -2.589542865753174, "logps/chosen": -323.46881103515625, "logps/rejected": -284.9610900878906, "loss": 0.6542, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2217252552509308, "rewards/margins": 0.12405480444431305, "rewards/rejected": 0.09767045080661774, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.599961280822754, "logits/rejected": -2.491401195526123, "logps/chosen": -305.62152099609375, "logps/rejected": -236.73916625976562, "loss": 0.6107, "rewards/accuracies": 0.6875, "rewards/chosen": 0.29835107922554016, "rewards/margins": 0.23799380660057068, "rewards/rejected": 0.06035725399851799, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8229166666666666e-07, "logits/chosen": -2.5403127670288086, "logits/rejected": -2.491334915161133, "logps/chosen": -281.8306884765625, "logps/rejected": -264.59130859375, "loss": 0.6257, "rewards/accuracies": 0.6875, "rewards/chosen": 0.37535926699638367, "rewards/margins": 0.173495814204216, "rewards/rejected": 0.20186343789100647, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6033613681793213, "logits/rejected": -2.548762083053589, "logps/chosen": -270.06597900390625, "logps/rejected": -262.29693603515625, "loss": 0.6014, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4671238958835602, "rewards/margins": 0.144499734044075, "rewards/rejected": 0.3226241171360016, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.3437499999999998e-07, "logits/chosen": -2.5922348499298096, "logits/rejected": -2.584137439727783, "logps/chosen": -278.80865478515625, "logps/rejected": -279.4120178222656, "loss": 0.5943, "rewards/accuracies": 0.625, "rewards/chosen": 0.295165479183197, "rewards/margins": 0.24514034390449524, "rewards/rejected": 0.05002513527870178, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.604166666666667e-07, "logits/chosen": -2.5200092792510986, "logits/rejected": -2.4520034790039062, "logps/chosen": -240.3234405517578, "logps/rejected": -248.67428588867188, "loss": 0.5838, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4579353332519531, "rewards/margins": 0.31523841619491577, "rewards/rejected": 0.14269694685935974, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.864583333333333e-07, "logits/chosen": -2.5587222576141357, "logits/rejected": -2.5158984661102295, "logps/chosen": -334.97784423828125, "logps/rejected": -290.39935302734375, "loss": 0.5736, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.39039498567581177, "rewards/margins": 0.49432238936424255, "rewards/rejected": -0.10392741858959198, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.5714714527130127, "logits/rejected": -2.503490924835205, "logps/chosen": -287.0711364746094, "logps/rejected": -279.7371520996094, "loss": 0.5872, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7893859148025513, "rewards/margins": 0.6542637944221497, "rewards/rejected": 0.13512210547924042, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.3854166666666667e-07, "logits/chosen": -2.6581666469573975, "logits/rejected": -2.558701753616333, "logps/chosen": -318.14178466796875, "logps/rejected": -277.52838134765625, "loss": 0.5618, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3024521768093109, "rewards/margins": 0.6436794400215149, "rewards/rejected": -0.3412272334098816, "step": 130 }, { "epoch": 0.07, "learning_rate": 3.645833333333333e-07, "logits/chosen": -2.6539013385772705, "logits/rejected": -2.6091341972351074, "logps/chosen": -307.194091796875, "logps/rejected": -285.154541015625, "loss": 0.5479, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1616998016834259, "rewards/margins": 0.4194749891757965, "rewards/rejected": -0.5811747908592224, "step": 140 }, { "epoch": 0.08, "learning_rate": 3.9062499999999997e-07, "logits/chosen": -2.4873158931732178, "logits/rejected": -2.504676342010498, "logps/chosen": -300.03448486328125, "logps/rejected": -318.03668212890625, "loss": 0.6838, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03602635860443115, "rewards/margins": -0.06406474858522415, "rewards/rejected": 0.028038373216986656, "step": 150 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.6210286617279053, "logits/rejected": -2.6042556762695312, "logps/chosen": -286.2755432128906, "logps/rejected": -299.8374328613281, "loss": 0.5376, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1890409290790558, "rewards/margins": 0.8602690696716309, "rewards/rejected": -0.6712281703948975, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.427083333333333e-07, "logits/chosen": -2.663649320602417, "logits/rejected": -2.611992835998535, "logps/chosen": -312.26580810546875, "logps/rejected": -271.50897216796875, "loss": 0.5935, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.08057786524295807, "rewards/margins": 0.7988370656967163, "rewards/rejected": -0.8794149160385132, "step": 170 }, { "epoch": 0.09, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.6343648433685303, "logits/rejected": -2.5704476833343506, "logps/chosen": -338.59124755859375, "logps/rejected": -286.7225341796875, "loss": 0.5162, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.006776231341063976, "rewards/margins": 0.9450720548629761, "rewards/rejected": -0.9518482089042664, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.947916666666667e-07, "logits/chosen": -2.6279683113098145, "logits/rejected": -2.5350871086120605, "logps/chosen": -288.70184326171875, "logps/rejected": -262.1229248046875, "loss": 0.6639, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2994307279586792, "rewards/margins": 0.9815131425857544, "rewards/rejected": -0.6820824146270752, "step": 190 }, { "epoch": 0.1, "learning_rate": 4.976730657358929e-07, "logits/chosen": -2.5379586219787598, "logits/rejected": -2.49354887008667, "logps/chosen": -253.19580078125, "logps/rejected": -223.9005584716797, "loss": 0.8012, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.20366668701171875, "rewards/margins": -0.2523919641971588, "rewards/rejected": 0.04872531443834305, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.947643979057592e-07, "logits/chosen": -2.543163299560547, "logits/rejected": -2.5506227016448975, "logps/chosen": -285.0458984375, "logps/rejected": -312.3334045410156, "loss": 0.5666, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.23225872218608856, "rewards/margins": 0.626153826713562, "rewards/rejected": -0.39389508962631226, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.918557300756254e-07, "logits/chosen": -2.4932289123535156, "logits/rejected": -2.464277744293213, "logps/chosen": -292.6415100097656, "logps/rejected": -273.45648193359375, "loss": 0.5932, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0871698409318924, "rewards/margins": 0.7229372262954712, "rewards/rejected": -0.6357674598693848, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.889470622454916e-07, "logits/chosen": -2.579195261001587, "logits/rejected": -2.464480400085449, "logps/chosen": -310.99090576171875, "logps/rejected": -287.1470031738281, "loss": 0.5378, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.05043240636587143, "rewards/margins": 0.7342410087585449, "rewards/rejected": -0.6838085651397705, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.860383944153578e-07, "logits/chosen": -2.4707000255584717, "logits/rejected": -2.3995609283447266, "logps/chosen": -292.8463439941406, "logps/rejected": -262.3541259765625, "loss": 0.5575, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05063791200518608, "rewards/margins": 0.8500161170959473, "rewards/rejected": -0.9006540179252625, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.83129726585224e-07, "logits/chosen": -2.4374887943267822, "logits/rejected": -2.4220387935638428, "logps/chosen": -266.37774658203125, "logps/rejected": -283.33978271484375, "loss": 0.6727, "rewards/accuracies": 0.6875, "rewards/chosen": -0.49987784028053284, "rewards/margins": 0.5630615949630737, "rewards/rejected": -1.0629395246505737, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.802210587550902e-07, "logits/chosen": -2.405151844024658, "logits/rejected": -2.373051643371582, "logps/chosen": -323.4302062988281, "logps/rejected": -306.20367431640625, "loss": 0.7053, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.31794676184654236, "rewards/margins": 0.897225558757782, "rewards/rejected": -1.215172290802002, "step": 260 }, { "epoch": 0.14, "learning_rate": 4.773123909249563e-07, "logits/chosen": -2.432044506072998, "logits/rejected": -2.363980770111084, "logps/chosen": -278.9121398925781, "logps/rejected": -302.1050109863281, "loss": 0.6258, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.047296181321144104, "rewards/margins": 0.663731038570404, "rewards/rejected": -0.711027204990387, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.7440372309482255e-07, "logits/chosen": -2.437777042388916, "logits/rejected": -2.3604378700256348, "logps/chosen": -301.4665222167969, "logps/rejected": -265.2036437988281, "loss": 0.7178, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1992354393005371, "rewards/margins": 0.936697781085968, "rewards/rejected": -1.13593327999115, "step": 280 }, { "epoch": 0.15, "learning_rate": 4.7149505526468876e-07, "logits/chosen": -2.397371530532837, "logits/rejected": -2.3492469787597656, "logps/chosen": -296.2301940917969, "logps/rejected": -297.2955322265625, "loss": 0.5162, "rewards/accuracies": 0.625, "rewards/chosen": 0.45999306440353394, "rewards/margins": 0.6742507815361023, "rewards/rejected": -0.21425779163837433, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.6858638743455497e-07, "logits/chosen": -2.4259443283081055, "logits/rejected": -2.3763365745544434, "logps/chosen": -349.0486755371094, "logps/rejected": -336.35894775390625, "loss": 0.5774, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4317784905433655, "rewards/margins": 0.9294607043266296, "rewards/rejected": -0.49768227338790894, "step": 300 }, { "epoch": 0.16, "learning_rate": 4.656777196044212e-07, "logits/chosen": -2.4149937629699707, "logits/rejected": -2.299372911453247, "logps/chosen": -296.93731689453125, "logps/rejected": -269.88836669921875, "loss": 0.614, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13364040851593018, "rewards/margins": 1.1540381908416748, "rewards/rejected": -1.020397663116455, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.627690517742874e-07, "logits/chosen": -2.3246355056762695, "logits/rejected": -2.31697940826416, "logps/chosen": -246.38571166992188, "logps/rejected": -243.2635955810547, "loss": 0.5455, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3853219151496887, "rewards/margins": 1.1966570615768433, "rewards/rejected": -0.8113352060317993, "step": 320 }, { "epoch": 0.17, "learning_rate": 4.5986038394415354e-07, "logits/chosen": -2.377342939376831, "logits/rejected": -2.266279697418213, "logps/chosen": -291.89923095703125, "logps/rejected": -252.13156127929688, "loss": 0.7187, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3700163662433624, "rewards/margins": 1.021303415298462, "rewards/rejected": -1.3913196325302124, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.569517161140198e-07, "logits/chosen": -2.4199347496032715, "logits/rejected": -2.4432175159454346, "logps/chosen": -294.2767333984375, "logps/rejected": -317.6234130859375, "loss": 0.5903, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04284637048840523, "rewards/margins": 0.6986137628555298, "rewards/rejected": -0.6557673811912537, "step": 340 }, { "epoch": 0.18, "learning_rate": 4.5404304828388595e-07, "logits/chosen": -2.488062858581543, "logits/rejected": -2.420581340789795, "logps/chosen": -316.81109619140625, "logps/rejected": -355.86297607421875, "loss": 0.5148, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16860897839069366, "rewards/margins": 1.251904010772705, "rewards/rejected": -1.4205129146575928, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.5113438045375216e-07, "logits/chosen": -2.372753858566284, "logits/rejected": -2.3147406578063965, "logps/chosen": -287.305419921875, "logps/rejected": -256.7542724609375, "loss": 0.526, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.243671253323555, "rewards/margins": 0.9904806017875671, "rewards/rejected": -0.7468093633651733, "step": 360 }, { "epoch": 0.19, "learning_rate": 4.4822571262361837e-07, "logits/chosen": -2.446676731109619, "logits/rejected": -2.414977550506592, "logps/chosen": -242.71127319335938, "logps/rejected": -246.08462524414062, "loss": 0.6242, "rewards/accuracies": 0.5625, "rewards/chosen": -0.48434001207351685, "rewards/margins": 0.3427623510360718, "rewards/rejected": -0.8271023631095886, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.453170447934846e-07, "logits/chosen": -2.3129241466522217, "logits/rejected": -2.2660763263702393, "logps/chosen": -325.49517822265625, "logps/rejected": -257.01812744140625, "loss": 0.6439, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7354428768157959, "rewards/margins": 0.5178254842758179, "rewards/rejected": -1.2532682418823242, "step": 380 }, { "epoch": 0.2, "learning_rate": 4.424083769633508e-07, "logits/chosen": -2.407226085662842, "logits/rejected": -2.3878862857818604, "logps/chosen": -321.883056640625, "logps/rejected": -329.2529296875, "loss": 0.6696, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6276373863220215, "rewards/margins": 0.8156329989433289, "rewards/rejected": -1.4432705640792847, "step": 390 }, { "epoch": 0.21, "learning_rate": 4.39499709133217e-07, "logits/chosen": -2.5650746822357178, "logits/rejected": -2.5109097957611084, "logps/chosen": -315.965576171875, "logps/rejected": -329.1568298339844, "loss": 0.5373, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22043700516223907, "rewards/margins": 0.7228590250015259, "rewards/rejected": -0.9432960748672485, "step": 400 }, { "epoch": 0.21, "learning_rate": 4.3659104130308314e-07, "logits/chosen": -2.3766233921051025, "logits/rejected": -2.342545747756958, "logps/chosen": -270.418701171875, "logps/rejected": -277.3287048339844, "loss": 0.4702, "rewards/accuracies": 0.75, "rewards/chosen": -0.33475637435913086, "rewards/margins": 1.0519344806671143, "rewards/rejected": -1.3866908550262451, "step": 410 }, { "epoch": 0.22, "learning_rate": 4.336823734729494e-07, "logits/chosen": -2.5363078117370605, "logits/rejected": -2.420921802520752, "logps/chosen": -339.0617370605469, "logps/rejected": -314.2167663574219, "loss": 0.5078, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.20719392597675323, "rewards/margins": 1.039120078086853, "rewards/rejected": -1.2463139295578003, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.3077370564281556e-07, "logits/chosen": -2.4507744312286377, "logits/rejected": -2.3617775440216064, "logps/chosen": -305.3289794921875, "logps/rejected": -270.58843994140625, "loss": 0.5647, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5011049509048462, "rewards/margins": 0.8858562707901001, "rewards/rejected": -1.3869613409042358, "step": 430 }, { "epoch": 0.23, "learning_rate": 4.278650378126818e-07, "logits/chosen": -2.327728033065796, "logits/rejected": -2.2920546531677246, "logps/chosen": -289.3099365234375, "logps/rejected": -333.728271484375, "loss": 0.5112, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1836535632610321, "rewards/margins": 1.081466794013977, "rewards/rejected": -0.8978131413459778, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.24956369982548e-07, "logits/chosen": -2.4250426292419434, "logits/rejected": -2.3737356662750244, "logps/chosen": -308.11236572265625, "logps/rejected": -295.8037109375, "loss": 0.5505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22618088126182556, "rewards/margins": 0.990483283996582, "rewards/rejected": -0.7643024325370789, "step": 450 }, { "epoch": 0.24, "learning_rate": 4.220477021524142e-07, "logits/chosen": -2.4373867511749268, "logits/rejected": -2.400118112564087, "logps/chosen": -290.0523681640625, "logps/rejected": -255.9022674560547, "loss": 0.5815, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.12436334788799286, "rewards/margins": 0.7458918690681458, "rewards/rejected": -0.6215284466743469, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.191390343222804e-07, "logits/chosen": -2.506086826324463, "logits/rejected": -2.4103667736053467, "logps/chosen": -305.6959533691406, "logps/rejected": -305.922607421875, "loss": 0.505, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.27497127652168274, "rewards/margins": 1.060005784034729, "rewards/rejected": -0.7850344181060791, "step": 470 }, { "epoch": 0.25, "learning_rate": 4.162303664921466e-07, "logits/chosen": -2.460691213607788, "logits/rejected": -2.3934152126312256, "logps/chosen": -310.1314697265625, "logps/rejected": -297.3235168457031, "loss": 0.5853, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14773890376091003, "rewards/margins": 0.7825302481651306, "rewards/rejected": -0.6347913146018982, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.1332169866201275e-07, "logits/chosen": -2.4345953464508057, "logits/rejected": -2.4074747562408447, "logps/chosen": -290.62579345703125, "logps/rejected": -277.2015380859375, "loss": 0.5713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5942241549491882, "rewards/margins": 0.8701769113540649, "rewards/rejected": -0.2759527266025543, "step": 490 }, { "epoch": 0.26, "learning_rate": 4.10413030831879e-07, "logits/chosen": -2.491701602935791, "logits/rejected": -2.3788347244262695, "logps/chosen": -327.69097900390625, "logps/rejected": -256.1454162597656, "loss": 0.5509, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3082984387874603, "rewards/margins": 0.7658659219741821, "rewards/rejected": -0.4575675129890442, "step": 500 }, { "epoch": 0.26, "eval_logits/chosen": -2.4869401454925537, "eval_logits/rejected": -2.4248592853546143, "eval_logps/chosen": -292.5533447265625, "eval_logps/rejected": -277.5867614746094, "eval_loss": 0.6311370134353638, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": 0.8113744854927063, "eval_rewards/margins": 0.9294369220733643, "eval_rewards/rejected": -0.11806251108646393, "eval_runtime": 615.8679, "eval_samples_per_second": 3.247, "eval_steps_per_second": 0.102, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.0750436300174517e-07, "logits/chosen": -2.397420883178711, "logits/rejected": -2.372736692428589, "logps/chosen": -287.88897705078125, "logps/rejected": -266.78594970703125, "loss": 0.5289, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.6826907396316528, "rewards/margins": 1.225263237953186, "rewards/rejected": -0.542572557926178, "step": 510 }, { "epoch": 0.27, "learning_rate": 4.0459569517161143e-07, "logits/chosen": -2.448612928390503, "logits/rejected": -2.335453748703003, "logps/chosen": -289.3345947265625, "logps/rejected": -280.9012145996094, "loss": 0.9872, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.408169686794281, "rewards/margins": 1.0372252464294434, "rewards/rejected": -0.6290556192398071, "step": 520 }, { "epoch": 0.28, "learning_rate": 4.016870273414776e-07, "logits/chosen": -2.4662892818450928, "logits/rejected": -2.425401449203491, "logps/chosen": -260.6138000488281, "logps/rejected": -284.0682373046875, "loss": 0.5718, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2825961709022522, "rewards/margins": 1.0750539302825928, "rewards/rejected": -0.7924576997756958, "step": 530 }, { "epoch": 0.28, "learning_rate": 3.987783595113438e-07, "logits/chosen": -2.498586416244507, "logits/rejected": -2.421823501586914, "logps/chosen": -339.8152160644531, "logps/rejected": -266.1904602050781, "loss": 0.5823, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5165579915046692, "rewards/margins": 1.7178938388824463, "rewards/rejected": -1.2013359069824219, "step": 540 }, { "epoch": 0.29, "learning_rate": 3.9586969168121e-07, "logits/chosen": -2.491562604904175, "logits/rejected": -2.4570322036743164, "logps/chosen": -348.4375, "logps/rejected": -306.64697265625, "loss": 0.5497, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.10518566519021988, "rewards/margins": 0.8593299984931946, "rewards/rejected": -0.7541443705558777, "step": 550 }, { "epoch": 0.29, "learning_rate": 3.929610238510762e-07, "logits/chosen": -2.3990979194641113, "logits/rejected": -2.39212965965271, "logps/chosen": -289.9490661621094, "logps/rejected": -269.9928894042969, "loss": 0.6319, "rewards/accuracies": 0.625, "rewards/chosen": 0.04818814992904663, "rewards/margins": 0.6179074048995972, "rewards/rejected": -0.569719135761261, "step": 560 }, { "epoch": 0.3, "learning_rate": 3.900523560209424e-07, "logits/chosen": -2.4815762042999268, "logits/rejected": -2.425726890563965, "logps/chosen": -312.6693420410156, "logps/rejected": -273.46728515625, "loss": 0.5107, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3141094148159027, "rewards/margins": 1.199885606765747, "rewards/rejected": -0.8857762217521667, "step": 570 }, { "epoch": 0.3, "learning_rate": 3.871436881908086e-07, "logits/chosen": -2.5039753913879395, "logits/rejected": -2.414092779159546, "logps/chosen": -284.1502990722656, "logps/rejected": -240.26083374023438, "loss": 0.523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05462036281824112, "rewards/margins": 0.9068329930305481, "rewards/rejected": -0.8522126078605652, "step": 580 }, { "epoch": 0.31, "learning_rate": 3.842350203606748e-07, "logits/chosen": -2.487733840942383, "logits/rejected": -2.4456124305725098, "logps/chosen": -264.53790283203125, "logps/rejected": -287.30718994140625, "loss": 0.5847, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.48793086409568787, "rewards/margins": 1.0478885173797607, "rewards/rejected": -1.535819411277771, "step": 590 }, { "epoch": 0.31, "learning_rate": 3.8132635253054103e-07, "logits/chosen": -2.4525985717773438, "logits/rejected": -2.419036865234375, "logps/chosen": -292.17840576171875, "logps/rejected": -305.83258056640625, "loss": 0.5466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03021824359893799, "rewards/margins": 1.0404767990112305, "rewards/rejected": -1.070695161819458, "step": 600 }, { "epoch": 0.32, "learning_rate": 3.784176847004072e-07, "logits/chosen": -2.494528293609619, "logits/rejected": -2.455793857574463, "logps/chosen": -276.6438903808594, "logps/rejected": -284.5893249511719, "loss": 0.4859, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.11220204830169678, "rewards/margins": 1.3604462146759033, "rewards/rejected": -1.2482441663742065, "step": 610 }, { "epoch": 0.32, "learning_rate": 3.755090168702734e-07, "logits/chosen": -2.4499683380126953, "logits/rejected": -2.4269161224365234, "logps/chosen": -319.4291076660156, "logps/rejected": -330.466552734375, "loss": 0.5209, "rewards/accuracies": 0.75, "rewards/chosen": -0.1047038584947586, "rewards/margins": 1.4205875396728516, "rewards/rejected": -1.5252914428710938, "step": 620 }, { "epoch": 0.33, "learning_rate": 3.726003490401396e-07, "logits/chosen": -2.4178521633148193, "logits/rejected": -2.3652777671813965, "logps/chosen": -301.78399658203125, "logps/rejected": -255.4466094970703, "loss": 0.6127, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.73511803150177, "rewards/margins": 0.9625850915908813, "rewards/rejected": -1.6977031230926514, "step": 630 }, { "epoch": 0.33, "learning_rate": 3.696916812100058e-07, "logits/chosen": -2.414977788925171, "logits/rejected": -2.2954540252685547, "logps/chosen": -309.4778747558594, "logps/rejected": -232.9076385498047, "loss": 0.563, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.36078378558158875, "rewards/margins": 1.096639633178711, "rewards/rejected": -1.457423448562622, "step": 640 }, { "epoch": 0.34, "learning_rate": 3.66783013379872e-07, "logits/chosen": -2.4305996894836426, "logits/rejected": -2.3547043800354004, "logps/chosen": -303.08221435546875, "logps/rejected": -249.44003295898438, "loss": 0.5194, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.32232993841171265, "rewards/margins": 1.2769782543182373, "rewards/rejected": -1.5993082523345947, "step": 650 }, { "epoch": 0.35, "learning_rate": 3.6387434554973823e-07, "logits/chosen": -2.4183077812194824, "logits/rejected": -2.3758342266082764, "logps/chosen": -326.63739013671875, "logps/rejected": -319.60357666015625, "loss": 0.5133, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7344697117805481, "rewards/margins": 1.0054800510406494, "rewards/rejected": -1.7399498224258423, "step": 660 }, { "epoch": 0.35, "learning_rate": 3.609656777196044e-07, "logits/chosen": -2.4492297172546387, "logits/rejected": -2.3460309505462646, "logps/chosen": -334.344970703125, "logps/rejected": -272.79498291015625, "loss": 0.5318, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6380278468132019, "rewards/margins": 1.2398582696914673, "rewards/rejected": -1.877886414527893, "step": 670 }, { "epoch": 0.36, "learning_rate": 3.5805700988947064e-07, "logits/chosen": -2.37317156791687, "logits/rejected": -2.287410259246826, "logps/chosen": -259.41107177734375, "logps/rejected": -246.9707794189453, "loss": 0.6405, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.004200550727546215, "rewards/margins": 1.685315728187561, "rewards/rejected": -1.689516305923462, "step": 680 }, { "epoch": 0.36, "learning_rate": 3.551483420593368e-07, "logits/chosen": -2.424691677093506, "logits/rejected": -2.3706610202789307, "logps/chosen": -337.8548889160156, "logps/rejected": -300.6462707519531, "loss": 0.4984, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4729352593421936, "rewards/margins": 1.0950307846069336, "rewards/rejected": -1.567966103553772, "step": 690 }, { "epoch": 0.37, "learning_rate": 3.5223967422920306e-07, "logits/chosen": -2.4483962059020996, "logits/rejected": -2.3456740379333496, "logps/chosen": -355.38446044921875, "logps/rejected": -313.3027038574219, "loss": 0.7153, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5108211636543274, "rewards/margins": 1.1368777751922607, "rewards/rejected": -1.647698998451233, "step": 700 }, { "epoch": 0.37, "learning_rate": 3.493310063990692e-07, "logits/chosen": -2.4961283206939697, "logits/rejected": -2.4019272327423096, "logps/chosen": -258.533447265625, "logps/rejected": -235.88912963867188, "loss": 0.6123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2880368232727051, "rewards/margins": 0.940460205078125, "rewards/rejected": -1.22849702835083, "step": 710 }, { "epoch": 0.38, "learning_rate": 3.464223385689354e-07, "logits/chosen": -2.4469172954559326, "logits/rejected": -2.442150354385376, "logps/chosen": -263.5059814453125, "logps/rejected": -278.46112060546875, "loss": 0.5192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03521687909960747, "rewards/margins": 1.297978162765503, "rewards/rejected": -1.2627613544464111, "step": 720 }, { "epoch": 0.38, "learning_rate": 3.4351367073880163e-07, "logits/chosen": -2.567368745803833, "logits/rejected": -2.5503649711608887, "logps/chosen": -263.9495544433594, "logps/rejected": -274.8172302246094, "loss": 0.4936, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18824487924575806, "rewards/margins": 1.1973415613174438, "rewards/rejected": -1.3855865001678467, "step": 730 }, { "epoch": 0.39, "learning_rate": 3.4060500290866783e-07, "logits/chosen": -2.5890707969665527, "logits/rejected": -2.497729539871216, "logps/chosen": -357.546875, "logps/rejected": -316.70361328125, "loss": 0.4672, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6199100017547607, "rewards/margins": 1.5381364822387695, "rewards/rejected": -2.158046245574951, "step": 740 }, { "epoch": 0.39, "learning_rate": 3.37696335078534e-07, "logits/chosen": -2.6034646034240723, "logits/rejected": -2.5190954208374023, "logps/chosen": -307.06573486328125, "logps/rejected": -264.7110900878906, "loss": 0.5857, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9259228706359863, "rewards/margins": 1.1956923007965088, "rewards/rejected": -2.121615409851074, "step": 750 }, { "epoch": 0.4, "learning_rate": 3.3478766724840025e-07, "logits/chosen": -2.5817999839782715, "logits/rejected": -2.532043695449829, "logps/chosen": -291.2391662597656, "logps/rejected": -286.722900390625, "loss": 0.4794, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.822766900062561, "rewards/margins": 1.2887961864471436, "rewards/rejected": -2.111563205718994, "step": 760 }, { "epoch": 0.4, "learning_rate": 3.318789994182664e-07, "logits/chosen": -2.596311092376709, "logits/rejected": -2.5423972606658936, "logps/chosen": -271.95330810546875, "logps/rejected": -298.80975341796875, "loss": 0.5103, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4673972725868225, "rewards/margins": 1.1896220445632935, "rewards/rejected": -1.6570192575454712, "step": 770 }, { "epoch": 0.41, "learning_rate": 3.2897033158813266e-07, "logits/chosen": -2.5218849182128906, "logits/rejected": -2.5217957496643066, "logps/chosen": -309.8044738769531, "logps/rejected": -282.1851806640625, "loss": 0.584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8506789207458496, "rewards/margins": 0.7243373990058899, "rewards/rejected": -1.5750162601470947, "step": 780 }, { "epoch": 0.41, "learning_rate": 3.260616637579988e-07, "logits/chosen": -2.621936082839966, "logits/rejected": -2.5529212951660156, "logps/chosen": -321.58624267578125, "logps/rejected": -306.6531677246094, "loss": 0.5215, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5436080694198608, "rewards/margins": 1.283302903175354, "rewards/rejected": -1.8269107341766357, "step": 790 }, { "epoch": 0.42, "learning_rate": 3.2315299592786503e-07, "logits/chosen": -2.537576913833618, "logits/rejected": -2.497835636138916, "logps/chosen": -290.63714599609375, "logps/rejected": -284.85650634765625, "loss": 0.5535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6990116834640503, "rewards/margins": 1.495788812637329, "rewards/rejected": -2.194800615310669, "step": 800 }, { "epoch": 0.42, "learning_rate": 3.2024432809773123e-07, "logits/chosen": -2.486382246017456, "logits/rejected": -2.403653860092163, "logps/chosen": -286.6750183105469, "logps/rejected": -267.35723876953125, "loss": 0.5029, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0650520324707031, "rewards/margins": 1.3269193172454834, "rewards/rejected": -2.3919713497161865, "step": 810 }, { "epoch": 0.43, "learning_rate": 3.1733566026759744e-07, "logits/chosen": -2.5125479698181152, "logits/rejected": -2.461850643157959, "logps/chosen": -289.32379150390625, "logps/rejected": -280.98785400390625, "loss": 0.4962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6875102519989014, "rewards/margins": 1.0723689794540405, "rewards/rejected": -1.7598793506622314, "step": 820 }, { "epoch": 0.43, "learning_rate": 3.144269924374636e-07, "logits/chosen": -2.495903491973877, "logits/rejected": -2.4473767280578613, "logps/chosen": -277.972900390625, "logps/rejected": -305.8498840332031, "loss": 0.5665, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6580140590667725, "rewards/margins": 0.9823113679885864, "rewards/rejected": -1.6403253078460693, "step": 830 }, { "epoch": 0.44, "learning_rate": 3.1151832460732986e-07, "logits/chosen": -2.4640355110168457, "logits/rejected": -2.3490617275238037, "logps/chosen": -321.24688720703125, "logps/rejected": -300.0312805175781, "loss": 0.7062, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7375732660293579, "rewards/margins": 1.0734766721725464, "rewards/rejected": -1.8110501766204834, "step": 840 }, { "epoch": 0.44, "learning_rate": 3.08609656777196e-07, "logits/chosen": -2.4589717388153076, "logits/rejected": -2.4391770362854004, "logps/chosen": -276.55450439453125, "logps/rejected": -291.0248107910156, "loss": 0.5562, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9284421801567078, "rewards/margins": 1.0043559074401855, "rewards/rejected": -1.9327980279922485, "step": 850 }, { "epoch": 0.45, "learning_rate": 3.0570098894706227e-07, "logits/chosen": -2.4961347579956055, "logits/rejected": -2.4345571994781494, "logps/chosen": -305.3291015625, "logps/rejected": -354.2350769042969, "loss": 0.5703, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7021960616111755, "rewards/margins": 1.1666209697723389, "rewards/rejected": -1.8688170909881592, "step": 860 }, { "epoch": 0.46, "learning_rate": 3.0279232111692843e-07, "logits/chosen": -2.366995096206665, "logits/rejected": -2.3412668704986572, "logps/chosen": -269.6289978027344, "logps/rejected": -294.22308349609375, "loss": 0.5518, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0025119720958173275, "rewards/margins": 1.3077478408813477, "rewards/rejected": -1.3102598190307617, "step": 870 }, { "epoch": 0.46, "learning_rate": 2.9988365328679463e-07, "logits/chosen": -2.5405781269073486, "logits/rejected": -2.5027382373809814, "logps/chosen": -310.4794006347656, "logps/rejected": -311.45599365234375, "loss": 0.4918, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14495661854743958, "rewards/margins": 1.4426281452178955, "rewards/rejected": -1.5875847339630127, "step": 880 }, { "epoch": 0.47, "learning_rate": 2.9697498545666084e-07, "logits/chosen": -2.406571388244629, "logits/rejected": -2.3264360427856445, "logps/chosen": -347.81866455078125, "logps/rejected": -290.57080078125, "loss": 0.5883, "rewards/accuracies": 0.75, "rewards/chosen": -0.660466730594635, "rewards/margins": 1.2641503810882568, "rewards/rejected": -1.924617052078247, "step": 890 }, { "epoch": 0.47, "learning_rate": 2.9406631762652705e-07, "logits/chosen": -2.4709229469299316, "logits/rejected": -2.382078170776367, "logps/chosen": -340.4996032714844, "logps/rejected": -315.2070617675781, "loss": 0.515, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6704212427139282, "rewards/margins": 1.4002532958984375, "rewards/rejected": -2.0706748962402344, "step": 900 }, { "epoch": 0.48, "learning_rate": 2.9115764979639326e-07, "logits/chosen": -2.506743907928467, "logits/rejected": -2.4725024700164795, "logps/chosen": -340.2807922363281, "logps/rejected": -301.91082763671875, "loss": 0.4824, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5034831762313843, "rewards/margins": 1.6047446727752686, "rewards/rejected": -2.1082279682159424, "step": 910 }, { "epoch": 0.48, "learning_rate": 2.8824898196625947e-07, "logits/chosen": -2.446906805038452, "logits/rejected": -2.392117977142334, "logps/chosen": -301.62506103515625, "logps/rejected": -307.46258544921875, "loss": 0.4731, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13553249835968018, "rewards/margins": 1.2748148441314697, "rewards/rejected": -1.4103472232818604, "step": 920 }, { "epoch": 0.49, "learning_rate": 2.853403141361256e-07, "logits/chosen": -2.418687343597412, "logits/rejected": -2.3500733375549316, "logps/chosen": -301.64056396484375, "logps/rejected": -261.1069030761719, "loss": 0.5395, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5969820022583008, "rewards/margins": 0.8396062850952148, "rewards/rejected": -1.4365884065628052, "step": 930 }, { "epoch": 0.49, "learning_rate": 2.824316463059919e-07, "logits/chosen": -2.4292044639587402, "logits/rejected": -2.3925364017486572, "logps/chosen": -310.83258056640625, "logps/rejected": -304.67486572265625, "loss": 0.484, "rewards/accuracies": 0.75, "rewards/chosen": -0.4251963198184967, "rewards/margins": 1.4573180675506592, "rewards/rejected": -1.8825145959854126, "step": 940 }, { "epoch": 0.5, "learning_rate": 2.7952297847585803e-07, "logits/chosen": -2.417309284210205, "logits/rejected": -2.3521695137023926, "logps/chosen": -278.1476745605469, "logps/rejected": -281.66656494140625, "loss": 0.5149, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.28509074449539185, "rewards/margins": 1.083017110824585, "rewards/rejected": -1.3681080341339111, "step": 950 }, { "epoch": 0.5, "learning_rate": 2.766143106457243e-07, "logits/chosen": -2.3309614658355713, "logits/rejected": -2.3364787101745605, "logps/chosen": -259.80975341796875, "logps/rejected": -250.3785400390625, "loss": 0.5875, "rewards/accuracies": 0.625, "rewards/chosen": -0.18767808377742767, "rewards/margins": 1.1901742219924927, "rewards/rejected": -1.377852201461792, "step": 960 }, { "epoch": 0.51, "learning_rate": 2.7370564281559045e-07, "logits/chosen": -2.5228657722473145, "logits/rejected": -2.431464672088623, "logps/chosen": -346.1999816894531, "logps/rejected": -306.3692932128906, "loss": 0.5132, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02729633077979088, "rewards/margins": 1.0325729846954346, "rewards/rejected": -1.0598691701889038, "step": 970 }, { "epoch": 0.51, "learning_rate": 2.7079697498545666e-07, "logits/chosen": -2.458400249481201, "logits/rejected": -2.4137959480285645, "logps/chosen": -322.91864013671875, "logps/rejected": -321.85699462890625, "loss": 0.477, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.24051785469055176, "rewards/margins": 1.2779700756072998, "rewards/rejected": -1.5184879302978516, "step": 980 }, { "epoch": 0.52, "learning_rate": 2.6788830715532287e-07, "logits/chosen": -2.5085196495056152, "logits/rejected": -2.441279888153076, "logps/chosen": -266.5360412597656, "logps/rejected": -265.48980712890625, "loss": 0.58, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10649283230304718, "rewards/margins": 1.1308716535568237, "rewards/rejected": -1.2373645305633545, "step": 990 }, { "epoch": 0.52, "learning_rate": 2.6497963932518907e-07, "logits/chosen": -2.3860020637512207, "logits/rejected": -2.3790476322174072, "logps/chosen": -304.46270751953125, "logps/rejected": -309.70697021484375, "loss": 0.6191, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.43433743715286255, "rewards/margins": 0.8789669275283813, "rewards/rejected": -1.3133043050765991, "step": 1000 }, { "epoch": 0.52, "eval_logits/chosen": -2.490722894668579, "eval_logits/rejected": -2.427964210510254, "eval_logps/chosen": -300.2858581542969, "eval_logps/rejected": -286.9600524902344, "eval_loss": 0.5983877778053284, "eval_rewards/accuracies": 0.7599206566810608, "eval_rewards/chosen": 0.03812364488840103, "eval_rewards/margins": 1.0935115814208984, "eval_rewards/rejected": -1.0553878545761108, "eval_runtime": 614.9522, "eval_samples_per_second": 3.252, "eval_steps_per_second": 0.102, "step": 1000 }, { "epoch": 0.53, "learning_rate": 2.6207097149505523e-07, "logits/chosen": -2.511011838912964, "logits/rejected": -2.427821159362793, "logps/chosen": -329.0254821777344, "logps/rejected": -263.37030029296875, "loss": 0.529, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.06932397931814194, "rewards/margins": 1.145455002784729, "rewards/rejected": -1.0761311054229736, "step": 1010 }, { "epoch": 0.53, "learning_rate": 2.591623036649215e-07, "logits/chosen": -2.5254805088043213, "logits/rejected": -2.4318673610687256, "logps/chosen": -304.84466552734375, "logps/rejected": -272.1488952636719, "loss": 0.4663, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.25631338357925415, "rewards/margins": 1.2234864234924316, "rewards/rejected": -1.4797999858856201, "step": 1020 }, { "epoch": 0.54, "learning_rate": 2.5625363583478764e-07, "logits/chosen": -2.4320321083068848, "logits/rejected": -2.407522201538086, "logps/chosen": -285.1934814453125, "logps/rejected": -312.1936950683594, "loss": 0.4631, "rewards/accuracies": 0.75, "rewards/chosen": -0.12434335052967072, "rewards/margins": 1.4510531425476074, "rewards/rejected": -1.5753967761993408, "step": 1030 }, { "epoch": 0.54, "learning_rate": 2.533449680046539e-07, "logits/chosen": -2.480064630508423, "logits/rejected": -2.439182758331299, "logps/chosen": -309.5289001464844, "logps/rejected": -296.3352355957031, "loss": 0.5437, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.2124924659729004, "rewards/margins": 1.4312546253204346, "rewards/rejected": -1.643747091293335, "step": 1040 }, { "epoch": 0.55, "learning_rate": 2.5043630017452006e-07, "logits/chosen": -2.41044020652771, "logits/rejected": -2.3936314582824707, "logps/chosen": -284.94268798828125, "logps/rejected": -283.9359436035156, "loss": 0.5227, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03999950736761093, "rewards/margins": 1.149482011795044, "rewards/rejected": -1.1094822883605957, "step": 1050 }, { "epoch": 0.55, "learning_rate": 2.4752763234438627e-07, "logits/chosen": -2.430999279022217, "logits/rejected": -2.3697829246520996, "logps/chosen": -253.0732421875, "logps/rejected": -247.62130737304688, "loss": 0.4938, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.35475850105285645, "rewards/margins": 1.3529800176620483, "rewards/rejected": -0.9982213973999023, "step": 1060 }, { "epoch": 0.56, "learning_rate": 2.4461896451425247e-07, "logits/chosen": -2.4376707077026367, "logits/rejected": -2.4240877628326416, "logps/chosen": -269.42938232421875, "logps/rejected": -270.54644775390625, "loss": 0.4648, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1738893836736679, "rewards/margins": 1.2686713933944702, "rewards/rejected": -1.4425609111785889, "step": 1070 }, { "epoch": 0.57, "learning_rate": 2.417102966841187e-07, "logits/chosen": -2.458761692047119, "logits/rejected": -2.3944101333618164, "logps/chosen": -271.7908935546875, "logps/rejected": -296.3941345214844, "loss": 0.5304, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14974147081375122, "rewards/margins": 1.2848742008209229, "rewards/rejected": -1.4346158504486084, "step": 1080 }, { "epoch": 0.57, "learning_rate": 2.3880162885398483e-07, "logits/chosen": -2.4826645851135254, "logits/rejected": -2.375396251678467, "logps/chosen": -299.49560546875, "logps/rejected": -252.5529327392578, "loss": 0.532, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.49217844009399414, "rewards/margins": 1.3745396137237549, "rewards/rejected": -1.8667182922363281, "step": 1090 }, { "epoch": 0.58, "learning_rate": 2.3589296102385107e-07, "logits/chosen": -2.499211072921753, "logits/rejected": -2.417027235031128, "logps/chosen": -299.8446044921875, "logps/rejected": -286.61322021484375, "loss": 0.4399, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06040573865175247, "rewards/margins": 1.4158607721328735, "rewards/rejected": -1.4762665033340454, "step": 1100 }, { "epoch": 0.58, "learning_rate": 2.3298429319371725e-07, "logits/chosen": -2.5318305492401123, "logits/rejected": -2.441438674926758, "logps/chosen": -301.87762451171875, "logps/rejected": -303.8865051269531, "loss": 0.4871, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.34866777062416077, "rewards/margins": 1.076101541519165, "rewards/rejected": -1.4247692823410034, "step": 1110 }, { "epoch": 0.59, "learning_rate": 2.3007562536358346e-07, "logits/chosen": -2.478132486343384, "logits/rejected": -2.420320749282837, "logps/chosen": -304.8691711425781, "logps/rejected": -313.31951904296875, "loss": 0.5539, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0543731153011322, "rewards/margins": 1.4534409046173096, "rewards/rejected": -1.507813811302185, "step": 1120 }, { "epoch": 0.59, "learning_rate": 2.2716695753344967e-07, "logits/chosen": -2.452592372894287, "logits/rejected": -2.422971487045288, "logps/chosen": -301.95721435546875, "logps/rejected": -311.69268798828125, "loss": 0.5074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.30631425976753235, "rewards/margins": 1.4355709552764893, "rewards/rejected": -1.7418854236602783, "step": 1130 }, { "epoch": 0.6, "learning_rate": 2.2425828970331587e-07, "logits/chosen": -2.5267157554626465, "logits/rejected": -2.4898681640625, "logps/chosen": -267.6355285644531, "logps/rejected": -277.17376708984375, "loss": 0.5519, "rewards/accuracies": 0.75, "rewards/chosen": -0.5024998784065247, "rewards/margins": 1.3192273378372192, "rewards/rejected": -1.8217273950576782, "step": 1140 }, { "epoch": 0.6, "learning_rate": 2.2134962187318208e-07, "logits/chosen": -2.405796527862549, "logits/rejected": -2.327157497406006, "logps/chosen": -305.180908203125, "logps/rejected": -281.836669921875, "loss": 0.5686, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9660147428512573, "rewards/margins": 1.525781512260437, "rewards/rejected": -2.4917960166931152, "step": 1150 }, { "epoch": 0.61, "learning_rate": 2.1844095404304826e-07, "logits/chosen": -2.4776437282562256, "logits/rejected": -2.430321455001831, "logps/chosen": -330.463623046875, "logps/rejected": -324.64324951171875, "loss": 0.6433, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.0749146938323975, "rewards/margins": 0.4202847480773926, "rewards/rejected": -1.4951995611190796, "step": 1160 }, { "epoch": 0.61, "learning_rate": 2.1553228621291447e-07, "logits/chosen": -2.4651365280151367, "logits/rejected": -2.394700527191162, "logps/chosen": -286.38763427734375, "logps/rejected": -294.4740295410156, "loss": 0.5214, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9769517779350281, "rewards/margins": 0.8809939622879028, "rewards/rejected": -1.857945442199707, "step": 1170 }, { "epoch": 0.62, "learning_rate": 2.1262361838278068e-07, "logits/chosen": -2.4773755073547363, "logits/rejected": -2.4409890174865723, "logps/chosen": -308.72369384765625, "logps/rejected": -298.4217224121094, "loss": 0.4998, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7563016414642334, "rewards/margins": 1.2209903001785278, "rewards/rejected": -1.9772918224334717, "step": 1180 }, { "epoch": 0.62, "learning_rate": 2.0971495055264688e-07, "logits/chosen": -2.4952621459960938, "logits/rejected": -2.4689629077911377, "logps/chosen": -291.19232177734375, "logps/rejected": -306.8577575683594, "loss": 0.5187, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.24586816132068634, "rewards/margins": 1.1213955879211426, "rewards/rejected": -1.3672637939453125, "step": 1190 }, { "epoch": 0.63, "learning_rate": 2.0680628272251307e-07, "logits/chosen": -2.509241819381714, "logits/rejected": -2.4504573345184326, "logps/chosen": -285.4134826660156, "logps/rejected": -277.93658447265625, "loss": 0.5123, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3394593298435211, "rewards/margins": 1.0499706268310547, "rewards/rejected": -1.3894299268722534, "step": 1200 }, { "epoch": 0.63, "learning_rate": 2.0389761489237927e-07, "logits/chosen": -2.5037548542022705, "logits/rejected": -2.4983646869659424, "logps/chosen": -298.67108154296875, "logps/rejected": -290.48687744140625, "loss": 0.5539, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12395872920751572, "rewards/margins": 1.0312166213989258, "rewards/rejected": -1.1551753282546997, "step": 1210 }, { "epoch": 0.64, "learning_rate": 2.0098894706224548e-07, "logits/chosen": -2.514279842376709, "logits/rejected": -2.473151683807373, "logps/chosen": -255.2734375, "logps/rejected": -252.98855590820312, "loss": 0.4852, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.47876453399658203, "rewards/margins": 1.0464788675308228, "rewards/rejected": -1.5252434015274048, "step": 1220 }, { "epoch": 0.64, "learning_rate": 1.980802792321117e-07, "logits/chosen": -2.471043825149536, "logits/rejected": -2.437826156616211, "logps/chosen": -336.73712158203125, "logps/rejected": -337.2440490722656, "loss": 0.5463, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4187423586845398, "rewards/margins": 1.326642394065857, "rewards/rejected": -1.7453848123550415, "step": 1230 }, { "epoch": 0.65, "learning_rate": 1.9517161140197787e-07, "logits/chosen": -2.5959396362304688, "logits/rejected": -2.5564401149749756, "logps/chosen": -309.3214111328125, "logps/rejected": -300.98138427734375, "loss": 0.5264, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3669760823249817, "rewards/margins": 1.1876031160354614, "rewards/rejected": -1.5545791387557983, "step": 1240 }, { "epoch": 0.65, "learning_rate": 1.9226294357184408e-07, "logits/chosen": -2.498528480529785, "logits/rejected": -2.4267897605895996, "logps/chosen": -316.4427185058594, "logps/rejected": -322.12493896484375, "loss": 0.5139, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0588008277118206, "rewards/margins": 1.484725832939148, "rewards/rejected": -1.5435266494750977, "step": 1250 }, { "epoch": 0.66, "learning_rate": 1.8935427574171028e-07, "logits/chosen": -2.542093515396118, "logits/rejected": -2.4682421684265137, "logps/chosen": -337.3454284667969, "logps/rejected": -299.1844482421875, "loss": 0.5489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02595728635787964, "rewards/margins": 1.281512975692749, "rewards/rejected": -1.307470440864563, "step": 1260 }, { "epoch": 0.66, "learning_rate": 1.864456079115765e-07, "logits/chosen": -2.477086067199707, "logits/rejected": -2.457650661468506, "logps/chosen": -268.6798095703125, "logps/rejected": -257.23358154296875, "loss": 0.4939, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1567353755235672, "rewards/margins": 1.2585152387619019, "rewards/rejected": -1.415250539779663, "step": 1270 }, { "epoch": 0.67, "learning_rate": 1.835369400814427e-07, "logits/chosen": -2.544891595840454, "logits/rejected": -2.4903788566589355, "logps/chosen": -289.84234619140625, "logps/rejected": -263.73297119140625, "loss": 0.4819, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.07888557016849518, "rewards/margins": 1.23806893825531, "rewards/rejected": -1.3169543743133545, "step": 1280 }, { "epoch": 0.68, "learning_rate": 1.8062827225130888e-07, "logits/chosen": -2.4458956718444824, "logits/rejected": -2.36130428314209, "logps/chosen": -296.7621765136719, "logps/rejected": -268.7115173339844, "loss": 0.5193, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21038198471069336, "rewards/margins": 0.9879549741744995, "rewards/rejected": -1.1983369588851929, "step": 1290 }, { "epoch": 0.68, "learning_rate": 1.777196044211751e-07, "logits/chosen": -2.422639846801758, "logits/rejected": -2.3881583213806152, "logps/chosen": -267.2961730957031, "logps/rejected": -258.95159912109375, "loss": 0.5247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0472625195980072, "rewards/margins": 1.1689708232879639, "rewards/rejected": -1.216233491897583, "step": 1300 }, { "epoch": 0.69, "learning_rate": 1.748109365910413e-07, "logits/chosen": -2.4238734245300293, "logits/rejected": -2.3859925270080566, "logps/chosen": -241.4824981689453, "logps/rejected": -256.94635009765625, "loss": 0.471, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.039008866995573044, "rewards/margins": 1.5137543678283691, "rewards/rejected": -1.474745512008667, "step": 1310 }, { "epoch": 0.69, "learning_rate": 1.719022687609075e-07, "logits/chosen": -2.5132765769958496, "logits/rejected": -2.449302911758423, "logps/chosen": -363.523681640625, "logps/rejected": -352.0708923339844, "loss": 0.518, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5855492353439331, "rewards/margins": 1.2664353847503662, "rewards/rejected": -1.8519846200942993, "step": 1320 }, { "epoch": 0.7, "learning_rate": 1.6899360093077368e-07, "logits/chosen": -2.4553329944610596, "logits/rejected": -2.4112277030944824, "logps/chosen": -280.7616882324219, "logps/rejected": -236.74765014648438, "loss": 0.5061, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.48466506600379944, "rewards/margins": 1.0938060283660889, "rewards/rejected": -1.578471064567566, "step": 1330 }, { "epoch": 0.7, "learning_rate": 1.660849331006399e-07, "logits/chosen": -2.47243070602417, "logits/rejected": -2.448956251144409, "logps/chosen": -347.1180114746094, "logps/rejected": -303.9145202636719, "loss": 0.5364, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2242584526538849, "rewards/margins": 1.3924510478973389, "rewards/rejected": -1.6167097091674805, "step": 1340 }, { "epoch": 0.71, "learning_rate": 1.631762652705061e-07, "logits/chosen": -2.430530071258545, "logits/rejected": -2.4229390621185303, "logps/chosen": -245.01284790039062, "logps/rejected": -268.6424560546875, "loss": 0.5447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5738204121589661, "rewards/margins": 1.0651990175247192, "rewards/rejected": -1.6390196084976196, "step": 1350 }, { "epoch": 0.71, "learning_rate": 1.602675974403723e-07, "logits/chosen": -2.509038209915161, "logits/rejected": -2.489039897918701, "logps/chosen": -279.32379150390625, "logps/rejected": -294.4464416503906, "loss": 0.5021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5361760854721069, "rewards/margins": 1.5468828678131104, "rewards/rejected": -2.0830588340759277, "step": 1360 }, { "epoch": 0.72, "learning_rate": 1.573589296102385e-07, "logits/chosen": -2.47121262550354, "logits/rejected": -2.3763961791992188, "logps/chosen": -295.9562072753906, "logps/rejected": -312.29022216796875, "loss": 0.5798, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7061088681221008, "rewards/margins": 0.6994223594665527, "rewards/rejected": -1.4055311679840088, "step": 1370 }, { "epoch": 0.72, "learning_rate": 1.544502617801047e-07, "logits/chosen": -2.4963533878326416, "logits/rejected": -2.4046132564544678, "logps/chosen": -306.95379638671875, "logps/rejected": -269.7267150878906, "loss": 0.5349, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.14952921867370605, "rewards/margins": 1.085407018661499, "rewards/rejected": -1.2349363565444946, "step": 1380 }, { "epoch": 0.73, "learning_rate": 1.515415939499709e-07, "logits/chosen": -2.3877298831939697, "logits/rejected": -2.3528621196746826, "logps/chosen": -306.18292236328125, "logps/rejected": -281.7127990722656, "loss": 0.4883, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3554263710975647, "rewards/margins": 1.3783906698226929, "rewards/rejected": -1.7338171005249023, "step": 1390 }, { "epoch": 0.73, "learning_rate": 1.486329261198371e-07, "logits/chosen": -2.4248931407928467, "logits/rejected": -2.3428282737731934, "logps/chosen": -265.8216247558594, "logps/rejected": -254.1871795654297, "loss": 0.5616, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4705156683921814, "rewards/margins": 1.1333085298538208, "rewards/rejected": -1.6038239002227783, "step": 1400 }, { "epoch": 0.74, "learning_rate": 1.4572425828970332e-07, "logits/chosen": -2.5125279426574707, "logits/rejected": -2.4069325923919678, "logps/chosen": -317.7018127441406, "logps/rejected": -303.3476867675781, "loss": 0.4968, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4776553511619568, "rewards/margins": 1.0890066623687744, "rewards/rejected": -1.566662073135376, "step": 1410 }, { "epoch": 0.74, "learning_rate": 1.428155904595695e-07, "logits/chosen": -2.402710437774658, "logits/rejected": -2.35219407081604, "logps/chosen": -308.2895202636719, "logps/rejected": -298.3793640136719, "loss": 0.5636, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.43602848052978516, "rewards/margins": 1.1804325580596924, "rewards/rejected": -1.6164608001708984, "step": 1420 }, { "epoch": 0.75, "learning_rate": 1.399069226294357e-07, "logits/chosen": -2.325125217437744, "logits/rejected": -2.265918254852295, "logps/chosen": -255.18252563476562, "logps/rejected": -242.65451049804688, "loss": 0.5112, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.38456660509109497, "rewards/margins": 1.3007227182388306, "rewards/rejected": -1.6852893829345703, "step": 1430 }, { "epoch": 0.75, "learning_rate": 1.3699825479930191e-07, "logits/chosen": -2.444197177886963, "logits/rejected": -2.3448662757873535, "logps/chosen": -335.58575439453125, "logps/rejected": -257.6363525390625, "loss": 0.4915, "rewards/accuracies": 0.75, "rewards/chosen": -0.1254938691854477, "rewards/margins": 1.4525253772735596, "rewards/rejected": -1.578019380569458, "step": 1440 }, { "epoch": 0.76, "learning_rate": 1.3408958696916812e-07, "logits/chosen": -2.432386875152588, "logits/rejected": -2.3468270301818848, "logps/chosen": -264.3630065917969, "logps/rejected": -253.1586456298828, "loss": 0.4865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.010800900869071484, "rewards/margins": 1.3583967685699463, "rewards/rejected": -1.3691976070404053, "step": 1450 }, { "epoch": 0.76, "learning_rate": 1.311809191390343e-07, "logits/chosen": -2.3925037384033203, "logits/rejected": -2.3687844276428223, "logps/chosen": -298.83160400390625, "logps/rejected": -321.2250671386719, "loss": 0.5522, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13408958911895752, "rewards/margins": 1.2063826322555542, "rewards/rejected": -1.3404719829559326, "step": 1460 }, { "epoch": 0.77, "learning_rate": 1.282722513089005e-07, "logits/chosen": -2.5323238372802734, "logits/rejected": -2.450490713119507, "logps/chosen": -303.0655517578125, "logps/rejected": -273.21636962890625, "loss": 0.4542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07529834657907486, "rewards/margins": 1.215362787246704, "rewards/rejected": -1.290661096572876, "step": 1470 }, { "epoch": 0.77, "learning_rate": 1.2536358347876672e-07, "logits/chosen": -2.4639523029327393, "logits/rejected": -2.451842784881592, "logps/chosen": -273.89410400390625, "logps/rejected": -288.24395751953125, "loss": 0.5013, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1511283814907074, "rewards/margins": 1.046019196510315, "rewards/rejected": -1.1971476078033447, "step": 1480 }, { "epoch": 0.78, "learning_rate": 1.2245491564863293e-07, "logits/chosen": -2.5214879512786865, "logits/rejected": -2.3725059032440186, "logps/chosen": -365.9342041015625, "logps/rejected": -288.50860595703125, "loss": 0.4655, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.24411487579345703, "rewards/margins": 1.6229133605957031, "rewards/rejected": -1.3787983655929565, "step": 1490 }, { "epoch": 0.78, "learning_rate": 1.195462478184991e-07, "logits/chosen": -2.4407520294189453, "logits/rejected": -2.403568744659424, "logps/chosen": -266.7708740234375, "logps/rejected": -256.9794006347656, "loss": 0.5342, "rewards/accuracies": 0.6875, "rewards/chosen": 0.008794727735221386, "rewards/margins": 1.0573989152908325, "rewards/rejected": -1.0486040115356445, "step": 1500 }, { "epoch": 0.78, "eval_logits/chosen": -2.479328155517578, "eval_logits/rejected": -2.420834541320801, "eval_logps/chosen": -299.6731872558594, "eval_logps/rejected": -287.96630859375, "eval_loss": 0.5819299817085266, "eval_rewards/accuracies": 0.7559523582458496, "eval_rewards/chosen": 0.09938977658748627, "eval_rewards/margins": 1.2554062604904175, "eval_rewards/rejected": -1.1560163497924805, "eval_runtime": 614.7519, "eval_samples_per_second": 3.253, "eval_steps_per_second": 0.102, "step": 1500 }, { "epoch": 0.79, "learning_rate": 1.1663757998836531e-07, "logits/chosen": -2.4385485649108887, "logits/rejected": -2.414762496948242, "logps/chosen": -282.46331787109375, "logps/rejected": -271.0342712402344, "loss": 0.5393, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.030192693695425987, "rewards/margins": 1.1163215637207031, "rewards/rejected": -1.0861289501190186, "step": 1510 }, { "epoch": 0.8, "learning_rate": 1.1372891215823152e-07, "logits/chosen": -2.475036144256592, "logits/rejected": -2.431018352508545, "logps/chosen": -273.4059753417969, "logps/rejected": -279.97149658203125, "loss": 0.4813, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.07051552832126617, "rewards/margins": 1.225347638130188, "rewards/rejected": -1.295863151550293, "step": 1520 }, { "epoch": 0.8, "learning_rate": 1.1082024432809772e-07, "logits/chosen": -2.434101104736328, "logits/rejected": -2.3855373859405518, "logps/chosen": -270.7154235839844, "logps/rejected": -257.188720703125, "loss": 0.4726, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03814787417650223, "rewards/margins": 1.2096832990646362, "rewards/rejected": -1.2478312253952026, "step": 1530 }, { "epoch": 0.81, "learning_rate": 1.0791157649796392e-07, "logits/chosen": -2.509958267211914, "logits/rejected": -2.5057830810546875, "logps/chosen": -268.060791015625, "logps/rejected": -294.9040832519531, "loss": 0.6052, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09406580030918121, "rewards/margins": 0.9256828427314758, "rewards/rejected": -1.019748568534851, "step": 1540 }, { "epoch": 0.81, "learning_rate": 1.0500290866783013e-07, "logits/chosen": -2.391065835952759, "logits/rejected": -2.3701300621032715, "logps/chosen": -280.59686279296875, "logps/rejected": -243.96798706054688, "loss": 0.5019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04815078154206276, "rewards/margins": 0.9994159936904907, "rewards/rejected": -0.9512651562690735, "step": 1550 }, { "epoch": 0.82, "learning_rate": 1.0209424083769633e-07, "logits/chosen": -2.428290367126465, "logits/rejected": -2.394292116165161, "logps/chosen": -284.85968017578125, "logps/rejected": -287.145263671875, "loss": 0.505, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22867484390735626, "rewards/margins": 1.033279299736023, "rewards/rejected": -1.2619540691375732, "step": 1560 }, { "epoch": 0.82, "learning_rate": 9.918557300756253e-08, "logits/chosen": -2.4378373622894287, "logits/rejected": -2.369236946105957, "logps/chosen": -293.5215759277344, "logps/rejected": -246.56997680664062, "loss": 0.4905, "rewards/accuracies": 0.75, "rewards/chosen": -0.13024893403053284, "rewards/margins": 1.4804332256317139, "rewards/rejected": -1.6106821298599243, "step": 1570 }, { "epoch": 0.83, "learning_rate": 9.627690517742873e-08, "logits/chosen": -2.4206690788269043, "logits/rejected": -2.409813165664673, "logps/chosen": -333.97467041015625, "logps/rejected": -314.40960693359375, "loss": 0.5211, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2985494136810303, "rewards/margins": 1.2202610969543457, "rewards/rejected": -0.9217117428779602, "step": 1580 }, { "epoch": 0.83, "learning_rate": 9.336823734729494e-08, "logits/chosen": -2.426513671875, "logits/rejected": -2.4083962440490723, "logps/chosen": -312.02655029296875, "logps/rejected": -314.4944152832031, "loss": 0.5172, "rewards/accuracies": 0.75, "rewards/chosen": -0.1636694371700287, "rewards/margins": 1.20877206325531, "rewards/rejected": -1.372441291809082, "step": 1590 }, { "epoch": 0.84, "learning_rate": 9.045956951716113e-08, "logits/chosen": -2.4410054683685303, "logits/rejected": -2.421079158782959, "logps/chosen": -268.4046630859375, "logps/rejected": -259.6822509765625, "loss": 0.5011, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.018908673897385597, "rewards/margins": 1.4655755758285522, "rewards/rejected": -1.446666955947876, "step": 1600 }, { "epoch": 0.84, "learning_rate": 8.755090168702734e-08, "logits/chosen": -2.4191842079162598, "logits/rejected": -2.3796703815460205, "logps/chosen": -282.9931945800781, "logps/rejected": -297.3874206542969, "loss": 0.488, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.26676809787750244, "rewards/margins": 1.4654661417007446, "rewards/rejected": -1.732234239578247, "step": 1610 }, { "epoch": 0.85, "learning_rate": 8.464223385689353e-08, "logits/chosen": -2.474801540374756, "logits/rejected": -2.439415693283081, "logps/chosen": -316.919921875, "logps/rejected": -286.6101989746094, "loss": 0.5092, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.14554055035114288, "rewards/margins": 1.1523851156234741, "rewards/rejected": -1.2979257106781006, "step": 1620 }, { "epoch": 0.85, "learning_rate": 8.173356602675974e-08, "logits/chosen": -2.501138210296631, "logits/rejected": -2.423609495162964, "logps/chosen": -267.6105651855469, "logps/rejected": -268.2532653808594, "loss": 0.5253, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7160999178886414, "rewards/margins": 0.7922013998031616, "rewards/rejected": -1.5083013772964478, "step": 1630 }, { "epoch": 0.86, "learning_rate": 7.882489819662593e-08, "logits/chosen": -2.4721310138702393, "logits/rejected": -2.4814186096191406, "logps/chosen": -304.85931396484375, "logps/rejected": -421.44427490234375, "loss": 0.4832, "rewards/accuracies": 0.75, "rewards/chosen": -0.4838688373565674, "rewards/margins": 1.3697353601455688, "rewards/rejected": -1.8536040782928467, "step": 1640 }, { "epoch": 0.86, "learning_rate": 7.591623036649214e-08, "logits/chosen": -2.331878185272217, "logits/rejected": -2.3361971378326416, "logps/chosen": -296.43096923828125, "logps/rejected": -293.191650390625, "loss": 0.5881, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4644225537776947, "rewards/margins": 1.1996773481369019, "rewards/rejected": -1.664099931716919, "step": 1650 }, { "epoch": 0.87, "learning_rate": 7.300756253635834e-08, "logits/chosen": -2.472486972808838, "logits/rejected": -2.420685291290283, "logps/chosen": -268.98712158203125, "logps/rejected": -287.7394714355469, "loss": 0.4996, "rewards/accuracies": 0.75, "rewards/chosen": -0.22886808216571808, "rewards/margins": 1.0310173034667969, "rewards/rejected": -1.259885311126709, "step": 1660 }, { "epoch": 0.87, "learning_rate": 7.009889470622454e-08, "logits/chosen": -2.480839967727661, "logits/rejected": -2.405022144317627, "logps/chosen": -327.4227600097656, "logps/rejected": -299.2853088378906, "loss": 0.4532, "rewards/accuracies": 0.75, "rewards/chosen": -0.21133089065551758, "rewards/margins": 1.356836199760437, "rewards/rejected": -1.568166971206665, "step": 1670 }, { "epoch": 0.88, "learning_rate": 6.719022687609075e-08, "logits/chosen": -2.4610321521759033, "logits/rejected": -2.3211312294006348, "logps/chosen": -367.6954650878906, "logps/rejected": -312.0871887207031, "loss": 0.4964, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6737295389175415, "rewards/margins": 1.504465103149414, "rewards/rejected": -2.178194522857666, "step": 1680 }, { "epoch": 0.88, "learning_rate": 6.428155904595695e-08, "logits/chosen": -2.350032091140747, "logits/rejected": -2.3311705589294434, "logps/chosen": -309.73712158203125, "logps/rejected": -270.7311096191406, "loss": 0.4952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18244963884353638, "rewards/margins": 1.2423228025436401, "rewards/rejected": -1.4247725009918213, "step": 1690 }, { "epoch": 0.89, "learning_rate": 6.137289121582315e-08, "logits/chosen": -2.5109975337982178, "logits/rejected": -2.3817737102508545, "logps/chosen": -335.9524841308594, "logps/rejected": -272.06829833984375, "loss": 0.4946, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4245755672454834, "rewards/margins": 1.40361487865448, "rewards/rejected": -1.828190565109253, "step": 1700 }, { "epoch": 0.89, "learning_rate": 5.846422338568935e-08, "logits/chosen": -2.4695639610290527, "logits/rejected": -2.414325714111328, "logps/chosen": -304.23468017578125, "logps/rejected": -285.8380432128906, "loss": 0.4495, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3581593334674835, "rewards/margins": 1.1380908489227295, "rewards/rejected": -1.4962502717971802, "step": 1710 }, { "epoch": 0.9, "learning_rate": 5.555555555555555e-08, "logits/chosen": -2.439877986907959, "logits/rejected": -2.3664920330047607, "logps/chosen": -350.97137451171875, "logps/rejected": -327.41302490234375, "loss": 0.5355, "rewards/accuracies": 0.625, "rewards/chosen": -0.5186697244644165, "rewards/margins": 1.1480152606964111, "rewards/rejected": -1.6666851043701172, "step": 1720 }, { "epoch": 0.91, "learning_rate": 5.264688772542175e-08, "logits/chosen": -2.455570697784424, "logits/rejected": -2.409554958343506, "logps/chosen": -292.89715576171875, "logps/rejected": -287.56573486328125, "loss": 0.4603, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.08683060109615326, "rewards/margins": 0.9011920690536499, "rewards/rejected": -0.98802250623703, "step": 1730 }, { "epoch": 0.91, "learning_rate": 4.973821989528795e-08, "logits/chosen": -2.502526044845581, "logits/rejected": -2.46052885055542, "logps/chosen": -266.27947998046875, "logps/rejected": -262.73822021484375, "loss": 0.5074, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.008827775716781616, "rewards/margins": 1.2833327054977417, "rewards/rejected": -1.2921605110168457, "step": 1740 }, { "epoch": 0.92, "learning_rate": 4.682955206515416e-08, "logits/chosen": -2.494965076446533, "logits/rejected": -2.4605846405029297, "logps/chosen": -316.22882080078125, "logps/rejected": -368.8150939941406, "loss": 0.5432, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.25845271348953247, "rewards/margins": 1.4049198627471924, "rewards/rejected": -1.6633726358413696, "step": 1750 }, { "epoch": 0.92, "learning_rate": 4.392088423502036e-08, "logits/chosen": -2.481264352798462, "logits/rejected": -2.5107004642486572, "logps/chosen": -274.2122802734375, "logps/rejected": -280.37933349609375, "loss": 0.5557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5907906293869019, "rewards/margins": 1.1111408472061157, "rewards/rejected": -1.701931357383728, "step": 1760 }, { "epoch": 0.93, "learning_rate": 4.101221640488656e-08, "logits/chosen": -2.4728500843048096, "logits/rejected": -2.408026695251465, "logps/chosen": -308.61956787109375, "logps/rejected": -288.31170654296875, "loss": 0.4805, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6883036494255066, "rewards/margins": 1.1805965900421143, "rewards/rejected": -1.8689002990722656, "step": 1770 }, { "epoch": 0.93, "learning_rate": 3.810354857475276e-08, "logits/chosen": -2.3637709617614746, "logits/rejected": -2.3329529762268066, "logps/chosen": -324.7041320800781, "logps/rejected": -270.0013122558594, "loss": 0.5471, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.544477105140686, "rewards/margins": 1.064997673034668, "rewards/rejected": -1.6094748973846436, "step": 1780 }, { "epoch": 0.94, "learning_rate": 3.519488074461896e-08, "logits/chosen": -2.447537660598755, "logits/rejected": -2.386204242706299, "logps/chosen": -268.69378662109375, "logps/rejected": -276.32843017578125, "loss": 0.3955, "rewards/accuracies": 0.75, "rewards/chosen": -0.3514997661113739, "rewards/margins": 1.253027319908142, "rewards/rejected": -1.6045271158218384, "step": 1790 }, { "epoch": 0.94, "learning_rate": 3.228621291448516e-08, "logits/chosen": -2.504725694656372, "logits/rejected": -2.4065499305725098, "logps/chosen": -324.8893127441406, "logps/rejected": -262.2546081542969, "loss": 0.4898, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5291447043418884, "rewards/margins": 1.2341575622558594, "rewards/rejected": -1.763302206993103, "step": 1800 }, { "epoch": 0.95, "learning_rate": 2.9377545084351366e-08, "logits/chosen": -2.4359753131866455, "logits/rejected": -2.3973264694213867, "logps/chosen": -380.8512878417969, "logps/rejected": -345.4535827636719, "loss": 0.4781, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.39544662833213806, "rewards/margins": 1.4492541551589966, "rewards/rejected": -1.844700813293457, "step": 1810 }, { "epoch": 0.95, "learning_rate": 2.6468877254217567e-08, "logits/chosen": -2.455008029937744, "logits/rejected": -2.406989574432373, "logps/chosen": -321.1400451660156, "logps/rejected": -326.90203857421875, "loss": 0.5332, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.45269957184791565, "rewards/margins": 1.45340895652771, "rewards/rejected": -1.9061084985733032, "step": 1820 }, { "epoch": 0.96, "learning_rate": 2.3560209424083768e-08, "logits/chosen": -2.451988458633423, "logits/rejected": -2.390340805053711, "logps/chosen": -319.3875427246094, "logps/rejected": -321.97186279296875, "loss": 0.5775, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6661805510520935, "rewards/margins": 0.7991847395896912, "rewards/rejected": -1.4653651714324951, "step": 1830 }, { "epoch": 0.96, "learning_rate": 2.065154159394997e-08, "logits/chosen": -2.439758539199829, "logits/rejected": -2.419706106185913, "logps/chosen": -330.24810791015625, "logps/rejected": -308.7535095214844, "loss": 0.5889, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5779649019241333, "rewards/margins": 0.9780583381652832, "rewards/rejected": -1.5560232400894165, "step": 1840 }, { "epoch": 0.97, "learning_rate": 1.7742873763816174e-08, "logits/chosen": -2.4510066509246826, "logits/rejected": -2.406909704208374, "logps/chosen": -338.01861572265625, "logps/rejected": -291.27923583984375, "loss": 0.5524, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.372661292552948, "rewards/margins": 1.7962923049926758, "rewards/rejected": -2.1689534187316895, "step": 1850 }, { "epoch": 0.97, "learning_rate": 1.4834205933682373e-08, "logits/chosen": -2.5106232166290283, "logits/rejected": -2.4458587169647217, "logps/chosen": -300.896728515625, "logps/rejected": -278.838623046875, "loss": 0.5813, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4824046492576599, "rewards/margins": 1.491176962852478, "rewards/rejected": -1.9735815525054932, "step": 1860 }, { "epoch": 0.98, "learning_rate": 1.1925538103548575e-08, "logits/chosen": -2.4239859580993652, "logits/rejected": -2.373213768005371, "logps/chosen": -280.8854675292969, "logps/rejected": -301.01092529296875, "loss": 0.5219, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.43918657302856445, "rewards/margins": 1.376401662826538, "rewards/rejected": -1.8155882358551025, "step": 1870 }, { "epoch": 0.98, "learning_rate": 9.016870273414776e-09, "logits/chosen": -2.4339070320129395, "logits/rejected": -2.3967947959899902, "logps/chosen": -354.99462890625, "logps/rejected": -309.2855529785156, "loss": 0.4764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.24851647019386292, "rewards/margins": 1.5067845582962036, "rewards/rejected": -1.7553011178970337, "step": 1880 }, { "epoch": 0.99, "learning_rate": 6.1082024432809765e-09, "logits/chosen": -2.4029338359832764, "logits/rejected": -2.4133036136627197, "logps/chosen": -268.3423767089844, "logps/rejected": -301.10321044921875, "loss": 0.5074, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8797990083694458, "rewards/margins": 0.8209834098815918, "rewards/rejected": -1.7007821798324585, "step": 1890 }, { "epoch": 0.99, "learning_rate": 3.1995346131471783e-09, "logits/chosen": -2.4465489387512207, "logits/rejected": -2.4295449256896973, "logps/chosen": -268.50006103515625, "logps/rejected": -266.83685302734375, "loss": 0.479, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.589029848575592, "rewards/margins": 1.1926295757293701, "rewards/rejected": -1.781659483909607, "step": 1900 }, { "epoch": 1.0, "learning_rate": 2.9086678301337986e-10, "logits/chosen": -2.482949733734131, "logits/rejected": -2.426959991455078, "logps/chosen": -303.0064392089844, "logps/rejected": -287.6535339355469, "loss": 0.5333, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44241657853126526, "rewards/margins": 0.9881950616836548, "rewards/rejected": -1.4306116104125977, "step": 1910 }, { "epoch": 1.0, "step": 1911, "total_flos": 0.0, "train_loss": 0.5469513680846325, "train_runtime": 31987.9403, "train_samples_per_second": 1.911, "train_steps_per_second": 0.06 } ], "logging_steps": 10, "max_steps": 1911, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }