{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010468463752944255, "grad_norm": 13.816486358642578, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.9122443199157715, "logits/rejected": -2.8823766708374023, "logps/chosen": -276.3387451171875, "logps/rejected": -242.270751953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010468463752944255, "grad_norm": 14.41542911529541, "learning_rate": 5.208333333333334e-07, "logits/chosen": -2.8143603801727295, "logits/rejected": -2.7806081771850586, "logps/chosen": -269.4888610839844, "logps/rejected": -283.96014404296875, "loss": 0.6927, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0007840646430850029, "rewards/margins": -0.0032457474153488874, "rewards/rejected": 0.00246168184094131, "step": 10 }, { "epoch": 0.02093692750588851, "grad_norm": 14.011621475219727, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.8383383750915527, "logits/rejected": -2.778038740158081, "logps/chosen": -289.6752624511719, "logps/rejected": -246.4437255859375, "loss": 0.6903, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0002006460854317993, "rewards/margins": 0.0035436502657830715, "rewards/rejected": -0.0033430042676627636, "step": 20 }, { "epoch": 0.031405391258832765, "grad_norm": 13.931632995605469, "learning_rate": 1.5625e-06, "logits/chosen": -2.8146023750305176, "logits/rejected": -2.8039023876190186, "logps/chosen": -259.8967590332031, "logps/rejected": -239.6516571044922, "loss": 0.6843, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.011000868864357471, "rewards/margins": 0.01744804158806801, "rewards/rejected": -0.006447173655033112, "step": 30 }, { "epoch": 0.04187385501177702, "grad_norm": 13.794343948364258, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.823273181915283, "logits/rejected": -2.7753758430480957, "logps/chosen": -267.2137451171875, "logps/rejected": -260.3147277832031, "loss": 0.6693, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.014138467609882355, "rewards/margins": 0.059490323066711426, "rewards/rejected": -0.04535185918211937, "step": 40 }, { "epoch": 0.05234231876472128, "grad_norm": 13.499931335449219, "learning_rate": 2.604166666666667e-06, "logits/chosen": -2.756686210632324, "logits/rejected": -2.7536370754241943, "logps/chosen": -224.6168670654297, "logps/rejected": -232.76455688476562, "loss": 0.6412, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.026402924209833145, "rewards/margins": 0.13416479527950287, "rewards/rejected": -0.10776187479496002, "step": 50 }, { "epoch": 0.06281078251766553, "grad_norm": 11.923255920410156, "learning_rate": 3.125e-06, "logits/chosen": -2.7912814617156982, "logits/rejected": -2.7667124271392822, "logps/chosen": -244.9199676513672, "logps/rejected": -246.4986114501953, "loss": 0.6221, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.030213266611099243, "rewards/margins": 0.2403673380613327, "rewards/rejected": -0.21015405654907227, "step": 60 }, { "epoch": 0.07327924627060979, "grad_norm": 12.119612693786621, "learning_rate": 3.6458333333333333e-06, "logits/chosen": -2.8213372230529785, "logits/rejected": -2.7878777980804443, "logps/chosen": -299.50091552734375, "logps/rejected": -263.40069580078125, "loss": 0.5969, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.06017110496759415, "rewards/margins": 0.3069015145301819, "rewards/rejected": -0.24673044681549072, "step": 70 }, { "epoch": 0.08374771002355404, "grad_norm": 15.060250282287598, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.776968479156494, "logits/rejected": -2.747732639312744, "logps/chosen": -274.90594482421875, "logps/rejected": -258.62103271484375, "loss": 0.6066, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.034456029534339905, "rewards/margins": 0.39508965611457825, "rewards/rejected": -0.42954570055007935, "step": 80 }, { "epoch": 0.0942161737764983, "grad_norm": 12.425506591796875, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -2.781564950942993, "logits/rejected": -2.756075620651245, "logps/chosen": -250.3762664794922, "logps/rejected": -221.0802459716797, "loss": 0.5766, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05863947793841362, "rewards/margins": 0.37909096479415894, "rewards/rejected": -0.43773046135902405, "step": 90 }, { "epoch": 0.10468463752944256, "grad_norm": 12.826536178588867, "learning_rate": 4.9997324926814375e-06, "logits/chosen": -2.775822401046753, "logits/rejected": -2.7732839584350586, "logps/chosen": -267.44232177734375, "logps/rejected": -250.16455078125, "loss": 0.5795, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.06354306638240814, "rewards/margins": 0.5622893571853638, "rewards/rejected": -0.4987463057041168, "step": 100 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -2.7899813652038574, "eval_logits/rejected": -2.768789529800415, "eval_logps/chosen": -271.6063232421875, "eval_logps/rejected": -271.3592529296875, "eval_loss": 0.5875207781791687, "eval_rewards/accuracies": 0.682539701461792, "eval_rewards/chosen": 0.0264796894043684, "eval_rewards/margins": 0.3985615074634552, "eval_rewards/rejected": -0.37208184599876404, "eval_runtime": 354.4273, "eval_samples_per_second": 5.643, "eval_steps_per_second": 0.178, "step": 100 }, { "epoch": 0.11515310128238682, "grad_norm": 11.797159194946289, "learning_rate": 4.996723692767927e-06, "logits/chosen": -2.7566726207733154, "logits/rejected": -2.727776527404785, "logps/chosen": -275.2736511230469, "logps/rejected": -252.49301147460938, "loss": 0.5617, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06772824376821518, "rewards/margins": 0.48222237825393677, "rewards/rejected": -0.414494127035141, "step": 110 }, { "epoch": 0.12562156503533106, "grad_norm": 13.538057327270508, "learning_rate": 4.9903757462135984e-06, "logits/chosen": -2.80414080619812, "logits/rejected": -2.7789015769958496, "logps/chosen": -260.9825744628906, "logps/rejected": -256.66082763671875, "loss": 0.5598, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.07279430329799652, "rewards/margins": 0.5211631655693054, "rewards/rejected": -0.4483688771724701, "step": 120 }, { "epoch": 0.1360900287882753, "grad_norm": 17.772846221923828, "learning_rate": 4.980697142834315e-06, "logits/chosen": -2.7555739879608154, "logits/rejected": -2.7485146522521973, "logps/chosen": -259.66021728515625, "logps/rejected": -250.82589721679688, "loss": 0.5679, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.04345005005598068, "rewards/margins": 0.595272958278656, "rewards/rejected": -0.5518229603767395, "step": 130 }, { "epoch": 0.14655849254121958, "grad_norm": 14.183941841125488, "learning_rate": 4.967700826904229e-06, "logits/chosen": -2.789041042327881, "logits/rejected": -2.7646267414093018, "logps/chosen": -251.831298828125, "logps/rejected": -259.8060607910156, "loss": 0.5732, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.012771936133503914, "rewards/margins": 0.5387409925460815, "rewards/rejected": -0.5515128970146179, "step": 140 }, { "epoch": 0.15702695629416383, "grad_norm": 13.352206230163574, "learning_rate": 4.951404179843963e-06, "logits/chosen": -2.7706031799316406, "logits/rejected": -2.777296543121338, "logps/chosen": -260.20172119140625, "logps/rejected": -244.833984375, "loss": 0.5414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.30398789048194885, "rewards/margins": 0.5834565162658691, "rewards/rejected": -0.2794686257839203, "step": 150 }, { "epoch": 0.16749542004710807, "grad_norm": 14.198497772216797, "learning_rate": 4.931828996974498e-06, "logits/chosen": -2.8031933307647705, "logits/rejected": -2.7727789878845215, "logps/chosen": -282.6216735839844, "logps/rejected": -249.09408569335938, "loss": 0.5264, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.7594842314720154, "rewards/margins": 0.7169784903526306, "rewards/rejected": 0.04250572994351387, "step": 160 }, { "epoch": 0.17796388380005235, "grad_norm": 14.190126419067383, "learning_rate": 4.909001458367867e-06, "logits/chosen": -2.821536064147949, "logits/rejected": -2.7903571128845215, "logps/chosen": -268.6175537109375, "logps/rejected": -254.8057098388672, "loss": 0.5492, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7267504930496216, "rewards/margins": 0.6977671980857849, "rewards/rejected": 0.028983300551772118, "step": 170 }, { "epoch": 0.1884323475529966, "grad_norm": 14.373480796813965, "learning_rate": 4.882952093833628e-06, "logits/chosen": -2.828423023223877, "logits/rejected": -2.818983554840088, "logps/chosen": -246.63040161132812, "logps/rejected": -241.3114471435547, "loss": 0.5687, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4481576979160309, "rewards/margins": 0.6290773153305054, "rewards/rejected": -0.18091967701911926, "step": 180 }, { "epoch": 0.19890081130594087, "grad_norm": 12.725995063781738, "learning_rate": 4.853715742087947e-06, "logits/chosen": -2.8280279636383057, "logits/rejected": -2.793407917022705, "logps/chosen": -296.0823669433594, "logps/rejected": -275.6176452636719, "loss": 0.538, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4707435667514801, "rewards/margins": 0.7004532814025879, "rewards/rejected": -0.22970974445343018, "step": 190 }, { "epoch": 0.2093692750588851, "grad_norm": 16.12042236328125, "learning_rate": 4.821331504159906e-06, "logits/chosen": -2.7835636138916016, "logits/rejected": -2.7702224254608154, "logps/chosen": -249.65725708007812, "logps/rejected": -258.82049560546875, "loss": 0.5449, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.3579314351081848, "rewards/margins": 0.6976887583732605, "rewards/rejected": -0.3397572934627533, "step": 200 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -2.7981207370758057, "eval_logits/rejected": -2.779163122177124, "eval_logps/chosen": -271.27044677734375, "eval_logps/rejected": -273.3644714355469, "eval_loss": 0.5519838333129883, "eval_rewards/accuracies": 0.7103174328804016, "eval_rewards/chosen": 0.0600733757019043, "eval_rewards/margins": 0.6326771974563599, "eval_rewards/rejected": -0.5726038217544556, "eval_runtime": 353.4661, "eval_samples_per_second": 5.658, "eval_steps_per_second": 0.178, "step": 200 }, { "epoch": 0.21983773881182936, "grad_norm": 15.229165077209473, "learning_rate": 4.7858426910973435e-06, "logits/chosen": -2.8110532760620117, "logits/rejected": -2.785770893096924, "logps/chosen": -290.8441467285156, "logps/rejected": -283.64971923828125, "loss": 0.5339, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.17869320511817932, "rewards/margins": 0.780803382396698, "rewards/rejected": -0.6021102070808411, "step": 210 }, { "epoch": 0.23030620256477363, "grad_norm": 11.427948951721191, "learning_rate": 4.747296766042161e-06, "logits/chosen": -2.761380434036255, "logits/rejected": -2.7642910480499268, "logps/chosen": -280.3062744140625, "logps/rejected": -276.4909973144531, "loss": 0.5184, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01649661734700203, "rewards/margins": 0.7951753735542297, "rewards/rejected": -0.7786787748336792, "step": 220 }, { "epoch": 0.24077466631771788, "grad_norm": 15.31595230102539, "learning_rate": 4.705745280752586e-06, "logits/chosen": -2.8096513748168945, "logits/rejected": -2.7831692695617676, "logps/chosen": -272.4754943847656, "logps/rejected": -266.40240478515625, "loss": 0.524, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.3214190900325775, "rewards/margins": 0.831048846244812, "rewards/rejected": -0.5096298456192017, "step": 230 }, { "epoch": 0.2512431300706621, "grad_norm": 16.855459213256836, "learning_rate": 4.661243806657256e-06, "logits/chosen": -2.790187120437622, "logits/rejected": -2.751081705093384, "logps/chosen": -297.11236572265625, "logps/rejected": -265.01123046875, "loss": 0.5584, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5351754426956177, "rewards/margins": 0.7664622664451599, "rewards/rejected": -0.23128685355186462, "step": 240 }, { "epoch": 0.26171159382360637, "grad_norm": 13.651627540588379, "learning_rate": 4.613851860533367e-06, "logits/chosen": -2.788170576095581, "logits/rejected": -2.7668204307556152, "logps/chosen": -260.80865478515625, "logps/rejected": -255.9954376220703, "loss": 0.5141, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.4738442301750183, "rewards/margins": 1.0687077045440674, "rewards/rejected": -0.5948633551597595, "step": 250 }, { "epoch": 0.2721800575765506, "grad_norm": 15.22368049621582, "learning_rate": 4.563632824908252e-06, "logits/chosen": -2.8234317302703857, "logits/rejected": -2.7849972248077393, "logps/chosen": -297.3622741699219, "logps/rejected": -288.5556945800781, "loss": 0.5374, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.3837077021598816, "rewards/margins": 0.7442808747291565, "rewards/rejected": -0.3605732023715973, "step": 260 }, { "epoch": 0.2826485213294949, "grad_norm": 13.450421333312988, "learning_rate": 4.510653863290871e-06, "logits/chosen": -2.7682323455810547, "logits/rejected": -2.7710976600646973, "logps/chosen": -264.67218017578125, "logps/rejected": -262.8077697753906, "loss": 0.5469, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5086392164230347, "rewards/margins": 0.8164563179016113, "rewards/rejected": -0.3078171908855438, "step": 270 }, { "epoch": 0.29311698508243916, "grad_norm": 13.034257888793945, "learning_rate": 4.454985830346574e-06, "logits/chosen": -2.7635130882263184, "logits/rejected": -2.730128049850464, "logps/chosen": -263.2830810546875, "logps/rejected": -227.2650604248047, "loss": 0.5155, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6867417693138123, "rewards/margins": 0.9631470441818237, "rewards/rejected": -0.2764051854610443, "step": 280 }, { "epoch": 0.3035854488353834, "grad_norm": 11.6800537109375, "learning_rate": 4.396703177135262e-06, "logits/chosen": -2.741833209991455, "logits/rejected": -2.7163052558898926, "logps/chosen": -271.12530517578125, "logps/rejected": -264.3458557128906, "loss": 0.5064, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.34893399477005005, "rewards/margins": 0.8408756256103516, "rewards/rejected": -0.49194154143333435, "step": 290 }, { "epoch": 0.31405391258832765, "grad_norm": 15.404385566711426, "learning_rate": 4.335883851539693e-06, "logits/chosen": -2.7640957832336426, "logits/rejected": -2.7771594524383545, "logps/chosen": -268.81573486328125, "logps/rejected": -250.0668182373047, "loss": 0.545, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04025740176439285, "rewards/margins": 0.6916046738624573, "rewards/rejected": -0.651347279548645, "step": 300 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -2.780329942703247, "eval_logits/rejected": -2.7615909576416016, "eval_logps/chosen": -272.0685729980469, "eval_logps/rejected": -275.2751159667969, "eval_loss": 0.5320433378219604, "eval_rewards/accuracies": 0.704365074634552, "eval_rewards/chosen": -0.019741566851735115, "eval_rewards/margins": 0.7439272999763489, "eval_rewards/rejected": -0.7636688351631165, "eval_runtime": 353.3156, "eval_samples_per_second": 5.661, "eval_steps_per_second": 0.178, "step": 300 }, { "epoch": 0.3245223763412719, "grad_norm": 15.563763618469238, "learning_rate": 4.2726091940171055e-06, "logits/chosen": -2.829540729522705, "logits/rejected": -2.7938830852508545, "logps/chosen": -332.7828674316406, "logps/rejected": -280.2691955566406, "loss": 0.5152, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09140340238809586, "rewards/margins": 0.7919255495071411, "rewards/rejected": -0.7005220651626587, "step": 310 }, { "epoch": 0.33499084009421615, "grad_norm": 16.867467880249023, "learning_rate": 4.206963828813555e-06, "logits/chosen": -2.788259983062744, "logits/rejected": -2.7637429237365723, "logps/chosen": -271.119140625, "logps/rejected": -276.2804260253906, "loss": 0.5178, "rewards/accuracies": 0.75, "rewards/chosen": -0.10287600755691528, "rewards/margins": 0.8704935908317566, "rewards/rejected": -0.9733695983886719, "step": 320 }, { "epoch": 0.34545930384716045, "grad_norm": 14.005449295043945, "learning_rate": 4.139035550786495e-06, "logits/chosen": -2.7821526527404785, "logits/rejected": -2.754249095916748, "logps/chosen": -268.0732421875, "logps/rejected": -225.94900512695312, "loss": 0.51, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.022264836356043816, "rewards/margins": 0.8124101758003235, "rewards/rejected": -0.7901453971862793, "step": 330 }, { "epoch": 0.3559277676001047, "grad_norm": 17.381444931030273, "learning_rate": 4.068915207986931e-06, "logits/chosen": -2.771470069885254, "logits/rejected": -2.7284042835235596, "logps/chosen": -257.30120849609375, "logps/rejected": -253.9716339111328, "loss": 0.5152, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1510663479566574, "rewards/margins": 0.8377777338027954, "rewards/rejected": -0.686711311340332, "step": 340 }, { "epoch": 0.36639623135304894, "grad_norm": 13.891199111938477, "learning_rate": 3.996696580158211e-06, "logits/chosen": -2.793546676635742, "logits/rejected": -2.7672958374023438, "logps/chosen": -294.7528076171875, "logps/rejected": -245.27969360351562, "loss": 0.5194, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.3805600702762604, "rewards/margins": 0.7021139860153198, "rewards/rejected": -0.32155394554138184, "step": 350 }, { "epoch": 0.3768646951059932, "grad_norm": 14.259988784790039, "learning_rate": 3.922476253313921e-06, "logits/chosen": -2.741086483001709, "logits/rejected": -2.7274608612060547, "logps/chosen": -274.81207275390625, "logps/rejected": -242.241943359375, "loss": 0.5059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2537211775779724, "rewards/margins": 0.7949485778808594, "rewards/rejected": -0.5412274599075317, "step": 360 }, { "epoch": 0.38733315885893743, "grad_norm": 19.930835723876953, "learning_rate": 3.846353490562664e-06, "logits/chosen": -2.715400218963623, "logits/rejected": -2.6755480766296387, "logps/chosen": -266.83074951171875, "logps/rejected": -271.99041748046875, "loss": 0.5134, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2603386640548706, "rewards/margins": 1.0200598239898682, "rewards/rejected": -1.2803986072540283, "step": 370 }, { "epoch": 0.39780162261188173, "grad_norm": 15.25166130065918, "learning_rate": 3.768430099352445e-06, "logits/chosen": -2.769425868988037, "logits/rejected": -2.723906993865967, "logps/chosen": -267.3630065917969, "logps/rejected": -261.64910888671875, "loss": 0.5334, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.059519242495298386, "rewards/margins": 0.8974593877792358, "rewards/rejected": -0.9569786787033081, "step": 380 }, { "epoch": 0.408270086364826, "grad_norm": 13.465699195861816, "learning_rate": 3.6888102953122307e-06, "logits/chosen": -2.772050619125366, "logits/rejected": -2.7556827068328857, "logps/chosen": -254.5989532470703, "logps/rejected": -253.4079132080078, "loss": 0.5046, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.2735213041305542, "rewards/margins": 0.8997882008552551, "rewards/rejected": -0.6262668967247009, "step": 390 }, { "epoch": 0.4187385501177702, "grad_norm": 17.0921573638916, "learning_rate": 3.607600562872785e-06, "logits/chosen": -2.7273311614990234, "logits/rejected": -2.7061333656311035, "logps/chosen": -275.65447998046875, "logps/rejected": -268.2806396484375, "loss": 0.4747, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.17162422835826874, "rewards/margins": 1.1053993701934814, "rewards/rejected": -0.9337752461433411, "step": 400 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -2.7731950283050537, "eval_logits/rejected": -2.753185510635376, "eval_logps/chosen": -273.5996398925781, "eval_logps/rejected": -277.1651306152344, "eval_loss": 0.5228143334388733, "eval_rewards/accuracies": 0.7003968358039856, "eval_rewards/chosen": -0.1728479415178299, "eval_rewards/margins": 0.7798227071762085, "eval_rewards/rejected": -0.952670693397522, "eval_runtime": 353.2859, "eval_samples_per_second": 5.661, "eval_steps_per_second": 0.178, "step": 400 }, { "epoch": 0.42920701387071447, "grad_norm": 14.339557647705078, "learning_rate": 3.5249095128531863e-06, "logits/chosen": -2.664485454559326, "logits/rejected": -2.6545228958129883, "logps/chosen": -247.5968780517578, "logps/rejected": -247.5493621826172, "loss": 0.4833, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.0953296571969986, "rewards/margins": 1.0826025009155273, "rewards/rejected": -1.1779320240020752, "step": 410 }, { "epoch": 0.4396754776236587, "grad_norm": 17.868024826049805, "learning_rate": 3.4408477372034743e-06, "logits/chosen": -2.716871738433838, "logits/rejected": -2.7016210556030273, "logps/chosen": -246.01736450195312, "logps/rejected": -259.8745422363281, "loss": 0.4955, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.052067507058382034, "rewards/margins": 1.0310100317001343, "rewards/rejected": -1.0830775499343872, "step": 420 }, { "epoch": 0.45014394137660296, "grad_norm": 16.81452178955078, "learning_rate": 3.355527661097728e-06, "logits/chosen": -2.75317645072937, "logits/rejected": -2.7509617805480957, "logps/chosen": -250.8342742919922, "logps/rejected": -248.368408203125, "loss": 0.529, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0890086218714714, "rewards/margins": 1.041926622390747, "rewards/rejected": -0.9529180526733398, "step": 430 }, { "epoch": 0.46061240512954726, "grad_norm": 14.235637664794922, "learning_rate": 3.269063392575352e-06, "logits/chosen": -2.807570695877075, "logits/rejected": -2.744551420211792, "logps/chosen": -296.353759765625, "logps/rejected": -257.65374755859375, "loss": 0.4776, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2943809926509857, "rewards/margins": 1.0381492376327515, "rewards/rejected": -0.7437682747840881, "step": 440 }, { "epoch": 0.4710808688824915, "grad_norm": 11.64111042022705, "learning_rate": 3.181570569931697e-06, "logits/chosen": -2.8272039890289307, "logits/rejected": -2.7998225688934326, "logps/chosen": -298.7302551269531, "logps/rejected": -261.15606689453125, "loss": 0.4891, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.18123969435691833, "rewards/margins": 0.7455132007598877, "rewards/rejected": -0.564273476600647, "step": 450 }, { "epoch": 0.48154933263543576, "grad_norm": 14.514245986938477, "learning_rate": 3.09316620706208e-06, "logits/chosen": -2.803687334060669, "logits/rejected": -2.788764715194702, "logps/chosen": -294.7230224609375, "logps/rejected": -290.67449951171875, "loss": 0.5102, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.22936221957206726, "rewards/margins": 1.060276985168457, "rewards/rejected": -0.8309147953987122, "step": 460 }, { "epoch": 0.49201779638838, "grad_norm": 19.26064682006836, "learning_rate": 3.0039685369660785e-06, "logits/chosen": -2.745448589324951, "logits/rejected": -2.7057714462280273, "logps/chosen": -275.9302978515625, "logps/rejected": -271.31964111328125, "loss": 0.5009, "rewards/accuracies": 0.84375, "rewards/chosen": 0.40928229689598083, "rewards/margins": 1.3523207902908325, "rewards/rejected": -0.9430384635925293, "step": 470 }, { "epoch": 0.5024862601413242, "grad_norm": 15.569584846496582, "learning_rate": 2.91409685362137e-06, "logits/chosen": -2.728668689727783, "logits/rejected": -2.7024617195129395, "logps/chosen": -280.735595703125, "logps/rejected": -284.4299621582031, "loss": 0.5371, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.218542218208313, "rewards/margins": 0.9496285319328308, "rewards/rejected": -0.7310863137245178, "step": 480 }, { "epoch": 0.5129547238942685, "grad_norm": 13.441338539123535, "learning_rate": 2.8236713524386085e-06, "logits/chosen": -2.780989408493042, "logits/rejected": -2.7283778190612793, "logps/chosen": -251.0142364501953, "logps/rejected": -233.73703002929688, "loss": 0.5264, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3006051182746887, "rewards/margins": 1.0492786169052124, "rewards/rejected": -0.7486735582351685, "step": 490 }, { "epoch": 0.5234231876472127, "grad_norm": 15.913731575012207, "learning_rate": 2.7328129695107205e-06, "logits/chosen": -2.6493608951568604, "logits/rejected": -2.6593687534332275, "logps/chosen": -285.948974609375, "logps/rejected": -264.4158630371094, "loss": 0.5367, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08400087058544159, "rewards/margins": 0.9130845069885254, "rewards/rejected": -0.997085452079773, "step": 500 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -2.753965139389038, "eval_logits/rejected": -2.733893632888794, "eval_logps/chosen": -274.0134582519531, "eval_logps/rejected": -278.07373046875, "eval_loss": 0.5174898505210876, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": -0.21423013508319855, "eval_rewards/margins": 0.8292967677116394, "eval_rewards/rejected": -1.0435270071029663, "eval_runtime": 353.1034, "eval_samples_per_second": 5.664, "eval_steps_per_second": 0.178, "step": 500 }, { "epoch": 0.533891651400157, "grad_norm": 15.676944732666016, "learning_rate": 2.641643219871597e-06, "logits/chosen": -2.719494581222534, "logits/rejected": -2.7037150859832764, "logps/chosen": -294.9654235839844, "logps/rejected": -266.7233581542969, "loss": 0.5713, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.23952999711036682, "rewards/margins": 0.886762261390686, "rewards/rejected": -1.126292109489441, "step": 510 }, { "epoch": 0.5443601151531012, "grad_norm": 13.938344955444336, "learning_rate": 2.5502840349805074e-06, "logits/chosen": -2.696988821029663, "logits/rejected": -2.683870792388916, "logps/chosen": -269.9395446777344, "logps/rejected": -262.0766906738281, "loss": 0.49, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.01678340695798397, "rewards/margins": 0.9905561208724976, "rewards/rejected": -1.0073394775390625, "step": 520 }, { "epoch": 0.5548285789060455, "grad_norm": 15.948507308959961, "learning_rate": 2.4588575996495797e-06, "logits/chosen": -2.7424683570861816, "logits/rejected": -2.694587230682373, "logps/chosen": -284.1721496582031, "logps/rejected": -278.7875671386719, "loss": 0.5118, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.3610132336616516, "rewards/margins": 0.8961461186408997, "rewards/rejected": -0.5351330041885376, "step": 530 }, { "epoch": 0.5652970426589898, "grad_norm": 12.358839988708496, "learning_rate": 2.367486188632446e-06, "logits/chosen": -2.6885437965393066, "logits/rejected": -2.6670284271240234, "logps/chosen": -256.28900146484375, "logps/rejected": -221.53140258789062, "loss": 0.4942, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.3476331830024719, "rewards/margins": 0.8925381898880005, "rewards/rejected": -0.5449050068855286, "step": 540 }, { "epoch": 0.575765506411934, "grad_norm": 15.99252986907959, "learning_rate": 2.276292003092593e-06, "logits/chosen": -2.733283281326294, "logits/rejected": -2.7206578254699707, "logps/chosen": -258.4396667480469, "logps/rejected": -267.3887634277344, "loss": 0.5156, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.19142115116119385, "rewards/margins": 0.8478537797927856, "rewards/rejected": -0.656432569026947, "step": 550 }, { "epoch": 0.5862339701648783, "grad_norm": 17.259794235229492, "learning_rate": 2.1853970071701415e-06, "logits/chosen": -2.7236075401306152, "logits/rejected": -2.697274923324585, "logps/chosen": -275.6216735839844, "logps/rejected": -264.6971740722656, "loss": 0.5283, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0764341875910759, "rewards/margins": 0.8573528528213501, "rewards/rejected": -0.7809187173843384, "step": 560 }, { "epoch": 0.5967024339178225, "grad_norm": 15.168867111206055, "learning_rate": 2.0949227648656194e-06, "logits/chosen": -2.74639630317688, "logits/rejected": -2.7061123847961426, "logps/chosen": -275.2660217285156, "logps/rejected": -243.61441040039062, "loss": 0.4809, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.16716930270195007, "rewards/margins": 1.0621994733810425, "rewards/rejected": -0.89503014087677, "step": 570 }, { "epoch": 0.6071708976707668, "grad_norm": 16.580690383911133, "learning_rate": 2.00499027745888e-06, "logits/chosen": -2.6987080574035645, "logits/rejected": -2.6702322959899902, "logps/chosen": -251.2617950439453, "logps/rejected": -249.33554077148438, "loss": 0.4982, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.08427687734365463, "rewards/margins": 1.002518653869629, "rewards/rejected": -0.9182417988777161, "step": 580 }, { "epoch": 0.6176393614237111, "grad_norm": 13.715599060058594, "learning_rate": 1.915719821680624e-06, "logits/chosen": -2.7376370429992676, "logits/rejected": -2.695758819580078, "logps/chosen": -288.4046325683594, "logps/rejected": -244.7723388671875, "loss": 0.5487, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.006659612059593201, "rewards/margins": 0.9512457847595215, "rewards/rejected": -0.9579054117202759, "step": 590 }, { "epoch": 0.6281078251766553, "grad_norm": 19.28061294555664, "learning_rate": 1.8272307888529276e-06, "logits/chosen": -2.720449924468994, "logits/rejected": -2.7100579738616943, "logps/chosen": -256.0718994140625, "logps/rejected": -238.8487548828125, "loss": 0.5031, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2969541549682617, "rewards/margins": 0.761928915977478, "rewards/rejected": -1.0588830709457397, "step": 600 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -2.726771831512451, "eval_logits/rejected": -2.7070980072021484, "eval_logps/chosen": -274.8104553222656, "eval_logps/rejected": -278.967041015625, "eval_loss": 0.513893723487854, "eval_rewards/accuracies": 0.7023809552192688, "eval_rewards/chosen": -0.2939308285713196, "eval_rewards/margins": 0.8389276266098022, "eval_rewards/rejected": -1.132858395576477, "eval_runtime": 353.2373, "eval_samples_per_second": 5.662, "eval_steps_per_second": 0.178, "step": 600 }, { "epoch": 0.6385762889295996, "grad_norm": 15.231009483337402, "learning_rate": 1.739641525213929e-06, "logits/chosen": -2.688774347305298, "logits/rejected": -2.6713051795959473, "logps/chosen": -264.80462646484375, "logps/rejected": -323.32769775390625, "loss": 0.4765, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2533731758594513, "rewards/margins": 1.022083044052124, "rewards/rejected": -1.275456190109253, "step": 610 }, { "epoch": 0.6490447526825438, "grad_norm": 13.739964485168457, "learning_rate": 1.6530691736402317e-06, "logits/chosen": -2.752707004547119, "logits/rejected": -2.7096076011657715, "logps/chosen": -272.6566467285156, "logps/rejected": -246.8385467529297, "loss": 0.4911, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.008082455024123192, "rewards/margins": 0.9637606739997864, "rewards/rejected": -0.9718431234359741, "step": 620 }, { "epoch": 0.6595132164354881, "grad_norm": 15.237521171569824, "learning_rate": 1.5676295169786864e-06, "logits/chosen": -2.6328330039978027, "logits/rejected": -2.6349148750305176, "logps/chosen": -243.0568389892578, "logps/rejected": -257.5806884765625, "loss": 0.4978, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.11530411243438721, "rewards/margins": 1.1866023540496826, "rewards/rejected": -1.071298360824585, "step": 630 }, { "epoch": 0.6699816801884323, "grad_norm": 14.393755912780762, "learning_rate": 1.4834368231970922e-06, "logits/chosen": -2.745518922805786, "logits/rejected": -2.7395710945129395, "logps/chosen": -261.69219970703125, "logps/rejected": -273.5394592285156, "loss": 0.4615, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.13604871928691864, "rewards/margins": 1.1850334405899048, "rewards/rejected": -1.0489846467971802, "step": 640 }, { "epoch": 0.6804501439413766, "grad_norm": 16.45931625366211, "learning_rate": 1.4006036925609245e-06, "logits/chosen": -2.7178597450256348, "logits/rejected": -2.701998472213745, "logps/chosen": -257.53363037109375, "logps/rejected": -260.20428466796875, "loss": 0.5055, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.378938764333725, "rewards/margins": 1.3896540403366089, "rewards/rejected": -1.0107152462005615, "step": 650 }, { "epoch": 0.6909186076943209, "grad_norm": 16.44599151611328, "learning_rate": 1.3192409070404582e-06, "logits/chosen": -2.668501615524292, "logits/rejected": -2.657517671585083, "logps/chosen": -266.4493408203125, "logps/rejected": -272.10491943359375, "loss": 0.5126, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.3282933533191681, "rewards/margins": 1.1049590110778809, "rewards/rejected": -0.7766658067703247, "step": 660 }, { "epoch": 0.7013870714472651, "grad_norm": 16.747392654418945, "learning_rate": 1.2394572821496953e-06, "logits/chosen": -2.720128059387207, "logits/rejected": -2.704305648803711, "logps/chosen": -274.4294738769531, "logps/rejected": -278.8267822265625, "loss": 0.5283, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.37905776500701904, "rewards/margins": 1.1339893341064453, "rewards/rejected": -0.7549317479133606, "step": 670 }, { "epoch": 0.7118555352002094, "grad_norm": 14.092031478881836, "learning_rate": 1.1613595214152713e-06, "logits/chosen": -2.725722074508667, "logits/rejected": -2.7022783756256104, "logps/chosen": -272.71630859375, "logps/rejected": -235.8934783935547, "loss": 0.5253, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.28024324774742126, "rewards/margins": 1.039801001548767, "rewards/rejected": -0.7595577836036682, "step": 680 }, { "epoch": 0.7223239989531536, "grad_norm": 12.211508750915527, "learning_rate": 1.0850520736699362e-06, "logits/chosen": -2.758209705352783, "logits/rejected": -2.7124698162078857, "logps/chosen": -298.98974609375, "logps/rejected": -265.95648193359375, "loss": 0.4786, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.3048846125602722, "rewards/margins": 1.1615703105926514, "rewards/rejected": -0.8566857576370239, "step": 690 }, { "epoch": 0.7327924627060979, "grad_norm": 17.241064071655273, "learning_rate": 1.0106369933615043e-06, "logits/chosen": -2.682668924331665, "logits/rejected": -2.677841901779175, "logps/chosen": -243.40737915039062, "logps/rejected": -249.0398406982422, "loss": 0.5057, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.09061723947525024, "rewards/margins": 0.9352877736091614, "rewards/rejected": -0.8446704745292664, "step": 700 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -2.740434408187866, "eval_logits/rejected": -2.7206947803497314, "eval_logps/chosen": -271.9793701171875, "eval_logps/rejected": -276.6876220703125, "eval_loss": 0.5084052085876465, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": -0.010822229087352753, "eval_rewards/margins": 0.8940958976745605, "eval_rewards/rejected": -0.9049180150032043, "eval_runtime": 353.4167, "eval_samples_per_second": 5.659, "eval_steps_per_second": 0.178, "step": 700 }, { "epoch": 0.7432609264590422, "grad_norm": 14.434347152709961, "learning_rate": 9.382138040640714e-07, "logits/chosen": -2.7248692512512207, "logits/rejected": -2.7028727531433105, "logps/chosen": -248.70083618164062, "logps/rejected": -249.9818878173828, "loss": 0.523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.020344754680991173, "rewards/margins": 0.7711048722267151, "rewards/rejected": -0.7914497256278992, "step": 710 }, { "epoch": 0.7537293902119864, "grad_norm": 18.781707763671875, "learning_rate": 8.678793653740633e-07, "logits/chosen": -2.6842763423919678, "logits/rejected": -2.655897378921509, "logps/chosen": -227.59765625, "logps/rejected": -242.20462036132812, "loss": 0.5119, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.029393959790468216, "rewards/margins": 1.0940216779708862, "rewards/rejected": -1.1234157085418701, "step": 720 }, { "epoch": 0.7641978539649307, "grad_norm": 14.272910118103027, "learning_rate": 7.997277433690984e-07, "logits/chosen": -2.763821601867676, "logits/rejected": -2.7315735816955566, "logps/chosen": -260.8923034667969, "logps/rejected": -242.97708129882812, "loss": 0.4847, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01939038559794426, "rewards/margins": 1.0031776428222656, "rewards/rejected": -0.9837873578071594, "step": 730 }, { "epoch": 0.7746663177178749, "grad_norm": 14.98263931274414, "learning_rate": 7.338500848029603e-07, "logits/chosen": -2.6752095222473145, "logits/rejected": -2.657548427581787, "logps/chosen": -277.3648376464844, "logps/rejected": -270.6456604003906, "loss": 0.5273, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.2628101706504822, "rewards/margins": 1.2527306079864502, "rewards/rejected": -0.9899206161499023, "step": 740 }, { "epoch": 0.7851347814708192, "grad_norm": 13.564722061157227, "learning_rate": 6.70334495204884e-07, "logits/chosen": -2.6619467735290527, "logits/rejected": -2.678673267364502, "logps/chosen": -261.7742919921875, "logps/rejected": -279.1748046875, "loss": 0.5415, "rewards/accuracies": 0.71875, "rewards/chosen": 0.09365497529506683, "rewards/margins": 0.9422794580459595, "rewards/rejected": -0.8486245274543762, "step": 750 }, { "epoch": 0.7956032452237635, "grad_norm": 15.380537986755371, "learning_rate": 6.092659210462232e-07, "logits/chosen": -2.7603864669799805, "logits/rejected": -2.740291118621826, "logps/chosen": -277.1561279296875, "logps/rejected": -247.03018188476562, "loss": 0.5041, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.11306103318929672, "rewards/margins": 1.0758002996444702, "rewards/rejected": -0.9627392888069153, "step": 760 }, { "epoch": 0.8060717089767077, "grad_norm": 12.876290321350098, "learning_rate": 5.507260361320738e-07, "logits/chosen": -2.766484498977661, "logits/rejected": -2.7405753135681152, "logps/chosen": -275.5859069824219, "logps/rejected": -261.8198547363281, "loss": 0.5393, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.04547576978802681, "rewards/margins": 0.9839900732040405, "rewards/rejected": -0.9385143518447876, "step": 770 }, { "epoch": 0.816540172729652, "grad_norm": 14.665273666381836, "learning_rate": 4.947931323697983e-07, "logits/chosen": -2.7336275577545166, "logits/rejected": -2.702479362487793, "logps/chosen": -280.28045654296875, "logps/rejected": -260.28778076171875, "loss": 0.4583, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.07684344798326492, "rewards/margins": 0.9824104309082031, "rewards/rejected": -1.0592539310455322, "step": 780 }, { "epoch": 0.8270086364825961, "grad_norm": 13.474715232849121, "learning_rate": 4.4154201506053985e-07, "logits/chosen": -2.698451519012451, "logits/rejected": -2.6745803356170654, "logps/chosen": -273.3094482421875, "logps/rejected": -273.69232177734375, "loss": 0.5363, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.16311880946159363, "rewards/margins": 0.8299474716186523, "rewards/rejected": -0.9930663108825684, "step": 790 }, { "epoch": 0.8374771002355405, "grad_norm": 18.379581451416016, "learning_rate": 3.910439028537638e-07, "logits/chosen": -2.688326358795166, "logits/rejected": -2.650494337081909, "logps/chosen": -279.04498291015625, "logps/rejected": -262.01470947265625, "loss": 0.5172, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.11311912536621094, "rewards/margins": 1.0069270133972168, "rewards/rejected": -0.8938078880310059, "step": 800 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -2.7371084690093994, "eval_logits/rejected": -2.717339038848877, "eval_logps/chosen": -272.1710510253906, "eval_logps/rejected": -276.8697204589844, "eval_loss": 0.5089952945709229, "eval_rewards/accuracies": 0.7182539701461792, "eval_rewards/chosen": -0.02998838946223259, "eval_rewards/margins": 0.8931422829627991, "eval_rewards/rejected": -0.9231306910514832, "eval_runtime": 353.2424, "eval_samples_per_second": 5.662, "eval_steps_per_second": 0.178, "step": 800 }, { "epoch": 0.8479455639884846, "grad_norm": 14.40329647064209, "learning_rate": 3.4336633249862084e-07, "logits/chosen": -2.6875698566436768, "logits/rejected": -2.688908100128174, "logps/chosen": -262.572998046875, "logps/rejected": -263.0613708496094, "loss": 0.5058, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.08515429496765137, "rewards/margins": 0.9841421246528625, "rewards/rejected": -0.8989877700805664, "step": 810 }, { "epoch": 0.8584140277414289, "grad_norm": 13.108388900756836, "learning_rate": 2.98573068519539e-07, "logits/chosen": -2.733366012573242, "logits/rejected": -2.6849071979522705, "logps/chosen": -281.99212646484375, "logps/rejected": -253.0430145263672, "loss": 0.4671, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.06205648183822632, "rewards/margins": 1.0704829692840576, "rewards/rejected": -1.0084264278411865, "step": 820 }, { "epoch": 0.8688824914943732, "grad_norm": 12.790390014648438, "learning_rate": 2.5672401793681854e-07, "logits/chosen": -2.754135847091675, "logits/rejected": -2.7304673194885254, "logps/chosen": -255.24227905273438, "logps/rejected": -233.9311981201172, "loss": 0.4682, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1760888546705246, "rewards/margins": 1.146909236907959, "rewards/rejected": -0.970820426940918, "step": 830 }, { "epoch": 0.8793509552473174, "grad_norm": 12.2880220413208, "learning_rate": 2.178751501463036e-07, "logits/chosen": -2.703233480453491, "logits/rejected": -2.676057815551758, "logps/chosen": -288.25238037109375, "logps/rejected": -258.7244567871094, "loss": 0.4983, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.12756434082984924, "rewards/margins": 1.043333649635315, "rewards/rejected": -0.9157692193984985, "step": 840 }, { "epoch": 0.8898194190002617, "grad_norm": 18.457061767578125, "learning_rate": 1.820784220652766e-07, "logits/chosen": -2.769766330718994, "logits/rejected": -2.749628782272339, "logps/chosen": -275.16302490234375, "logps/rejected": -264.35260009765625, "loss": 0.5438, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1516454517841339, "rewards/margins": 1.0686285495758057, "rewards/rejected": -0.9169832468032837, "step": 850 }, { "epoch": 0.9002878827532059, "grad_norm": 20.659849166870117, "learning_rate": 1.4938170864468636e-07, "logits/chosen": -2.7188377380371094, "logits/rejected": -2.7116434574127197, "logps/chosen": -265.40045166015625, "logps/rejected": -279.65716552734375, "loss": 0.4835, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.013210540637373924, "rewards/margins": 1.0902684926986694, "rewards/rejected": -1.0770580768585205, "step": 860 }, { "epoch": 0.9107563465061502, "grad_norm": 15.20347785949707, "learning_rate": 1.1982873884064466e-07, "logits/chosen": -2.6989123821258545, "logits/rejected": -2.668877124786377, "logps/chosen": -265.70562744140625, "logps/rejected": -227.8814239501953, "loss": 0.5241, "rewards/accuracies": 0.6875, "rewards/chosen": -0.006173081696033478, "rewards/margins": 0.8927028775215149, "rewards/rejected": -0.8988760113716125, "step": 870 }, { "epoch": 0.9212248102590945, "grad_norm": 13.230890274047852, "learning_rate": 9.345903713082305e-08, "logits/chosen": -2.7578580379486084, "logits/rejected": -2.700892210006714, "logps/chosen": -257.47833251953125, "logps/rejected": -217.13882446289062, "loss": 0.4941, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.035571496933698654, "rewards/margins": 1.0173650979995728, "rewards/rejected": -1.0529365539550781, "step": 880 }, { "epoch": 0.9316932740120387, "grad_norm": 13.502944946289062, "learning_rate": 7.030787065396866e-08, "logits/chosen": -2.709721088409424, "logits/rejected": -2.6791884899139404, "logps/chosen": -248.15966796875, "logps/rejected": -225.49337768554688, "loss": 0.5095, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.09806522727012634, "rewards/margins": 0.9047859907150269, "rewards/rejected": -0.8067208528518677, "step": 890 }, { "epoch": 0.942161737764983, "grad_norm": 13.781939506530762, "learning_rate": 5.0406202043228604e-08, "logits/chosen": -2.698310375213623, "logits/rejected": -2.6684579849243164, "logps/chosen": -279.3674621582031, "logps/rejected": -252.4850311279297, "loss": 0.5173, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.18393948674201965, "rewards/margins": 0.9908119440078735, "rewards/rejected": -0.8068723678588867, "step": 900 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -2.737166404724121, "eval_logits/rejected": -2.7174251079559326, "eval_logps/chosen": -271.87890625, "eval_logps/rejected": -276.7181396484375, "eval_loss": 0.5083790421485901, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": -0.0007740448345430195, "eval_rewards/margins": 0.9071968197822571, "eval_rewards/rejected": -0.9079708456993103, "eval_runtime": 353.3783, "eval_samples_per_second": 5.66, "eval_steps_per_second": 0.178, "step": 900 }, { "epoch": 0.9526302015179272, "grad_norm": 12.07172679901123, "learning_rate": 3.378064801637687e-08, "logits/chosen": -2.7228991985321045, "logits/rejected": -2.7141544818878174, "logps/chosen": -273.98095703125, "logps/rejected": -259.7334899902344, "loss": 0.4743, "rewards/accuracies": 0.8125, "rewards/chosen": 0.030942853540182114, "rewards/margins": 1.1606462001800537, "rewards/rejected": -1.1297032833099365, "step": 910 }, { "epoch": 0.9630986652708715, "grad_norm": 13.161744117736816, "learning_rate": 2.0453443778310766e-08, "logits/chosen": -2.7474637031555176, "logits/rejected": -2.731945753097534, "logps/chosen": -264.48785400390625, "logps/rejected": -259.0505065917969, "loss": 0.515, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.009974095039069653, "rewards/margins": 1.0056250095367432, "rewards/rejected": -0.9956509470939636, "step": 920 }, { "epoch": 0.9735671290238157, "grad_norm": 13.763269424438477, "learning_rate": 1.0442413283435759e-08, "logits/chosen": -2.752173900604248, "logits/rejected": -2.7249786853790283, "logps/chosen": -276.56597900390625, "logps/rejected": -276.8932800292969, "loss": 0.516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.10977672040462494, "rewards/margins": 0.9989693760871887, "rewards/rejected": -0.8891927003860474, "step": 930 }, { "epoch": 0.98403559277676, "grad_norm": 13.344006538391113, "learning_rate": 3.760945397705828e-09, "logits/chosen": -2.750119209289551, "logits/rejected": -2.7376656532287598, "logps/chosen": -282.4700012207031, "logps/rejected": -263.9110107421875, "loss": 0.5178, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0567372627556324, "rewards/margins": 1.0393383502960205, "rewards/rejected": -0.9826010465621948, "step": 940 }, { "epoch": 0.9945040565297043, "grad_norm": 16.06287956237793, "learning_rate": 4.1797599220405605e-10, "logits/chosen": -2.705085515975952, "logits/rejected": -2.6794345378875732, "logps/chosen": -257.38623046875, "logps/rejected": -263.4278259277344, "loss": 0.505, "rewards/accuracies": 0.6875, "rewards/chosen": -0.005693820305168629, "rewards/margins": 0.9306586384773254, "rewards/rejected": -0.936352550983429, "step": 950 }, { "epoch": 0.9997382884061764, "step": 955, "total_flos": 0.0, "train_loss": 0.5278735879828168, "train_runtime": 28474.7268, "train_samples_per_second": 2.147, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }