zephyr-7b-dpo-lora / trainer_state.json
Wenboz's picture
Model save
7182753 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 100,
"global_step": 955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010468463752944255,
"grad_norm": 13.816486358642578,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -2.9122443199157715,
"logits/rejected": -2.8823766708374023,
"logps/chosen": -276.3387451171875,
"logps/rejected": -242.270751953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010468463752944255,
"grad_norm": 14.41542911529541,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": -2.8143603801727295,
"logits/rejected": -2.7806081771850586,
"logps/chosen": -269.4888610839844,
"logps/rejected": -283.96014404296875,
"loss": 0.6927,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0007840646430850029,
"rewards/margins": -0.0032457474153488874,
"rewards/rejected": 0.00246168184094131,
"step": 10
},
{
"epoch": 0.02093692750588851,
"grad_norm": 14.011621475219727,
"learning_rate": 1.0416666666666667e-06,
"logits/chosen": -2.8383383750915527,
"logits/rejected": -2.778038740158081,
"logps/chosen": -289.6752624511719,
"logps/rejected": -246.4437255859375,
"loss": 0.6903,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0002006460854317993,
"rewards/margins": 0.0035436502657830715,
"rewards/rejected": -0.0033430042676627636,
"step": 20
},
{
"epoch": 0.031405391258832765,
"grad_norm": 13.931632995605469,
"learning_rate": 1.5625e-06,
"logits/chosen": -2.8146023750305176,
"logits/rejected": -2.8039023876190186,
"logps/chosen": -259.8967590332031,
"logps/rejected": -239.6516571044922,
"loss": 0.6843,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.011000868864357471,
"rewards/margins": 0.01744804158806801,
"rewards/rejected": -0.006447173655033112,
"step": 30
},
{
"epoch": 0.04187385501177702,
"grad_norm": 13.794343948364258,
"learning_rate": 2.0833333333333334e-06,
"logits/chosen": -2.823273181915283,
"logits/rejected": -2.7753758430480957,
"logps/chosen": -267.2137451171875,
"logps/rejected": -260.3147277832031,
"loss": 0.6693,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.014138467609882355,
"rewards/margins": 0.059490323066711426,
"rewards/rejected": -0.04535185918211937,
"step": 40
},
{
"epoch": 0.05234231876472128,
"grad_norm": 13.499931335449219,
"learning_rate": 2.604166666666667e-06,
"logits/chosen": -2.756686210632324,
"logits/rejected": -2.7536370754241943,
"logps/chosen": -224.6168670654297,
"logps/rejected": -232.76455688476562,
"loss": 0.6412,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.026402924209833145,
"rewards/margins": 0.13416479527950287,
"rewards/rejected": -0.10776187479496002,
"step": 50
},
{
"epoch": 0.06281078251766553,
"grad_norm": 11.923255920410156,
"learning_rate": 3.125e-06,
"logits/chosen": -2.7912814617156982,
"logits/rejected": -2.7667124271392822,
"logps/chosen": -244.9199676513672,
"logps/rejected": -246.4986114501953,
"loss": 0.6221,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.030213266611099243,
"rewards/margins": 0.2403673380613327,
"rewards/rejected": -0.21015405654907227,
"step": 60
},
{
"epoch": 0.07327924627060979,
"grad_norm": 12.119612693786621,
"learning_rate": 3.6458333333333333e-06,
"logits/chosen": -2.8213372230529785,
"logits/rejected": -2.7878777980804443,
"logps/chosen": -299.50091552734375,
"logps/rejected": -263.40069580078125,
"loss": 0.5969,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.06017110496759415,
"rewards/margins": 0.3069015145301819,
"rewards/rejected": -0.24673044681549072,
"step": 70
},
{
"epoch": 0.08374771002355404,
"grad_norm": 15.060250282287598,
"learning_rate": 4.166666666666667e-06,
"logits/chosen": -2.776968479156494,
"logits/rejected": -2.747732639312744,
"logps/chosen": -274.90594482421875,
"logps/rejected": -258.62103271484375,
"loss": 0.6066,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.034456029534339905,
"rewards/margins": 0.39508965611457825,
"rewards/rejected": -0.42954570055007935,
"step": 80
},
{
"epoch": 0.0942161737764983,
"grad_norm": 12.425506591796875,
"learning_rate": 4.6875000000000004e-06,
"logits/chosen": -2.781564950942993,
"logits/rejected": -2.756075620651245,
"logps/chosen": -250.3762664794922,
"logps/rejected": -221.0802459716797,
"loss": 0.5766,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.05863947793841362,
"rewards/margins": 0.37909096479415894,
"rewards/rejected": -0.43773046135902405,
"step": 90
},
{
"epoch": 0.10468463752944256,
"grad_norm": 12.826536178588867,
"learning_rate": 4.9997324926814375e-06,
"logits/chosen": -2.775822401046753,
"logits/rejected": -2.7732839584350586,
"logps/chosen": -267.44232177734375,
"logps/rejected": -250.16455078125,
"loss": 0.5795,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.06354306638240814,
"rewards/margins": 0.5622893571853638,
"rewards/rejected": -0.4987463057041168,
"step": 100
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -2.7899813652038574,
"eval_logits/rejected": -2.768789529800415,
"eval_logps/chosen": -271.6063232421875,
"eval_logps/rejected": -271.3592529296875,
"eval_loss": 0.5875207781791687,
"eval_rewards/accuracies": 0.682539701461792,
"eval_rewards/chosen": 0.0264796894043684,
"eval_rewards/margins": 0.3985615074634552,
"eval_rewards/rejected": -0.37208184599876404,
"eval_runtime": 354.4273,
"eval_samples_per_second": 5.643,
"eval_steps_per_second": 0.178,
"step": 100
},
{
"epoch": 0.11515310128238682,
"grad_norm": 11.797159194946289,
"learning_rate": 4.996723692767927e-06,
"logits/chosen": -2.7566726207733154,
"logits/rejected": -2.727776527404785,
"logps/chosen": -275.2736511230469,
"logps/rejected": -252.49301147460938,
"loss": 0.5617,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.06772824376821518,
"rewards/margins": 0.48222237825393677,
"rewards/rejected": -0.414494127035141,
"step": 110
},
{
"epoch": 0.12562156503533106,
"grad_norm": 13.538057327270508,
"learning_rate": 4.9903757462135984e-06,
"logits/chosen": -2.80414080619812,
"logits/rejected": -2.7789015769958496,
"logps/chosen": -260.9825744628906,
"logps/rejected": -256.66082763671875,
"loss": 0.5598,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.07279430329799652,
"rewards/margins": 0.5211631655693054,
"rewards/rejected": -0.4483688771724701,
"step": 120
},
{
"epoch": 0.1360900287882753,
"grad_norm": 17.772846221923828,
"learning_rate": 4.980697142834315e-06,
"logits/chosen": -2.7555739879608154,
"logits/rejected": -2.7485146522521973,
"logps/chosen": -259.66021728515625,
"logps/rejected": -250.82589721679688,
"loss": 0.5679,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.04345005005598068,
"rewards/margins": 0.595272958278656,
"rewards/rejected": -0.5518229603767395,
"step": 130
},
{
"epoch": 0.14655849254121958,
"grad_norm": 14.183941841125488,
"learning_rate": 4.967700826904229e-06,
"logits/chosen": -2.789041042327881,
"logits/rejected": -2.7646267414093018,
"logps/chosen": -251.831298828125,
"logps/rejected": -259.8060607910156,
"loss": 0.5732,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.012771936133503914,
"rewards/margins": 0.5387409925460815,
"rewards/rejected": -0.5515128970146179,
"step": 140
},
{
"epoch": 0.15702695629416383,
"grad_norm": 13.352206230163574,
"learning_rate": 4.951404179843963e-06,
"logits/chosen": -2.7706031799316406,
"logits/rejected": -2.777296543121338,
"logps/chosen": -260.20172119140625,
"logps/rejected": -244.833984375,
"loss": 0.5414,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.30398789048194885,
"rewards/margins": 0.5834565162658691,
"rewards/rejected": -0.2794686257839203,
"step": 150
},
{
"epoch": 0.16749542004710807,
"grad_norm": 14.198497772216797,
"learning_rate": 4.931828996974498e-06,
"logits/chosen": -2.8031933307647705,
"logits/rejected": -2.7727789878845215,
"logps/chosen": -282.6216735839844,
"logps/rejected": -249.09408569335938,
"loss": 0.5264,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.7594842314720154,
"rewards/margins": 0.7169784903526306,
"rewards/rejected": 0.04250572994351387,
"step": 160
},
{
"epoch": 0.17796388380005235,
"grad_norm": 14.190126419067383,
"learning_rate": 4.909001458367867e-06,
"logits/chosen": -2.821536064147949,
"logits/rejected": -2.7903571128845215,
"logps/chosen": -268.6175537109375,
"logps/rejected": -254.8057098388672,
"loss": 0.5492,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.7267504930496216,
"rewards/margins": 0.6977671980857849,
"rewards/rejected": 0.028983300551772118,
"step": 170
},
{
"epoch": 0.1884323475529966,
"grad_norm": 14.373480796813965,
"learning_rate": 4.882952093833628e-06,
"logits/chosen": -2.828423023223877,
"logits/rejected": -2.818983554840088,
"logps/chosen": -246.63040161132812,
"logps/rejected": -241.3114471435547,
"loss": 0.5687,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.4481576979160309,
"rewards/margins": 0.6290773153305054,
"rewards/rejected": -0.18091967701911926,
"step": 180
},
{
"epoch": 0.19890081130594087,
"grad_norm": 12.725995063781738,
"learning_rate": 4.853715742087947e-06,
"logits/chosen": -2.8280279636383057,
"logits/rejected": -2.793407917022705,
"logps/chosen": -296.0823669433594,
"logps/rejected": -275.6176452636719,
"loss": 0.538,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.4707435667514801,
"rewards/margins": 0.7004532814025879,
"rewards/rejected": -0.22970974445343018,
"step": 190
},
{
"epoch": 0.2093692750588851,
"grad_norm": 16.12042236328125,
"learning_rate": 4.821331504159906e-06,
"logits/chosen": -2.7835636138916016,
"logits/rejected": -2.7702224254608154,
"logps/chosen": -249.65725708007812,
"logps/rejected": -258.82049560546875,
"loss": 0.5449,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.3579314351081848,
"rewards/margins": 0.6976887583732605,
"rewards/rejected": -0.3397572934627533,
"step": 200
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -2.7981207370758057,
"eval_logits/rejected": -2.779163122177124,
"eval_logps/chosen": -271.27044677734375,
"eval_logps/rejected": -273.3644714355469,
"eval_loss": 0.5519838333129883,
"eval_rewards/accuracies": 0.7103174328804016,
"eval_rewards/chosen": 0.0600733757019043,
"eval_rewards/margins": 0.6326771974563599,
"eval_rewards/rejected": -0.5726038217544556,
"eval_runtime": 353.4661,
"eval_samples_per_second": 5.658,
"eval_steps_per_second": 0.178,
"step": 200
},
{
"epoch": 0.21983773881182936,
"grad_norm": 15.229165077209473,
"learning_rate": 4.7858426910973435e-06,
"logits/chosen": -2.8110532760620117,
"logits/rejected": -2.785770893096924,
"logps/chosen": -290.8441467285156,
"logps/rejected": -283.64971923828125,
"loss": 0.5339,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.17869320511817932,
"rewards/margins": 0.780803382396698,
"rewards/rejected": -0.6021102070808411,
"step": 210
},
{
"epoch": 0.23030620256477363,
"grad_norm": 11.427948951721191,
"learning_rate": 4.747296766042161e-06,
"logits/chosen": -2.761380434036255,
"logits/rejected": -2.7642910480499268,
"logps/chosen": -280.3062744140625,
"logps/rejected": -276.4909973144531,
"loss": 0.5184,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.01649661734700203,
"rewards/margins": 0.7951753735542297,
"rewards/rejected": -0.7786787748336792,
"step": 220
},
{
"epoch": 0.24077466631771788,
"grad_norm": 15.31595230102539,
"learning_rate": 4.705745280752586e-06,
"logits/chosen": -2.8096513748168945,
"logits/rejected": -2.7831692695617676,
"logps/chosen": -272.4754943847656,
"logps/rejected": -266.40240478515625,
"loss": 0.524,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.3214190900325775,
"rewards/margins": 0.831048846244812,
"rewards/rejected": -0.5096298456192017,
"step": 230
},
{
"epoch": 0.2512431300706621,
"grad_norm": 16.855459213256836,
"learning_rate": 4.661243806657256e-06,
"logits/chosen": -2.790187120437622,
"logits/rejected": -2.751081705093384,
"logps/chosen": -297.11236572265625,
"logps/rejected": -265.01123046875,
"loss": 0.5584,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.5351754426956177,
"rewards/margins": 0.7664622664451599,
"rewards/rejected": -0.23128685355186462,
"step": 240
},
{
"epoch": 0.26171159382360637,
"grad_norm": 13.651627540588379,
"learning_rate": 4.613851860533367e-06,
"logits/chosen": -2.788170576095581,
"logits/rejected": -2.7668204307556152,
"logps/chosen": -260.80865478515625,
"logps/rejected": -255.9954376220703,
"loss": 0.5141,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.4738442301750183,
"rewards/margins": 1.0687077045440674,
"rewards/rejected": -0.5948633551597595,
"step": 250
},
{
"epoch": 0.2721800575765506,
"grad_norm": 15.22368049621582,
"learning_rate": 4.563632824908252e-06,
"logits/chosen": -2.8234317302703857,
"logits/rejected": -2.7849972248077393,
"logps/chosen": -297.3622741699219,
"logps/rejected": -288.5556945800781,
"loss": 0.5374,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.3837077021598816,
"rewards/margins": 0.7442808747291565,
"rewards/rejected": -0.3605732023715973,
"step": 260
},
{
"epoch": 0.2826485213294949,
"grad_norm": 13.450421333312988,
"learning_rate": 4.510653863290871e-06,
"logits/chosen": -2.7682323455810547,
"logits/rejected": -2.7710976600646973,
"logps/chosen": -264.67218017578125,
"logps/rejected": -262.8077697753906,
"loss": 0.5469,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.5086392164230347,
"rewards/margins": 0.8164563179016113,
"rewards/rejected": -0.3078171908855438,
"step": 270
},
{
"epoch": 0.29311698508243916,
"grad_norm": 13.034257888793945,
"learning_rate": 4.454985830346574e-06,
"logits/chosen": -2.7635130882263184,
"logits/rejected": -2.730128049850464,
"logps/chosen": -263.2830810546875,
"logps/rejected": -227.2650604248047,
"loss": 0.5155,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.6867417693138123,
"rewards/margins": 0.9631470441818237,
"rewards/rejected": -0.2764051854610443,
"step": 280
},
{
"epoch": 0.3035854488353834,
"grad_norm": 11.6800537109375,
"learning_rate": 4.396703177135262e-06,
"logits/chosen": -2.741833209991455,
"logits/rejected": -2.7163052558898926,
"logps/chosen": -271.12530517578125,
"logps/rejected": -264.3458557128906,
"loss": 0.5064,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.34893399477005005,
"rewards/margins": 0.8408756256103516,
"rewards/rejected": -0.49194154143333435,
"step": 290
},
{
"epoch": 0.31405391258832765,
"grad_norm": 15.404385566711426,
"learning_rate": 4.335883851539693e-06,
"logits/chosen": -2.7640957832336426,
"logits/rejected": -2.7771594524383545,
"logps/chosen": -268.81573486328125,
"logps/rejected": -250.0668182373047,
"loss": 0.545,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.04025740176439285,
"rewards/margins": 0.6916046738624573,
"rewards/rejected": -0.651347279548645,
"step": 300
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -2.780329942703247,
"eval_logits/rejected": -2.7615909576416016,
"eval_logps/chosen": -272.0685729980469,
"eval_logps/rejected": -275.2751159667969,
"eval_loss": 0.5320433378219604,
"eval_rewards/accuracies": 0.704365074634552,
"eval_rewards/chosen": -0.019741566851735115,
"eval_rewards/margins": 0.7439272999763489,
"eval_rewards/rejected": -0.7636688351631165,
"eval_runtime": 353.3156,
"eval_samples_per_second": 5.661,
"eval_steps_per_second": 0.178,
"step": 300
},
{
"epoch": 0.3245223763412719,
"grad_norm": 15.563763618469238,
"learning_rate": 4.2726091940171055e-06,
"logits/chosen": -2.829540729522705,
"logits/rejected": -2.7938830852508545,
"logps/chosen": -332.7828674316406,
"logps/rejected": -280.2691955566406,
"loss": 0.5152,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.09140340238809586,
"rewards/margins": 0.7919255495071411,
"rewards/rejected": -0.7005220651626587,
"step": 310
},
{
"epoch": 0.33499084009421615,
"grad_norm": 16.867467880249023,
"learning_rate": 4.206963828813555e-06,
"logits/chosen": -2.788259983062744,
"logits/rejected": -2.7637429237365723,
"logps/chosen": -271.119140625,
"logps/rejected": -276.2804260253906,
"loss": 0.5178,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10287600755691528,
"rewards/margins": 0.8704935908317566,
"rewards/rejected": -0.9733695983886719,
"step": 320
},
{
"epoch": 0.34545930384716045,
"grad_norm": 14.005449295043945,
"learning_rate": 4.139035550786495e-06,
"logits/chosen": -2.7821526527404785,
"logits/rejected": -2.754249095916748,
"logps/chosen": -268.0732421875,
"logps/rejected": -225.94900512695312,
"loss": 0.51,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.022264836356043816,
"rewards/margins": 0.8124101758003235,
"rewards/rejected": -0.7901453971862793,
"step": 330
},
{
"epoch": 0.3559277676001047,
"grad_norm": 17.381444931030273,
"learning_rate": 4.068915207986931e-06,
"logits/chosen": -2.771470069885254,
"logits/rejected": -2.7284042835235596,
"logps/chosen": -257.30120849609375,
"logps/rejected": -253.9716339111328,
"loss": 0.5152,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.1510663479566574,
"rewards/margins": 0.8377777338027954,
"rewards/rejected": -0.686711311340332,
"step": 340
},
{
"epoch": 0.36639623135304894,
"grad_norm": 13.891199111938477,
"learning_rate": 3.996696580158211e-06,
"logits/chosen": -2.793546676635742,
"logits/rejected": -2.7672958374023438,
"logps/chosen": -294.7528076171875,
"logps/rejected": -245.27969360351562,
"loss": 0.5194,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.3805600702762604,
"rewards/margins": 0.7021139860153198,
"rewards/rejected": -0.32155394554138184,
"step": 350
},
{
"epoch": 0.3768646951059932,
"grad_norm": 14.259988784790039,
"learning_rate": 3.922476253313921e-06,
"logits/chosen": -2.741086483001709,
"logits/rejected": -2.7274608612060547,
"logps/chosen": -274.81207275390625,
"logps/rejected": -242.241943359375,
"loss": 0.5059,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.2537211775779724,
"rewards/margins": 0.7949485778808594,
"rewards/rejected": -0.5412274599075317,
"step": 360
},
{
"epoch": 0.38733315885893743,
"grad_norm": 19.930835723876953,
"learning_rate": 3.846353490562664e-06,
"logits/chosen": -2.715400218963623,
"logits/rejected": -2.6755480766296387,
"logps/chosen": -266.83074951171875,
"logps/rejected": -271.99041748046875,
"loss": 0.5134,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2603386640548706,
"rewards/margins": 1.0200598239898682,
"rewards/rejected": -1.2803986072540283,
"step": 370
},
{
"epoch": 0.39780162261188173,
"grad_norm": 15.25166130065918,
"learning_rate": 3.768430099352445e-06,
"logits/chosen": -2.769425868988037,
"logits/rejected": -2.723906993865967,
"logps/chosen": -267.3630065917969,
"logps/rejected": -261.64910888671875,
"loss": 0.5334,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.059519242495298386,
"rewards/margins": 0.8974593877792358,
"rewards/rejected": -0.9569786787033081,
"step": 380
},
{
"epoch": 0.408270086364826,
"grad_norm": 13.465699195861816,
"learning_rate": 3.6888102953122307e-06,
"logits/chosen": -2.772050619125366,
"logits/rejected": -2.7556827068328857,
"logps/chosen": -254.5989532470703,
"logps/rejected": -253.4079132080078,
"loss": 0.5046,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.2735213041305542,
"rewards/margins": 0.8997882008552551,
"rewards/rejected": -0.6262668967247009,
"step": 390
},
{
"epoch": 0.4187385501177702,
"grad_norm": 17.0921573638916,
"learning_rate": 3.607600562872785e-06,
"logits/chosen": -2.7273311614990234,
"logits/rejected": -2.7061333656311035,
"logps/chosen": -275.65447998046875,
"logps/rejected": -268.2806396484375,
"loss": 0.4747,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.17162422835826874,
"rewards/margins": 1.1053993701934814,
"rewards/rejected": -0.9337752461433411,
"step": 400
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -2.7731950283050537,
"eval_logits/rejected": -2.753185510635376,
"eval_logps/chosen": -273.5996398925781,
"eval_logps/rejected": -277.1651306152344,
"eval_loss": 0.5228143334388733,
"eval_rewards/accuracies": 0.7003968358039856,
"eval_rewards/chosen": -0.1728479415178299,
"eval_rewards/margins": 0.7798227071762085,
"eval_rewards/rejected": -0.952670693397522,
"eval_runtime": 353.2859,
"eval_samples_per_second": 5.661,
"eval_steps_per_second": 0.178,
"step": 400
},
{
"epoch": 0.42920701387071447,
"grad_norm": 14.339557647705078,
"learning_rate": 3.5249095128531863e-06,
"logits/chosen": -2.664485454559326,
"logits/rejected": -2.6545228958129883,
"logps/chosen": -247.5968780517578,
"logps/rejected": -247.5493621826172,
"loss": 0.4833,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.0953296571969986,
"rewards/margins": 1.0826025009155273,
"rewards/rejected": -1.1779320240020752,
"step": 410
},
{
"epoch": 0.4396754776236587,
"grad_norm": 17.868024826049805,
"learning_rate": 3.4408477372034743e-06,
"logits/chosen": -2.716871738433838,
"logits/rejected": -2.7016210556030273,
"logps/chosen": -246.01736450195312,
"logps/rejected": -259.8745422363281,
"loss": 0.4955,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.052067507058382034,
"rewards/margins": 1.0310100317001343,
"rewards/rejected": -1.0830775499343872,
"step": 420
},
{
"epoch": 0.45014394137660296,
"grad_norm": 16.81452178955078,
"learning_rate": 3.355527661097728e-06,
"logits/chosen": -2.75317645072937,
"logits/rejected": -2.7509617805480957,
"logps/chosen": -250.8342742919922,
"logps/rejected": -248.368408203125,
"loss": 0.529,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.0890086218714714,
"rewards/margins": 1.041926622390747,
"rewards/rejected": -0.9529180526733398,
"step": 430
},
{
"epoch": 0.46061240512954726,
"grad_norm": 14.235637664794922,
"learning_rate": 3.269063392575352e-06,
"logits/chosen": -2.807570695877075,
"logits/rejected": -2.744551420211792,
"logps/chosen": -296.353759765625,
"logps/rejected": -257.65374755859375,
"loss": 0.4776,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.2943809926509857,
"rewards/margins": 1.0381492376327515,
"rewards/rejected": -0.7437682747840881,
"step": 440
},
{
"epoch": 0.4710808688824915,
"grad_norm": 11.64111042022705,
"learning_rate": 3.181570569931697e-06,
"logits/chosen": -2.8272039890289307,
"logits/rejected": -2.7998225688934326,
"logps/chosen": -298.7302551269531,
"logps/rejected": -261.15606689453125,
"loss": 0.4891,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.18123969435691833,
"rewards/margins": 0.7455132007598877,
"rewards/rejected": -0.564273476600647,
"step": 450
},
{
"epoch": 0.48154933263543576,
"grad_norm": 14.514245986938477,
"learning_rate": 3.09316620706208e-06,
"logits/chosen": -2.803687334060669,
"logits/rejected": -2.788764715194702,
"logps/chosen": -294.7230224609375,
"logps/rejected": -290.67449951171875,
"loss": 0.5102,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.22936221957206726,
"rewards/margins": 1.060276985168457,
"rewards/rejected": -0.8309147953987122,
"step": 460
},
{
"epoch": 0.49201779638838,
"grad_norm": 19.26064682006836,
"learning_rate": 3.0039685369660785e-06,
"logits/chosen": -2.745448589324951,
"logits/rejected": -2.7057714462280273,
"logps/chosen": -275.9302978515625,
"logps/rejected": -271.31964111328125,
"loss": 0.5009,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.40928229689598083,
"rewards/margins": 1.3523207902908325,
"rewards/rejected": -0.9430384635925293,
"step": 470
},
{
"epoch": 0.5024862601413242,
"grad_norm": 15.569584846496582,
"learning_rate": 2.91409685362137e-06,
"logits/chosen": -2.728668689727783,
"logits/rejected": -2.7024617195129395,
"logps/chosen": -280.735595703125,
"logps/rejected": -284.4299621582031,
"loss": 0.5371,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.218542218208313,
"rewards/margins": 0.9496285319328308,
"rewards/rejected": -0.7310863137245178,
"step": 480
},
{
"epoch": 0.5129547238942685,
"grad_norm": 13.441338539123535,
"learning_rate": 2.8236713524386085e-06,
"logits/chosen": -2.780989408493042,
"logits/rejected": -2.7283778190612793,
"logps/chosen": -251.0142364501953,
"logps/rejected": -233.73703002929688,
"loss": 0.5264,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.3006051182746887,
"rewards/margins": 1.0492786169052124,
"rewards/rejected": -0.7486735582351685,
"step": 490
},
{
"epoch": 0.5234231876472127,
"grad_norm": 15.913731575012207,
"learning_rate": 2.7328129695107205e-06,
"logits/chosen": -2.6493608951568604,
"logits/rejected": -2.6593687534332275,
"logps/chosen": -285.948974609375,
"logps/rejected": -264.4158630371094,
"loss": 0.5367,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.08400087058544159,
"rewards/margins": 0.9130845069885254,
"rewards/rejected": -0.997085452079773,
"step": 500
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -2.753965139389038,
"eval_logits/rejected": -2.733893632888794,
"eval_logps/chosen": -274.0134582519531,
"eval_logps/rejected": -278.07373046875,
"eval_loss": 0.5174898505210876,
"eval_rewards/accuracies": 0.7142857313156128,
"eval_rewards/chosen": -0.21423013508319855,
"eval_rewards/margins": 0.8292967677116394,
"eval_rewards/rejected": -1.0435270071029663,
"eval_runtime": 353.1034,
"eval_samples_per_second": 5.664,
"eval_steps_per_second": 0.178,
"step": 500
},
{
"epoch": 0.533891651400157,
"grad_norm": 15.676944732666016,
"learning_rate": 2.641643219871597e-06,
"logits/chosen": -2.719494581222534,
"logits/rejected": -2.7037150859832764,
"logps/chosen": -294.9654235839844,
"logps/rejected": -266.7233581542969,
"loss": 0.5713,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.23952999711036682,
"rewards/margins": 0.886762261390686,
"rewards/rejected": -1.126292109489441,
"step": 510
},
{
"epoch": 0.5443601151531012,
"grad_norm": 13.938344955444336,
"learning_rate": 2.5502840349805074e-06,
"logits/chosen": -2.696988821029663,
"logits/rejected": -2.683870792388916,
"logps/chosen": -269.9395446777344,
"logps/rejected": -262.0766906738281,
"loss": 0.49,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.01678340695798397,
"rewards/margins": 0.9905561208724976,
"rewards/rejected": -1.0073394775390625,
"step": 520
},
{
"epoch": 0.5548285789060455,
"grad_norm": 15.948507308959961,
"learning_rate": 2.4588575996495797e-06,
"logits/chosen": -2.7424683570861816,
"logits/rejected": -2.694587230682373,
"logps/chosen": -284.1721496582031,
"logps/rejected": -278.7875671386719,
"loss": 0.5118,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.3610132336616516,
"rewards/margins": 0.8961461186408997,
"rewards/rejected": -0.5351330041885376,
"step": 530
},
{
"epoch": 0.5652970426589898,
"grad_norm": 12.358839988708496,
"learning_rate": 2.367486188632446e-06,
"logits/chosen": -2.6885437965393066,
"logits/rejected": -2.6670284271240234,
"logps/chosen": -256.28900146484375,
"logps/rejected": -221.53140258789062,
"loss": 0.4942,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.3476331830024719,
"rewards/margins": 0.8925381898880005,
"rewards/rejected": -0.5449050068855286,
"step": 540
},
{
"epoch": 0.575765506411934,
"grad_norm": 15.99252986907959,
"learning_rate": 2.276292003092593e-06,
"logits/chosen": -2.733283281326294,
"logits/rejected": -2.7206578254699707,
"logps/chosen": -258.4396667480469,
"logps/rejected": -267.3887634277344,
"loss": 0.5156,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.19142115116119385,
"rewards/margins": 0.8478537797927856,
"rewards/rejected": -0.656432569026947,
"step": 550
},
{
"epoch": 0.5862339701648783,
"grad_norm": 17.259794235229492,
"learning_rate": 2.1853970071701415e-06,
"logits/chosen": -2.7236075401306152,
"logits/rejected": -2.697274923324585,
"logps/chosen": -275.6216735839844,
"logps/rejected": -264.6971740722656,
"loss": 0.5283,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.0764341875910759,
"rewards/margins": 0.8573528528213501,
"rewards/rejected": -0.7809187173843384,
"step": 560
},
{
"epoch": 0.5967024339178225,
"grad_norm": 15.168867111206055,
"learning_rate": 2.0949227648656194e-06,
"logits/chosen": -2.74639630317688,
"logits/rejected": -2.7061123847961426,
"logps/chosen": -275.2660217285156,
"logps/rejected": -243.61441040039062,
"loss": 0.4809,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.16716930270195007,
"rewards/margins": 1.0621994733810425,
"rewards/rejected": -0.89503014087677,
"step": 570
},
{
"epoch": 0.6071708976707668,
"grad_norm": 16.580690383911133,
"learning_rate": 2.00499027745888e-06,
"logits/chosen": -2.6987080574035645,
"logits/rejected": -2.6702322959899902,
"logps/chosen": -251.2617950439453,
"logps/rejected": -249.33554077148438,
"loss": 0.4982,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.08427687734365463,
"rewards/margins": 1.002518653869629,
"rewards/rejected": -0.9182417988777161,
"step": 580
},
{
"epoch": 0.6176393614237111,
"grad_norm": 13.715599060058594,
"learning_rate": 1.915719821680624e-06,
"logits/chosen": -2.7376370429992676,
"logits/rejected": -2.695758819580078,
"logps/chosen": -288.4046325683594,
"logps/rejected": -244.7723388671875,
"loss": 0.5487,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.006659612059593201,
"rewards/margins": 0.9512457847595215,
"rewards/rejected": -0.9579054117202759,
"step": 590
},
{
"epoch": 0.6281078251766553,
"grad_norm": 19.28061294555664,
"learning_rate": 1.8272307888529276e-06,
"logits/chosen": -2.720449924468994,
"logits/rejected": -2.7100579738616943,
"logps/chosen": -256.0718994140625,
"logps/rejected": -238.8487548828125,
"loss": 0.5031,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2969541549682617,
"rewards/margins": 0.761928915977478,
"rewards/rejected": -1.0588830709457397,
"step": 600
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -2.726771831512451,
"eval_logits/rejected": -2.7070980072021484,
"eval_logps/chosen": -274.8104553222656,
"eval_logps/rejected": -278.967041015625,
"eval_loss": 0.513893723487854,
"eval_rewards/accuracies": 0.7023809552192688,
"eval_rewards/chosen": -0.2939308285713196,
"eval_rewards/margins": 0.8389276266098022,
"eval_rewards/rejected": -1.132858395576477,
"eval_runtime": 353.2373,
"eval_samples_per_second": 5.662,
"eval_steps_per_second": 0.178,
"step": 600
},
{
"epoch": 0.6385762889295996,
"grad_norm": 15.231009483337402,
"learning_rate": 1.739641525213929e-06,
"logits/chosen": -2.688774347305298,
"logits/rejected": -2.6713051795959473,
"logps/chosen": -264.80462646484375,
"logps/rejected": -323.32769775390625,
"loss": 0.4765,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2533731758594513,
"rewards/margins": 1.022083044052124,
"rewards/rejected": -1.275456190109253,
"step": 610
},
{
"epoch": 0.6490447526825438,
"grad_norm": 13.739964485168457,
"learning_rate": 1.6530691736402317e-06,
"logits/chosen": -2.752707004547119,
"logits/rejected": -2.7096076011657715,
"logps/chosen": -272.6566467285156,
"logps/rejected": -246.8385467529297,
"loss": 0.4911,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.008082455024123192,
"rewards/margins": 0.9637606739997864,
"rewards/rejected": -0.9718431234359741,
"step": 620
},
{
"epoch": 0.6595132164354881,
"grad_norm": 15.237521171569824,
"learning_rate": 1.5676295169786864e-06,
"logits/chosen": -2.6328330039978027,
"logits/rejected": -2.6349148750305176,
"logps/chosen": -243.0568389892578,
"logps/rejected": -257.5806884765625,
"loss": 0.4978,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.11530411243438721,
"rewards/margins": 1.1866023540496826,
"rewards/rejected": -1.071298360824585,
"step": 630
},
{
"epoch": 0.6699816801884323,
"grad_norm": 14.393755912780762,
"learning_rate": 1.4834368231970922e-06,
"logits/chosen": -2.745518922805786,
"logits/rejected": -2.7395710945129395,
"logps/chosen": -261.69219970703125,
"logps/rejected": -273.5394592285156,
"loss": 0.4615,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.13604871928691864,
"rewards/margins": 1.1850334405899048,
"rewards/rejected": -1.0489846467971802,
"step": 640
},
{
"epoch": 0.6804501439413766,
"grad_norm": 16.45931625366211,
"learning_rate": 1.4006036925609245e-06,
"logits/chosen": -2.7178597450256348,
"logits/rejected": -2.701998472213745,
"logps/chosen": -257.53363037109375,
"logps/rejected": -260.20428466796875,
"loss": 0.5055,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.378938764333725,
"rewards/margins": 1.3896540403366089,
"rewards/rejected": -1.0107152462005615,
"step": 650
},
{
"epoch": 0.6909186076943209,
"grad_norm": 16.44599151611328,
"learning_rate": 1.3192409070404582e-06,
"logits/chosen": -2.668501615524292,
"logits/rejected": -2.657517671585083,
"logps/chosen": -266.4493408203125,
"logps/rejected": -272.10491943359375,
"loss": 0.5126,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.3282933533191681,
"rewards/margins": 1.1049590110778809,
"rewards/rejected": -0.7766658067703247,
"step": 660
},
{
"epoch": 0.7013870714472651,
"grad_norm": 16.747392654418945,
"learning_rate": 1.2394572821496953e-06,
"logits/chosen": -2.720128059387207,
"logits/rejected": -2.704305648803711,
"logps/chosen": -274.4294738769531,
"logps/rejected": -278.8267822265625,
"loss": 0.5283,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.37905776500701904,
"rewards/margins": 1.1339893341064453,
"rewards/rejected": -0.7549317479133606,
"step": 670
},
{
"epoch": 0.7118555352002094,
"grad_norm": 14.092031478881836,
"learning_rate": 1.1613595214152713e-06,
"logits/chosen": -2.725722074508667,
"logits/rejected": -2.7022783756256104,
"logps/chosen": -272.71630859375,
"logps/rejected": -235.8934783935547,
"loss": 0.5253,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.28024324774742126,
"rewards/margins": 1.039801001548767,
"rewards/rejected": -0.7595577836036682,
"step": 680
},
{
"epoch": 0.7223239989531536,
"grad_norm": 12.211508750915527,
"learning_rate": 1.0850520736699362e-06,
"logits/chosen": -2.758209705352783,
"logits/rejected": -2.7124698162078857,
"logps/chosen": -298.98974609375,
"logps/rejected": -265.95648193359375,
"loss": 0.4786,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.3048846125602722,
"rewards/margins": 1.1615703105926514,
"rewards/rejected": -0.8566857576370239,
"step": 690
},
{
"epoch": 0.7327924627060979,
"grad_norm": 17.241064071655273,
"learning_rate": 1.0106369933615043e-06,
"logits/chosen": -2.682668924331665,
"logits/rejected": -2.677841901779175,
"logps/chosen": -243.40737915039062,
"logps/rejected": -249.0398406982422,
"loss": 0.5057,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.09061723947525024,
"rewards/margins": 0.9352877736091614,
"rewards/rejected": -0.8446704745292664,
"step": 700
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -2.740434408187866,
"eval_logits/rejected": -2.7206947803497314,
"eval_logps/chosen": -271.9793701171875,
"eval_logps/rejected": -276.6876220703125,
"eval_loss": 0.5084052085876465,
"eval_rewards/accuracies": 0.7202380895614624,
"eval_rewards/chosen": -0.010822229087352753,
"eval_rewards/margins": 0.8940958976745605,
"eval_rewards/rejected": -0.9049180150032043,
"eval_runtime": 353.4167,
"eval_samples_per_second": 5.659,
"eval_steps_per_second": 0.178,
"step": 700
},
{
"epoch": 0.7432609264590422,
"grad_norm": 14.434347152709961,
"learning_rate": 9.382138040640714e-07,
"logits/chosen": -2.7248692512512207,
"logits/rejected": -2.7028727531433105,
"logps/chosen": -248.70083618164062,
"logps/rejected": -249.9818878173828,
"loss": 0.523,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.020344754680991173,
"rewards/margins": 0.7711048722267151,
"rewards/rejected": -0.7914497256278992,
"step": 710
},
{
"epoch": 0.7537293902119864,
"grad_norm": 18.781707763671875,
"learning_rate": 8.678793653740633e-07,
"logits/chosen": -2.6842763423919678,
"logits/rejected": -2.655897378921509,
"logps/chosen": -227.59765625,
"logps/rejected": -242.20462036132812,
"loss": 0.5119,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.029393959790468216,
"rewards/margins": 1.0940216779708862,
"rewards/rejected": -1.1234157085418701,
"step": 720
},
{
"epoch": 0.7641978539649307,
"grad_norm": 14.272910118103027,
"learning_rate": 7.997277433690984e-07,
"logits/chosen": -2.763821601867676,
"logits/rejected": -2.7315735816955566,
"logps/chosen": -260.8923034667969,
"logps/rejected": -242.97708129882812,
"loss": 0.4847,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.01939038559794426,
"rewards/margins": 1.0031776428222656,
"rewards/rejected": -0.9837873578071594,
"step": 730
},
{
"epoch": 0.7746663177178749,
"grad_norm": 14.98263931274414,
"learning_rate": 7.338500848029603e-07,
"logits/chosen": -2.6752095222473145,
"logits/rejected": -2.657548427581787,
"logps/chosen": -277.3648376464844,
"logps/rejected": -270.6456604003906,
"loss": 0.5273,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.2628101706504822,
"rewards/margins": 1.2527306079864502,
"rewards/rejected": -0.9899206161499023,
"step": 740
},
{
"epoch": 0.7851347814708192,
"grad_norm": 13.564722061157227,
"learning_rate": 6.70334495204884e-07,
"logits/chosen": -2.6619467735290527,
"logits/rejected": -2.678673267364502,
"logps/chosen": -261.7742919921875,
"logps/rejected": -279.1748046875,
"loss": 0.5415,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.09365497529506683,
"rewards/margins": 0.9422794580459595,
"rewards/rejected": -0.8486245274543762,
"step": 750
},
{
"epoch": 0.7956032452237635,
"grad_norm": 15.380537986755371,
"learning_rate": 6.092659210462232e-07,
"logits/chosen": -2.7603864669799805,
"logits/rejected": -2.740291118621826,
"logps/chosen": -277.1561279296875,
"logps/rejected": -247.03018188476562,
"loss": 0.5041,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.11306103318929672,
"rewards/margins": 1.0758002996444702,
"rewards/rejected": -0.9627392888069153,
"step": 760
},
{
"epoch": 0.8060717089767077,
"grad_norm": 12.876290321350098,
"learning_rate": 5.507260361320738e-07,
"logits/chosen": -2.766484498977661,
"logits/rejected": -2.7405753135681152,
"logps/chosen": -275.5859069824219,
"logps/rejected": -261.8198547363281,
"loss": 0.5393,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.04547576978802681,
"rewards/margins": 0.9839900732040405,
"rewards/rejected": -0.9385143518447876,
"step": 770
},
{
"epoch": 0.816540172729652,
"grad_norm": 14.665273666381836,
"learning_rate": 4.947931323697983e-07,
"logits/chosen": -2.7336275577545166,
"logits/rejected": -2.702479362487793,
"logps/chosen": -280.28045654296875,
"logps/rejected": -260.28778076171875,
"loss": 0.4583,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.07684344798326492,
"rewards/margins": 0.9824104309082031,
"rewards/rejected": -1.0592539310455322,
"step": 780
},
{
"epoch": 0.8270086364825961,
"grad_norm": 13.474715232849121,
"learning_rate": 4.4154201506053985e-07,
"logits/chosen": -2.698451519012451,
"logits/rejected": -2.6745803356170654,
"logps/chosen": -273.3094482421875,
"logps/rejected": -273.69232177734375,
"loss": 0.5363,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.16311880946159363,
"rewards/margins": 0.8299474716186523,
"rewards/rejected": -0.9930663108825684,
"step": 790
},
{
"epoch": 0.8374771002355405,
"grad_norm": 18.379581451416016,
"learning_rate": 3.910439028537638e-07,
"logits/chosen": -2.688326358795166,
"logits/rejected": -2.650494337081909,
"logps/chosen": -279.04498291015625,
"logps/rejected": -262.01470947265625,
"loss": 0.5172,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.11311912536621094,
"rewards/margins": 1.0069270133972168,
"rewards/rejected": -0.8938078880310059,
"step": 800
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -2.7371084690093994,
"eval_logits/rejected": -2.717339038848877,
"eval_logps/chosen": -272.1710510253906,
"eval_logps/rejected": -276.8697204589844,
"eval_loss": 0.5089952945709229,
"eval_rewards/accuracies": 0.7182539701461792,
"eval_rewards/chosen": -0.02998838946223259,
"eval_rewards/margins": 0.8931422829627991,
"eval_rewards/rejected": -0.9231306910514832,
"eval_runtime": 353.2424,
"eval_samples_per_second": 5.662,
"eval_steps_per_second": 0.178,
"step": 800
},
{
"epoch": 0.8479455639884846,
"grad_norm": 14.40329647064209,
"learning_rate": 3.4336633249862084e-07,
"logits/chosen": -2.6875698566436768,
"logits/rejected": -2.688908100128174,
"logps/chosen": -262.572998046875,
"logps/rejected": -263.0613708496094,
"loss": 0.5058,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.08515429496765137,
"rewards/margins": 0.9841421246528625,
"rewards/rejected": -0.8989877700805664,
"step": 810
},
{
"epoch": 0.8584140277414289,
"grad_norm": 13.108388900756836,
"learning_rate": 2.98573068519539e-07,
"logits/chosen": -2.733366012573242,
"logits/rejected": -2.6849071979522705,
"logps/chosen": -281.99212646484375,
"logps/rejected": -253.0430145263672,
"loss": 0.4671,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.06205648183822632,
"rewards/margins": 1.0704829692840576,
"rewards/rejected": -1.0084264278411865,
"step": 820
},
{
"epoch": 0.8688824914943732,
"grad_norm": 12.790390014648438,
"learning_rate": 2.5672401793681854e-07,
"logits/chosen": -2.754135847091675,
"logits/rejected": -2.7304673194885254,
"logps/chosen": -255.24227905273438,
"logps/rejected": -233.9311981201172,
"loss": 0.4682,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.1760888546705246,
"rewards/margins": 1.146909236907959,
"rewards/rejected": -0.970820426940918,
"step": 830
},
{
"epoch": 0.8793509552473174,
"grad_norm": 12.2880220413208,
"learning_rate": 2.178751501463036e-07,
"logits/chosen": -2.703233480453491,
"logits/rejected": -2.676057815551758,
"logps/chosen": -288.25238037109375,
"logps/rejected": -258.7244567871094,
"loss": 0.4983,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.12756434082984924,
"rewards/margins": 1.043333649635315,
"rewards/rejected": -0.9157692193984985,
"step": 840
},
{
"epoch": 0.8898194190002617,
"grad_norm": 18.457061767578125,
"learning_rate": 1.820784220652766e-07,
"logits/chosen": -2.769766330718994,
"logits/rejected": -2.749628782272339,
"logps/chosen": -275.16302490234375,
"logps/rejected": -264.35260009765625,
"loss": 0.5438,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.1516454517841339,
"rewards/margins": 1.0686285495758057,
"rewards/rejected": -0.9169832468032837,
"step": 850
},
{
"epoch": 0.9002878827532059,
"grad_norm": 20.659849166870117,
"learning_rate": 1.4938170864468636e-07,
"logits/chosen": -2.7188377380371094,
"logits/rejected": -2.7116434574127197,
"logps/chosen": -265.40045166015625,
"logps/rejected": -279.65716552734375,
"loss": 0.4835,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.013210540637373924,
"rewards/margins": 1.0902684926986694,
"rewards/rejected": -1.0770580768585205,
"step": 860
},
{
"epoch": 0.9107563465061502,
"grad_norm": 15.20347785949707,
"learning_rate": 1.1982873884064466e-07,
"logits/chosen": -2.6989123821258545,
"logits/rejected": -2.668877124786377,
"logps/chosen": -265.70562744140625,
"logps/rejected": -227.8814239501953,
"loss": 0.5241,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.006173081696033478,
"rewards/margins": 0.8927028775215149,
"rewards/rejected": -0.8988760113716125,
"step": 870
},
{
"epoch": 0.9212248102590945,
"grad_norm": 13.230890274047852,
"learning_rate": 9.345903713082305e-08,
"logits/chosen": -2.7578580379486084,
"logits/rejected": -2.700892210006714,
"logps/chosen": -257.47833251953125,
"logps/rejected": -217.13882446289062,
"loss": 0.4941,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.035571496933698654,
"rewards/margins": 1.0173650979995728,
"rewards/rejected": -1.0529365539550781,
"step": 880
},
{
"epoch": 0.9316932740120387,
"grad_norm": 13.502944946289062,
"learning_rate": 7.030787065396866e-08,
"logits/chosen": -2.709721088409424,
"logits/rejected": -2.6791884899139404,
"logps/chosen": -248.15966796875,
"logps/rejected": -225.49337768554688,
"loss": 0.5095,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.09806522727012634,
"rewards/margins": 0.9047859907150269,
"rewards/rejected": -0.8067208528518677,
"step": 890
},
{
"epoch": 0.942161737764983,
"grad_norm": 13.781939506530762,
"learning_rate": 5.0406202043228604e-08,
"logits/chosen": -2.698310375213623,
"logits/rejected": -2.6684579849243164,
"logps/chosen": -279.3674621582031,
"logps/rejected": -252.4850311279297,
"loss": 0.5173,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.18393948674201965,
"rewards/margins": 0.9908119440078735,
"rewards/rejected": -0.8068723678588867,
"step": 900
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -2.737166404724121,
"eval_logits/rejected": -2.7174251079559326,
"eval_logps/chosen": -271.87890625,
"eval_logps/rejected": -276.7181396484375,
"eval_loss": 0.5083790421485901,
"eval_rewards/accuracies": 0.7222222089767456,
"eval_rewards/chosen": -0.0007740448345430195,
"eval_rewards/margins": 0.9071968197822571,
"eval_rewards/rejected": -0.9079708456993103,
"eval_runtime": 353.3783,
"eval_samples_per_second": 5.66,
"eval_steps_per_second": 0.178,
"step": 900
},
{
"epoch": 0.9526302015179272,
"grad_norm": 12.07172679901123,
"learning_rate": 3.378064801637687e-08,
"logits/chosen": -2.7228991985321045,
"logits/rejected": -2.7141544818878174,
"logps/chosen": -273.98095703125,
"logps/rejected": -259.7334899902344,
"loss": 0.4743,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.030942853540182114,
"rewards/margins": 1.1606462001800537,
"rewards/rejected": -1.1297032833099365,
"step": 910
},
{
"epoch": 0.9630986652708715,
"grad_norm": 13.161744117736816,
"learning_rate": 2.0453443778310766e-08,
"logits/chosen": -2.7474637031555176,
"logits/rejected": -2.731945753097534,
"logps/chosen": -264.48785400390625,
"logps/rejected": -259.0505065917969,
"loss": 0.515,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.009974095039069653,
"rewards/margins": 1.0056250095367432,
"rewards/rejected": -0.9956509470939636,
"step": 920
},
{
"epoch": 0.9735671290238157,
"grad_norm": 13.763269424438477,
"learning_rate": 1.0442413283435759e-08,
"logits/chosen": -2.752173900604248,
"logits/rejected": -2.7249786853790283,
"logps/chosen": -276.56597900390625,
"logps/rejected": -276.8932800292969,
"loss": 0.516,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.10977672040462494,
"rewards/margins": 0.9989693760871887,
"rewards/rejected": -0.8891927003860474,
"step": 930
},
{
"epoch": 0.98403559277676,
"grad_norm": 13.344006538391113,
"learning_rate": 3.760945397705828e-09,
"logits/chosen": -2.750119209289551,
"logits/rejected": -2.7376656532287598,
"logps/chosen": -282.4700012207031,
"logps/rejected": -263.9110107421875,
"loss": 0.5178,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.0567372627556324,
"rewards/margins": 1.0393383502960205,
"rewards/rejected": -0.9826010465621948,
"step": 940
},
{
"epoch": 0.9945040565297043,
"grad_norm": 16.06287956237793,
"learning_rate": 4.1797599220405605e-10,
"logits/chosen": -2.705085515975952,
"logits/rejected": -2.6794345378875732,
"logps/chosen": -257.38623046875,
"logps/rejected": -263.4278259277344,
"loss": 0.505,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.005693820305168629,
"rewards/margins": 0.9306586384773254,
"rewards/rejected": -0.936352550983429,
"step": 950
},
{
"epoch": 0.9997382884061764,
"step": 955,
"total_flos": 0.0,
"train_loss": 0.5278735879828168,
"train_runtime": 28474.7268,
"train_samples_per_second": 2.147,
"train_steps_per_second": 0.034
}
],
"logging_steps": 10,
"max_steps": 955,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}